first commit

2026-01-18 20:37:50 +08:00
commit fff9f18287
123 changed files with 1385491 additions and 0 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -0,0 +1,78 @@
+---
+AccessModifierOffset: -4
+AlignAfterOpenBracket: DontAlign
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: false
+AlignOperands:   false
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BasedOnStyle: None
+BinPackArguments: true
+BinPackParameters: true
+BreakBeforeBinaryOperators: All
+BreakBeforeBraces: Allman
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: true
+ColumnLimit:     120
+CommentPragmas:  '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
+IncludeBlocks: Preserve
+IncludeCategories:
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:        2
+  - Regex:           '^(<|"(gtest|isl|json)/)'
+    Priority:        3
+  - Regex:           '.*'
+    Priority:        1
+IndentCaseLabels: false
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: true
+Language: Cpp
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 4
+ObjCSpaceAfterProperty: true
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Left
+PointerBindsToType: false
+ReflowComments:  true
+SortIncludes:    true
+SpaceAfterCStyleCast: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInCStyleCastParentheses: false
+SpacesInContainerLiterals: true
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+TabWidth:        4
+UseTab:          Never
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,10 @@
+out
+*.swp
+src/*.o
+bin/HPCG-Benchmark_3*.txt
+bin/xhpcg
+bin/xhpcg-cpu
+bin/hpcg20*.txt
+.DS_Store
+bin/
+build/
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -0,0 +1,120 @@
+
+## NVIDIA HPCG Contribution Rules
+
+#### Issue Tracking
+
+* All enhancement, bugfix, or change requests must begin with the creation of a [NVIDIA HPCG Issue Request](https://github.com/NVIDIA/nvidia-hpcg/issues).
+* The issue request must be reviewed by NVIDIA HPCG engineers and approved prior to code review.
+
+
+#### Coding Guidelines
+
+- Please follow the existing conventions in the relevant file, submodule, module, and project when you add new code or when you extend/fix existing functionality.
+
+- To maintain consistency in code formatting and style, you should also run `clang-format` on the modified sources with the provided configuration file. This applies NVIDIA HPCG code formatting rules to:
+  - class, function/method, and variable/field naming
+  - comment style
+  - indentation
+  - line length
+
+- Format git changes:
+  ```bash
+  # Commit ID is optional - if unspecified, run format on staged changes.
+  git-clang-format --style file [commit ID/reference]
+  ```
+
+- Format  individual source files:
+  ```bash
+  # -style=file : Obtain the formatting rules from .clang-format
+  # -i : In-place modification of the processed file
+  clang-format -style=file -i -fallback-style=none <file(s) to process>
+  ```
+
+- Format entire codebase (for project maintainers only):
+  ```bash
+  find samples plugin -iname *.h -o -iname *.c -o -iname *.cpp -o -iname *.hpp \
+  | xargs clang-format -style=file -i -fallback-style=none
+  ```
+
+- Try to keep pull requests (PRs) as concise as possible:
+  - Avoid committing commented-out code.
+  - Wherever possible, each PR should address a single concern. If there are several otherwise-unrelated things that should be fixed to reach a desired endpoint, our recommendation is to open several PRs and indicate the dependencies in the description. The more complex the changes are in a single PR, the more time it will take to review those changes.
+
+- Write commit titles using imperative mood and [these rules](https://chris.beams.io/posts/git-commit/), and reference the Issue number corresponding to the PR. Following is the recommended format for commit texts:
+```
+#<Issue Number> - <Commit Title>
+
+<Commit Body>
+```
+
+- Ensure that the build log is clean, meaning no warnings or errors should be present.
+
+- All OSS components must contain accompanying documentation (READMEs) describing the functionality, dependencies, and known issues.
+
+- Make sure that you can contribute your work to open source (no license and/or patent conflict is introduced by your code). You will need to [`sign`](#signing-your-work) your commit.
+
+- Thanks in advance for your patience as we review your contributions; we do appreciate them!
+
+
+#### Pull Requests
+Developer workflow for code contributions is as follows:
+
+1. Developers must first [fork](https://help.github.com/en/articles/fork-a-repo) the [upstream](https://github.com/NVIDIA/nvidia-hpcg.git) NVIDIA-HPCG repository.
+
+2. Git clone the forked repository and push changes to the personal fork.
+
+  ```bash
+git clone https://github.com/YOUR_USERNAME/YOUR_FORK.git NVIDIA-HPCG
+# Checkout the targeted branch and commit changes
+# Push the commits to a branch on the fork (remote).
+git push -u origin <local-branch>:<remote-branch>
+  ```
+
+3. Once the code changes are staged on the fork and ready for review, a [Pull Request](https://help.github.com/en/articles/about-pull-requests) (PR) can be [requested](https://help.github.com/en/articles/creating-a-pull-request) to merge the changes from a branch of the fork into a selected branch of upstream.
+  * Exercise caution when selecting the source and target branches for the PR.
+  * Creation of a PR creation kicks off the code review process.
+  * At least one NVIDIA-HPCG engineer will be assigned for the review.
+  * While under review, mark your PRs as work-in-progress by prefixing the PR title with [WIP].
+
+4. Since there is no CI/CD process in place yet, the PR will be accepted and the corresponding issue closed only after adequate testing has been completed, manually, by the developer and/or NVIDIA-HPCG engineer reviewing the code.
+
+
+#### Signing Your Work
+
+* We require that all contributors "sign-off" on their commits. This certifies that the contribution is your original work, or you have rights to submit it under the same license, or a compatible license.
+
+  * Any contribution which contains commits that are not Signed-Off will not be accepted.
+
+* To sign off on a commit you simply use the `--signoff` (or `-s`) option when committing your changes:
+  ```bash
+  $ git commit -s -m "Add cool feature."
+  ```
+  This will append the following to your commit message:
+  ```
+  Signed-off-by: Your Name <your@email.com>
+  ```
+
+* Full text of the DCO:
+
+  ```
+    Developer Certificate of Origin
+    Version 1.1
+    
+    Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
+    
+    Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.
+  ```
+
+  ```
+    Developer's Certificate of Origin 1.1
+    
+    By making a contribution to this project, I certify that:
+    
+    (a) The contribution was created in whole or in part by me and I have the right to submit it under the open source license indicated in the file; or
+    
+    (b) The contribution is based upon previous work that, to the best of my knowledge, is covered under an appropriate open source license and I have the right under that license to submit that work with modifications, whether created in whole or in part by me, under the same open source license (unless I am permitted to submit under a different license), as indicated in the file; or
+    
+    (c) The contribution was provided directly to me by some other person who certified (a), (b) or (c) and I have not modified it.
+    
+    (d) I understand and agree that this project and the contribution are public and that a record of the contribution (including all personal information I submit with it, including my sign-off) is maintained indefinitely and may be redistributed consistent with this project or the open source license(s) involved.
+  ```
--- a/46
+++ b/46
@@ -0,0 +1,46 @@
+======================================================================
+ -- High Performance Conjugate Gradients (HPCG) Benchmark
+    HPCG - 3.1 - March 28, 2019
+
+    Michael A. Heroux
+    Scalable Algorithms Group, Center for Computing Research
+    Sandia National Laboratories, Albuquerque, NM
+
+    Piotr Luszczek
+    Jack Dongarra
+    University of Tennessee, Knoxville
+    Innovative Computing Laboratory
+    (C) Copyright 2013-2019 All Rights Reserved
+
+ -- Copyright notice and Licensing terms:
+
+ Redistribution  and  use in  source and binary forms, with or without
+ modification, are  permitted provided  that the following  conditions
+ are met:
+
+ 1. Redistributions  of  source  code  must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce  the above copyright
+ notice, this list of conditions,  and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ 3. The name of the  University,  the name of the  Laboratory,  or the
+ names  of  its  contributors  may  not  be used to endorse or promote
+ products  derived   from   this  software  without  specific  written
+ permission.
+
+ -- Disclaimer:
+
+ THIS  SOFTWARE  IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,  INCLUDING,  BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+ OR  CONTRIBUTORS  BE  LIABLE FOR ANY  DIRECT,  INDIRECT,  INCIDENTAL,
+ SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES  (INCLUDING,  BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA OR PROFITS; OR BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+======================================================================
--- a/63
+++ b/63
@@ -0,0 +1,63 @@
+======================================================================
+Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. 
+All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================
+
+======================================================================
+ -- High Performance Conjugate Gradients (HPCG) Benchmark
+    HPCG - 3.1 - March 28, 2019
+
+    Michael A. Heroux
+    Scalable Algorithms Group, Center for Computing Research
+    Sandia National Laboratories, Albuquerque, NM
+
+    Piotr Luszczek
+    Jack Dongarra
+    University of Tennessee, Knoxville
+    Innovative Computing Laboratory
+    (C) Copyright 2013-2019 All Rights Reserved
+
+ -- Copyright notice and Licensing terms:
+
+ Redistribution  and  use in  source and binary forms, with or without
+ modification, are  permitted provided  that the following  conditions
+ are met:
+
+ 1. Redistributions  of  source  code  must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce  the above copyright
+ notice, this list of conditions,  and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ 3. The name of the  University,  the name of the  Laboratory,  or the
+ names  of  its  contributors  may  not  be used to endorse or promote
+ products  derived   from   this  software  without  specific  written
+ permission.
+
+ -- Disclaimer:
+
+ THIS  SOFTWARE  IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,  INCLUDING,  BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+ OR  CONTRIBUTORS  BE  LIABLE FOR ANY  DIRECT,  INDIRECT,  INCIDENTAL,
+ SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES  (INCLUDING,  BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA OR PROFITS; OR BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+======================================================================
--- a/DASP/LICENSE
+++ b/DASP/LICENSE
@@ -0,0 +1,661 @@
+                    GNU AFFERO GENERAL PUBLIC LICENSE
+                       Version 3, 19 November 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU Affero General Public License is a free, copyleft license for
+software and other kinds of works, specifically designed to ensure
+cooperation with the community in the case of network server software.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+our General Public Licenses are intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  Developers that use our General Public Licenses protect your rights
+with two steps: (1) assert copyright on the software, and (2) offer
+you this License which gives you legal permission to copy, distribute
+and/or modify the software.
+
+  A secondary benefit of defending all users' freedom is that
+improvements made in alternate versions of the program, if they
+receive widespread use, become available for other developers to
+incorporate.  Many developers of free software are heartened and
+encouraged by the resulting cooperation.  However, in the case of
+software used on network servers, this result may fail to come about.
+The GNU General Public License permits making a modified version and
+letting the public access it on a server without ever releasing its
+source code to the public.
+
+  The GNU Affero General Public License is designed specifically to
+ensure that, in such cases, the modified source code becomes available
+to the community.  It requires the operator of a network server to
+provide the source code of the modified version running there to the
+users of that server.  Therefore, public use of a modified version, on
+a publicly accessible server, gives the public access to the source
+code of the modified version.
+
+  An older license, called the Affero General Public License and
+published by Affero, was designed to accomplish similar goals.  This is
+a different license, not a version of the Affero GPL, but Affero has
+released a new version of the Affero GPL which permits relicensing under
+this license.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU Affero General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Remote Network Interaction; Use with the GNU General Public License.
+
+  Notwithstanding any other provision of this License, if you modify the
+Program, your modified version must prominently offer all users
+interacting with it remotely through a computer network (if your version
+supports such interaction) an opportunity to receive the Corresponding
+Source of your version by providing access to the Corresponding Source
+from a network server at no charge, through some standard or customary
+means of facilitating copying of software.  This Corresponding Source
+shall include the Corresponding Source for any work covered by version 3
+of the GNU General Public License that is incorporated pursuant to the
+following paragraph.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the work with which it is combined will remain governed by version
+3 of the GNU General Public License.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU Affero General Public License from time to time.  Such new versions
+will be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU Affero General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU Affero General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU Affero General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License as published
+    by the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If your software can interact with users remotely through a computer
+network, you should also make sure that it provides a way for users to
+get its source.  For example, if your program is a web application, its
+interface could display a "Source" link that leads users to an archive
+of the code.  There are many ways you could offer source, and different
+solutions will be better for different programs; see section 13 for the
+specific requirements.
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU AGPL, see
+<https://www.gnu.org/licenses/>.
--- a/DASP/Makefile
+++ b/DASP/Makefile
@@ -0,0 +1,24 @@
+#compilers
+CC=/usr/local/cuda-12.0/bin/nvcc
+
+NVCC_FLAGS = -O3 -ccbin /usr/local/gcc-12.2/bin -m64 -gencode arch=compute_80,code=sm_80
+
+# #ENVIRONMENT_PARAMETERS
+# CUDA_INSTALL_PATH = /usr/local/cuda-12.0
+
+CUDA_LIBS = -lcusparse -lcublas
+LIBS =  -lineinfo $(CUDA_LIBS)
+
+#options
+OPTIONS = -Xcompiler -fopenmp-simd
+
+double:
+	$(CC) $(NVCC_FLAGS) src/main_f64.cu -o spmv_double  -D f64 $(OPTIONS) $(LIBS) 
+
+half:
+	$(CC) $(NVCC_FLAGS) src/main_f16.cu -o spmv_half $(OPTIONS) $(LIBS) 
+
+clean:
+	rm -rf spmv_double
+	rm -rf spmv_half
+	rm data/*.csv
--- a/DASP/README.md
+++ b/DASP/README.md
@@ -0,0 +1,43 @@
+# DASP
+Specific Dense Matrix Multiply-Accumulate Units Accelerated General Sparse Matrix-Vector Multiplication
+
+## Paper
+
+This is the code of our paper published at SC '23:
+
+Yuechen Lu and Weifeng Liu. 2023. DASP: Specific Dense Matrix Multiply-Accumulate Units Accelerated General Sparse Matrix-Vector Multiplication. In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC '23). Association for Computing Machinery, New York, NY, USA, Article 73, 1–14. https://doi.org/10.1145/3581784.3607051
+
+
+## Introduction
+
+Sparse matrix-vector multiplication (SpMV) plays a key role in computational science and engineering, graph processing and machine learning applications. In this work, we propose DASP, a new algorithm using specific dense MMA units for accelerating the compute part of general SpMV. We analyze the row-wise distribution of nonzeros and group the rows into three categories containing long, medium and short rows, respectively. We then organize them into small blocks of proper sizes to meet the requirement of MMA computation. For the three categories, DASP offers different strategies to complete SpMV by efficiently utilizing the MMA units.
+
+## Installation
+
+To better reproduce experiment results, we suggest an NVIDIA GPU with compute capability 8.0. DASP evaluation requires the CUDA GPU driver, the nvcc CUDA compiler, and the cuSPARSE library, all of them are included with the CUDA Toolkit. 
+
+## Execution
+
+Our test programs currently support input files encoded using the matrix market format. All matrix market datasets used in this evaluation are publicly available from the SuiteSparse Matrix Collection.
+
+1. The command 'make xxx' generates an executable file.
+
+`make double`
+
+`make half`
+
+2. Run code on matrix data. Running the program requires one parameter: matrix path.
+
+`./spmv_double matrix.mtx`
+
+3. Example
+
+`cd test`
+
+`sh run_double.sh`
+
+## Contact us
+
+If you have any questions about running the code, please contact Yuechen Lu.
+
+E-mail: yuechenlu@student.cup.edu.cn
--- a/DASP/data/record.csv
+++ b/DASP/data/record.csv
@@ -0,0 +1,2 @@
+
+#
--- a/DASP/src/common.h
+++ b/DASP/src/common.h
@@ -0,0 +1,60 @@
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <mma.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <sys/time.h>
+#include <math.h>
+
+// #include <helper_cuda.h>
+// #include <helper_functions.h>
+
+#include <cusparse.h>
+#include <cublas_v2.h>
+
+#include "omp.h"
+#include "mmio_highlevel.h"
+
+#ifdef f64
+#define MAT_VAL_TYPE double
+#else
+#define MAT_VAL_TYPE half
+#endif
+
+
+#define WARP_SIZE 32
+#define BlockSize 8
+
+#define MMA_M 8
+#define MMA_N 8
+#define MMA_K 4
+
+#define MAT_PTR_TYPE int
+
+#define NEW_CID_TYPE int
+
+
+#define GET_BIT_REST(x)  ((unsigned int)(x << 2) >> 2)
+
+#define SET_16_BIT(dst, src, index)  \
+    dst &= ~(0xffff << (index << 4)); \
+    dst |= (src << (index << 4))
+
+#define SET_8_BIT(dst, src, index)  \
+    dst &= ~(0xff << (index << 3)); \
+    dst |= (src << (index << 3))
+
+#define SET_4_BIT(dst, src, index) \
+    dst &= ~(0xf << (index << 2)); \
+    dst |= (src << (index << 2))
+
+#define SET_2_BIT(dst, src) dst |= src << 30
+
+#define GET_16_BIT(src, index) ((src >> (index << 4)) & 0xffff)
+#define GET_8_BIT(src, index) ((src >> (index << 3)) & 0xff)
+#define GET_4_BIT(src, index) ((src >> (index << 2)) & 0xf)
+#define GET_2_BIT(src) ((src >> 30) & 0b11)
+#define omp_valve 1e4
--- a/DASP/src/dasp_f16.h
+++ b/DASP/src/dasp_f16.h
--- a/DASP/src/dasp_f64.h
+++ b/DASP/src/dasp_f64.h
--- a/DASP/src/main_f16.cu
+++ b/DASP/src/main_f16.cu
@@ -0,0 +1,164 @@
+#include "dasp_f16.h"
+
+int verify_new(MAT_VAL_TYPE *cusp_val, MAT_VAL_TYPE *cuda_val, int *new_order, int length)
+{
+    for (int i = 0; i < length; i ++)
+    {
+        int cusp_idx = new_order[i];
+        float temp_cusp_val = cusp_val[cusp_idx];
+        float temp_cuda_val = cuda_val[i];
+        if (fabs(temp_cusp_val - temp_cuda_val) > 1)
+        {
+            printf("error in (%d), cusp(%4.2f), cuda(%4.2f),please check your code!\n", i, temp_cusp_val, temp_cuda_val); 
+            return -1;
+        }
+    }
+    printf("Y(%d), compute succeed!\n", length);
+    return 0;
+}
+
+__host__
+void cusparse_spmv_all(MAT_VAL_TYPE *cu_ValA, MAT_PTR_TYPE *cu_RowPtrA, int *cu_ColIdxA, 
+                       MAT_VAL_TYPE *cu_ValX, MAT_VAL_TYPE *cu_ValY, int rowA, int colA, MAT_PTR_TYPE nnzA,
+                       long long int data_origin1, long long int data_origin2, double *pre_time, double *cu_time, double *cu_gflops, double *cu_bandwidth1, double *cu_bandwidth2)
+{
+    struct timeval t1, t2;
+
+    MAT_VAL_TYPE *dA_val, *dX, *dY;
+    int *dA_cid;
+    MAT_PTR_TYPE *dA_rpt;
+    float alpha = 1.0, beta = 0.0;
+
+    cudaMalloc((void **)&dA_val, sizeof(MAT_VAL_TYPE) * nnzA);
+    cudaMalloc((void **)&dA_cid, sizeof(int) * nnzA);
+    cudaMalloc((void **)&dA_rpt, sizeof(MAT_PTR_TYPE) * (rowA + 1));
+    cudaMalloc((void **)&dX, sizeof(MAT_VAL_TYPE) * colA);
+    cudaMalloc((void **)&dY, sizeof(MAT_VAL_TYPE) * rowA);
+
+    cudaMemcpy(dA_val, cu_ValA, sizeof(MAT_VAL_TYPE) * nnzA, cudaMemcpyHostToDevice);
+    cudaMemcpy(dA_cid, cu_ColIdxA, sizeof(int) * nnzA, cudaMemcpyHostToDevice);
+    cudaMemcpy(dA_rpt, cu_RowPtrA, sizeof(MAT_PTR_TYPE) * (rowA + 1), cudaMemcpyHostToDevice);
+    cudaMemcpy(dX, cu_ValX, sizeof(MAT_VAL_TYPE) * colA, cudaMemcpyHostToDevice);
+    // cudaMemset(dY, 0.0, sizeof(MAT_VAL_TYPE) * rowA);
+
+    cusparseHandle_t     handle = NULL;
+    cusparseSpMatDescr_t matA;
+    cusparseDnVecDescr_t vecX, vecY;
+    void*                dBuffer = NULL;
+    size_t               bufferSize = 0;
+
+    gettimeofday(&t1, NULL);
+    cusparseCreate(&handle);
+    cusparseCreateCsr(&matA, rowA, colA, nnzA, dA_rpt, dA_cid, dA_val,
+                        CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+                        CUSPARSE_INDEX_BASE_ZERO, CUDA_R_16F);
+    cusparseCreateDnVec(&vecX, colA, dX, CUDA_R_16F);
+    cusparseCreateDnVec(&vecY, rowA, dY, CUDA_R_16F);
+    cusparseSpMV_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                            &alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
+                            CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize);
+    cudaMalloc(&dBuffer, bufferSize);
+    gettimeofday(&t2, NULL);
+    *pre_time = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0;
+    // printf("cusparse preprocessing time: %8.4lf ms\n", *pre_time);
+
+    for (int i = 0; i < 100; ++i)
+    {
+        cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                    &alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
+                    CUSPARSE_SPMV_ALG_DEFAULT, dBuffer);
+    }
+    cudaDeviceSynchronize();
+
+    gettimeofday(&t1, NULL);
+    for (int i = 0; i < 1000; ++i)
+    {
+        cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                    &alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
+                    CUSPARSE_SPMV_ALG_DEFAULT, dBuffer);
+    }
+    cudaDeviceSynchronize();
+    gettimeofday(&t2, NULL);
+    *cu_time = ((t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0) / 1000;
+    *cu_gflops = (double)((long)nnzA * 2) / (*cu_time * 1e6);
+    *cu_bandwidth1 = (double)data_origin1 / (*cu_time * 1e6); 
+    *cu_bandwidth2 = (double)data_origin2 / (*cu_time * 1e6); 
+    printf("cusparse:%8.4lf ms, %8.4lf Gflop/s, %9.4lf GB/s, %9.4lf GB/s\n", *cu_time, *cu_gflops, *cu_bandwidth1, *cu_bandwidth2);
+
+    cusparseDestroySpMat(matA);
+    cusparseDestroyDnVec(vecX);
+    cusparseDestroyDnVec(vecY);
+    cusparseDestroy(handle);
+
+    cudaMemcpy(cu_ValY, dY, sizeof(MAT_VAL_TYPE) * rowA, cudaMemcpyDeviceToHost);
+
+    cudaFree(dA_val);
+    cudaFree(dA_cid);
+    cudaFree(dA_rpt);
+    cudaFree(dX);
+    cudaFree(dY);
+}
+
+__host__
+int main(int argc, char **argv)
+{
+    if (argc < 2)
+    {
+        printf("Run the code by './spmv_half matrix.mtx'.\n");
+        return 0;
+    }
+
+    // struct timeval t1, t2;
+    int rowA, colA;
+    MAT_PTR_TYPE nnzA;
+    int isSymmetricA;
+    // float *csrValA_f32;
+
+    MAT_VAL_TYPE *csrValA;
+    int *csrColIdxA;
+    MAT_PTR_TYPE *csrRowPtrA;
+
+    char *filename;
+    filename = argv[1];
+    // int NUM = atoi(argv[2]);
+    // int block_longest = atoi(argv[3]);
+    int NUM = 4;
+    int block_longest = 256;
+    double threshold = 0.75;
+
+    printf("\n===%s===\n\n", filename);
+
+    mmio_allinone(&rowA, &colA, &nnzA, &isSymmetricA, &csrRowPtrA, &csrColIdxA, &csrValA, filename);
+    MAT_VAL_TYPE *X_val = (MAT_VAL_TYPE *)malloc(sizeof(MAT_VAL_TYPE) * colA);
+    initVec(X_val, colA);
+    initVec(csrValA, nnzA);
+
+    MAT_VAL_TYPE *dY_val = (MAT_VAL_TYPE *)malloc(sizeof(MAT_VAL_TYPE) * rowA);
+    MAT_VAL_TYPE *Y_val = (MAT_VAL_TYPE *)malloc(sizeof(MAT_VAL_TYPE) * rowA);
+    int *new_order = (int *)malloc(sizeof(int) * rowA);
+    
+    double pre_time = 0, cu_time = 0, cu_gflops = 0, cu_bandwidth1 = 0, cu_bandwidth2 = 0;
+    long long int data_origin1 = (nnzA + colA + rowA) * sizeof(MAT_VAL_TYPE) + nnzA * sizeof(int) + (rowA + 1) * sizeof(MAT_PTR_TYPE);
+    long long int data_origin2 = (nnzA + nnzA + rowA) * sizeof(MAT_VAL_TYPE) + nnzA * sizeof(int) + (rowA + 1) * sizeof(MAT_PTR_TYPE);
+
+    cusparse_spmv_all(csrValA, csrRowPtrA, csrColIdxA, X_val, dY_val, rowA, colA, nnzA, data_origin1, data_origin2, &pre_time, &cu_time, &cu_gflops, &cu_bandwidth1, &cu_bandwidth2);
+
+    spmv_all(filename, csrValA, csrRowPtrA, csrColIdxA, X_val, Y_val, new_order, rowA, colA, nnzA, NUM, threshold, block_longest);
+
+    FILE* fout;
+    fout = fopen("data/spmv_f16_record.csv", "a");
+    fprintf(fout, "%lld,%lf,%lf,%lf,%lf,%lf\n", data_origin1, pre_time, cu_time, cu_gflops, cu_bandwidth1, cu_bandwidth2);
+    fclose(fout);
+
+    // int result = verify_new(dY_val, Y_val, new_order, rowA);
+
+    free(X_val);
+    free(Y_val);
+    free(dY_val);
+    free(csrColIdxA);
+    free(csrRowPtrA);
+    free(csrValA);
+    free(new_order);
+
+    return 0;
+}
--- a/DASP/src/main_f64.cu
+++ b/DASP/src/main_f64.cu
@@ -0,0 +1,168 @@
+#include "dasp_f64.h"
+
+int verify_new(MAT_VAL_TYPE *cusp_val, MAT_VAL_TYPE *cuda_val, int *new_order, int length)
+{
+    for (int i = 0; i < length; i ++)
+    {
+        int cusp_idx = new_order[i];
+        if (fabs(cusp_val[cusp_idx] - cuda_val[i]) > 1e-5)
+        {
+            printf("error in (%d), cusp(%4.2f), cuda(%4.2f),please check your code!\n", i, cusp_val[cusp_idx], cuda_val[i]);
+            return -1;
+        }
+    }
+    printf("Y(%d), compute succeed!\n", length);
+    return 0;
+}
+
+__host__
+void cusparse_spmv_all(MAT_VAL_TYPE *cu_ValA, MAT_PTR_TYPE *cu_RowPtrA, int *cu_ColIdxA, 
+                       MAT_VAL_TYPE *cu_ValX, MAT_VAL_TYPE *cu_ValY, int rowA, int colA, MAT_PTR_TYPE nnzA,
+                       long long int data_origin1, long long int data_origin2, double *cu_time, double *cu_gflops, double *cu_bandwidth1, double *cu_bandwidth2, double *cu_pre)
+{
+    struct timeval t1, t2;
+
+    MAT_VAL_TYPE *dA_val, *dX, *dY;
+    int *dA_cid;
+    MAT_PTR_TYPE *dA_rpt;
+    MAT_VAL_TYPE alpha = 1.0, beta = 0.0;
+
+    cudaMalloc((void **)&dA_val, sizeof(MAT_VAL_TYPE) * nnzA);
+    cudaMalloc((void **)&dA_cid, sizeof(int) * nnzA);
+    cudaMalloc((void **)&dA_rpt, sizeof(MAT_PTR_TYPE) * (rowA + 1));
+    cudaMalloc((void **)&dX, sizeof(MAT_VAL_TYPE) * colA);
+    cudaMalloc((void **)&dY, sizeof(MAT_VAL_TYPE) * rowA);
+
+    cudaMemcpy(dA_val, cu_ValA, sizeof(MAT_VAL_TYPE) * nnzA, cudaMemcpyHostToDevice);
+    cudaMemcpy(dA_cid, cu_ColIdxA, sizeof(int) * nnzA, cudaMemcpyHostToDevice);
+    cudaMemcpy(dA_rpt, cu_RowPtrA, sizeof(MAT_PTR_TYPE) * (rowA + 1), cudaMemcpyHostToDevice);
+    cudaMemcpy(dX, cu_ValX, sizeof(MAT_VAL_TYPE) * colA, cudaMemcpyHostToDevice);
+    cudaMemset(dY, 0.0, sizeof(MAT_VAL_TYPE) * rowA);
+
+    cusparseHandle_t     handle = NULL;
+    cusparseSpMatDescr_t matA;
+    cusparseDnVecDescr_t vecX, vecY;
+    void*                dBuffer = NULL;
+    size_t               bufferSize = 0;
+
+    gettimeofday(&t1, NULL);
+    cusparseCreate(&handle);
+    cusparseCreateCsr(&matA, rowA, colA, nnzA, dA_rpt, dA_cid, dA_val,
+                        CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+                        CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
+    cusparseCreateDnVec(&vecX, colA, dX, CUDA_R_64F);
+    cusparseCreateDnVec(&vecY, rowA, dY, CUDA_R_64F);
+    cusparseSpMV_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                            &alpha, matA, vecX, &beta, vecY, CUDA_R_64F,
+                            CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize);
+    cudaMalloc(&dBuffer, bufferSize);
+    // cudaDeviceSynchronize();
+    gettimeofday(&t2, NULL);
+    double cusparse_pre = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0;
+    // printf("cusparse preprocessing time: %8.4lf ms\n", cusparse_pre);
+    *cu_pre = cusparse_pre;
+
+    for (int i = 0; i < 100; ++i)
+    {
+        cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                    &alpha, matA, vecX, &beta, vecY, CUDA_R_64F,
+                    CUSPARSE_SPMV_ALG_DEFAULT, dBuffer);
+    }
+    cudaDeviceSynchronize();
+
+    gettimeofday(&t1, NULL);
+    for (int i = 0; i < 1000; ++i)
+    {
+        cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                    &alpha, matA, vecX, &beta, vecY, CUDA_R_64F,
+                    CUSPARSE_SPMV_ALG_DEFAULT, dBuffer);
+    }
+    cudaDeviceSynchronize();
+    gettimeofday(&t2, NULL);
+    *cu_time = ((t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0) / 1000;
+    *cu_gflops = (double)((long)nnzA * 2) / (*cu_time * 1e6);
+    *cu_bandwidth1 = (double)data_origin1 / (*cu_time * 1e6); 
+    *cu_bandwidth2 = (double)data_origin2 / (*cu_time * 1e6); 
+    printf("cusparse:%8.4lf ms, %8.4lf Gflop/s, %9.4lf GB/s, %9.4lf GB/s\n", *cu_time, *cu_gflops, *cu_bandwidth1, *cu_bandwidth2);
+
+    cusparseDestroySpMat(matA);
+    cusparseDestroyDnVec(vecX);
+    cusparseDestroyDnVec(vecY);
+    cusparseDestroy(handle);
+
+    cudaMemcpy(cu_ValY, dY, sizeof(MAT_VAL_TYPE) * rowA, cudaMemcpyDeviceToHost);
+
+    cudaFree(dA_val);
+    cudaFree(dA_cid);
+    cudaFree(dA_rpt);
+    cudaFree(dX);
+    cudaFree(dY);
+}
+
+__host__
+int main(int argc, char **argv)
+{
+    if (argc < 2)
+    {
+        printf("Run the code by './spmv_double matrix.mtx'. \n");
+        return 0;
+    }
+
+    // struct timeval t1, t2;
+    int rowA, colA;
+    MAT_PTR_TYPE nnzA;
+    int isSymmetricA;
+    MAT_VAL_TYPE *csrValA;
+    int *csrColIdxA;
+    MAT_PTR_TYPE *csrRowPtrA;
+
+    char *filename;
+    filename = argv[1];
+    // int NUM = atoi(argv[2]);
+    // int block_longest = atoi(argv[3]);
+    int NUM = 4;
+    int block_longest = 256;
+    double threshold = 0.75;
+
+    printf("\n===%s===\n\n", filename);
+
+    mmio_allinone(&rowA, &colA, &nnzA, &isSymmetricA, &csrRowPtrA, &csrColIdxA, &csrValA, filename);
+    MAT_VAL_TYPE *X_val = (MAT_VAL_TYPE *)malloc(sizeof(MAT_VAL_TYPE) * colA);
+    initVec(X_val, colA);
+    initVec(csrValA, nnzA);
+
+    printf("INIT DONE\n");
+
+    MAT_VAL_TYPE *dY_val = (MAT_VAL_TYPE *)malloc(sizeof(MAT_VAL_TYPE) * rowA);
+    MAT_VAL_TYPE *Y_val = (MAT_VAL_TYPE *)malloc(sizeof(MAT_VAL_TYPE) * rowA);
+    int *new_order = (int *)malloc(sizeof(int) * rowA);
+
+    // int warmup = 3, pre_num = 10;
+    
+    double cu_time = 0, cu_gflops = 0, cu_bandwidth1 = 0, cu_bandwidth2 = 0, cu_pre = 0;
+    long long int data_origin1 = (nnzA + colA + rowA) * sizeof(MAT_VAL_TYPE) + nnzA * sizeof(int) + (rowA + 1) * sizeof(MAT_PTR_TYPE);
+    long long int data_origin2 = (nnzA + nnzA + rowA) * sizeof(MAT_VAL_TYPE) + nnzA * sizeof(int) + (rowA + 1) * sizeof(MAT_PTR_TYPE);
+    
+    cusparse_spmv_all(csrValA, csrRowPtrA, csrColIdxA, X_val, dY_val, rowA, colA, nnzA, data_origin1, data_origin2, &cu_time, &cu_gflops, &cu_bandwidth1, &cu_bandwidth2, &cu_pre);
+    
+    // double dasp_pre = 0;
+    spmv_all(filename, csrValA, csrRowPtrA, csrColIdxA, X_val, Y_val, new_order, rowA, colA, nnzA, NUM, threshold, block_longest);
+
+    FILE* fout;
+    fout = fopen("data/spmv_f64_record.csv", "a");
+    fprintf(fout, "%lld,%lf,%lf,%lf,%lf\n", data_origin1, cu_time, cu_gflops, cu_bandwidth1, cu_bandwidth2);
+    fclose(fout);
+    
+    /* verify the result with cusparse */
+    // int result = verify_new(dY_val, Y_val, new_order, rowA);
+
+    free(X_val);
+    free(Y_val);
+    free(dY_val);
+    free(csrColIdxA);
+    free(csrRowPtrA);
+    free(csrValA);
+    free(new_order);
+
+    return 0;
+}
--- a/DASP/src/mmio.h
+++ b/DASP/src/mmio.h
--- a/DASP/src/mmio_highlevel.h
+++ b/DASP/src/mmio_highlevel.h
@@ -0,0 +1,778 @@
+#ifndef _MMIO_HIGHLEVEL_
+
+#define _MMIO_HIGHLEVEL_
+
+
+
+#include "mmio.h"
+#include "common.h"
+
+void exclusive_scan(MAT_PTR_TYPE *input, int length)
+{
+    if (length == 0 || length == 1)
+        return;
+
+    MAT_PTR_TYPE old_val, new_val;
+
+    old_val = input[0];
+    input[0] = 0;
+    for (int i = 1; i < length; i++)
+    {
+        new_val = input[i];
+        input[i] = old_val + input[i - 1];
+        old_val = new_val;
+    }
+}
+
+
+// read matrix infomation from mtx file
+
+int mmio_info(int *m, int *n, int *nnz, int *isSymmetric, char *filename)
+
+{
+
+    int m_tmp, n_tmp, nnz_tmp;
+
+
+
+    int ret_code;
+
+    MM_typecode matcode;
+
+    FILE *f;
+
+
+
+    int nnz_mtx_report;
+
+    int isInteger = 0, isReal = 0, isPattern = 0, isSymmetric_tmp = 0, isComplex = 0;
+
+
+
+    // load matrix
+
+    if ((f = fopen(filename, "r")) == NULL)
+
+        return -1;
+
+
+
+    if (mm_read_banner(f, &matcode) != 0)
+
+    {
+
+        printf("Could not process Matrix Market banner.\n");
+
+        return -2;
+
+    }
+
+
+
+    if ( mm_is_pattern( matcode ) )  { isPattern = 1; /*printf("type = Pattern\n");*/ }
+
+    if ( mm_is_real ( matcode) )     { isReal = 1; /*printf("type = real\n");*/ }
+
+    if ( mm_is_complex( matcode ) )  { isComplex = 1; /*printf("type = real\n");*/ }
+
+    if ( mm_is_integer ( matcode ) ) { isInteger = 1; /*printf("type = integer\n");*/ }
+
+
+
+    /* find out size of sparse matrix .... */
+
+    ret_code = mm_read_mtx_crd_size(f, &m_tmp, &n_tmp, &nnz_mtx_report);
+
+    if (ret_code != 0)
+
+        return -4;
+
+
+
+    if ( mm_is_symmetric( matcode ) || mm_is_hermitian( matcode ) )
+
+    {
+
+        isSymmetric_tmp = 1;
+
+        //printf("input matrix is symmetric = true\n");
+
+    }
+
+    else
+
+    {
+
+        //printf("input matrix is symmetric = false\n");
+
+    }
+
+
+
+    int *csrRowPtr_counter = (int *)malloc((m_tmp+1) * sizeof(int));
+
+    memset(csrRowPtr_counter, 0, (m_tmp+1) * sizeof(int));
+
+
+
+    int *csrRowIdx_tmp = (int *)malloc(nnz_mtx_report * sizeof(int));
+
+    int *csrColIdx_tmp = (int *)malloc(nnz_mtx_report * sizeof(int));
+
+    MAT_VAL_TYPE *csrVal_tmp    = (MAT_VAL_TYPE *)malloc(nnz_mtx_report * sizeof(MAT_VAL_TYPE));
+
+
+
+    /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
+
+    /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
+
+    /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
+
+
+
+    for (int i = 0; i < nnz_mtx_report; i++)
+
+    {
+
+        int idxi, idxj;
+
+        double fval, fval_im;
+
+        int ival;
+
+        int returnvalue;
+
+
+
+        if (isReal)
+
+        {
+
+            returnvalue = fscanf(f, "%d %d %lg\n", &idxi, &idxj, &fval);
+
+        }
+
+        else if (isComplex)
+
+        {
+
+            returnvalue = fscanf(f, "%d %d %lg %lg\n", &idxi, &idxj, &fval, &fval_im);
+
+        }
+
+        else if (isInteger)
+
+        {
+
+            returnvalue = fscanf(f, "%d %d %d\n", &idxi, &idxj, &ival);
+
+            fval = ival;
+
+        }
+
+        else if (isPattern)
+
+        {
+
+            returnvalue = fscanf(f, "%d %d\n", &idxi, &idxj);
+
+            fval = 1.0;
+
+        }
+
+
+
+        // adjust from 1-based to 0-based
+
+        idxi--;
+
+        idxj--;
+
+
+
+        csrRowPtr_counter[idxi]++;
+
+        csrRowIdx_tmp[i] = idxi;
+
+        csrColIdx_tmp[i] = idxj;
+
+        csrVal_tmp[i] = fval;
+
+    }
+
+
+
+    if (f != stdin)
+
+        fclose(f);
+
+
+
+    if (isSymmetric_tmp)
+
+    {
+
+        for (int i = 0; i < nnz_mtx_report; i++)
+
+        {
+
+            if (csrRowIdx_tmp[i] != csrColIdx_tmp[i])
+
+                csrRowPtr_counter[csrColIdx_tmp[i]]++;
+
+        }
+
+    }
+
+
+
+    // exclusive scan for csrRowPtr_counter
+
+    int old_val, new_val;
+
+
+
+    old_val = csrRowPtr_counter[0];
+
+    csrRowPtr_counter[0] = 0;
+
+    for (int i = 1; i <= m_tmp; i++)
+
+    {
+
+        new_val = csrRowPtr_counter[i];
+
+        csrRowPtr_counter[i] = old_val + csrRowPtr_counter[i-1];
+
+        old_val = new_val;
+
+    }
+
+
+
+    nnz_tmp = csrRowPtr_counter[m_tmp];
+
+
+
+    *m = m_tmp;
+
+    *n = n_tmp;
+
+    *nnz = nnz_tmp;
+
+    *isSymmetric = isSymmetric_tmp;
+
+
+
+    // free tmp space
+
+    free(csrColIdx_tmp);
+
+    free(csrVal_tmp);
+
+    free(csrRowIdx_tmp);
+
+    free(csrRowPtr_counter);
+
+
+
+    return 0;
+
+}
+
+
+
+// read matrix infomation from mtx file
+
+int mmio_data(int *csrRowPtr, int *csrColIdx, MAT_VAL_TYPE *csrVal, char *filename)
+
+{
+
+    int m_tmp, n_tmp, nnz_tmp;
+
+
+
+    int ret_code;
+
+    MM_typecode matcode;
+
+    FILE *f;
+
+
+
+    int nnz_mtx_report;
+
+    int isInteger = 0, isReal = 0, isPattern = 0, isSymmetric_tmp = 0, isComplex = 0;
+
+
+
+    // load matrix
+
+    if ((f = fopen(filename, "r")) == NULL)
+
+        return -1;
+
+
+
+    if (mm_read_banner(f, &matcode) != 0)
+
+    {
+
+        printf("Could not process Matrix Market banner.\n");
+
+        return -2;
+
+    }
+
+
+
+    if ( mm_is_pattern( matcode ) )  { isPattern = 1; /*printf("type = Pattern\n");*/ }
+
+    if ( mm_is_real ( matcode) )     { isReal = 1; /*printf("type = real\n");*/ }
+
+    if ( mm_is_complex( matcode ) )  { isComplex = 1; /*printf("type = real\n");*/ }
+
+    if ( mm_is_integer ( matcode ) ) { isInteger = 1; /*printf("type = integer\n");*/ }
+
+
+
+    /* find out size of sparse matrix .... */
+
+    ret_code = mm_read_mtx_crd_size(f, &m_tmp, &n_tmp, &nnz_mtx_report);
+
+    if (ret_code != 0)
+
+        return -4;
+
+
+
+    if ( mm_is_symmetric( matcode ) || mm_is_hermitian( matcode ) )
+
+    {
+
+        isSymmetric_tmp = 1;
+
+        //printf("input matrix is symmetric = true\n");
+
+    }
+
+    else
+
+    {
+
+        //printf("input matrix is symmetric = false\n");
+
+    }
+
+
+
+    int *csrRowPtr_counter = (int *)malloc((m_tmp+1) * sizeof(int));
+
+    memset(csrRowPtr_counter, 0, (m_tmp+1) * sizeof(int));
+
+
+
+    int *csrRowIdx_tmp = (int *)malloc(nnz_mtx_report * sizeof(int));
+
+    int *csrColIdx_tmp = (int *)malloc(nnz_mtx_report * sizeof(int));
+
+    MAT_VAL_TYPE *csrVal_tmp    = (MAT_VAL_TYPE *)malloc(nnz_mtx_report * sizeof(MAT_VAL_TYPE));
+
+
+
+    /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
+
+    /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
+
+    /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
+
+
+
+    for (int i = 0; i < nnz_mtx_report; i++)
+
+    {
+
+        int idxi, idxj;
+
+        double fval, fval_im;
+
+        int ival;
+
+        int returnvalue;
+
+
+
+        if (isReal)
+
+        {
+
+            returnvalue = fscanf(f, "%d %d %lg\n", &idxi, &idxj, &fval);
+
+        }
+
+        else if (isComplex)
+
+        {
+
+            returnvalue = fscanf(f, "%d %d %lg %lg\n", &idxi, &idxj, &fval, &fval_im);
+
+        }
+
+        else if (isInteger)
+
+        {
+
+            returnvalue = fscanf(f, "%d %d %d\n", &idxi, &idxj, &ival);
+
+            fval = ival;
+
+        }
+
+        else if (isPattern)
+
+        {
+
+            returnvalue = fscanf(f, "%d %d\n", &idxi, &idxj);
+
+            fval = 1.0;
+
+        }
+
+
+
+        // adjust from 1-based to 0-based
+
+        idxi--;
+
+        idxj--;
+
+
+
+        csrRowPtr_counter[idxi]++;
+
+        csrRowIdx_tmp[i] = idxi;
+
+        csrColIdx_tmp[i] = idxj;
+
+        csrVal_tmp[i] = fval;
+
+    }
+
+
+
+    if (f != stdin)
+
+        fclose(f);
+
+
+
+    if (isSymmetric_tmp)
+
+    {
+
+        for (int i = 0; i < nnz_mtx_report; i++)
+
+        {
+
+            if (csrRowIdx_tmp[i] != csrColIdx_tmp[i])
+
+                csrRowPtr_counter[csrColIdx_tmp[i]]++;
+
+        }
+
+    }
+
+
+
+    // exclusive scan for csrRowPtr_counter
+
+    int old_val, new_val;
+
+
+
+    old_val = csrRowPtr_counter[0];
+
+    csrRowPtr_counter[0] = 0;
+
+    for (int i = 1; i <= m_tmp; i++)
+
+    {
+
+        new_val = csrRowPtr_counter[i];
+
+        csrRowPtr_counter[i] = old_val + csrRowPtr_counter[i-1];
+
+        old_val = new_val;
+
+    }
+
+
+
+    nnz_tmp = csrRowPtr_counter[m_tmp];
+
+    memcpy(csrRowPtr, csrRowPtr_counter, (m_tmp+1) * sizeof(int));
+
+    memset(csrRowPtr_counter, 0, (m_tmp+1) * sizeof(int));
+
+
+
+    if (isSymmetric_tmp)
+
+    {
+
+        for (int i = 0; i < nnz_mtx_report; i++)
+
+        {
+
+            if (csrRowIdx_tmp[i] != csrColIdx_tmp[i])
+
+            {
+
+                int offset = csrRowPtr[csrRowIdx_tmp[i]] + csrRowPtr_counter[csrRowIdx_tmp[i]];
+
+                csrColIdx[offset] = csrColIdx_tmp[i];
+
+                csrVal[offset] = csrVal_tmp[i];
+
+                csrRowPtr_counter[csrRowIdx_tmp[i]]++;
+
+
+
+                offset = csrRowPtr[csrColIdx_tmp[i]] + csrRowPtr_counter[csrColIdx_tmp[i]];
+
+                csrColIdx[offset] = csrRowIdx_tmp[i];
+
+                csrVal[offset] = csrVal_tmp[i];
+
+                csrRowPtr_counter[csrColIdx_tmp[i]]++;
+
+            }
+
+            else
+
+            {
+
+                int offset = csrRowPtr[csrRowIdx_tmp[i]] + csrRowPtr_counter[csrRowIdx_tmp[i]];
+
+                csrColIdx[offset] = csrColIdx_tmp[i];
+
+                csrVal[offset] = csrVal_tmp[i];
+
+                csrRowPtr_counter[csrRowIdx_tmp[i]]++;
+
+            }
+
+        }
+
+    }
+
+    else
+
+    {
+
+        for (int i = 0; i < nnz_mtx_report; i++)
+
+        {
+
+            int offset = csrRowPtr[csrRowIdx_tmp[i]] + csrRowPtr_counter[csrRowIdx_tmp[i]];
+
+            csrColIdx[offset] = csrColIdx_tmp[i];
+
+            csrVal[offset] = csrVal_tmp[i];
+
+            csrRowPtr_counter[csrRowIdx_tmp[i]]++;
+
+        }
+
+    }
+
+
+
+    // free tmp space
+
+    free(csrColIdx_tmp);
+
+    free(csrVal_tmp);
+
+    free(csrRowIdx_tmp);
+
+    free(csrRowPtr_counter);
+
+
+
+    return 0;
+
+}
+// read matrix infomation from mtx file
+int mmio_allinone(int *m, int *n, MAT_PTR_TYPE *nnz, int *isSymmetric, 
+                  MAT_PTR_TYPE **csrRowPtr, int **csrColIdx, MAT_VAL_TYPE **csrVal, 
+                  char *filename)
+{
+    int m_tmp, n_tmp;
+    MAT_PTR_TYPE nnz_tmp;
+
+    int ret_code;
+    MM_typecode matcode;
+    FILE *f;
+
+    MAT_PTR_TYPE nnz_mtx_report;
+    int isInteger = 0, isReal = 0, isPattern = 0, isSymmetric_tmp = 0, isComplex = 0;
+
+    // load matrix
+    if ((f = fopen(filename, "r")) == NULL)
+        return -1;
+
+    if (mm_read_banner(f, &matcode) != 0)
+    {
+        printf("Could not process Matrix Market banner.\n");
+        return -2;
+    }
+
+    if ( mm_is_pattern( matcode ) )  { isPattern = 1; /*printf("type = Pattern\n");*/ }
+    if ( mm_is_real ( matcode) )     { isReal = 1; /*printf("type = real\n");*/ }
+    if ( mm_is_complex( matcode ) )  { isComplex = 1; /*printf("type = real\n");*/ }
+    if ( mm_is_integer ( matcode ) ) { isInteger = 1; /*printf("type = integer\n");*/ }
+
+    /* find out size of sparse matrix .... */
+    ret_code = mm_read_mtx_crd_size(f, &m_tmp, &n_tmp, &nnz_mtx_report);
+    if (ret_code != 0)
+        return -4;
+
+    if ( mm_is_symmetric( matcode ) || mm_is_hermitian( matcode ) )
+    {
+        isSymmetric_tmp = 1;
+        //printf("input matrix is symmetric = true\n");
+    }
+    else
+    {
+        //printf("input matrix is symmetric = false\n");
+    }
+
+    MAT_PTR_TYPE *csrRowPtr_counter = (MAT_PTR_TYPE *)malloc((m_tmp+1) * sizeof(MAT_PTR_TYPE));
+    memset(csrRowPtr_counter, 0, (m_tmp+1) * sizeof(MAT_PTR_TYPE));
+
+    int *csrRowIdx_tmp = (int *)malloc(nnz_mtx_report * sizeof(int));
+    int *csrColIdx_tmp = (int *)malloc(nnz_mtx_report * sizeof(int));
+    MAT_VAL_TYPE *csrVal_tmp    = (MAT_VAL_TYPE *)malloc(nnz_mtx_report * sizeof(MAT_VAL_TYPE));
+
+    /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
+    /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
+    /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
+
+    for (MAT_PTR_TYPE i = 0; i < nnz_mtx_report; i++)
+    {
+        int idxi, idxj;
+        double fval, fval_im;
+        int ival;
+        int returnvalue;
+
+        if (isReal)
+        {
+            returnvalue = fscanf(f, "%d %d %lg\n", &idxi, &idxj, &fval);
+        }
+        else if (isComplex)
+        {
+            returnvalue = fscanf(f, "%d %d %lg %lg\n", &idxi, &idxj, &fval, &fval_im);
+        }
+        else if (isInteger)
+        {
+            returnvalue = fscanf(f, "%d %d %d\n", &idxi, &idxj, &ival);
+            fval = ival;
+        }
+        else if (isPattern)
+        {
+            returnvalue = fscanf(f, "%d %d\n", &idxi, &idxj);
+            fval = 1.0;
+        }
+
+        // adjust from 1-based to 0-based
+        idxi--;
+        idxj--;
+        
+        csrRowPtr_counter[idxi]++;
+        csrRowIdx_tmp[i] = idxi;
+        csrColIdx_tmp[i] = idxj;
+        csrVal_tmp[i] = fval;
+    }
+
+    if (f != stdin)
+        fclose(f);
+
+    if (isSymmetric_tmp)
+    {
+        for (MAT_PTR_TYPE i = 0; i < nnz_mtx_report; i++)
+        {
+            if (csrRowIdx_tmp[i] != csrColIdx_tmp[i])
+                csrRowPtr_counter[csrColIdx_tmp[i]]++;
+        }
+    }
+
+    // exclusive scan for csrRowPtr_counter
+    exclusive_scan(csrRowPtr_counter, m_tmp+1);
+
+    MAT_PTR_TYPE *csrRowPtr_alias = (MAT_PTR_TYPE *)malloc((m_tmp+1) * sizeof(MAT_PTR_TYPE));
+    nnz_tmp = csrRowPtr_counter[m_tmp];
+    int *csrColIdx_alias = (int *)malloc(nnz_tmp * sizeof(int));
+    MAT_VAL_TYPE *csrVal_alias    = (MAT_VAL_TYPE *)malloc(nnz_tmp * sizeof(MAT_VAL_TYPE));
+
+    memcpy(csrRowPtr_alias, csrRowPtr_counter, (m_tmp+1) * sizeof(MAT_PTR_TYPE));
+    memset(csrRowPtr_counter, 0, (m_tmp+1) * sizeof(MAT_PTR_TYPE));
+
+    if (isSymmetric_tmp)
+    {
+        for (MAT_PTR_TYPE i = 0; i < nnz_mtx_report; i++)
+        {
+            if (csrRowIdx_tmp[i] != csrColIdx_tmp[i])
+            {
+                MAT_PTR_TYPE offset = csrRowPtr_alias[csrRowIdx_tmp[i]] + csrRowPtr_counter[csrRowIdx_tmp[i]];
+                csrColIdx_alias[offset] = csrColIdx_tmp[i];
+                csrVal_alias[offset] = csrVal_tmp[i];
+                csrRowPtr_counter[csrRowIdx_tmp[i]]++;
+
+                offset = csrRowPtr_alias[csrColIdx_tmp[i]] + csrRowPtr_counter[csrColIdx_tmp[i]];
+                csrColIdx_alias[offset] = csrRowIdx_tmp[i];
+                csrVal_alias[offset] = csrVal_tmp[i];
+                csrRowPtr_counter[csrColIdx_tmp[i]]++;
+            }
+            else
+            {
+                MAT_PTR_TYPE offset = csrRowPtr_alias[csrRowIdx_tmp[i]] + csrRowPtr_counter[csrRowIdx_tmp[i]];
+                csrColIdx_alias[offset] = csrColIdx_tmp[i];
+                csrVal_alias[offset] = csrVal_tmp[i];
+                csrRowPtr_counter[csrRowIdx_tmp[i]]++;
+            }
+        }
+    }
+    else
+    {
+        for (MAT_PTR_TYPE i = 0; i < nnz_mtx_report; i++)
+        {            
+            MAT_PTR_TYPE offset = csrRowPtr_alias[csrRowIdx_tmp[i]] + csrRowPtr_counter[csrRowIdx_tmp[i]];
+            csrColIdx_alias[offset] = csrColIdx_tmp[i];
+            csrVal_alias[offset] = csrVal_tmp[i];
+            csrRowPtr_counter[csrRowIdx_tmp[i]]++;
+        }
+    }
+    
+    *m = m_tmp;
+    *n = n_tmp;
+    *nnz = nnz_tmp;
+    *isSymmetric = isSymmetric_tmp;
+
+    *csrRowPtr = csrRowPtr_alias;
+    *csrColIdx = csrColIdx_alias;
+    *csrVal = csrVal_alias;
+
+    // free tmp space
+    free(csrColIdx_tmp);
+    free(csrVal_tmp);
+    free(csrRowIdx_tmp);
+    free(csrRowPtr_counter);
+
+    return 0;
+}
+
+
+
+#endif
--- a/DASP/src/utils.h
+++ b/DASP/src/utils.h
@@ -0,0 +1,212 @@
+#include "common.h"
+
+int BinarySearch(int *arr, int len, int target) {
+	int low = 0;
+	int high = len;
+	int mid = 0;
+	while (low <= high) {
+		mid = (low + high) / 2;
+		if (target < arr[mid]) high = mid - 1;
+		else if (target > arr[mid]) low = mid + 1;
+		else return mid;
+	}
+	return -1;
+}
+
+void swap_key(int *a, int *b)
+{
+    int tmp = *a;
+    *a = *b;
+    *b = tmp;
+}
+
+// quick sort key (child function)
+int partition_key(int *key, int length, int pivot_index)
+{
+    int i = 0;
+    int small_length = pivot_index;
+
+    int pivot = key[pivot_index];
+    swap_key(&key[pivot_index], &key[pivot_index + (length - 1)]);
+
+    for (; i < length; i++)
+    {
+        if (key[pivot_index + i] < pivot)
+        {
+            swap_key(&key[pivot_index + i], &key[small_length]);
+            small_length++;
+        }
+    }
+
+    swap_key(&key[pivot_index + length - 1], &key[small_length]);
+
+    return small_length;
+}
+
+// quick sort key (child function)
+int partition_key_idx(int *key, int *len, int length, int pivot_index)
+{
+    int i = 0;
+    int small_length = pivot_index;
+
+    int pivot = key[pivot_index];
+    swap_key(&key[pivot_index], &key[pivot_index + (length - 1)]);
+    swap_key(&len[pivot_index], &len[pivot_index + (length - 1)]);
+
+    for (; i < length; i++)
+    {
+        if (key[pivot_index + i] < pivot)
+        {
+            swap_key(&key[pivot_index + i], &key[small_length]);
+            swap_key(&len[pivot_index + i], &len[small_length]);
+            small_length++;
+        }
+    }
+
+    swap_key(&key[pivot_index + length - 1], &key[small_length]);
+    swap_key(&len[pivot_index + length - 1], &len[small_length]);
+
+    return small_length;
+}
+
+// quick sort key (main function)
+void quick_sort_key(int *key, int length)
+{
+    if (length == 0 || length == 1)
+        return;
+
+    int small_length = partition_key(key, length, 0);
+    quick_sort_key(key, small_length);
+    quick_sort_key(&key[small_length + 1], length - small_length - 1);
+}
+
+void quick_sort_key_idx(int *key, int *len, int length)
+{
+    if (length == 0 || length == 1)
+        return;
+
+    int small_length = partition_key_idx(key, len, length, 0);
+    quick_sort_key_idx(key, len, small_length);
+    quick_sort_key_idx(&key[small_length + 1], &len[small_length + 1], length - small_length - 1);
+}
+
+void initVec(MAT_VAL_TYPE *vec, int length)
+{
+    for (int i = 0; i < length; ++ i)
+    {
+        // vec[i] = rand() % 20 * 0.1;
+        vec[i] = 1;
+    }
+}
+
+#ifdef f64
+__device__ __forceinline__ void mma_m8n8k4(MAT_VAL_TYPE *acc, MAT_VAL_TYPE &frag_a, MAT_VAL_TYPE &frag_b)
+{
+    asm volatile(
+        "mma.sync.aligned.m8n8k4.row.col.f64.f64.f64.f64"
+        " { %0, %1 }, "
+        " { %2 }, "
+        " { %3 }, "
+        " { %0, %1 };"
+        : "+d"(acc[0]), "+d"(acc[1]):
+        "d"(frag_a), "d"(frag_b)
+    );
+}
+#endif
+
+
+int get_max(int *arr, int len)
+{
+    int max = arr[0];
+    for (int i = 1; i < len; i ++)
+    {
+        if (arr[i] > max) max = arr[i];
+    }
+    return max;
+}
+
+void count_sort(int *arr, int *idx, int len, int exp)
+{
+    int *temp_arr = (int *)malloc(sizeof(int) * len);
+    int *temp_idx = (int *)malloc(sizeof(int) * len);
+    int buckets[10] = {0};
+
+    for (int i = 0; i < len; i ++)
+    {
+        buckets[(arr[i] / exp) % 10] ++;
+    }
+
+    for (int i = 1; i < 10; i ++)
+    {
+        buckets[i] += buckets[i - 1];
+    }
+
+    for (int i = 0; i < len; i ++)
+    {
+        int offset = len - (buckets[(arr[i] / exp) % 10] - 1) - 1;
+        temp_arr[offset] = arr[i];
+        temp_idx[offset] = idx[i];
+        buckets[(arr[i] / exp) % 10] --;
+    }
+
+    for (int i = 0; i < len; i ++)
+    {
+        arr[i] = temp_arr[i];
+        idx[i] = temp_idx[i];
+    }
+
+    free(temp_arr);
+    free(temp_idx);
+}
+
+void count_sort_asce(int *arr, int *idx, int len, int exp)
+{
+    int *temp_arr = (int *)malloc(sizeof(int) * len);
+    int *temp_idx = (int *)malloc(sizeof(int) * len);
+    int buckets[10] = {0};
+
+    for (int i = 0; i < len; i ++)
+    {
+        buckets[(arr[i] / exp) % 10] ++;
+    }
+
+    for (int i = 1; i < 10; i ++)
+    {
+        buckets[i] += buckets[i - 1];
+    }
+
+    for (int i = len - 1; i >= 0; i ++)
+    {
+        int offset = buckets[(arr[i] / exp) % 10] - 1;
+        temp_arr[offset] = arr[i];
+        temp_idx[offset] = idx[i];
+        buckets[(arr[i] / exp) % 10] --;
+    }
+
+    for (int i = 0; i < len; i ++)
+    {
+        arr[i] = temp_arr[i];
+        idx[i] = temp_idx[i];
+    }
+
+    free(temp_arr);
+    free(temp_idx);
+}
+
+void radix_sort(int *arr, int *idx, int len)
+{
+    int max = get_max(arr, len);
+    for (int exp = 1; max / exp > 0; exp *= 10)
+    {
+        count_sort(arr, idx, len, exp);
+    }
+}
+
+void radix_sort_asce(int *arr, int *idx, int len)
+{
+    int max = get_max(arr, len);
+    for (int exp = 1; max / exp > 0; exp *= 10)
+    {
+        count_sort_asce(arr, idx, len, exp);
+    }
+}
--- a/DASP/test/cop20k_A.mtx
+++ b/DASP/test/cop20k_A.mtx
--- a/DASP/test/run_double.sh
+++ b/DASP/test/run_double.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+cd ..
+./spmv_double  test/cop20k_A.mtx
--- a/DASP/test/run_half.sh
+++ b/DASP/test/run_half.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+cd ..
+./spmv_half  test/cop20k_A.mtx
--- a/45
+++ b/45
@@ -0,0 +1,45 @@
+
+Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. 
+All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+
+Copyright (c) 2013-2019, hpcg-benchmark
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of hpcg nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
--- a/54
+++ b/54
@@ -0,0 +1,54 @@
+# -*- Makefile -*-
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# by default, "arch" is unknown, should be specified in the command line
+arch = UNKNOWN
+
+setup_file = setup/Make.$(arch)
+include $(setup_file)
+
+bin_name='bin/xhpcg'
+
+ifeq ($(USE_CUDA), 0)
+     ifneq ($(USE_GRACE), 0)
+          bin_name='bin/xhpcg-cpu'
+     endif
+endif
+
+HPCG_DEPS = src/CG.o src/CG_ref.o src/TestCG.o src/ComputeResidual.o \
+         src/ExchangeHalo.o src/GenerateGeometry.o src/GenerateProblem.o \
+         src/GenerateProblem_ref.o src/CheckProblem.o \
+	 src/OptimizeProblem.o src/ReadHpcgDat.o src/ReportResults.o \
+	 src/SetupHalo.o src/SetupHalo_ref.o src/TestSymmetry.o src/TestNorms.o src/WriteProblem.o \
+         src/YAML_Doc.o src/YAML_Element.o src/ComputeDotProduct.o \
+         src/ComputeDotProduct_ref.o src/finalize.o src/init.o src/mytimer.o src/ComputeSPMV.o \
+         src/ComputeSPMV_ref.o src/ComputeSYMGS.o src/ComputeSYMGS_ref.o src/ComputeWAXPBY.o src/ComputeWAXPBY_ref.o \
+         src/ComputeMG_ref.o src/ComputeMG.o src/ComputeProlongation_ref.o src/ComputeRestriction_ref.o src/GenerateCoarseProblem.o \
+	 src/ComputeOptimalShapeXYZ.o src/MixedBaseCounter.o src/CheckAspectRatio.o src/OutputFile.o \
+     src/ComputeProlongation.o src/ComputeRestriction.o
+
+$(bin_name): src/main.o $(HPCG_DEPS)
+	$(LINKER) $(LINKFLAGS) src/main.o $(HPCG_DEPS) -o $(bin_name) $(HPCG_LIBS)
+
+install:
+	cp build/bin/xhpcg* bin/
+
+clean:
+	rm -f $(HPCG_DEPS) $(bin_name) src/main.o
+
+.PHONY: clean
+
--- a/Makefile.ext
+++ b/Makefile.ext
@@ -0,0 +1,221 @@
+# -*- Makefile -*-
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+arch = UNKNOWN
+setup_file = setup/Make.$(arch)
+
+include $(setup_file)
+
+bin_name='bin/xhpcg'
+ifeq ($(USE_CUDA), 0)
+     ifneq ($(USE_GRACE), 0)
+          bin_name='bin/xhpcg-cpu'
+     endif
+endif
+
+HPCG_DEPS = src/CG.o \
+	    src/CG_ref.o \
+	    src/TestCG.o \
+	    src/ComputeResidual.o \
+	    src/ExchangeHalo.o \
+	    src/GenerateGeometry.o \
+	    src/GenerateProblem.o \
+	    src/GenerateProblem_ref.o \
+	    src/CheckProblem.o \
+	    src/MixedBaseCounter.o \
+	    src/OptimizeProblem.o \
+	    src/ReadHpcgDat.o \
+	    src/ReportResults.o \
+	    src/SetupHalo.o \
+	    src/SetupHalo_ref.o \
+	    src/TestSymmetry.o \
+	    src/TestNorms.o \
+	    src/WriteProblem.o \
+	    src/YAML_Doc.o \
+	    src/YAML_Element.o \
+	    src/ComputeDotProduct.o \
+	    src/ComputeDotProduct_ref.o \
+	    src/mytimer.o \
+	    src/ComputeOptimalShapeXYZ.o \
+	    src/ComputeSPMV.o \
+	    src/ComputeSPMV_ref.o \
+	    src/ComputeSYMGS.o \
+	    src/ComputeSYMGS_ref.o \
+	    src/ComputeWAXPBY.o \
+	    src/ComputeWAXPBY_ref.o \
+	    src/ComputeMG_ref.o \
+	    src/ComputeMG.o \
+	    src/ComputeProlongation_ref.o \
+	    src/ComputeRestriction_ref.o \
+		src/ComputeProlongation.o \
+	    src/ComputeRestriction.o \
+	    src/CheckAspectRatio.o \
+	    src/OutputFile.o \
+	    src/GenerateCoarseProblem.o \
+	    src/init.o \
+	    src/finalize.o \
+	    src/CudaKernels.o \
+		src/CpuKernels.o
+
+# These header files are included in many source files, so we recompile every file if one or more of these header is modified.
+PRIMARY_HEADERS = HPCG_SRC_PATH/src/Geometry.hpp HPCG_SRC_PATH/src/SparseMatrix.hpp HPCG_SRC_PATH/src/Vector.hpp HPCG_SRC_PATH/src/CGData.hpp \
+                  HPCG_SRC_PATH/src/MGData.hpp HPCG_SRC_PATH/src/hpcg.hpp
+
+all: $(bin_name)
+
+$(bin_name): src/main.o $(HPCG_DEPS)
+	$(LINKER) $(LINKFLAGS) src/main.o $(HPCG_DEPS) $(HPCG_LIBS) -o $(bin_name)
+
+install:
+	cp $(bin_name) ../bin/
+
+clean:
+	rm -f src/*.o $(bin_name)
+
+.PHONY: all clean
+
+src/main.o: HPCG_SRC_PATH/src/main.cpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/CG.o: HPCG_SRC_PATH/src/CG.cpp HPCG_SRC_PATH/src/CG.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/CG_ref.o: HPCG_SRC_PATH/src/CG_ref.cpp HPCG_SRC_PATH/src/CG_ref.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/TestCG.o: HPCG_SRC_PATH/src/TestCG.cpp HPCG_SRC_PATH/src/TestCG.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/ComputeResidual.o: HPCG_SRC_PATH/src/ComputeResidual.cpp HPCG_SRC_PATH/src/ComputeResidual.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/ExchangeHalo.o: HPCG_SRC_PATH/src/ExchangeHalo.cpp HPCG_SRC_PATH/src/ExchangeHalo.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/GenerateGeometry.o: HPCG_SRC_PATH/src/GenerateGeometry.cpp HPCG_SRC_PATH/src/GenerateGeometry.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/GenerateProblem.o: HPCG_SRC_PATH/src/GenerateProblem.cpp HPCG_SRC_PATH/src/GenerateProblem.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/GenerateProblem_ref.o: HPCG_SRC_PATH/src/GenerateProblem_ref.cpp HPCG_SRC_PATH/src/GenerateProblem_ref.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/CheckProblem.o: HPCG_SRC_PATH/src/CheckProblem.cpp HPCG_SRC_PATH/src/CheckProblem.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/MixedBaseCounter.o: HPCG_SRC_PATH/src/MixedBaseCounter.cpp HPCG_SRC_PATH/src/MixedBaseCounter.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/OptimizeProblem.o: HPCG_SRC_PATH/src/OptimizeProblem.cpp HPCG_SRC_PATH/src/OptimizeProblem.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/ReadHpcgDat.o: HPCG_SRC_PATH/src/ReadHpcgDat.cpp HPCG_SRC_PATH/src/ReadHpcgDat.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/ReportResults.o: HPCG_SRC_PATH/src/ReportResults.cpp HPCG_SRC_PATH/src/ReportResults.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/SetupHalo.o: HPCG_SRC_PATH/src/SetupHalo.cpp HPCG_SRC_PATH/src/SetupHalo.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/SetupHalo_ref.o: HPCG_SRC_PATH/src/SetupHalo_ref.cpp HPCG_SRC_PATH/src/SetupHalo_ref.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/TestSymmetry.o: HPCG_SRC_PATH/src/TestSymmetry.cpp HPCG_SRC_PATH/src/TestSymmetry.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/TestNorms.o: HPCG_SRC_PATH/src/TestNorms.cpp HPCG_SRC_PATH/src/TestNorms.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/WriteProblem.o: HPCG_SRC_PATH/src/WriteProblem.cpp HPCG_SRC_PATH/src/WriteProblem.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/YAML_Doc.o: HPCG_SRC_PATH/src/YAML_Doc.cpp HPCG_SRC_PATH/src/YAML_Doc.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/YAML_Element.o: HPCG_SRC_PATH/src/YAML_Element.cpp HPCG_SRC_PATH/src/YAML_Element.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/ComputeDotProduct.o: HPCG_SRC_PATH/src/ComputeDotProduct.cpp HPCG_SRC_PATH/src/ComputeDotProduct.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/ComputeDotProduct_ref.o: HPCG_SRC_PATH/src/ComputeDotProduct_ref.cpp HPCG_SRC_PATH/src/ComputeDotProduct_ref.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/finalize.o: HPCG_SRC_PATH/src/finalize.cpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/init.o: HPCG_SRC_PATH/src/init.cpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/mytimer.o: HPCG_SRC_PATH/src/mytimer.cpp HPCG_SRC_PATH/src/mytimer.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/ComputeOptimalShapeXYZ.o: HPCG_SRC_PATH/src/ComputeOptimalShapeXYZ.cpp HPCG_SRC_PATH/src/ComputeOptimalShapeXYZ.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/ComputeSPMV.o: HPCG_SRC_PATH/src/ComputeSPMV.cpp HPCG_SRC_PATH/src/ComputeSPMV.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/ComputeSPMV_ref.o: HPCG_SRC_PATH/src/ComputeSPMV_ref.cpp HPCG_SRC_PATH/src/ComputeSPMV_ref.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/ComputeSYMGS.o: HPCG_SRC_PATH/src/ComputeSYMGS.cpp HPCG_SRC_PATH/src/ComputeSYMGS.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/ComputeSYMGS_ref.o: HPCG_SRC_PATH/src/ComputeSYMGS_ref.cpp HPCG_SRC_PATH/src/ComputeSYMGS_ref.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/ComputeWAXPBY.o: HPCG_SRC_PATH/src/ComputeWAXPBY.cpp HPCG_SRC_PATH/src/ComputeWAXPBY.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/ComputeWAXPBY_ref.o: HPCG_SRC_PATH/src/ComputeWAXPBY_ref.cpp HPCG_SRC_PATH/src/ComputeWAXPBY_ref.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/ComputeMG_ref.o: HPCG_SRC_PATH/src/ComputeMG_ref.cpp HPCG_SRC_PATH/src/ComputeMG_ref.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/ComputeMG.o: HPCG_SRC_PATH/src/ComputeMG.cpp HPCG_SRC_PATH/src/ComputeMG.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/ComputeProlongation_ref.o: HPCG_SRC_PATH/src/ComputeProlongation_ref.cpp HPCG_SRC_PATH/src/ComputeProlongation_ref.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/ComputeRestriction_ref.o: HPCG_SRC_PATH/src/ComputeRestriction_ref.cpp HPCG_SRC_PATH/src/ComputeRestriction_ref.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/ComputeProlongation.o: HPCG_SRC_PATH/src/ComputeProlongation.cpp HPCG_SRC_PATH/src/ComputeProlongation.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/ComputeRestriction.o: HPCG_SRC_PATH/src/ComputeRestriction.cpp HPCG_SRC_PATH/src/ComputeRestriction.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/GenerateCoarseProblem.o: HPCG_SRC_PATH/src/GenerateCoarseProblem.cpp HPCG_SRC_PATH/src/GenerateCoarseProblem.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/CheckAspectRatio.o: HPCG_SRC_PATH/src/CheckAspectRatio.cpp HPCG_SRC_PATH/src/CheckAspectRatio.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/OutputFile.o: HPCG_SRC_PATH/src/OutputFile.cpp HPCG_SRC_PATH/src/OutputFile.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/CpuKernels.o: HPCG_SRC_PATH/src/CpuKernels.cpp HPCG_SRC_PATH/src/CpuKernels.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/CudaKernels.o: HPCG_SRC_PATH/src/CudaKernels.cu
+	nvcc -c -O3 $(CUDA_ARCH) $(HPCG_DEFS) -IHPCG_SRC_PATH/src -I$(CUDA_HOME)/include -I$(CUBLASROOT)/include -I$(MPdir)/include $< -o $@
--- a/README.md
+++ b/README.md
@@ -0,0 +1,90 @@
+
+# NVIDIA High Performance Conjugate Gradient Benchmark (HPCG)
+
+
+NVIDIA HPCG is based on the [HPCG](https://github.com/hpcg-benchmark/hpcg) benchmark and optimized for performance on NVIDIA accelerated HPC systems.
+
+NVIDIA's HPCG benchmark accelerates the High Performance Conjugate Gradients (HPCG) Benchmark. HPCG is a software package that performs a fixed number of multigrid preconditioned (using a symmetric Gauss-Seidel smoother) conjugate gradient (PCG) iterations using double precision (64 bit) floating point values.
+
+## Main Features
+* The NVIDIA HPCG benchmark exploits NVIDIA high-performance math libraries: [cuSPARSE](https://docs.nvidia.com/cuda/cusparse/) and [NVPL Sparse](https://docs.nvidia.com/nvpl/_static/sparse/index.html) to achieve the highest possible performance for Sparse Matrix-vector multiplication (SpMV) and Sparse Matrix triangular solvers (SpSV) on NVIDIA GPUs and Grace CPUs.
+
+* The NVIDIA HPCG benchmark supports highly configurable command line parameters to decide:
+    * Problem sizes for the GPU and Grace CPU
+    * 3D rank grid shape
+    * Execution modes: CPU-only, GPU-only and heterogeneous
+    * Point-to-point communication: MPI_Host (Send/Recv), MPI_Host_Alltoallv, MPI_CUDA_Aware, MPI_CUDA_Aware_Alltoallv, and NCCL
+    * NUMA-related configurations
+
+    See `bin/RUNNING-x86` and `bin/RUNNING-aarch64` for detailed description.
+* The supported sparse storage format in the NVIDIA HPCG benchmark is the standard [sliced-ELLPACK format (SELL)](https://docs.nvidia.com/cuda/cusparse/#sliced-ellpack-sell).
+
+## Supported Platforms
+* The NVIDIA HPCG benchmark supports GPU-only execution on x86 and NVIDIA Grace CPU systems with NVIDIA Ampere GPU architecture (sm80) and NVIDIA Hopper GPU architecture (sm90), CPU only execution for NVIDIA Grace CPUs, and heterogeneous GPU-Grace execution for NVIDIA Grace Hopper superchips.
+* NVIDIA HPCG only supports Linux operating systems.
+
+## Prerequisite
+* Git
+* MPI, OpenMPI 4.1+ and MPICH 4.0+
+* CUDA Toolkit 12.3+, for NVIDIA GPU execution 
+* cuSPARSE 12.3+, for NVIDIA GPU execution
+* cuBLAS 12.2+, for NVIDIA GPU execution
+* GCC 13.0+ NVIDIA Grace CPU execution
+* NVPL 24.03+, for NVIDIA Grace CPU execution
+* NCCL 2.19+, optional for inter-process communication
+
+## Compile and build
+### Cloning the repo
+SSH
+```
+git clone ssh://github.com/NVIDIA/nvidia-hpcg
+```
+HTTPS
+```
+git clone https://github.com/NVIDIA/nvidia-hpcg
+```
+GitHub CLI
+```
+gh repo clone NVIDIA/nvidia-hpcg
+```
+
+### Compile the NVIDIA HPCG benchmark
+The `build_sample.sh` script can be used to compile and build the NVIDIA HPCG benchmark. The paths to MPI, CUDA toolkit, CUDA Mathlibs, NCCL, and NVPL Sparse must be exported into `MPI_PATH`, `CUDA_PATH`, `MATHLIBS_PATH`, `NCCL_PATH`, and `NVPL_SPARSE_PATH` before running the `make` command. 
+The following options can used to decide the target platform:
+* `USE_CUDA`, set to 1 to build for NVIDIA GPUs and 0 otherwise.
+* `USE_GRACE`, set to 1 to build for NVIDIA Grace CPUs and 0 otherwise. When set to 0, the code builds for x86 platforms.
+* `USE_NCCL`, set to 1 to build for NCCL and 0 otherwise.
+
+The `USE_CUDA` and `USE_GRACE` options are used to create binaries that support one of three execution modes as follows:
+* For GPU-only, set `USE_CUDA` to 1. When `USE_GRACE=1`, build for `aarch64`. When `USE_GRACE=0`, build for `x86`.
+* For Grace-only, set `USE_CUDA` to 0 and `USE_GRACE` to 1.
+* For GPU-Grace, set `USE_CUDA` to 1 and `USE_GRACE` to 1.
+
+The `build_sample.sh` script uses `setup/MAKE.CUDA_AARCH64` and `setup/MAKE.CUDA_X86` to compose the include and link lines for the `make` command. These two scripts define compile-time options that are used in the source code. These options are explained in the two `setup/MAKE.CUDA_*` script files. The build script creates `build` directory and stores the NVIDIA HPCG binary in `build/bin` and `bin` directories (the binary is copied from `build/bin` to `bin`). The build script can create one of the following binaries:
+* xhpcg, when `USE_CUDA=1`.
+* xhpcg-cpu, when `USE_CUDA=0` and `USE_GRACE=1`.
+
+
+## Running the NVIDIA HPCG benchmark
+The NVIDIA HPCG benchmark uses the same input format as the standard HPCG benchmark or user can pass benchmarks parameters with help of options. Please see the HPCG benchmark for getting started with the HPCG software concepts and best practices. The `bin` directory has scripts to run the NVIDIA HPCG benchmark along with descriptions and samples. Files `bin/RUNNING-x86` and `bin/RUNNING-aa64` explain, in detail, how to run the NVIDIA HPCG benchmark on `x86` and `aarch64` platforms, respectively. The `run_sample.sh` script provides four examples to run on `x86` and Grace Hopper x4 platforms.
+
+### Heterogenous (GPU-GRACE) execution mode in-depth
+The NVIDIA HPCG benchmark can run efficiently on heterogeneous systems comprising GPUs and Grace CPUs like GRACE HOPPER. The approach involves assigning an MPI rank to each GPU and one or more MPI ranks to the Grace CPU. Given that the GPU performs significantly faster than the Grace CPU, the strategy is to allocate a larger local problem size to the GPU compared to the Grace CPU. This ensures that during MPI blocking communication steps like `MPI_Allreduce`, the GPU's execution is not interrupted by the Grace CPU's slower execution.
+
+In the NVIDIA HPCG benchmark, the GPU and Grace local problems are configured to differ in only one dimension while keeping the other dimensions identical. This design enables proper halo exchange operations across the dimensions that remain identical between the GPU and Grace ranks. The image below depicts an example of this design. The GPU and Grace ranks have the same x and y dimensions, where the halo exchange takes place. The z dimension is different which enables assigning different local problems for the GPU and Grace ranks. The NVIDIA HPCG benchmark has the flexibility to choose the 3D shape of ranks,  choose the different dimension, and configure the sizes of GPU and Grace ranks. Refer to `bin/RUNNING-aarch64` for more details.
+
+<img src="images/hpcg-gpu-grace-example.png" alt="drawing" width="150"/>
+
+### Interpreting the results
+By default, the NVIDIA HPCG benchmark outputs the logs to the standard output (`stdout`). To print into log files, set knob `--of` to 1.
+Even though a run can be valid, there are performance flags to observe in performance logs (line numbers are considered when output to log files):
+* In the iterations summary section (line 68), the number of optimized CG iterations per set (line 72) should be as close as possible to the reference value (i.e., 50 iterations). The user can try different parameters such as the local problem size and 3D grid shape to achieve low iterations value.
+* In the GFLOP/s summary (line 100), the value of `Total with convergence and optimization phase overhead` at line 107 should be as close as possible to `Raw Total`. Otherwise, make sure the number of optimized CG iterations per set, the setup time (line 20), and the optimization time (line 82) are reasonable compared to the total execution time. This is important when scaling on multi-node.
+* When scaling on multi-node platforms, most of the DDOT computation time is the time of the `MPI_Allreduce`. High `MPI_Allreduce` time indicates scaling bottlenecks due to a small local problem size or a problem in configurations or platforms.
+
+## Support
+For questions or to provide feedback, please contact [HPCBenchmarks@nvidia.com](mailto:HPCBenchmarks@nvidia.com)
+
+## License
+The license file can be found in the [LICENSE](LICENSE) file.
+
--- a/build_sample.sh
+++ b/build_sample.sh
@@ -0,0 +1,104 @@
+#! /usr/bin/env bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export CXX_PATH=/usr
+export PATH=${CXX_PATH}/bin:${PATH}
+
+if [[ -z "${MPI_PATH}" ]]; then
+    export MPI_PATH=/path/to/mpi #Change this to correct MPI path
+fi
+
+if [[ -z "${CUDA_PATH}" ]]; then
+    export MATHLIBS_PATH=/path/to/mathlibs #Change this to correct CUDA mathlibs
+fi
+
+if [[ -z "${NCCL_PATH}" ]]; then
+    export NCCL_PATH=/path/to/nccl #Change to correct NCCL path
+fi
+
+if [[ -z "${CUDA_PATH}" ]]; then
+    export CUDA_PATH=/path/to/cuda #Change this to correct CUDA path
+fi
+
+if [[ -z "${NVPL_SPARSE_PATH}" ]]; then
+    export NVPL_SPARSE_PATH=/path/to/nvpllibs #Change this to correct NVPL mathlibs
+fi
+
+export PATH=${CUDA_PATH}/bin:${PATH}
+export LD_LIBRARY_PATH=${CUDA_PATH}/lib64:${LD_LIBRARY_PATH}
+export LD_LIBRARY_PATH=${NVPL_SPARSE_PATH}/lib:${LD_LIBRARY_PATH}
+
+#xhpcg binary will be located in build/bin
+mkdir -p build
+cd build
+
+######## USE Nvidia GPU? ############
+# 1:         Yes
+# O:         No 
+export USE_CUDA=1
+if [[ $5 == "0" ]]; then
+    export USE_CUDA=0
+fi
+################################################
+
+######## USE Grace CPU? ############
+# 1:         Yes
+# O:         No 
+export USE_GRACE=1
+if [[ $6 == "0" ]]; then
+    export USE_GRACE=0
+fi
+################################################
+
+######## USE NCCL? ############
+# 1:         Yes
+# O:         No 
+export USE_NCCL=1
+if [[ $7 == "0" ]]; then
+    export USE_NCCL=0
+fi
+################################################
+
+if [[ $USE_GRACE == 1 ]]; then
+    ../configure CUDA_AARCH64
+else
+    ../configure CUDA_X86
+fi
+
+export build_B100=0
+if [[ $8 == "1" ]]; then
+    export build_B100=1
+fi
+
+make -j 16 \
+    USE_CUDA=${USE_CUDA} \
+    USE_GRACE=${USE_GRACE} \
+    USE_NCCL=${USE_NCCL} \
+    MPdir=${MPI_PATH} \
+    MPlib=${MPI_PATH}/lib \
+    Mathdir=${MATHLIBS_PATH} \
+    NCCLdir=${NCCL_PATH} \
+    CUDA_HOME=${CUDA_PATH} \
+    NVPL_PATH=${NVPL_SPARSE_PATH} \
+    HPCG_ENG_VERSION=${is_ENG_VERSION} \
+    HPCG_COMMIT_HASH=$2 \
+    HPCG_VER_MAJOR=$3 \
+    HPCG_VER_MINOR=$4 \
+    BUILD_B100=${build_B100}
+
+#Move build/bin/xhpcg to bin/xhpcg
+make install
--- a/50
+++ b/50
@@ -0,0 +1,50 @@
+#! /bin/sh
+
+src_path=`echo $0 | sed -e s:/configure$::`
+bld_path=`pwd`
+
+#FIXME: need to check whether src and bld are the same (test f1 -ef f2)
+
+if test x"$#" != x"1" -o x"$1" = "x" ; then
+  echo
+  echo Please specify '"'arch'"' argument, for example:
+  echo
+  echo $0 Unix
+  echo
+  exit 127
+fi
+
+arg_arch="$1"
+
+setup_file=${src_path}/setup/Make.${arg_arch}
+
+if test ! -f $setup_file ; then
+  echo
+  echo Please create the configuration file $setup_file
+  echo
+  exit 127
+fi
+
+mkfile=${bld_path}/Makefile
+
+if test -d $mkfile -o -f $mkfile ; then
+  rm -rf $mkfile
+fi
+
+sed -e "s:HPCG_ROOT_PATH:${bld_path}:g" ${src_path}/Makefile.ext | sed -e "s:HPCG_SRC_PATH:${src_path}:g" | sed -e "s:UNKNOWN:${arg_arch}:" > $mkfile
+
+# creating missing directories
+for path in src testing bin setup
+do
+  if test ! -d $path ; then
+    mkdir $path
+  fi
+done
+
+# copy hpcg.dat if it doesn't exist
+if test ! -f bin/hpcg.dat ; then
+  cp ${src_path}/bin/hpcg.dat bin/hpcg.dat
+fi
+
+# copy the architecture setup file
+cp -f $setup_file setup
--- a/images/hpcg-gpu-grace-example.png
+++ b/images/hpcg-gpu-grace-example.png
--- a/run_sample.sh
+++ b/run_sample.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export CXX_PATH=/usr
+export PATH=${CXX_PATH}/bin:${PATH}
+
+if [[ -z "${MPI_PATH}" ]]; then
+    export MPI_PATH=/path/to/mpi #Change this to correct MPI path
+fi
+
+if [[ -z "${CUDA_PATH}" ]]; then
+    export MATHLIBS_PATH=/path/to/mathlibs #Change this to correct CUDA mathlibs
+fi
+
+if [[ -z "${NCCL_PATH}" ]]; then
+    export NCCL_PATH=/path/to/nccl #Change to correct NCCL path
+fi
+
+if [[ -z "${CUDA_PATH}" ]]; then
+    export CUDA_PATH=/path/to/cuda #Change this to correct CUDA path
+fi
+
+if [[ -z "${NVPL_SPARSE}" ]]; then
+    export NVPL_SPARSE=/path/to/nvpllibs #Change this to correct NVPL mathlibs
+fi
+
+#Please fix, if needed
+export CUDA_BLAS_VERSION=${CUDA_BUILD_VERSION:-12.2}
+export LD_LIBRARY_PATH=${MATHLIBS_PATH}/${CUDA_BLAS_VERSION}/lib64/:${LD_LIBRARY_PATH}
+export PATH=${CUDA_PATH}/bin:${PATH}
+export LD_LIBRARY_PATH=${CUDA_PATH}/lib64:${LD_LIBRARY_PATH}
+export LD_LIBRARY_PATH=${NCCL_PATH}/lib:${LD_LIBRARY_PATH}
+export LD_LIBRARY_PATH=${NVPL_SPARSE}/lib:${LD_LIBRARY_PATH}
+
+ext="--mca pml ^ucx --mca btl ^openib,smcuda -mca coll_hcoll_enable 0 -x coll_hcoll_np=0 --bind-to none"
+
+#Directory to xhpcg binary
+dir="bin/"
+
+#Sample on a Hopper GPU x86
+###########################
+#Local problem size
+nx=512 #Large problem size x
+ny=512 #Large problem size y
+nz=288 #Large problem size z
+mpirun --oversubscribe ${ext} -np 1 ${dir}/hpcg.sh  --exec-name ${dir}/xhpcg \
+ --nx $nx --ny $ny --nz $nz --rt 10 --b 0
+########################################################################################
+
+#Sample on Grace Hopper x4
+###########################
+#Local problem size
+nx=256 #Large problem size x, assumed for the GPU
+ny=1024 #Large problem size y, assumed for the GPU
+nz=288 #Large problem size z, assumed for the GPU
+
+#1 GPUOnly
+#---------#
+np=4  #Total number of ranks
+mpirun --oversubscribe ${ext} -np $np ${dir}/hpcg-aarch64.sh  --exec-name ${dir}/xhpcg \
+ --nx $nx --ny $ny --nz $nz --rt 10 --b 0 --exm 0 --p2p 0 \
+ --mem-affinity 0:1:2:3 --cpu-affinity 0-71:72-143:144-215:216-287
+
+#2 GraceOnly
+#-----------#
+np=4  #Total number of ranks
+mpirun --oversubscribe ${ext} -np $np ${dir}/hpcg-aarch64.sh  --exec-name ${dir}/xhpcg-cpu \
+ --nx $nx --ny $ny --nz $nz --rt 10 --b 0 --exm 0 --p2p 0 \
+ --mem-affinity 0:1:2:3 --cpu-affinity 0-71:72-143:144-215:216-287
+
+#3 Hetrogeneous (GPU + Grace)
+#----------------------------#
+np=8  #Total number of ranks (4GPU + 4Grace)
+exm=2 #Execution mode GPU+Grace
+diff_dim=2 #different dim between GPU and Grace is Y
+lpm=1 #Local problem mode (nx/ny/nz are local to GPU, g2c is the Grace different dimension)
+g2c=64 #Based on dif_dim=2 and lpm=1 --> Grace rank local problem size is $nx x $g2c x $nz
+
+#3D grid size 4x2x1 (must be equal to np)
+npx=4 #number of ranks in the x direction
+npy=2 #number of ranks in the y direction
+npz=1 #number of ranks in the z direction
+mpirun --oversubscribe ${ext} -np $np ${dir}/hpcg-aarch64.sh  --exec-name ${dir}/xhpcg \
+ --nx $nx --ny $ny --nz $nz --rt 10 --b 0 --p2p 0 --exm $exm --lpm $lpm --g2c $g2c --ddm $diff_dim --npx $npx --npy $npy --npz $npz \
+ --mem-affinity 0:0:1:1:2:2:3:3 --cpu-affinity 0-7:8-71:72-79:80-143:144-151:152-215:216-223:224-287
--- a/setup/Make.CUDA_AARCH64
+++ b/setup/Make.CUDA_AARCH64
@@ -0,0 +1,202 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#HEADER
+#  -- High Performance Conjugate Gradient Benchmark (HPCG)
+#     HPCG - 3.1 - March 28, 2019
+
+#     Michael A. Heroux
+#     Scalable Algorithms Group, Computing Research Division
+#     Sandia National Laboratories, Albuquerque, NM
+#
+#     Piotr Luszczek
+#     Jack Dongarra
+#     University of Tennessee, Knoxville
+#     Innovative Computing Laboratory
+#
+#     (C) Copyright 2013-2019 All Rights Reserved
+#
+#
+#  -- Copyright notice and Licensing terms:
+#
+#  Redistribution  and  use in  source and binary forms, with or without
+#  modification, are  permitted provided  that the following  conditions
+#  are met:
+#
+#  1. Redistributions  of  source  code  must retain the above copyright
+#  notice, this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce  the above copyright
+#  notice, this list of conditions,  and the following disclaimer in the
+#  documentation and/or other materials provided with the distribution.
+#
+#  3. All  advertising  materials  mentioning  features  or  use of this
+#  software must display the following acknowledgement:
+#  This  product  includes  software  developed  at Sandia National
+#  Laboratories, Albuquerque, NM and the  University  of
+#  Tennessee, Knoxville, Innovative Computing Laboratory.
+#
+#  4. The name of the  University,  the name of the  Laboratory,  or the
+#  names  of  its  contributors  may  not  be used to endorse or promote
+#  products  derived   from   this  software  without  specific  written
+#  permission.
+#
+#  -- Disclaimer:
+#
+#  THIS  SOFTWARE  IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,  INCLUDING,  BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+#  OR  CONTRIBUTORS  BE  LIABLE FOR ANY  DIRECT,  INDIRECT,  INCIDENTAL,
+#  SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES  (INCLUDING,  BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA OR PROFITS; OR BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ######################################################################
+#@HEADER
+# ----------------------------------------------------------------------
+# - shell --------------------------------------------------------------
+# ----------------------------------------------------------------------
+#
+SHELL        = /bin/sh
+#
+CD           = cd
+CP           = cp
+LN_S         = ln -s -f
+MKDIR        = mkdir -p
+RM           = /bin/rm -f
+TOUCH        = touch
+#
+# ----------------------------------------------------------------------
+# - HPCG Directory Structure / HPCG library ------------------------------
+# ----------------------------------------------------------------------
+#
+TOPdir       = .
+SRCdir       = $(TOPdir)/src
+INCdir       = $(TOPdir)/src
+BINdir       = $(TOPdir)/bin
+#
+# ----------------------------------------------------------------------
+# - Message Passing library (MPI) --------------------------------------
+# ----------------------------------------------------------------------
+# MPinc tells the  C  compiler where to find the Message Passing library
+# header files,  MPlib  is defined  to be the name of  the library to be
+# used. The variable MPdir is only used for defining MPinc and MPlib.
+#
+#MPdir        =
+#MPinc        =
+#MPlib        =
+#
+#
+# ----------------------------------------------------------------------
+# - HPCG includes / libraries / specifics -------------------------------
+# ----------------------------------------------------------------------
+#
+
+NVPL_SPARSE_INC=$(NVPL_PATH)/include
+NVPL_SPARSE_LIB=$(NVPL_PATH)/lib
+
+HPCG_INCLUDES = -I$(INCdir) -I$(INCdir)/$(arch) -I$(MPdir)/include  $(CRAY_CUDATOOLKIT_INCLUDE_OPTS)
+HPCG_LIBS     = -L${MPlib} -lmpi
+
+ifeq ($(USE_CUDA), 1)
+    HPCG_INCLUDES += -I$(CUDA_HOME)/include -I$(Mathdir)/include
+    HPCG_LIBS += -L$(Mathdir)/lib  -lcuda -lcusparse -lcublas -lcublasLt  -L$(CUDA_HOME)/lib64
+endif
+
+ifeq ($(USE_GRACE), 1)
+    HPCG_INCLUDES += -I$(NVPL_SPARSE_INC)
+    HPCG_LIBS += -L$(NVPL_SPARSE_LIB) -lnvpl_sparse
+endif
+
+ifeq ($(USE_NCCL), 1)
+    HPCG_INCLUDES += -I$(NCCLdir)/include
+    HPCG_LIBS += -L$(NCCLdir)/lib -lnccl
+endif
+
+#
+# - Compile time options -----------------------------------------------
+#
+# -DHPCG_NO_MPI	            Define to disable MPI
+# -DHPCG_NO_OPENMP	        Define to disable OPENMP
+# -DHPCG_CONTIGUOUS_ARRAYS  Define to have sparse matrix arrays long and contiguous
+# -DHPCG_DEBUG       	    Define to enable debugging output
+# -DHPCG_DETAILED_DEBUG     Define to enable very detailed debugging output
+# -DUSE_CUDA                Define to enable GPU execution
+# -DUSE_GRACE               Define to enable Grace CPU execution
+# -DUSE_NCCL                Define to enable NCCL P2P communication. Use --p2p=4 for NCCL
+# -DUSE_INT64               Define to enable INT64 indexing
+
+# By default HPCG will:
+#    *) Build with MPI enabled.
+#    *) Build with OpenMP enabled.
+#    *) Not generate debugging output.
+#
+HPCG_OPTS  = -DHPCG_CUBIC_RADICAL_SEARCH  -DHPCG_CONTIGUOUS_ARRAYS  #-DHPCG_DEBUG #-DHPCG_NO_MPI
+ifeq ($(USE_CUDA), 1)
+    HPCG_OPTS  += -DUSE_CUDA
+endif
+
+ifeq ($(USE_GRACE), 1)
+    HPCG_OPTS  += -DUSE_GRACE
+endif
+
+ifeq ($(USE_NCCL), 1)
+    HPCG_OPTS  += -DUSE_NCCL
+endif
+
+ifeq ($(HPCG_ENG_VERSION), 1)
+    HPCG_OPTS += -DHPCG_ENG_VERSION
+endif
+
+ifeq ($(USE_INT64), 1)
+    HPCG_OPTS += -DINDEX_64
+endif
+
+#If not set, the defualt values in src/hpcg.hpp will be used
+HPCG_OPTS += -Dmake_HPCG_VER_MAJOR=$(HPCG_VER_MAJOR)
+HPCG_OPTS += -Dmake_HPCG_VER_MINOR=$(HPCG_VER_MINOR)
+HPCG_OPTS += -DHPCG_COMMIT_HASH=$(HPCG_COMMIT_HASH)
+#
+# ----------------------------------------------------------------------
+#
+HPCG_DEFS     = $(HPCG_OPTS) $(HPCG_INCLUDES)
+#
+# ----------------------------------------------------------------------
+# - Compilers / linkers - Optimization flags ---------------------------
+# ----------------------------------------------------------------------
+#
+ifeq ($(USE_CUDA), 1)
+    CUDA_ARCH = -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90
+endif
+ifeq ($(BUILD_B100), 1)
+CUDA_ARCH += --generate-code arch=compute_100,code=sm_100
+endif
+CPU_ARCH ?= native
+
+CXX          = nvcc
+CXXFLAGS     = $(HPCG_DEFS) -O3 -Xcompiler --std=c++17 -Xcompiler -Ofast,-fopenmp,-mcpu=$(CPU_ARCH),-mtune=$(CPU_ARCH),-ftree-vectorize,-funroll-loops $(CUDA_ARCH)
+
+#
+LINKER       = $(CXX)
+LINKFLAGS    = $(CXXFLAGS) $(HPCG_LIBS)
+#
+ARCHIVER     = ar
+ARFLAGS      = r
+RANLIB       = echo
+#
+# ----------------------------------------------------------------------
--- a/setup/Make.CUDA_X86
+++ b/setup/Make.CUDA_X86
@@ -0,0 +1,186 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#HEADER
+#  -- High Performance Conjugate Gradient Benchmark (HPCG)
+#     HPCG - 3.1 - March 28, 2019
+
+#     Michael A. Heroux
+#     Scalable Algorithms Group, Computing Research Division
+#     Sandia National Laboratories, Albuquerque, NM
+#
+#     Piotr Luszczek
+#     Jack Dongarra
+#     University of Tennessee, Knoxville
+#     Innovative Computing Laboratory
+#
+#     (C) Copyright 2013-2019 All Rights Reserved
+#
+#
+#  -- Copyright notice and Licensing terms:
+#
+#  Redistribution  and  use in  source and binary forms, with or without
+#  modification, are  permitted provided  that the following  conditions
+#  are met:
+#
+#  1. Redistributions  of  source  code  must retain the above copyright
+#  notice, this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce  the above copyright
+#  notice, this list of conditions,  and the following disclaimer in the
+#  documentation and/or other materials provided with the distribution.
+#
+#  3. All  advertising  materials  mentioning  features  or  use of this
+#  software must display the following acknowledgement:
+#  This  product  includes  software  developed  at Sandia National
+#  Laboratories, Albuquerque, NM and the  University  of
+#  Tennessee, Knoxville, Innovative Computing Laboratory.
+#
+#  4. The name of the  University,  the name of the  Laboratory,  or the
+#  names  of  its  contributors  may  not  be used to endorse or promote
+#  products  derived   from   this  software  without  specific  written
+#  permission.
+#
+#  -- Disclaimer:
+#
+#  THIS  SOFTWARE  IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,  INCLUDING,  BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+#  OR  CONTRIBUTORS  BE  LIABLE FOR ANY  DIRECT,  INDIRECT,  INCIDENTAL,
+#  SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES  (INCLUDING,  BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA OR PROFITS; OR BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ######################################################################
+#@HEADER
+# ----------------------------------------------------------------------
+# - shell --------------------------------------------------------------
+# ----------------------------------------------------------------------
+#
+SHELL        = /bin/sh
+#
+CD           = cd
+CP           = cp
+LN_S         = ln -s -f
+MKDIR        = mkdir -p
+RM           = /bin/rm -f
+TOUCH        = touch
+#
+# ----------------------------------------------------------------------
+# - HPCG Directory Structure / HPCG library ------------------------------
+# ----------------------------------------------------------------------
+#
+TOPdir       = .
+SRCdir       = $(TOPdir)/src
+INCdir       = $(TOPdir)/src
+BINdir       = $(TOPdir)/bin
+#
+# ----------------------------------------------------------------------
+# - Message Passing library (MPI) --------------------------------------
+# ----------------------------------------------------------------------
+# MPinc tells the  C  compiler where to find the Message Passing library
+# header files,  MPlib  is defined  to be the name of  the library to be
+# used. The variable MPdir is only used for defining MPinc and MPlib.
+#
+#MPdir        =
+#MPinc        =
+#MPlib        =
+#
+#
+# ----------------------------------------------------------------------
+# - HPCG includes / libraries / specifics -------------------------------
+# ----------------------------------------------------------------------
+#
+
+NVPL_SPARSE_INC=$(NVPL_PATH)/include
+NVPL_SPARSE_LIB=$(NVPL_PATH)/lib
+
+HPCG_INCLUDES = -I$(INCdir) -I$(INCdir)/$(arch) -I$(MPdir)/include  $(CRAY_CUDATOOLKIT_INCLUDE_OPTS)
+HPCG_LIBS     = -L${MPlib} -lmpi
+
+HPCG_INCLUDES += -I$(CUDA_HOME)/include -I$(Mathdir)/include
+HPCG_LIBS += -L$(Mathdir)/lib  -lcuda -lcusparse -lcublas -lcublasLt  -L$(CUDA_HOME)/lib64
+
+ifeq ($(USE_NCCL), 1)
+    HPCG_INCLUDES += -I$(NCCLdir)/include
+    HPCG_LIBS += -L$(NCCLdir)/lib -lnccl
+endif
+
+#
+# - Compile time options -----------------------------------------------
+#
+# -DHPCG_NO_MPI	             Define to disable MPI
+# -DHPCG_NO_OPENMP	         Define to disable OPENMP
+# -DHPCG_CONTIGUOUS_ARRAYS   Define to have sparse matrix arrays long and contiguous
+# -DHPCG_DEBUG       	     Define to enable debugging output
+# -DHPCG_DETAILED_DEBUG      Define to enable very detailed debugging output
+# -DUSE_CUDA                 Define to enable GPU execution
+# -DUSE_NCCL                 Define to enabele NCCL P2P communication. Use --p2p=4 for NCCL
+# -DUSE_INT64                Define to enable INT64 indexing
+
+# By default HPCG will:
+#    *) Build with MPI enabled.
+#    *) Build with OpenMP enabled.
+#    *) Not generate debugging output.
+#
+HPCG_OPTS  = -DHPCG_CUBIC_RADICAL_SEARCH  -DHPCG_CONTIGUOUS_ARRAYS  #-DHPCG_DEBUG #-DHPCG_NO_MPI
+HPCG_OPTS  += -DUSE_CUDA
+
+ifeq ($(USE_NCCL), 1)
+    HPCG_OPTS  += -DUSE_NCCL
+endif
+
+ifeq ($(HPCG_ENG_VERSION), 1)
+    HPCG_OPTS += -DHPCG_ENG_VERSION
+endif
+
+ifeq ($(USE_INT64), 1)
+    HPCG_OPTS += -DINDEX_64
+endif
+
+#If not set, the defualt values in src/hpcg.hpp will be used
+HPCG_OPTS += -Dmake_HPCG_VER_MAJOR=$(HPCG_VER_MAJOR)
+HPCG_OPTS += -Dmake_HPCG_VER_MINOR=$(HPCG_VER_MINOR)
+HPCG_OPTS += -DHPCG_COMMIT_HASH=$(HPCG_COMMIT_HASH)
+#
+# ----------------------------------------------------------------------
+#
+HPCG_DEFS     = $(HPCG_OPTS) $(HPCG_INCLUDES)
+#
+# ----------------------------------------------------------------------
+# - Compilers / linkers - Optimization flags ---------------------------
+# ----------------------------------------------------------------------
+#
+CUDA_ARCH = -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90
+ifeq ($(BUILD_B100), 1)
+CUDA_ARCH += --generate-code arch=compute_100,code=sm_100
+endif
+CPU_ARCH ?= native
+
+CXX          = nvcc
+CXXFLAGS     = $(HPCG_DEFS) -O3 -Xcompiler --std=c++17 -Xcompiler -Ofast,-fopenmp,-mcpu=$(CPU_ARCH),-mtune=$(CPU_ARCH),-ftree-vectorize,-funroll-loops $(CUDA_ARCH)
+
+#
+LINKER       = $(CXX)
+LINKFLAGS    = $(CXXFLAGS) $(HPCG_LIBS)
+#
+ARCHIVER     = ar
+ARFLAGS      = r
+RANLIB       = echo
+#
+# ----------------------------------------------------------------------
--- a/setup/Make.UNKNOWN
+++ b/setup/Make.UNKNOWN
@@ -0,0 +1,24 @@
+# -*- Makefile -*-
+
+arch=UNKNOWN
+
+VERSION = 3.1
+
+UNKNOWN:
+	@echo
+	@echo Please specify "'"arch"'" variable, for example:
+	@echo 1. Create file "'"Make.Unix"'" in the "'"setup"'" directory
+	@echo 2. Type: "'"make arch=Unix"'"
+	@echo
+
+#GNUTAR = gnutar # or "gtar" on Linux
+GNUTAR = gtar
+
+dist:
+	@echo Packaging for version $(VERSION)
+	ln -s -f . hpcg-$(VERSION)
+	grep :0: /etc/group | sed -e 's/:.*//' | xargs -I '{}' $(GNUTAR) --owner=root --group='{}' -cvhof hpcg-$(VERSION).tar hpcg-$(VERSION)/src/*.[ch]pp hpcg-$(VERSION)/[BCHIQRTV]* hpcg-$(VERSION)/bin/hpcg.dat hpcg-$(VERSION)/setup/Make.* hpcg-$(VERSION)/configure hpcg-$(VERSION)/Makefile hpcg-$(VERSION)/Makefile.ext hpcg-$(VERSION)/tools/hpcg.dox
+	gzip -v --best hpcg-$(VERSION).tar
+	rm -f hpcg-$(VERSION)
+
+.PHONY: UNKNOWN dist
--- a/src/CG.cpp
+++ b/src/CG.cpp
@@ -0,0 +1,241 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file CG.cpp
+
+ HPCG routine
+ */
+
+#include <fstream>
+
+#include <cmath>
+
+#include "hpcg.hpp"
+
+#include "CG.hpp"
+#include "ComputeDotProduct.hpp"
+#include "ComputeMG.hpp"
+#include "ComputeSPMV.hpp"
+#include "ComputeWAXPBY.hpp"
+#include "mytimer.hpp"
+#include <iostream>
+
+#include "CpuKernels.hpp"
+
+#include <mpi.h>
+
+extern int use_output_file;
+
+#define TICKD() t0 = mytimer()       //!< record current time in 't0'
+#define TOCKD(t) t += mytimer() - t0 //!< store time difference in 't' using time in 't0'
+
+/*!
+  Routine to compute an approximate solution to Ax = b
+
+  @param[in]    geom The description of the problem's geometry.
+  @param[inout] A    The known system matrix
+  @param[inout] data The data structure with all necessary CG vectors preallocated
+  @param[in]    b    The known right hand side vector
+  @param[inout] x    On entry: the initial guess; on exit: the new approximate solution
+  @param[in]    max_iter  The maximum number of iterations to perform, even if tolerance is not met.
+  @param[in]    tolerance The stopping criterion to assert convergence: if norm of residual is <= to tolerance.
+  @param[out]   niters    The number of iterations actually performed.
+  @param[out]   normr     The 2-norm of the residual vector after the last iteration.
+  @param[out]   normr0    The 2-norm of the residual vector before the first iteration.
+  @param[out]   times     The 7-element vector of the timing information accumulated during all of the iterations.
+  @param[in]    doPreconditioning The flag to indicate whether the preconditioner should be invoked at each iteration.
+
+  @return Returns zero on success and a non-zero value otherwise.
+
+  @see CG_ref()
+*/
+int CG(const SparseMatrix& A, CGData& data, const Vector& b, Vector& x, const int max_iter, const double tolerance,
+    int& niters, double& normr, double& normr0, double* times, bool doPreconditioning, int flag)
+{
+
+    double t_begin = mytimer(); // Start timing right away
+    normr = 0.0;
+    double rtz = 0.0, oldrtz = 0.0, alpha = 0.0, beta = 0.0, pAp = 0.0;
+    double t0 = 0.0, t1 = 0.0, t2 = 0.0, t3 = 0.0, t4 = 0.0, t5 = 0.0;
+    // #ifndef HPCG_NO_MPI
+    //   double t6 = 0.0;
+    // #endif
+    local_int_t nrow = A.localNumberOfRows;
+    Vector& r = data.r; // Residual vector
+    Vector& z = data.z; // Preconditioned residual vector
+    Vector& p = data.p; // Direction vector (in MPI mode ncol>=nrow)
+    Vector& Ap = data.Ap;
+
+    if (!doPreconditioning && A.geom->rank == 0)
+        if (use_output_file)
+        {
+            HPCG_fout << "WARNING: PERFORMING UNPRECONDITIONED ITERATIONS" << std::endl;
+        }
+        else
+        {
+            std::cout << "WARNING: PERFORMING UNPRECONDITIONED ITERATIONS" << std::endl;
+        }
+
+    int print_freq = 1;
+    if (print_freq > 50)
+        print_freq = 50;
+    if (print_freq < 1)
+        print_freq = 1;
+
+    // p is of length ncols, copy x to p for sparse MV operation
+    if (A.rankType == GPU)
+    {
+#ifdef USE_CUDA
+        CopyVectorD2D(x, p);
+#endif
+    }
+    else
+    {
+        CopyVector(x, p);
+    }
+
+    TICKD();
+    ComputeSPMV(A, p, Ap);
+    TOCKD(t3); // Ap = A*p
+    TICKD();
+    ComputeWAXPBY(nrow, 1.0, b, -1.0, Ap, r, A.isWaxpbyOptimized, A.rankType);
+    TOCKD(t2); // r = b - Ax (x stored in p)
+    TICKD();
+    ComputeDotProduct(nrow, r, r, normr, t4, A.isDotProductOptimized, A.rankType);
+    TOCKD(t1);
+    normr = sqrt(normr);
+
+    if (A.geom->rank == 0 && flag)
+        if (use_output_file)
+        {
+            HPCG_fout << "Initial Residual = " << normr << std::endl;
+        }
+        else
+        {
+            std::cout << "Initial Residual = " << normr << std::endl;
+        }
+
+    // Record initial residual for convergence testing
+    normr0 = normr;
+
+    // Start iterations
+    for (int k = 1; k <= max_iter && normr / normr0 * (1.0 + 1.0e-6) > tolerance; k++)
+    {
+        TICKD();
+        if (doPreconditioning)
+        {
+            ComputeMG(A, r, z); // Apply preconditioner
+            if (A.rankType == GPU)
+            {
+#ifdef USE_CUDA
+                cudaStreamSynchronize(stream);
+#endif
+            }
+        }
+        else
+        {
+            if (A.rankType == GPU)
+            {
+#ifdef USE_CUDA
+                CopyVectorD2D(r, z); // copy r to z (no preconditioning)
+#endif
+            }
+            else
+            {
+                CopyVector(r, z); // copy r to z (no preconditioning)
+            }
+        }
+        TOCKD(t5); // Preconditioner apply time
+
+        if (k == 1)
+        {
+            TICKD();
+            ComputeWAXPBY(nrow, 1.0, z, 0.0, z, p, A.isWaxpbyOptimized, A.rankType);
+            TOCKD(t2); // Copy Mr to p
+            TICKD();
+            ComputeDotProduct(nrow, r, z, rtz, t4, A.isDotProductOptimized, A.rankType);
+            TOCKD(t1); // rtz = r'*z
+        }
+        else
+        {
+            oldrtz = rtz;
+            TICKD();
+            ComputeDotProduct(nrow, r, z, rtz, t4, A.isDotProductOptimized, A.rankType);
+            TOCKD(t1); // rtz = r'*z
+            beta = rtz / oldrtz;
+            TICKD();
+            ComputeWAXPBY(nrow, 1.0, z, beta, p, p, A.isWaxpbyOptimized, A.rankType);
+            TOCKD(t2); // p = beta*p + z
+        }
+        TICKD();
+        ComputeSPMV(A, p, Ap);
+        TOCKD(t3); // Ap = A*p
+        TICKD();
+        ComputeDotProduct(nrow, p, Ap, pAp, t4, A.isDotProductOptimized, A.rankType);
+        TOCKD(t1); // alpha = p'*Ap
+        alpha = rtz / pAp;
+
+        TICKD();
+        ComputeWAXPBY(nrow, 1.0, x, alpha, p, x, A.isWaxpbyOptimized, A.rankType); // x = x + alpha*p
+        ComputeWAXPBY(nrow, 1.0, r, -alpha, Ap, r, A.isWaxpbyOptimized, A.rankType);
+        TOCKD(t2); // r = r - alpha*Ap
+        TICKD();
+        ComputeDotProduct(nrow, r, r, normr, t4, A.isDotProductOptimized, A.rankType);
+        TOCKD(t1);
+
+        normr = sqrt(normr);
+
+        if (flag && A.geom->rank == 0 && (k % print_freq == 0 || k == max_iter))
+            if (use_output_file)
+            {
+                HPCG_fout << "Iteration = " << k << "   Scaled Residual = " << normr / normr0 << std::endl;
+            }
+            else
+            {
+                std::cout << "Iteration = " << k << "   Scaled Residual = " << normr / normr0 << std::endl;
+            }
+
+        niters = k;
+    }
+
+    // Store times
+    times[1] += t1; // dot-product time
+    times[2] += t2; // WAXPBY time
+    times[3] += t3; // SPMV time
+    times[4] += t4; // AllReduce time
+    times[5] += t5; // preconditioner apply time
+                    // #ifndef HPCG_NO_MPI
+    //   times[6] += t6; // exchange halo time
+    // #endif
+    times[0] += mytimer() - t_begin; // Total time. All done...
+    return 0;
+}
--- a/src/CG.hpp
+++ b/src/CG.hpp
@@ -0,0 +1,55 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CG_HPP
+#define CG_HPP
+
+#include "CGData.hpp"
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+
+int CG(const SparseMatrix& A, CGData& data, const Vector& b, Vector& x, const int max_iter, const double tolerance,
+    int& niters, double& normr, double& normr0, double* times, bool doPreconditioning, int flag);
+
+// this function will compute the Conjugate Gradient iterations.
+// geom - Domain and processor topology information
+// A - Matrix
+// b - constant
+// x - used for return value
+// max_iter - how many times we iterate
+// tolerance - Stopping tolerance for preconditioned iterations.
+// niters - number of iterations performed
+// normr - computed residual norm
+// normr0 - Original residual
+// times - array of timing information
+// doPreconditioning - bool to specify whether or not symmetric GS will be applied.
+
+#endif // CG_HPP
--- a/src/CGData.hpp
+++ b/src/CGData.hpp
@@ -0,0 +1,84 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file CGData.hpp
+
+ HPCG data structure
+ */
+
+#ifndef CGDATA_HPP
+#define CGDATA_HPP
+
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+
+struct CGData_STRUCT
+{
+    Vector r;  //!< pointer to residual vector
+    Vector z;  //!< pointer to preconditioned residual vector
+    Vector p;  //!< pointer to direction vector
+    Vector Ap; //!< pointer to Krylov vector
+};
+typedef struct CGData_STRUCT CGData;
+
+/*!
+ Constructor for the data structure of CG vectors.
+
+ @param[in]  A    the data structure that describes the problem matrix and its structure
+ @param[out] data the data structure for CG vectors that will be allocated to get it ready for use in CG iterations
+ */
+inline void InitializeSparseCGData(SparseMatrix& A, CGData& data)
+{
+    local_int_t nrow = A.localNumberOfRows;
+    local_int_t ncol = A.localNumberOfColumns;
+    InitializeVector(data.r, nrow, A.rankType);
+    InitializeVector(data.z, ncol, A.rankType, true /*Only when rank type is GPU*/);
+    InitializeVector(data.p, ncol, A.rankType, true);
+    InitializeVector(data.Ap, nrow, A.rankType);
+    return;
+}
+
+/*!
+ Destructor for the CG vectors data.
+
+ @param[inout] data the CG vectors data structure whose storage is deallocated
+ */
+inline void DeleteCGData(CGData& data)
+{
+    DeleteVector(data.r);
+    DeleteVector(data.z);
+    DeleteVector(data.p);
+    DeleteVector(data.Ap);
+    return;
+}
+
+#endif // CGDATA_HPP
--- a/src/CG_ref.cpp
+++ b/src/CG_ref.cpp
@@ -0,0 +1,198 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file CG_ref.cpp
+
+ HPCG routine
+ */
+
+#include <cmath>
+#include <fstream>
+#include <iostream>
+
+#include "hpcg.hpp"
+
+#include "CG_ref.hpp"
+#include "ComputeDotProduct_ref.hpp"
+#include "ComputeMG_ref.hpp"
+#include "ComputeSPMV_ref.hpp"
+#include "ComputeWAXPBY_ref.hpp"
+#include "mytimer.hpp"
+
+extern int use_output_file;
+
+// Use TICK and TOCK to time a code section in MATLAB-like fashion
+#define TICK() t0 = mytimer()       //!< record current time in 't0'
+#define TOCK(t) t += mytimer() - t0 //!< store time difference in 't' using time in 't0'
+
+/*!
+  Reference routine to compute an approximate solution to Ax = b
+
+  @param[inout] A    The known system matrix
+  @param[inout] data The data structure with all necessary CG vectors preallocated
+  @param[in]    b    The known right hand side vector
+  @param[inout] x    On entry: the initial guess; on exit: the new approximate solution
+  @param[in]    max_iter  The maximum number of iterations to perform, even if tolerance is not met.
+  @param[in]    tolerance The stopping criterion to assert convergence: if norm of residual is <= to tolerance.
+  @param[out]   niters    The number of iterations actually performed.
+  @param[out]   normr     The 2-norm of the residual vector after the last iteration.
+  @param[out]   normr0    The 2-norm of the residual vector before the first iteration.
+  @param[out]   times     The 7-element vector of the timing information accumulated during all of the iterations.
+  @param[in]    doPreconditioning The flag to indicate whether the preconditioner should be invoked at each iteration.
+
+  @return Returns zero on success and a non-zero value otherwise.
+
+  @see CG()
+*/
+
+int CG_ref(const SparseMatrix& A, CGData& data, const Vector& b, Vector& x, const int max_iter, const double tolerance,
+    int& niters, double& normr, double& normr0, double* times, bool doPreconditioning, int flag)
+{
+
+    double t_begin = mytimer(); // Start timing right away
+    normr = 0.0;
+    double rtz = 0.0, oldrtz = 0.0, alpha = 0.0, beta = 0.0, pAp = 0.0;
+
+    double t0 = 0.0, t1 = 0.0, t2 = 0.0, t3 = 0.0, t4 = 0.0, t5 = 0.0;
+    // #ifndef HPCG_NO_MPI
+    //   double t6 = 0.0;
+    // #endif
+
+    local_int_t nrow = A.localNumberOfRows;
+
+    Vector& r = data.r; // Residual vector
+    Vector& z = data.z; // Preconditioned residual vector
+    Vector& p = data.p; // Direction vector (in MPI mode ncol>=nrow)
+    Vector& Ap = data.Ap;
+
+    if (!doPreconditioning && A.geom->rank == 0)
+        if (use_output_file)
+        {
+            HPCG_fout << "WARNING: PERFORMING UNPRECONDITIONED ITERATIONS" << std::endl;
+        }
+        else
+        {
+            std::cout << "WARNING: PERFORMING UNPRECONDITIONED ITERATIONS" << std::endl;
+        }
+
+#if 1
+    // def HPCG_DEBUG
+    int print_freq = 1;
+    if (print_freq > 50)
+        print_freq = 50;
+    if (print_freq < 1)
+        print_freq = 1;
+#endif
+    // p is of length ncols, copy x to p for sparse MV operation
+    CopyVector(x, p);
+    TICK();
+    ComputeSPMV_ref(A, p, Ap);
+    TOCK(t3); // Ap = A*p
+    TICK();
+    ComputeWAXPBY_ref(nrow, 1.0, b, -1.0, Ap, r);
+    TOCK(t2); // r = b - Ax (x stored in p)
+    TICK();
+    ComputeDotProduct_ref(nrow, r, r, normr, t4);
+    TOCK(t1);
+    normr = sqrt(normr);
+#if 1
+    // def HPCG_DEBUG
+    if (A.geom->rank == 0 && flag)
+        if (use_output_file)
+        {
+            HPCG_fout << "Initial Residual = " << normr << std::endl;
+        }
+        else
+        {
+            std::cout << "Initial Residual = " << normr << std::endl;
+        }
+#endif
+
+    // Record initial residual for convergence testing
+    normr0 = normr;
+
+    // Start iterations
+
+    for (int k = 1; k <= max_iter && normr / normr0 > tolerance; k++)
+    {
+        TICK();
+        if (doPreconditioning)
+            ComputeMG_ref(A, r, z); // Apply preconditioner
+        else
+            ComputeWAXPBY_ref(nrow, 1.0, r, 0.0, r, z); // copy r to z (no preconditioning)
+        TOCK(t5);                                       // Preconditioner apply time
+
+        if (k == 1)
+        {
+            CopyVector(z, p);
+            TOCK(t2); // Copy Mr to p
+            TICK();
+            ComputeDotProduct_ref(nrow, r, z, rtz, t4);
+            TOCK(t1); // rtz = r'*z
+        }
+        else
+        {
+            oldrtz = rtz;
+            TICK();
+            ComputeDotProduct_ref(nrow, r, z, rtz, t4);
+            TOCK(t1); // rtz = r'*z
+            beta = rtz / oldrtz;
+            TICK();
+            ComputeWAXPBY_ref(nrow, 1.0, z, beta, p, p);
+            TOCK(t2); // p = beta*p + z
+        }
+
+        TICK();
+        ComputeSPMV_ref(A, p, Ap);
+        TOCK(t3); // Ap = A*p
+        TICK();
+        ComputeDotProduct_ref(nrow, p, Ap, pAp, t4);
+        TOCK(t1); // alpha = p'*Ap
+        alpha = rtz / pAp;
+        TICK();
+        ComputeWAXPBY_ref(nrow, 1.0, x, alpha, p, x); // x = x + alpha*p
+        ComputeWAXPBY_ref(nrow, 1.0, r, -alpha, Ap, r);
+        TOCK(t2); // r = r - alpha*Ap
+        TICK();
+        ComputeDotProduct_ref(nrow, r, r, normr, t4);
+        TOCK(t1);
+        normr = sqrt(normr);
+#if 1
+        // def HPCG_DEBUG
+        if (flag && A.geom->rank == 0 && (k % print_freq == 0 || k == max_iter))
+            if (use_output_file)
+            {
+                HPCG_fout << "Iteration = " << k << "   Scaled Residual = " << normr / normr0 << std::endl;
+            }
+            else
+            {
+                std::cout << "Iteration = " << k << "   Scaled Residual = " << normr / normr0 << std::endl;
+            }
+#endif
+        niters = k;
+    }
+
+    // Store times
+    times[1] += t1; // dot product time
+    times[2] += t2; // WAXPBY time
+    times[3] += t3; // SPMV time
+    times[4] += t4; // AllReduce time
+    times[5] += t5; // preconditioner apply time
+    // #ifndef HPCG_NO_MPI
+    //   times[6] += t6; // exchange halo time
+    // #endif
+    times[0] += mytimer() - t_begin; // Total time. All done...
+    return 0;
+}
--- a/src/CG_ref.hpp
+++ b/src/CG_ref.hpp
@@ -0,0 +1,42 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef CG_REF_HPP
+#define CG_REF_HPP
+
+#include "CGData.hpp"
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+
+// The use of CPU and GPU Sparse Matrix is intended to resolve
+// the linked list structures for MG coarse levels
+// There is no change of th erefernce code
+
+int CG_ref(const SparseMatrix& A, CGData& data, const Vector& b, Vector& x, const int max_iter, const double tolerance,
+    int& niters, double& normr, double& normr0, double* times, bool doPreconditioning, int flag);
+
+// this function will compute the Conjugate Gradient iterations.
+// geom - Domain and processor topology information
+// A - Matrix
+// b - constant
+// x - used for return value
+// max_iter - how many times we iterate
+// tolerance - Stopping tolerance for preconditioned iterations.
+// niters - number of iterations performed
+// normr - computed residual norm
+// normr0 - Original residual
+// times - array of timing information
+// doPreconditioning - bool to specify whether or not symmetric GS will be applied.
+
+#endif // CG_REF_HPP
--- a/src/CheckAspectRatio.cpp
+++ b/src/CheckAspectRatio.cpp
@@ -0,0 +1,84 @@
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file CheckAspectRatio.cpp
+
+ HPCG routine
+ */
+
+#include <algorithm>
+#include <iostream>
+
+#ifndef HPCG_NO_MPI
+#include <mpi.h>
+#endif
+
+#include "hpcg.hpp"
+
+#include "CheckAspectRatio.hpp"
+
+extern int use_output_file;
+
+int CheckAspectRatio(double smallest_ratio, int x, int y, int z, const char* what, bool DoIo)
+{
+    double current_ratio = std::min(std::min(x, y), z) / double(std::max(std::max(x, y), z));
+
+    if (current_ratio < smallest_ratio)
+    { // ratio of the smallest to the largest
+        if (DoIo)
+        {
+            if (use_output_file)
+            {
+                HPCG_fout << "The " << what << " sizes (" << x << "," << y << "," << z
+                          << ") are invalid because the ratio min(x,y,z)/max(x,y,z)=" << current_ratio
+                          << " is too small (at least " << smallest_ratio << " is required)." << std::endl;
+                HPCG_fout << "The shape should resemble a 3D cube. Please adjust and try again." << std::endl;
+                HPCG_fout.flush();
+            }
+            else
+            {
+                std::cout << "The " << what << " sizes (" << x << "," << y << "," << z
+                          << ") are invalid because the ratio min(x,y,z)/max(x,y,z)=" << current_ratio
+                          << " is too small (at least " << smallest_ratio << " is required)." << std::endl;
+                std::cout << "The shape should resemble a 3D cube. Please adjust and try again." << std::endl
+                          << std::flush;
+            }
+        }
+
+#ifndef HPCG_NO_MPI
+        MPI_Abort(MPI_COMM_WORLD, 127);
+#endif
+
+        return 127;
+    }
+
+    return 0;
+}
--- a/src/CheckAspectRatio.hpp
+++ b/src/CheckAspectRatio.hpp
@@ -0,0 +1,18 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef CHECKASPECTRATIO_HPP
+#define CHECKASPECTRATIO_HPP
+extern int CheckAspectRatio(double smallest_ratio, int x, int y, int z, const char* what, bool DoIo);
+#endif // CHECKASPECTRATIO_HPP
--- a/src/CheckProblem.cpp
+++ b/src/CheckProblem.cpp
@@ -0,0 +1,192 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file CheckProblem.cpp
+
+ HPCG routine
+ */
+
+#ifndef HPCG_NO_MPI
+#include <mpi.h>
+#endif
+
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+
+#if defined(HPCG_DEBUG) || defined(HPCG_DETAILED_DEBUG)
+#include <fstream>
+using std::endl;
+#include "hpcg.hpp"
+#endif
+#include <cassert>
+
+#include "CheckProblem.hpp"
+
+/*!
+  Check the contents of the generated sparse matrix to see if values match expected contents.
+
+  @param[in]  A      The known system matrix
+  @param[inout] b      The newly allocated and generated right hand side vector (if b!=0 on entry)
+  @param[inout] x      The newly allocated solution vector with entries set to 0.0 (if x!=0 on entry)
+  @param[inout] xexact The newly allocated solution vector with entries set to the exact solution (if the xexact!=0
+  non-zero on entry)
+
+  @see GenerateGeometry
+*/
+
+void CheckProblem(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact)
+{
+
+    // Make local copies of geometry information.  Use global_int_t since the RHS products in the calculations
+    // below may result in global range values.
+    global_int_t nx = A.geom->nx;
+    global_int_t ny = A.geom->ny;
+    global_int_t nz = A.geom->nz;
+    global_int_t gnx = A.geom->gnx;
+    global_int_t gny = A.geom->gny;
+    global_int_t gnz = A.geom->gnz;
+    global_int_t gix0 = A.geom->gix0;
+    global_int_t giy0 = A.geom->giy0;
+    global_int_t giz0 = A.geom->giz0;
+
+    local_int_t localNumberOfRows = nx * ny * nz;     // This is the size of our subblock
+    global_int_t totalNumberOfRows = gnx * gny * gnz; // Total number of grid points in mesh
+
+    double* bv = 0;
+    double* xv = 0;
+    double* xexactv = 0;
+    if (b != 0)
+        bv = b->values; // Only compute exact solution if requested
+    if (x != 0)
+        xv = x->values; // Only compute exact solution if requested
+    if (xexact != 0)
+        xexactv = xexact->values; // Only compute exact solution if requested
+
+    local_int_t localNumberOfNonzeros = 0;
+    // TODO:  This triply nested loop could be flattened or use nested parallelism
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+    for (local_int_t iz = 0; iz < nz; iz++)
+    {
+        global_int_t giz = giz0 + iz;
+        for (local_int_t iy = 0; iy < ny; iy++)
+        {
+            global_int_t giy = giy0 + iy;
+            for (local_int_t ix = 0; ix < nx; ix++)
+            {
+                global_int_t gix = gix0 + ix;
+                local_int_t currentLocalRow = iz * nx * ny + iy * nx + ix;
+                global_int_t currentGlobalRow = giz * gnx * gny + giy * gnx + gix;
+                assert(A.localToGlobalMap[currentLocalRow] == currentGlobalRow);
+#ifdef HPCG_DETAILED_DEBUG
+                HPCG_fout << " rank, globalRow, localRow = " << A.geom->rank << " " << currentGlobalRow << " "
+                          << A.globalToLocalMap.find(currentGlobalRow)->second << endl;
+#endif
+                char numberOfNonzerosInRow = 0;
+                double* currentValuePointer
+                    = A.matrixValues[currentLocalRow]; // Pointer to current value in current row
+                global_int_t* currentIndexPointerG
+                    = A.mtxIndG[currentLocalRow]; // Pointer to current index in current row
+                for (int sz = -1; sz <= 1; sz++)
+                {
+                    if (giz + sz > -1 && giz + sz < gnz)
+                    {
+                        for (int sy = -1; sy <= 1; sy++)
+                        {
+                            if (giy + sy > -1 && giy + sy < gny)
+                            {
+                                for (int sx = -1; sx <= 1; sx++)
+                                {
+                                    if (gix + sx > -1 && gix + sx < gnx)
+                                    {
+                                        global_int_t curcol = currentGlobalRow + sz * gnx * gny + sy * gnx + sx;
+                                        if (curcol == currentGlobalRow)
+                                        {
+                                            assert(A.matrixDiagonal[currentLocalRow] == currentValuePointer);
+                                            assert(*currentValuePointer++ == 26.0);
+                                        }
+                                        else
+                                        {
+                                            assert(*currentValuePointer++ == -1.0);
+                                        }
+                                        assert(*currentIndexPointerG++ == curcol);
+                                        numberOfNonzerosInRow++;
+                                    } // end x bounds test
+                                } // end sx loop
+                            } // end y bounds test
+                        } // end sy loop
+                    } // end z bounds test
+                } // end sz loop
+                assert(A.nonzerosInRow[currentLocalRow] == numberOfNonzerosInRow);
+#ifndef HPCG_NO_OPENMP
+#pragma omp critical
+#endif
+                localNumberOfNonzeros += numberOfNonzerosInRow; // Protect this with an atomic
+                if (b != 0)
+                    assert(bv[currentLocalRow] == 26.0 - ((double) (numberOfNonzerosInRow - 1)));
+                if (x != 0)
+                    assert(xv[currentLocalRow] == 0.0);
+                if (xexact != 0)
+                    assert(xexactv[currentLocalRow] == 1.0);
+            } // end ix loop
+        } // end iy loop
+    } // end iz loop
+#ifdef HPCG_DETAILED_DEBUG
+    HPCG_fout << "Process " << A.geom->rank << " of " << A.geom->size << " has " << localNumberOfRows << " rows."
+              << endl
+              << "Process " << A.geom->rank << " of " << A.geom->size << " has " << localNumberOfNonzeros
+              << " nonzeros." << endl;
+#endif
+
+    global_int_t totalNumberOfNonzeros = 0;
+#ifndef HPCG_NO_MPI
+    // Use MPI's reduce function to sum all nonzeros
+#ifdef HPCG_NO_LONG_LONG
+    MPI_Allreduce(&localNumberOfNonzeros, &totalNumberOfNonzeros, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+#else
+    long long lnnz = localNumberOfNonzeros, gnnz = 0; // convert to 64 bit for MPI call
+    MPI_Allreduce(&lnnz, &gnnz, 1, MPI_LONG_LONG_INT, MPI_SUM, MPI_COMM_WORLD);
+    totalNumberOfNonzeros = gnnz; // Copy back
+#endif
+#else
+    totalNumberOfNonzeros = localNumberOfNonzeros;
+#endif
+
+    assert(A.totalNumberOfRows == totalNumberOfRows);
+    assert(A.totalNumberOfNonzeros == totalNumberOfNonzeros);
+    assert(A.localNumberOfRows == localNumberOfRows);
+    assert(A.localNumberOfNonzeros == localNumberOfNonzeros);
+
+    return;
+}
--- a/src/CheckProblem.hpp
+++ b/src/CheckProblem.hpp
@@ -0,0 +1,21 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef CHECKPROBLEM_HPP
+#define CHECKPROBLEM_HPP
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+
+void CheckProblem(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact);
+#endif // CHECKPROBLEM_HPP
--- a/src/ComputeDotProduct.cpp
+++ b/src/ComputeDotProduct.cpp
@@ -0,0 +1,114 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file ComputeDotProduct.cpp
+
+ HPCG routine
+ */
+
+#ifndef HPCG_NO_MPI
+#include "mytimer.hpp"
+#include <mpi.h>
+#endif
+#include "ComputeDotProduct.hpp"
+#include "ComputeDotProduct_ref.hpp"
+#ifdef USE_CUDA
+#include "Cuda.hpp"
+#define CHECK_CUBLAS(x)                                                                                                \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        cublasStatus_t cublasStatus = (x);                                                                             \
+        if (cublasStatus != CUBLAS_STATUS_SUCCESS)                                                                     \
+        {                                                                                                              \
+            fprintf(stderr, "CUBLAS: %s = %d at (%s:%d)\n", #x, cublasStatus, __FILE__, __LINE__);                     \
+            exit(1);                                                                                                   \
+        }                                                                                                              \
+    } while (0)
+#endif
+
+#ifdef USE_GRACE
+#include "CpuKernels.hpp"
+#endif
+
+/*!
+  Routine to compute the dot product of two vectors.
+
+  This routine calls the reference dot-product implementation by default, but
+  can be replaced by a custom routine that is optimized and better suited for
+  the target system.
+
+  @param[in]  n the number of vector elements (on this processor)
+  @param[in]  x, y the input vectors
+  @param[out] result a pointer to scalar value, on exit will contain the result.
+  @param[out] time_allreduce the time it took to perform the communication between processes
+  @param[out] isOptimized should be set to false if this routine uses the reference implementation (is not optimized);
+  otherwise leave it unchanged
+
+  @return returns 0 upon success and non-zero otherwise
+
+  @see ComputeDotProduct_ref
+*/
+
+int ComputeDotProduct(const local_int_t n, const Vector& x, const Vector& y, double& result, double& time_allreduce,
+    bool& isOptimized, rank_type_t rt)
+{
+
+    double local_result = 0.0;
+    if (rt == GPU)
+    {
+#ifdef USE_CUDA
+        cublasStatus_t t = cublasDdot(cublashandle, n, x.values_d, 1, y.values_d, 1, &local_result);
+#endif
+    }
+    else
+    {
+#ifdef USE_GRACE
+        // Consider replacing with NVPL BLAS dot product
+        ComputeDotProductCpu(n, x, y, local_result, isOptimized);
+#endif
+    }
+
+#ifndef HPCG_NO_MPI
+    // Use MPI's reduce function to collect all partial sums
+    double t0 = mytimer();
+    double global_result = 0.0;
+    MPI_Allreduce(&local_result, &global_result, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    result = global_result;
+    t0 = mytimer() - t0;
+    time_allreduce += t0;
+#else
+    time_allreduce += 0.0;
+    result = local_result;
+#endif
+
+    return 0;
+}
--- a/src/ComputeDotProduct.hpp
+++ b/src/ComputeDotProduct.hpp
@@ -0,0 +1,39 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef COMPUTEDOTPRODUCT_HPP
+#define COMPUTEDOTPRODUCT_HPP
+#include "Vector.hpp"
+
+int ComputeDotProduct(const local_int_t n, const Vector& x, const Vector& y, double& result, double& time_allreduce,
+    bool& isOptimized, rank_type_t rt);
+
+#endif // COMPUTEDOTPRODUCT_HPP
--- a/src/ComputeDotProduct_ref.cpp
+++ b/src/ComputeDotProduct_ref.cpp
@@ -0,0 +1,84 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file ComputeDotProduct_ref.cpp
+
+ HPCG routine
+ */
+
+#ifndef HPCG_NO_MPI
+#include "mytimer.hpp"
+#include <mpi.h>
+#endif
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+#include "ComputeDotProduct_ref.hpp"
+#include <cassert>
+
+/*!
+  Routine to compute the dot product of two vectors where:
+
+  This is the reference dot-product implementation.  It _CANNOT_ be modified for the
+  purposes of this benchmark.
+
+  @param[in] n the number of vector elements (on this processor)
+  @param[in] x, y the input vectors
+  @param[in] result a pointer to scalar value, on exit will contain result.
+  @param[out] time_allreduce the time it took to perform the communication between processes
+
+  @return returns 0 upon success and non-zero otherwise
+
+  @see ComputeDotProduct
+*/
+int ComputeDotProduct_ref(const local_int_t n, const Vector& x, const Vector& y, double& result, double& time_allreduce)
+{
+    assert(x.localLength >= n); // Test vector lengths
+    assert(y.localLength >= n);
+
+    double local_result = 0.0;
+    double* xv = x.values;
+    double* yv = y.values;
+    if (yv == xv)
+    {
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for reduction(+ : local_result)
+#endif
+        for (local_int_t i = 0; i < n; i++)
+            local_result += xv[i] * xv[i];
+    }
+    else
+    {
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for reduction(+ : local_result)
+#endif
+        for (local_int_t i = 0; i < n; i++)
+            local_result += xv[i] * yv[i];
+    }
+
+#ifndef HPCG_NO_MPI
+    // Use MPI's reduce function to collect all partial sums
+    double t0 = mytimer();
+    double global_result = 0.0;
+    MPI_Allreduce(&local_result, &global_result, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    result = global_result;
+    time_allreduce += mytimer() - t0;
+#else
+    time_allreduce += 0.0;
+    result = local_result;
+#endif
+
+    return 0;
+}
--- a/src/ComputeDotProduct_ref.hpp
+++ b/src/ComputeDotProduct_ref.hpp
@@ -0,0 +1,21 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef COMPUTEDOTPRODUCT_REF_HPP
+#define COMPUTEDOTPRODUCT_REF_HPP
+#include "Vector.hpp"
+int ComputeDotProduct_ref(
+    const local_int_t n, const Vector& x, const Vector& y, double& result, double& time_allreduce);
+
+#endif // COMPUTEDOTPRODUCT_REF_HPP
--- a/src/ComputeMG.cpp
+++ b/src/ComputeMG.cpp
@@ -0,0 +1,96 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file ComputeMG.cpp
+
+ HPCG routine
+ */
+
+#include "ComputeMG.hpp"
+#include "ComputeProlongation.hpp"
+#include "ComputeRestriction.hpp"
+#include "ComputeSYMGS.hpp"
+#include "CudaKernels.hpp"
+
+/*!
+  @param[in] A the known system matrix
+  @param[in] r the input vector
+  @param[inout] x On exit contains the result of the multigrid V-cycle with r as the RHS, x is the approximation to Ax =
+  r.
+
+  @return returns 0 upon success and non-zero otherwise
+
+  @see ComputeMG_ref
+*/
+
+int ComputeMG(const SparseMatrix& A, const Vector& r, Vector& x)
+{
+    int ierr = 0;
+    if (A.mgData != 0)
+    { // Go to next coarse level if defined
+        ComputeSYMGS(A, r, x, 1);
+        if (A.rankType == GPU)
+        {
+#ifdef USE_CUDA
+            ComputeRestrictionCuda(A, r);
+#endif
+        }
+        else
+        {
+#ifdef USE_GRACE
+            ComputeRestriction(A, r);
+#endif
+        }
+
+        ierr = ComputeMG(*A.Ac, *A.mgData->rc, *A.mgData->xc);
+
+        if (A.rankType == GPU)
+        {
+#ifdef USE_CUDA
+            ComputeProlongationCuda(A, x);
+#endif
+        }
+        else
+        {
+#ifdef USE_GRACE
+            ComputeProlongation(A, x);
+#endif
+        }
+
+        ComputeSYMGS(A, r, x, 0);
+    }
+    else
+    {
+        ComputeSYMGS(A, r, x, 1);
+    }
+    return 0;
+}
--- a/src/ComputeMG.hpp
+++ b/src/ComputeMG.hpp
@@ -0,0 +1,22 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef COMPUTEMG_HPP
+#define COMPUTEMG_HPP
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+
+int ComputeMG(const SparseMatrix& A, const Vector& r, Vector& x);
+
+#endif // COMPUTEMG_HPP
--- a/src/ComputeMG_ref.cpp
+++ b/src/ComputeMG_ref.cpp
@@ -0,0 +1,81 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file ComputeSYMGS_ref.cpp
+
+ HPCG routine
+ */
+
+#include "ComputeMG_ref.hpp"
+#include "ComputeProlongation_ref.hpp"
+#include "ComputeRestriction_ref.hpp"
+#include "ComputeSPMV_ref.hpp"
+#include "ComputeSYMGS_ref.hpp"
+#include <cassert>
+#include <iostream>
+
+/*!
+
+  @param[in] A the known system matrix
+  @param[in] r the input vector
+  @param[inout] x On exit contains the result of the multigrid V-cycle with r as the RHS, x is the approximation to Ax =
+  r.
+
+  @return returns 0 upon success and non-zero otherwise
+
+  @see ComputeMG
+*/
+
+int ComputeMG_ref(const SparseMatrix& A, const Vector& r, Vector& x)
+{
+    assert(x.localLength == A.localNumberOfColumns); // Make sure x contain space for halo values
+
+    ZeroVector(x); // initialize x to zero
+
+    int ierr = 0;
+    if (A.mgData != 0)
+    { // Go to next coarse level if defined
+        int numberOfPresmootherSteps = A.mgData->numberOfPresmootherSteps;
+        for (int i = 0; i < numberOfPresmootherSteps; ++i)
+            ierr += ComputeSYMGS_ref(A, r, x);
+        if (ierr != 0)
+            return ierr;
+        ierr = ComputeSPMV_ref(A, x, *A.mgData->Axf);
+        if (ierr != 0)
+            return ierr;
+        // Perform restriction operation using simple injection
+        ierr = ComputeRestriction_ref(A, r);
+        if (ierr != 0)
+            return ierr;
+        ierr = ComputeMG_ref(*A.Ac, *A.mgData->rc, *A.mgData->xc);
+        if (ierr != 0)
+            return ierr;
+        ierr = ComputeProlongation_ref(A, x);
+        if (ierr != 0)
+            return ierr;
+        int numberOfPostsmootherSteps = A.mgData->numberOfPostsmootherSteps;
+        for (int i = 0; i < numberOfPostsmootherSteps; ++i)
+            ierr += ComputeSYMGS_ref(A, r, x);
+        if (ierr != 0)
+            return ierr;
+    }
+    else
+    {
+        ierr = ComputeSYMGS_ref(A, r, x);
+        if (ierr != 0)
+            return ierr;
+    }
+    return 0;
+}
--- a/src/ComputeMG_ref.hpp
+++ b/src/ComputeMG_ref.hpp
@@ -0,0 +1,26 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef COMPUTEMG_REF_HPP
+#define COMPUTEMG_REF_HPP
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+
+// The use of CPU and GPU Sparse Matrix is intended to resolve
+// the linked list structures for MG coarse levels (A->Ac)
+// There is no change of th erefernce code
+
+int ComputeMG_ref(const SparseMatrix& A, const Vector& r, Vector& x);
+
+#endif // COMPUTEMG_REF_HPP
--- a/src/ComputeOptimalShapeXYZ.cpp
+++ b/src/ComputeOptimalShapeXYZ.cpp
@@ -0,0 +1,175 @@
+
+#include <cmath>
+#include <cstdlib>
+
+#ifdef HPCG_CUBIC_RADICAL_SEARCH
+#include <algorithm>
+#endif
+#include <map>
+
+#include "ComputeOptimalShapeXYZ.hpp"
+#include "MixedBaseCounter.hpp"
+
+#ifdef HPCG_CUBIC_RADICAL_SEARCH
+static int min3(int a, int b, int c)
+{
+    return std::min(a, std::min(b, c));
+}
+
+static int max3(int a, int b, int c)
+{
+    return std::max(a, std::max(b, c));
+}
+
+static void cubic_radical_search(int n, int& x, int& y, int& z)
+{
+    double best = 0.0;
+
+    for (int f1 = (int) (pow(n, 1.0 / 3.0) + 0.5); f1 > 0; --f1)
+        if (n % f1 == 0)
+        {
+            int n1 = n / f1;
+            for (int f2 = (int) (pow(n1, 0.5) + 0.5); f2 > 0; --f2)
+                if (n1 % f2 == 0)
+                {
+                    int f3 = n1 / f2;
+                    double current = (double) min3(f1, f2, f3) / max3(f1, f2, f3);
+                    if (current > best)
+                    {
+                        best = current;
+                        x = f1;
+                        y = f2;
+                        z = f3;
+                    }
+                }
+        }
+}
+
+#else
+
+static void ComputePrimeFactors(int n, std::map<int, int>& factors)
+{
+    int d, sq = int((sqrt(double(n))) + 1L);
+    div_t r;
+
+    // remove 2 as a factor with shifts instead "/" and "%"
+    for (; n > 1 && (n & 1) == 0; n >>= 1)
+    {
+        factors[2]++;
+    }
+
+    // keep removing subsequent odd numbers
+    for (d = 3; d <= sq; d += 2)
+    {
+        while (1)
+        {
+            r = div(n, d);
+            if (r.rem == 0)
+            {
+                factors[d]++;
+                n = r.quot;
+                continue;
+            }
+            break;
+        }
+    }
+    if (n > 1 || factors.size() == 0) // left with a prime or x==1
+        factors[n]++;
+}
+
+static int pow_i(int x, int p)
+{
+    int v;
+
+    if (0 == x || 1 == x)
+        return x;
+
+    if (p < 0)
+        return 0;
+
+    for (v = 1; p; p >>= 1)
+    {
+        if (1 & p)
+            v *= x;
+        x *= x;
+    }
+
+    return v;
+}
+
+#endif
+
+void ComputeOptimalShapeXYZ(int xyz, int& x, int& y, int& z)
+{
+#ifdef HPCG_CUBIC_RADICAL_SEARCH
+    cubic_radical_search(xyz, x, y, z);
+#else
+    std::map<int, int> factors;
+
+    ComputePrimeFactors(xyz, factors); // factors are sorted: ascending order
+
+    std::map<int, int>::iterator iter = factors.begin();
+
+    // there is at least one prime factor
+    x = (iter++)->first; // cache the first factor, move to the next one
+
+    y = iter != factors.end() ? (iter++)->first : y; // try to cache the second factor in "y"
+
+    if (factors.size() == 1)
+    { // only a single factor
+        z = pow_i(x, factors[x] / 3);
+        y = pow_i(x, factors[x] / 3 + ((factors[x] % 3) >= 2 ? 1 : 0));
+        x = pow_i(x, factors[x] / 3 + ((factors[x] % 3) >= 1 ? 1 : 0));
+    }
+    else if (factors.size() == 2 && factors[x] == 1 && factors[y] == 1)
+    { // two distinct prime factors
+        z = 1;
+    }
+    else if (factors.size() == 2 && factors[x] + factors[y] == 3)
+    {                                // three prime factors, one repeated
+        z = factors[x] == 2 ? x : y; // test which factor is repeated
+    }
+    else if (factors.size() == 3 && factors[x] == 1 && factors[y] == 1 && iter->second == 1)
+    { // three distinct and single prime factors
+        z = iter->first;
+    }
+    else
+    { // 3 or more prime factors so try all possible 3-subsets
+
+        int i, distinct_factors[32 + 1], count_factors[32 + 1];
+
+        i = 0;
+        for (std::map<int, int>::iterator iter = factors.begin(); iter != factors.end(); ++iter, ++i)
+        {
+            distinct_factors[i] = iter->first;
+            count_factors[i] = iter->second;
+        }
+
+        // count total number of prime factors in "c_main" and distribute some factors into "c1"
+        MixedBaseCounter c_main(count_factors, factors.size()), c1(count_factors, factors.size());
+
+        // at the beginning, minimum area is the maximum area
+        double area, min_area = 2.0 * xyz + 1.0;
+
+        for (c1.next(); !c1.is_zero(); c1.next())
+        {
+            MixedBaseCounter c2(c_main, c1); // "c2" gets the factors remaining in "c_main" that "c1" doesn't have
+            for (c2.next(); !c2.is_zero(); c2.next())
+            {
+                int tf1 = c1.product(distinct_factors);
+                int tf2 = c2.product(distinct_factors);
+                int tf3 = xyz / tf1 / tf2; // we derive the third dimension, we don't keep track of the factors it has
+
+                area = tf1 * double(tf2) + tf2 * double(tf3) + tf1 * double(tf3);
+                if (area < min_area)
+                {
+                    min_area = area;
+                    x = tf1;
+                    y = tf2;
+                    z = tf3;
+                }
+            }
+        }
+    }
+#endif
+}
--- a/src/ComputeOptimalShapeXYZ.hpp
+++ b/src/ComputeOptimalShapeXYZ.hpp
@@ -0,0 +1,2 @@
+
+void ComputeOptimalShapeXYZ(int xyz, int& x, int& y, int& z);
--- a/src/ComputeProlongation.cpp
+++ b/src/ComputeProlongation.cpp
@@ -0,0 +1,72 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file ComputeProlongation.cpp
+
+ HPCG routine
+ */
+
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+
+#include "ComputeProlongation.hpp"
+
+/*!
+  Routine to compute the coarse residual vector.
+
+  @param[in]  Af - Fine grid sparse matrix object containing pointers to current coarse grid correction and the f2c
+  operator.
+  @param[inout] xf - Fine grid solution vector, update with coarse grid correction.
+
+  Note that the fine grid residual is never explicitly constructed.
+  We only compute it for the fine grid points that will be injected into corresponding coarse grid points.
+
+  @return Returns zero on success and a non-zero value otherwise.
+*/
+int ComputeProlongation(const SparseMatrix& Af, Vector& xf)
+{
+    double* xfv = xf.values;
+    double* xcv = Af.mgData->xc->values;
+    local_int_t* f2c = Af.mgData->f2cOperator;
+    local_int_t nc = Af.mgData->rc->localLength;
+
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+    for (local_int_t i = 0; i < nc; ++i)
+    {
+        xfv[Af.f2cPerm[i]] += xcv[i];
+    }
+
+    return 0;
+}
--- a/src/ComputeProlongation.hpp
+++ b/src/ComputeProlongation.hpp
@@ -0,0 +1,20 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef COMPUTEPROLONGATION_HPP
+#define COMPUTEPROLONGATION_HPP
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+int ComputeProlongation(const SparseMatrix& Af, Vector& xf);
+#endif // COMPUTEPROLONGATION_HPP
--- a/src/ComputeProlongation_ref.cpp
+++ b/src/ComputeProlongation_ref.cpp
@@ -0,0 +1,55 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file ComputeProlongation_ref.cpp
+
+ HPCG routine
+ */
+
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+
+#include "ComputeProlongation_ref.hpp"
+
+/*!
+  Routine to compute the coarse residual vector.
+
+  @param[in]  Af - Fine grid sparse matrix object containing pointers to current coarse grid correction and the f2c
+  operator.
+  @param[inout] xf - Fine grid solution vector, update with coarse grid correction.
+
+  Note that the fine grid residual is never explicitly constructed.
+  We only compute it for the fine grid points that will be injected into corresponding coarse grid points.
+
+  @return Returns zero on success and a non-zero value otherwise.
+*/
+int ComputeProlongation_ref(const SparseMatrix& Af, Vector& xf)
+{
+
+    double* xfv = xf.values;
+    double* xcv = Af.mgData->xc->values;
+    local_int_t* f2c = Af.mgData->f2cOperator;
+    local_int_t nc = Af.mgData->rc->localLength;
+
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+    // TODO: Somehow note that this loop can be safely vectorized since f2c has no repeated indices
+    for (local_int_t i = 0; i < nc; ++i)
+        xfv[f2c[i]] += xcv[i]; // This loop is safe to vectorize
+
+    return 0;
+}
--- a/src/ComputeProlongation_ref.hpp
+++ b/src/ComputeProlongation_ref.hpp
@@ -0,0 +1,20 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef COMPUTEPROLONGATION_REF_HPP
+#define COMPUTEPROLONGATION_REF_HPP
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+int ComputeProlongation_ref(const SparseMatrix& Af, Vector& xf);
+#endif // COMPUTEPROLONGATION_REF_HPP
--- a/src/ComputeResidual.cpp
+++ b/src/ComputeResidual.cpp
@@ -0,0 +1,95 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file ComputeResidual.cpp
+
+ HPCG routine
+ */
+#ifndef HPCG_NO_MPI
+#include <mpi.h>
+#endif
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+
+#include "Vector.hpp"
+
+#ifdef HPCG_DETAILED_DEBUG
+#include "hpcg.hpp"
+#include <fstream>
+#endif
+
+#include "ComputeResidual.hpp"
+#include <cmath> // needed for fabs
+#ifdef HPCG_DETAILED_DEBUG
+#include <iostream>
+#endif
+
+/*!
+  Routine to compute the inf-norm difference between two vectors where:
+
+  @param[in]  n        number of vector elements (local to this processor)
+  @param[in]  v1, v2   input vectors
+  @param[out] residual pointer to scalar value; on exit, will contain result: inf-norm difference
+
+  @return Returns zero on success and a non-zero value otherwise.
+*/
+int ComputeResidual(const local_int_t n, const Vector& v1, const Vector& v2, double& residual)
+{
+
+    double* v1v = v1.values;
+    double* v2v = v2.values;
+    double local_residual = 0.0;
+
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel shared(local_residual, v1v, v2v)
+    {
+        double threadlocal_residual = 0.0;
+#pragma omp for
+        for (local_int_t i = 0; i < n; i++)
+        {
+            double diff = std::fabs(v1v[i] - v2v[i]);
+            if (diff > threadlocal_residual)
+                threadlocal_residual = diff;
+        }
+#pragma omp critical
+        {
+            if (threadlocal_residual > local_residual)
+                local_residual = threadlocal_residual;
+        }
+    }
+#else // No threading
+    for (local_int_t i = 0; i < n; i++)
+    {
+        double diff = std::fabs(v1v[i] - v2v[i]);
+        if (diff > local_residual)
+            local_residual = diff;
+#ifdef HPCG_DETAILED_DEBUG
+        HPCG_fout << " Computed, exact, diff = " << v1v[i] << " " << v2v[i] << " " << diff << std::endl;
+#endif
+    }
+#endif
+
+#ifndef HPCG_NO_MPI
+    // Use MPI's reduce function to collect all partial sums
+    double global_residual = 0;
+    MPI_Allreduce(&local_residual, &global_residual, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+    residual = global_residual;
+#else
+    residual = local_residual;
+#endif
+
+    return 0;
+}
--- a/src/ComputeResidual.hpp
+++ b/src/ComputeResidual.hpp
@@ -0,0 +1,19 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef COMPUTERESIDUAL_HPP
+#define COMPUTERESIDUAL_HPP
+#include "Vector.hpp"
+int ComputeResidual(const local_int_t n, const Vector& v1, const Vector& v2, double& residual);
+#endif // COMPUTERESIDUAL_HPP
--- a/src/ComputeRestriction.cpp
+++ b/src/ComputeRestriction.cpp
@@ -0,0 +1,75 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file ComputeRestriction.cpp
+
+ HPCG routine
+ */
+
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+
+#include "ComputeRestriction.hpp"
+
+/*!
+  Routine to compute the coarse residual vector.
+
+  @param[inout]  A - Sparse matrix object containing pointers to mgData->Axf, the fine grid matrix-vector product and
+  mgData->rc the coarse residual vector.
+  @param[in]    rf - Fine grid RHS.
+
+
+  Note that the fine grid residual is never explicitly constructed.
+  We only compute it for the fine grid points that will be injected into corresponding coarse grid points.
+
+  @return Returns zero on success and a non-zero value otherwise.
+*/
+int ComputeRestriction(const SparseMatrix& A, const Vector& rf)
+{
+
+    double* Axfv = A.mgData->Axf->values;
+    double* rfv = rf.values;
+    double* rcv = A.mgData->rc->values;
+    local_int_t* f2c = A.mgData->f2cOperator;
+    local_int_t nc = A.mgData->rc->localLength;
+
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+    for (local_int_t i = 0; i < nc; ++i)
+    {
+        rcv[i] = rfv[A.f2cPerm[i]] - Axfv[A.f2cPerm[i]];
+    }
+
+    return 0;
+}
--- a/src/ComputeRestriction.hpp
+++ b/src/ComputeRestriction.hpp
@@ -0,0 +1,20 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef COMPUTERESTRICTION_HPP
+#define COMPUTERESTRICTION_HPP
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+int ComputeRestriction(const SparseMatrix& A, const Vector& rf);
+#endif // COMPUTERESTRICTION_HPP
--- a/src/ComputeRestriction_ref.cpp
+++ b/src/ComputeRestriction_ref.cpp
@@ -0,0 +1,56 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file ComputeRestriction_ref.cpp
+
+ HPCG routine
+ */
+
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+
+#include "ComputeRestriction_ref.hpp"
+
+/*!
+  Routine to compute the coarse residual vector.
+
+  @param[inout]  A - Sparse matrix object containing pointers to mgData->Axf, the fine grid matrix-vector product and
+  mgData->rc the coarse residual vector.
+  @param[in]    rf - Fine grid RHS.
+
+
+  Note that the fine grid residual is never explicitly constructed.
+  We only compute it for the fine grid points that will be injected into corresponding coarse grid points.
+
+  @return Returns zero on success and a non-zero value otherwise.
+*/
+int ComputeRestriction_ref(const SparseMatrix& A, const Vector& rf)
+{
+
+    double* Axfv = A.mgData->Axf->values;
+    double* rfv = rf.values;
+    double* rcv = A.mgData->rc->values;
+    local_int_t* f2c = A.mgData->f2cOperator;
+    local_int_t nc = A.mgData->rc->localLength;
+
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+    for (local_int_t i = 0; i < nc; ++i)
+        rcv[i] = rfv[f2c[i]] - Axfv[f2c[i]];
+
+    return 0;
+}
--- a/src/ComputeRestriction_ref.hpp
+++ b/src/ComputeRestriction_ref.hpp
@@ -0,0 +1,20 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef COMPUTERESTRICTION_REF_HPP
+#define COMPUTERESTRICTION_REF_HPP
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+int ComputeRestriction_ref(const SparseMatrix& A, const Vector& rf);
+#endif // COMPUTERESTRICTION_REF_HPP
--- a/src/ComputeSPMV.cpp
+++ b/src/ComputeSPMV.cpp
@@ -0,0 +1,111 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file ComputeSPMV.cpp
+
+ HPCG routine
+ */
+
+#include "ComputeSPMV.hpp"
+#include "ComputeSPMV_ref.hpp"
+#ifndef HPCG_NO_MPI
+#include "ExchangeHalo.hpp"
+#endif
+#ifdef USE_CUDA
+#include "Cuda.hpp"
+#include "CudaKernels.hpp"
+#endif
+
+#include "CpuKernels.hpp"
+/*!
+  Routine to compute sparse matrix vector product y = Ax where:
+  Precondition: First call exchange_externals to get off-processor values of x
+
+  This routine calls the reference SpMV implementation by default, but
+  can be replaced by a custom, optimized routine suited for
+  the target system.
+
+  @param[in]  A the known system matrix
+  @param[in]  x the known vector
+  @param[out] y the On exit contains the result: Ax.
+
+  @return returns 0 upon success and non-zero otherwise
+
+  @see ComputeSPMV_ref
+*/
+int ComputeSPMV(const SparseMatrix& A, Vector& x, Vector& y)
+{
+
+    double one = 1.0, zero = 0.0;
+    if (A.rankType == GPU)
+    {
+// #ifdef USE_CUDA
+#ifndef HPCG_NO_MPI
+        PackSendBufferCuda(A, x, false, copy_stream);
+#endif
+
+        cusparseDnVecSetValues(A.cusparseOpt.vecX, x.values_d);
+        cusparseDnVecSetValues(A.cusparseOpt.vecY, y.values_d);
+        cusparseSpMV(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, A.cusparseOpt.matA, A.cusparseOpt.vecX,
+            &zero, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, A.bufferMvA);
+
+#ifndef HPCG_NO_MPI
+        if (A.totalToBeSent > 0)
+        {
+            ExchangeHaloCuda(A, x, copy_stream);
+            ExtSpMVCuda((SparseMatrix&) A, one, x.values_d + A.localNumberOfRows, y.values_d);
+        }
+#endif
+
+        cudaStreamSynchronize(stream);
+// #endif
+    }
+//     else
+//     {
+// #ifdef USE_GRACE
+//         nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, x.values);
+//         nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, y.values);
+//         nvpl_sparse_spmv(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matA,
+//             A.nvplSparseOpt.vecX, &zero, A.nvplSparseOpt.vecY, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
+//             NVPL_SPARSE_SPMV_ALG_DEFAULT, A.nvplSparseOpt.spmvADescr);
+
+// #ifndef HPCG_NO_MPI
+//         if (A.totalToBeSent > 0)
+//         {
+//             ExchangeHaloCpu(A, x);
+//             ExtSpMVCpu(A, A.localNumberOfRows, 1.0, x.values, y.values);
+//         }
+// #endif
+// #endif // USE_GRACE
+//     }
+    return 0;
+}
--- a/src/ComputeSPMV.hpp
+++ b/src/ComputeSPMV.hpp
@@ -0,0 +1,22 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef COMPUTESPMV_HPP
+#define COMPUTESPMV_HPP
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+
+int ComputeSPMV(const SparseMatrix& A, Vector& x, Vector& y);
+
+#endif // COMPUTESPMV_HPP
--- a/src/ComputeSPMV_ref.cpp
+++ b/src/ComputeSPMV_ref.cpp
@@ -0,0 +1,74 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file ComputeSPMV_ref.cpp
+
+ HPCG routine
+ */
+
+#include "ComputeSPMV_ref.hpp"
+
+#ifndef HPCG_NO_MPI
+#include "ExchangeHalo.hpp"
+#endif
+
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+#include <cassert>
+
+/*!
+  Routine to compute matrix vector product y = Ax where:
+  Precondition: First call exchange_externals to get off-processor values of x
+
+  This is the reference SPMV implementation.  It CANNOT be modified for the
+  purposes of this benchmark.
+
+  @param[in]  A the known system matrix
+  @param[in]  x the known vector
+  @param[out] y the On exit contains the result: Ax.
+
+  @return returns 0 upon success and non-zero otherwise
+
+  @see ComputeSPMV
+*/
+int ComputeSPMV_ref(const SparseMatrix& A, Vector& x, Vector& y)
+{
+
+    assert(x.localLength >= A.localNumberOfColumns); // Test vector lengths
+    assert(y.localLength >= A.localNumberOfRows);
+
+#ifndef HPCG_NO_MPI
+    ExchangeHalo(A, x);
+#endif
+    const double* const xv = x.values;
+    double* const yv = y.values;
+    const local_int_t nrow = A.localNumberOfRows;
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+    for (local_int_t i = 0; i < nrow; i++)
+    {
+        double sum = 0.0;
+        const double* const cur_vals = A.matrixValues[i];
+        const local_int_t* const cur_inds = A.mtxIndL[i];
+        const int cur_nnz = A.nonzerosInRow[i];
+
+        for (int j = 0; j < cur_nnz; j++)
+            sum += cur_vals[j] * xv[cur_inds[j]];
+        yv[i] = sum;
+    }
+    return 0;
+}
--- a/src/ComputeSPMV_ref.hpp
+++ b/src/ComputeSPMV_ref.hpp
@@ -0,0 +1,22 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef COMPUTESPMV_REF_HPP
+#define COMPUTESPMV_REF_HPP
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+
+int ComputeSPMV_ref(const SparseMatrix& A, Vector& x, Vector& y);
+
+#endif // COMPUTESPMV_REF_HPP
--- a/src/ComputeSYMGS.cpp
+++ b/src/ComputeSYMGS.cpp
@@ -0,0 +1,309 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file ComputeSYMGS.cpp
+
+ HPCG routine
+ */
+#ifdef USE_CUDA
+#include "Cuda.hpp"
+#endif
+#ifndef HPCG_NO_MPI
+#include "ExchangeHalo.hpp"
+#endif
+#include "ComputeSPMV.hpp"
+#include "ComputeSYMGS.hpp"
+#include "CpuKernels.hpp"
+#include "CudaKernels.hpp"
+
+/*!
+  Routine to compute one step of symmetric Gauss-Seidel:
+
+  Assumption about the structure of matrix A:
+  - Each row 'i' of the matrix has nonzero diagonal value whose address is matrixDiagonal[i]
+  - Entries in row 'i' are ordered such that:
+       - lower triangular terms are stored before the diagonal element.
+       - upper triangular terms are stored after the diagonal element.
+       - No other assumptions are made about entry ordering.
+
+  Symmetric Gauss-Seidel notes:
+  - We use the input vector x as the RHS and start with an initial guess for y of all zeros.
+  - We perform one forward sweep.  Since y is initially zero we can ignore the upper triangular terms of A.
+  - We then perform one back sweep.
+       - For simplicity we include the diagonal contribution in the for-j loop, then correct the sum after
+
+  @param[in] A the known system matrix
+  @param[in] r the input vector
+  @param[inout] x On entry, x should contain relevant values, on exit x contains the result of one symmetric GS sweep
+  with r as the RHS.
+
+  @return returns 0 upon success and non-zero otherwise
+
+  @warning Early versions of this kernel (Version 1.1 and earlier) had the r and x arguments in reverse order, and out
+  of sync with other kernels.
+
+  @see ComputeSYMGS_ref
+*/
+
+#ifdef USE_CUDA
+int ComputeSYMGS_Gpu(const SparseMatrix& A, const Vector& r, Vector& x, bool step)
+{
+    double* tmp_d;
+    if (step == 1 && A.mgData != 0)
+    {
+        tmp_d = (*A.mgData->Axf).values_d;
+    }
+    else
+    {
+        tmp_d = A.tempBuffer;
+    }
+    const local_int_t nrow = A.localNumberOfRows;
+    double alpha = 1.0;
+    cusparseFillMode_t fillmode_l = CUSPARSE_FILL_MODE_LOWER;
+    cusparseFillMode_t fillmode_u = CUSPARSE_FILL_MODE_UPPER;
+
+    if (step == 1)
+    {
+        // TRSV(D+L, r, t)
+        cusparseDnVecSetValues(A.cusparseOpt.vecX, r.values_d);
+        cusparseDnVecSetValues(A.cusparseOpt.vecY, tmp_d);
+        cusparseSpMatSetAttribute(A.cusparseOpt.matA, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
+        cusparseSpSV_solve(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matA,
+            A.cusparseOpt.vecX, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A.cusparseOpt.spsvDescrL);
+
+        // SPMV(D, t, t)
+        SpmvDiagCuda(nrow, tmp_d, A.diagonal);
+
+        // TRSV(D+U, t, x)
+        cusparseDnVecSetValues(A.cusparseOpt.vecX, tmp_d);
+        cusparseDnVecSetValues(A.cusparseOpt.vecY, x.values_d);
+        cusparseSpMatSetAttribute(A.cusparseOpt.matA, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
+        cusparseSpSV_solve(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matA,
+            A.cusparseOpt.vecX, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A.cusparseOpt.spsvDescrU);
+
+        if (A.mgData != 0)
+        {
+#ifndef HPCG_NO_MPI
+            cudaStreamSynchronize(stream);
+            PackSendBufferCuda(A, x, false, copy_stream);
+#endif
+
+            // SPMV(L, x, t): t = t + L * x
+            double alpha = 1.0;
+            cusparseDnVecSetValues(A.cusparseOpt.vecX, x.values_d);
+            cusparseDnVecSetValues(A.cusparseOpt.vecY, (*A.mgData->Axf).values_d);
+            cusparseSpMV(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matL,
+                A.cusparseOpt.vecX, &alpha, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, A.bufferMvA);
+
+#ifndef HPCG_NO_MPI
+            if (A.totalToBeSent > 0)
+            {
+                ExchangeHaloCuda(A, x, copy_stream);
+                double one = 1.0, zero = 0.0;
+                ExtSpMVCuda((SparseMatrix&) A, one, x.values_d + A.localNumberOfRows, (*A.mgData->Axf).values_d);
+            }
+#endif
+        }
+    }
+    else
+    { // step == 0
+#ifndef HPCG_NO_MPI
+        cudaStreamSynchronize(stream);
+        PackSendBufferCuda(A, x, false, copy_stream);
+#endif
+
+        // SPMV(U, x, t): t = U * x
+        double alpha = 1.0, beta = 0.0;
+        cusparseDnVecSetValues(A.cusparseOpt.vecX, x.values_d);
+        cusparseDnVecSetValues(A.cusparseOpt.vecY, (*A.mgData->Axf).values_d);
+        cusparseSpMV(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matU, A.cusparseOpt.vecX,
+            &beta, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, A.bufferMvA);
+
+        // tmp = rv - t
+        AxpbyCuda(nrow, r.values_d, (*A.mgData->Axf).values_d, tmp_d);
+
+#ifndef HPCG_NO_MPI
+        if (A.totalToBeSent > 0)
+        {
+            // MPI_Ibarrier --> will help improve MPI_Allreduce in dot product
+            ExchangeHaloCuda(A, x, copy_stream, A.level == 0 ? 1 /*call MPI_Ibarrier*/ : 0);
+            double mone = -1.0, zero = 0.0;
+            ExtSpMVCuda((SparseMatrix&) A, mone, x.values_d + A.localNumberOfRows, tmp_d);
+        }
+#endif
+
+        // TRSV(D+L, r-t, x)
+        cusparseDnVecSetValues(A.cusparseOpt.vecX, tmp_d);
+        cusparseDnVecSetValues(A.cusparseOpt.vecY, x.values_d);
+        cusparseSpMatSetAttribute(A.cusparseOpt.matA, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
+        cusparseSpSV_solve(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matA,
+            A.cusparseOpt.vecX, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A.cusparseOpt.spsvDescrL);
+
+        // SPMV(D, x, t) t += D*x
+        SpFmaCuda(nrow, x.values_d, A.diagonal, (*A.mgData->Axf).values_d);
+
+        // TRSV(D+U, x, x)
+        cusparseDnVecSetValues(A.cusparseOpt.vecX, (*A.mgData->Axf).values_d);
+        cusparseDnVecSetValues(A.cusparseOpt.vecY, x.values_d);
+        cusparseSpMatSetAttribute(A.cusparseOpt.matA, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
+        cusparseSpSV_solve(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matA,
+            A.cusparseOpt.vecX, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A.cusparseOpt.spsvDescrU);
+    }
+    return 0;
+}
+#endif
+
+#ifdef USE_GRACE
+int ComputeSYMGS_Cpu(const SparseMatrix& A, const Vector& r, Vector& x, bool step)
+{
+    local_int_t nrow = A.localNumberOfRows;
+    double* temp;
+    if (step == 1 && A.mgData != 0)
+    {
+        temp = (*A.mgData->Axf).values;
+    }
+    else
+    {
+        temp = A.tempBuffer;
+    }
+    double* xv = x.values;
+    double* rv = r.values;
+    double one = 1.0, zero = 0.0;
+    nvpl_sparse_fill_mode_t fillmode_l = NVPL_SPARSE_FILL_MODE_LOWER;
+    nvpl_sparse_fill_mode_t fillmode_u = NVPL_SPARSE_FILL_MODE_UPPER;
+
+    if (step == 1)
+    {
+        // TRSV(L, r, x)
+        nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, r.values);
+        nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, xv);
+        nvpl_sparse_sp_mat_set_attribute(
+            A.nvplSparseOpt.matL, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
+        nvpl_sparse_spsv_solve(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matL,
+            A.nvplSparseOpt.vecX, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F, NVPL_SPARSE_SPSV_ALG_DEFAULT,
+            A.nvplSparseOpt.spsvDescrL);
+
+        // SPMV(D, x, t) t = D*x
+        SpmvDiagCpu(nrow, A.diagonal, xv, temp);
+
+        // TRSV(U, x, x)
+        nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, temp);
+        nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, xv);
+        nvpl_sparse_sp_mat_set_attribute(
+            A.nvplSparseOpt.matU, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
+        nvpl_sparse_spsv_solve(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matU,
+            A.nvplSparseOpt.vecX, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F, NVPL_SPARSE_SPSV_ALG_DEFAULT,
+            A.nvplSparseOpt.spsvDescrU);
+
+        if (A.mgData != 0)
+        {
+            // SPMV(L, x, t): t += L*x
+            nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, xv);
+            nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, temp);
+            nvpl_sparse_spmv(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matL,
+                A.nvplSparseOpt.vecX, &one, A.nvplSparseOpt.vecY, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
+                NVPL_SPARSE_SPMV_ALG_DEFAULT, A.nvplSparseOpt.spmvLDescr);
+
+#ifndef HPCG_NO_MPI
+            ExchangeHaloCpu(A, x);
+            if (A.totalToBeSent > 0)
+            {
+                ExtSpMVCpu(A, nrow, 1.0, xv, temp);
+            }
+#endif
+        }
+    }
+    else if (step == 0)
+    {
+        // SPMV(U, x, t) t = U*x
+        nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, xv);
+        nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, (*A.mgData->Axf).values);
+
+        nvpl_sparse_spmv(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matU,
+            A.nvplSparseOpt.vecX, &zero, A.nvplSparseOpt.vecY, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
+            NVPL_SPARSE_SPMV_ALG_DEFAULT, A.nvplSparseOpt.spmvUDescr);
+
+        // axpy: t = r-t
+        AxpbyCpu(nrow, rv, (*A.mgData->Axf).values, temp);
+
+#ifndef HPCG_NO_MPI
+        // MPI_Ibarrier --> will help improve MPI_Allreduce in dot product
+        ExchangeHaloCpu(A, x, A.level == 0 ? 1 /*call MPI_Ibarrier*/ : 0);
+        if (A.totalToBeSent > 0)
+        {
+            ExtSpMVCpu(A, nrow, -1.0, xv, temp);
+        }
+#endif
+
+        // TRSV(L, r-t, x)
+        nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, temp);
+        nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, xv);
+        nvpl_sparse_sp_mat_set_attribute(
+            A.nvplSparseOpt.matL, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
+        nvpl_sparse_spsv_solve(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matL,
+            A.nvplSparseOpt.vecX, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F, NVPL_SPARSE_SPSV_ALG_DEFAULT,
+            A.nvplSparseOpt.spsvDescrL);
+
+        // SPMV(D, x, t) t += D*x
+        SpFmaCpu(nrow, A.diagonal, xv, (*A.mgData->Axf).values);
+
+        // TRSV(U, x, x)
+        nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, (*A.mgData->Axf).values);
+        nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, xv);
+        nvpl_sparse_sp_mat_set_attribute(
+            A.nvplSparseOpt.matU, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
+        nvpl_sparse_spsv_solve(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matU,
+            A.nvplSparseOpt.vecX, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F, NVPL_SPARSE_SPSV_ALG_DEFAULT,
+            A.nvplSparseOpt.spsvDescrU);
+    }
+
+    return 0;
+}
+#endif // USE_GRACE
+
+int ComputeSYMGS(const SparseMatrix& A, const Vector& r, Vector& x, bool step)
+{
+    if (A.rankType == GPU)
+    {
+#ifdef USE_CUDA
+        ComputeSYMGS_Gpu(A, r, x, step);
+#endif
+    }
+    else
+    {
+#ifdef USE_GRACE
+        ComputeSYMGS_Cpu(A, r, x, step);
+#endif
+    }
+
+    return 0;
+}
--- a/src/ComputeSYMGS.hpp
+++ b/src/ComputeSYMGS.hpp
@@ -0,0 +1,39 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef COMPUTESYMGS_HPP
+#define COMPUTESYMGS_HPP
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+
+int ComputeSYMGS(const SparseMatrix& A, const Vector& r, Vector& x, bool step);
+
+#endif // COMPUTESYMGS_HPP
--- a/src/ComputeSYMGS_ref.cpp
+++ b/src/ComputeSYMGS_ref.cpp
@@ -0,0 +1,110 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file ComputeSYMGS_ref.cpp
+
+ HPCG routine
+ */
+
+#ifndef HPCG_NO_MPI
+#include "ExchangeHalo.hpp"
+#endif
+#include "ComputeSYMGS_ref.hpp"
+#include <cassert>
+
+/*!
+  Computes one step of symmetric Gauss-Seidel:
+
+  Assumption about the structure of matrix A:
+  - Each row 'i' of the matrix has nonzero diagonal value whose address is matrixDiagonal[i]
+  - Entries in row 'i' are ordered such that:
+       - lower triangular terms are stored before the diagonal element.
+       - upper triangular terms are stored after the diagonal element.
+       - No other assumptions are made about entry ordering.
+
+  Symmetric Gauss-Seidel notes:
+  - We use the input vector x as the RHS and start with an initial guess for y of all zeros.
+  - We perform one forward sweep.  x should be initially zero on the first GS sweep, but we do not attempt to exploit
+  this fact.
+  - We then perform one back sweep.
+  - For simplicity we include the diagonal contribution in the for-j loop, then correct the sum after
+
+  @param[in] A the known system matrix
+  @param[in] r the input vector
+  @param[inout] x On entry, x should contain relevant values, on exit x contains the result of one symmetric GS sweep
+  with r as the RHS.
+
+
+  @warning Early versions of this kernel (Version 1.1 and earlier) had the r and x arguments in reverse order, and out
+  of sync with other kernels.
+
+  @return returns 0 upon success and non-zero otherwise
+
+  @see ComputeSYMGS
+*/
+int ComputeSYMGS_ref(const SparseMatrix& A, const Vector& r, Vector& x)
+{
+
+    assert(x.localLength == A.localNumberOfColumns); // Make sure x contain space for halo values
+
+#ifndef HPCG_NO_MPI
+    ExchangeHalo(A, x);
+#endif
+
+    const local_int_t nrow = A.localNumberOfRows;
+    double** matrixDiagonal = A.matrixDiagonal; // An array of pointers to the diagonal entries A.matrixValues
+    const double* const rv = r.values;
+    double* const xv = x.values;
+
+    for (local_int_t i = 0; i < nrow; i++)
+    {
+        const double* const currentValues = A.matrixValues[i];
+        const local_int_t* const currentColIndices = A.mtxIndL[i];
+        const int currentNumberOfNonzeros = A.nonzerosInRow[i];
+        const double currentDiagonal = matrixDiagonal[i][0]; // Current diagonal value
+        double sum = rv[i];                                  // RHS value
+
+        for (int j = 0; j < currentNumberOfNonzeros; j++)
+        {
+            local_int_t curCol = currentColIndices[j];
+            sum -= currentValues[j] * xv[curCol];
+        }
+        sum += xv[i] * currentDiagonal; // Remove diagonal contribution from previous loop
+
+        xv[i] = sum / currentDiagonal;
+    }
+
+    // Now the back sweep.
+
+    for (local_int_t i = nrow - 1; i >= 0; i--)
+    {
+        const double* const currentValues = A.matrixValues[i];
+        const local_int_t* const currentColIndices = A.mtxIndL[i];
+        const int currentNumberOfNonzeros = A.nonzerosInRow[i];
+        const double currentDiagonal = matrixDiagonal[i][0]; // Current diagonal value
+        double sum = rv[i];                                  // RHS value
+
+        for (int j = 0; j < currentNumberOfNonzeros; j++)
+        {
+            local_int_t curCol = currentColIndices[j];
+            sum -= currentValues[j] * xv[curCol];
+        }
+        sum += xv[i] * currentDiagonal; // Remove diagonal contribution from previous loop
+
+        xv[i] = sum / currentDiagonal;
+    }
+
+    return 0;
+}
--- a/src/ComputeSYMGS_ref.hpp
+++ b/src/ComputeSYMGS_ref.hpp
@@ -0,0 +1,22 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef COMPUTESYMGS_REF_HPP
+#define COMPUTESYMGS_REF_HPP
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+
+int ComputeSYMGS_ref(const SparseMatrix& A, const Vector& r, Vector& x);
+
+#endif // COMPUTESYMGS_REF_HPP
--- a/src/ComputeWAXPBY.cpp
+++ b/src/ComputeWAXPBY.cpp
@@ -0,0 +1,89 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file ComputeWAXPBY.cpp
+
+ HPCG routine
+ */
+#ifndef HPCG_NO_MPI
+#include "mytimer.hpp"
+#include <mpi.h>
+#endif
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+#ifdef USE_CUDA
+#include "Cuda.hpp"
+#endif
+#include "ComputeWAXPBY.hpp"
+#include "ComputeWAXPBY_ref.hpp"
+#include "CpuKernels.hpp"
+#include "CudaKernels.hpp"
+#include "SparseMatrix.hpp"
+
+/*!
+  Routine to compute the update of a vector with the sum of two
+  scaled vectors where: w = alpha*x + beta*y
+
+  This routine calls the reference WAXPBY implementation by default, but
+  can be replaced by a custom, optimized routine suited for
+  the target system.
+
+  @param[in] n the number of vector elements (on this processor)
+  @param[in] alpha, beta the scalars applied to x and y respectively.
+  @param[in] x, y the input vectors
+  @param[out] w the output vector
+  @param[out] isOptimized should be set to false if this routine uses the reference implementation (is not optimized);
+  otherwise leave it unchanged
+
+  @return returns 0 upon success and non-zero otherwise
+
+  @see ComputeWAXPBY_ref
+*/
+
+int ComputeWAXPBY(const local_int_t n, const double alpha, const Vector& x, const double beta, const Vector& y,
+    Vector& w, bool& isOptimized, rank_type_t rt)
+{
+    if (rt == GPU)
+    {
+#ifdef USE_CUDA
+        ComputeWAXPBYCuda(n, alpha, x, beta, y, w);
+#endif
+    }
+    else
+    {
+#ifdef USE_GRACE
+        ComputeWAXPBYCpu(n, alpha, x, beta, y, w, isOptimized);
+#endif
+    }
+    return 0;
+}
--- a/src/ComputeWAXPBY.hpp
+++ b/src/ComputeWAXPBY.hpp
@@ -0,0 +1,39 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef COMPUTEWAXPBY_HPP
+#define COMPUTEWAXPBY_HPP
+#include "Vector.hpp"
+
+int ComputeWAXPBY(const local_int_t n, const double alpha, const Vector& x, const double beta, const Vector& y,
+    Vector& w, bool& isOptimized, rank_type_t rt);
+
+#endif // COMPUTEWAXPBY_HPP
--- a/src/ComputeWAXPBY_ref.cpp
+++ b/src/ComputeWAXPBY_ref.cpp
@@ -0,0 +1,79 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file ComputeWAXPBY_ref.cpp
+
+ HPCG routine
+ */
+
+#include "ComputeWAXPBY_ref.hpp"
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+#include <cassert>
+/*!
+  Routine to compute the update of a vector with the sum of two
+  scaled vectors where: w = alpha*x + beta*y
+
+  This is the reference WAXPBY impmentation.  It CANNOT be modified for the
+  purposes of this benchmark.
+
+  @param[in] n the number of vector elements (on this processor)
+  @param[in] alpha, beta the scalars applied to x and y respectively.
+  @param[in] x, y the input vectors
+  @param[out] w the output vector.
+
+  @return returns 0 upon success and non-zero otherwise
+
+  @see ComputeWAXPBY
+*/
+int ComputeWAXPBY_ref(
+    const local_int_t n, const double alpha, const Vector& x, const double beta, const Vector& y, Vector& w)
+{
+
+    assert(x.localLength >= n); // Test vector lengths
+    assert(y.localLength >= n);
+
+    const double* const xv = x.values;
+    const double* const yv = y.values;
+    double* const wv = w.values;
+
+    if (alpha == 1.0)
+    {
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+        for (local_int_t i = 0; i < n; i++)
+            wv[i] = xv[i] + beta * yv[i];
+    }
+    else if (beta == 1.0)
+    {
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+        for (local_int_t i = 0; i < n; i++)
+            wv[i] = alpha * xv[i] + yv[i];
+    }
+    else
+    {
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+        for (local_int_t i = 0; i < n; i++)
+            wv[i] = alpha * xv[i] + beta * yv[i];
+    }
+
+    return 0;
+}
--- a/src/ComputeWAXPBY_ref.hpp
+++ b/src/ComputeWAXPBY_ref.hpp
@@ -0,0 +1,20 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef COMPUTEWAXPBY_REF_HPP
+#define COMPUTEWAXPBY_REF_HPP
+#include "Vector.hpp"
+int ComputeWAXPBY_ref(
+    const local_int_t n, const double alpha, const Vector& x, const double beta, const Vector& y, Vector& w);
+#endif // COMPUTEWAXPBY_REF_HPP
--- a/src/CpuKernels.cpp
+++ b/src/CpuKernels.cpp
--- a/src/CpuKernels.hpp
+++ b/src/CpuKernels.hpp
@@ -0,0 +1,92 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CPUKERNELS_HPP
+#define CPUKERNELS_HPP
+
+#ifdef USE_GRACE
+
+#include <nvpl_sparse.h>
+extern nvpl_sparse_handle_t nvpl_sparse_handle;
+
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+#include <algorithm>
+#include <random>
+#include <vector>
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+
+#ifndef HPCG_NO_MPI
+#include <mpi.h>
+#endif
+
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+
+///////// Deallocate CPU Memory for data structures //
+void DeleteMatrixCpu(SparseMatrix& A);
+
+///////// Find the size of CPU reference allocated memory //
+size_t EstimateCpuRefMem(SparseMatrix& A);
+
+/*
+    Translation of a 3D point in all directions
+    27 possibilities
+*/
+constexpr int tid2indCpu[32][4] = {{-1, -1, -1, 0}, {0, -1, -1, 0}, {1, -1, -1, 0}, {-1, 0, -1, 0}, {0, 0, -1, 0},
+    {1, 0, -1, 0}, {-1, 1, -1, 0}, {0, 1, -1, 0}, {1, 1, -1, 0}, {-1, -1, 0, 0}, {0, -1, 0, 0}, {1, -1, 0, 0},
+    {-1, 0, 0, 0}, {0, 0, 0, 0}, {1, 0, 0, 0}, {-1, 1, 0, 0}, {0, 1, 0, 0}, {1, 1, 0, 0}, {-1, -1, 1, 0}, {0, -1, 1, 0},
+    {1, -1, 1, 0}, {-1, 0, 1, 0}, {0, 0, 1, 0}, {1, 0, 1, 0}, {-1, 1, 1, 0}, {0, 1, 1, 0}, {1, 1, 1, 0}, {0, 0, 0, 0},
+    {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}};
+
+// Generate Problem
+// Inclusive Prefix Sum
+void PrefixsumCpu(int* x, int N);
+
+// Optimize Problem
+size_t AllocateMemCpu(SparseMatrix& A_in);
+void ColorMatrixCpu(SparseMatrix& A, int* num_colors);
+void CreateSellPermCpu(SparseMatrix& A);
+void F2cPermCpu(local_int_t nrow_c, local_int_t* f2c, local_int_t* f2c_perm, local_int_t* perm_f, local_int_t* iperm_c);
+
+// Permute a vector using coloring buffer
+void PermVectorCpu(local_int_t* perm, Vector& x, local_int_t length);
+
+// Test CG
+void ReplaceMatrixDiagonalCpu(SparseMatrix& A, Vector diagonal);
+
+// CG Support Kernels
+// Dot-product Per single rank
+void ComputeDotProductCpu(const local_int_t n, const Vector& x, const Vector& y, double& result, bool& isOptimized);
+
+// WAXPBY
+int ComputeWAXPBYCpu(const local_int_t n, const double alpha, const Vector& x, const double beta, const Vector& y,
+    Vector& w, bool& isOptimized);
+// SYMGS
+void SpmvDiagCpu(local_int_t n, const double* x, double* y, double* z);
+void AxpbyCpu(local_int_t n, double* x, double* y, double* z);
+void SpFmaCpu(local_int_t n, const double* x, double* y, double* z);
+
+// External Matrix SpMV + Scatter
+void ExtSpMVCpu(const SparseMatrix& A, const local_int_t n, const double alpha, const double* x, double* y);
+
+#endif // USE_GRACE
+#endif // CPUKERNELS_HPP
--- a/src/Cuda.hpp
+++ b/src/Cuda.hpp
@@ -0,0 +1,87 @@
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#ifdef USE_CUDA
+#include "cublas_v2.h"
+#include "cuda_runtime_api.h"
+#include "cusparse.h"
+#include <cuda.h>
+#ifdef USE_NCCL
+#include "nccl.h"
+#endif
+#ifdef USE_NVTX
+#include <nvToolsExt.h>
+#endif
+#include <unistd.h>
+
+extern cusparseHandle_t cusparsehandle;
+extern cublasHandle_t cublashandle;
+extern cudaStream_t stream;
+extern cudaEvent_t copy_done;
+extern cudaStream_t copy_stream;
+extern int* ranktoId;   // DEV:Compress rank in MPI_WORLD to Neighbors
+extern int* rankToId_h; // HOST:Compress rank in MPI_WORLD to Neighbors
+extern int* idToRank_h;
+extern bool Use_Compression;        /*USE CUDA L2 compression*/
+extern bool Use_Hpcg_Mem_Reduction; /*USE HPCG aggresive memory reduction*/
+#endif
+
+#ifdef USE_CUDA
+#define CHECK_CUDART(x)                                                                                                \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        cudaError_t res = (x);                                                                                         \
+        if (res != cudaSuccess)                                                                                        \
+        {                                                                                                              \
+            char rank_name[1024];                                                                                      \
+            gethostname(rank_name, 1024);                                                                              \
+            fprintf(stderr, "CUDART: %s = %d (%s) on %s at (%s:%d)\n", #x, res, cudaGetErrorString(res), rank_name,    \
+                __FILE__, __LINE__);                                                                                   \
+            exit(1);                                                                                                   \
+        }                                                                                                              \
+    } while (0)
+
+// IF NVTX is needed for profiling, please define USE_NVTX
+// Then, add PUSH_RANGE and POP_RANGE around the target code block
+// See, https://developer.nvidia.com/blog/cuda-pro-tip-generate-custom-application-profile-timelines-nvtx/
+// #define USE_NVTX
+#ifdef USE_NVTX
+const uint32_t colors[] = {0xff00ff00, 0xff0000ff, 0xffffff00, 0xffff00ff, 0xff00ffff, 0xffff0000, 0xffffffff};
+const int num_colors = sizeof(colors) / sizeof(uint32_t);
+#define PUSH_RANGE(name, cid)                                                                                          \
+    {                                                                                                                  \
+        int color_id = cid;                                                                                            \
+        color_id = color_id % num_colors;                                                                              \
+        nvtxEventAttributes_t eventAttrib = {0};                                                                       \
+        eventAttrib.version = NVTX_VERSION;                                                                            \
+        eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;                                                              \
+        eventAttrib.colorType = NVTX_COLOR_ARGB;                                                                       \
+        eventAttrib.color = colors[color_id];                                                                          \
+        eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;                                                             \
+        eventAttrib.message.ascii = name;                                                                              \
+        nvtxRangePushEx(&eventAttrib);                                                                                 \
+    }
+#define POP_RANGE nvtxRangePop();
+#else
+#define PUSH_RANGE(name, cid)                                                                                          \
+    {                                                                                                                  \
+    }
+#define POP_RANGE
+#endif
+#endif
--- a/src/CudaKernels.cu
+++ b/src/CudaKernels.cu
--- a/src/CudaKernels.hpp
+++ b/src/CudaKernels.hpp
@@ -0,0 +1,92 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#ifdef USE_CUDA
+#include "SparseMatrix.hpp"
+
+///////// L2 Memory Compression Allocation Support Routines //
+cudaError_t setProp(CUmemAllocationProp* prop);
+cudaError_t cudaMallocCompressible(void** adr, size_t size);
+cudaError_t cudaFreeCompressible(void* ptr, size_t size);
+
+///////// Allocate CUDA Memory for data structures //
+local_int_t EstimateLUmem(local_int_t n, local_int_t padded_n, local_int_t level);
+void AllocateMemCuda(SparseMatrix& A_in);
+void AllocateMemOptCuda(SparseMatrix& A_in);
+
+///////// Deallocate CUDA Memory for data structures //
+void DeleteMatrixGpu(SparseMatrix& A);
+
+///////// Genrerate Problem //
+void GenerateProblemCuda(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact);
+
+// Halo Exchange
+void SetupHaloCuda(SparseMatrix& A, local_int_t sendbufld, local_int_t* sendlen, local_int_t* sendbuff,
+    local_int_t* tot_to_send, int* nneighs, int* neighs_h, local_int_t* sendlen_h, local_int_t** elem_to_send_d);
+void ExtToLocMapCuda(
+    local_int_t localNumberOfRows, local_int_t str, local_int_t end, local_int_t* extToLocMap, local_int_t* eltsToRecv);
+void ExtTolocCuda(local_int_t localNumberOfRows, int neighborId, local_int_t ext_nnz, local_int_t* csr_ext_columns,
+    double* csr_ext_values, local_int_t* ext2csr_offsets, local_int_t* extToLocMap, local_int_t* csrColumns);
+void PackSendBufferCuda(const SparseMatrix& A, Vector& x, bool cpu_data, cudaStream_t stream1);
+void ExchangeHaloCuda(const SparseMatrix& A, Vector& x, cudaStream_t stream1, int use_ibarrier = 0);
+
+// Optimize Problem
+void SetVectorAscCuda(local_int_t* arr, local_int_t n);
+void ColorMatrixCuda(double* A_vals, local_int_t* A_col, local_int_t* nnzPerRow, local_int_t rows, local_int_t* color,
+    int* num_colors, int* count_colors, int max_colors, local_int_t* ref2opt, local_int_t* opt2ref, int rank, int nx,
+    int* rowhash);
+void PermElemToSendCuda(local_int_t totalToBeSent, local_int_t* elementsToSend, local_int_t* perm);
+void EllPermColumnsValuesCuda(local_int_t localNumberOfRows, local_int_t* nnzPerRow, local_int_t* csrColumns,
+    double* csrValues, local_int_t* permOffsets, local_int_t* permColumns, double* permValues, local_int_t* opt2ref,
+    local_int_t* ref2opt, local_int_t* diagonalIdx, local_int_t* permLOffsets, local_int_t* permUOffsets, bool diag);
+void TransposeCuda(local_int_t n, local_int_t slice_size, local_int_t* sellCollIndex, double* sellValues);
+void EllMaxRowLenPerBlockCuda(local_int_t nrow, int sliceSize, local_int_t* sellLPermOffsets,
+    local_int_t* sellUPermOffsets, local_int_t* sellLSliceMrl, local_int_t* sellUSliceMrl);
+void PrefixsumCuda(local_int_t localNumberOfRows, local_int_t* arr);
+void MultiplyBySliceSizeCUDA(local_int_t nrow, int slice_size, local_int_t* arr);
+void CreateAMatrixSliceOffsetsCuda(local_int_t nrow, local_int_t slice_size, local_int_t* arr);
+void CreateSellLUColumnsValuesCuda(const local_int_t n, int sliceSize, local_int_t* columns, double* values,
+    local_int_t* sellLSliceOffset, local_int_t* sellLColumns, double* sellLValues, local_int_t* sellUSliceOffset,
+    local_int_t* sellUColumns, double* sellUValues, int level);
+void PermVectorCuda(local_int_t* perm, Vector& x, local_int_t length);
+void F2cPermCuda(local_int_t nrow_c, local_int_t* f2c, local_int_t* f2cPerm, local_int_t* permF, local_int_t* ipermC);
+
+// Test CG
+void ReplaceMatrixDiagonalCuda(SparseMatrix& A, Vector& diagonal);
+void CopyMatrixDiagonalCuda(SparseMatrix& A, Vector& diagonal);
+
+// CG Support Kernels
+// 1. MG
+void ComputeRestrictionCuda(const SparseMatrix& A, const Vector& r);
+void ComputeProlongationCuda(const SparseMatrix& A, Vector& x);
+
+// 2. WAXPBY
+void ComputeWAXPBYCuda(
+    const local_int_t n, const double alpha, const Vector& x, const double beta, const Vector& y, Vector& w);
+
+// 3.SYMGS
+void SpmvDiagCuda(local_int_t n, double* x, double* d);
+void AxpbyCuda(local_int_t n, double* x, double* y, double* z);
+void SpFmaCuda(local_int_t n, double* x, double* y, double* z);
+
+// 4.External Matrix SpMV + Scatter
+void ExtSpMVCuda(SparseMatrix& A, double alpha, double* x, double* y);
+
+// Transfer Problem to CPU
+size_t CopyDataToHostCuda(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact);
+#endif
--- a/src/ExchangeHalo.cpp
+++ b/src/ExchangeHalo.cpp
@@ -0,0 +1,205 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file ExchangeHalo.cpp
+
+ HPCG routine
+ */
+
+// Compile this routine only if running with MPI
+#ifndef HPCG_NO_MPI
+#include "ExchangeHalo.hpp"
+#include "Geometry.hpp"
+#include <cstdlib>
+#include <mpi.h>
+
+extern p2p_comm_mode_t P2P_Mode;
+
+/*!
+  Communicates data that is at the border of the part of the domain assigned to this processor.
+
+  @param[in]    A The known system matrix
+  @param[inout] x On entry: the local vector entries followed by entries to be communicated; on exit: the vector with
+  non-local entries updated by other processors
+ */
+void ExchangeHalo(const SparseMatrix& A, Vector& x)
+{
+    local_int_t localNumberOfRows = A.localNumberOfRows;
+    int num_neighbors = A.numberOfSendNeighbors;
+    local_int_t * receiveLength = A.receiveLength;
+    local_int_t * sendLength = A.sendLength;
+    int * neighbors = A.neighbors;
+    double * sendBuffer = A.sendBuffer;
+    local_int_t totalToBeSent = A.totalToBeSent;
+    local_int_t * elementsToSend = A.elementsToSend;
+  
+    double * const xv = x.values;
+  
+    int size, rank; // Number of MPI processes, My process ID
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  
+    //
+    //  first post receives, these are immediate receives
+    //  Do not wait for result to come, will do that at the
+    //  wait call below.
+    //
+  
+    int MPI_MY_TAG = 99;
+  
+    MPI_Request * request = new MPI_Request[num_neighbors];
+  
+    //
+    // Externals are at end of locals
+    //
+    double * x_external = (double *) xv + localNumberOfRows;
+  
+    // Post receives first
+    // TODO: Thread this loop
+    for (int i = 0; i < num_neighbors; i++) {
+      local_int_t n_recv = receiveLength[i];
+      MPI_Irecv(x_external, n_recv, MPI_DOUBLE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD, request+i);
+      x_external += n_recv;
+    }
+  
+  
+    //
+    // Fill up send buffer
+    //
+  
+    // TODO: Thread this loop
+    for (local_int_t i=0; i<totalToBeSent; i++) sendBuffer[i] = xv[elementsToSend[i]];
+  
+    //
+    // Send to each neighbor
+    //
+  
+    // TODO: Thread this loop
+    for (int i = 0; i < num_neighbors; i++) {
+      local_int_t n_send = sendLength[i];
+      MPI_Send(sendBuffer, n_send, MPI_DOUBLE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD);
+      sendBuffer += n_send;
+    }
+  
+    //
+    // Complete the reads issued above
+    //
+  
+    MPI_Status status;
+    // TODO: Thread this loop
+    for (int i = 0; i < num_neighbors; i++) {
+      if ( MPI_Wait(request+i, &status) ) {
+        std::exit(-1); // TODO: have better error exit
+      }
+    }
+  
+    delete [] request;
+  
+    return;
+}
+
+/*!
+  Communicates data that is at the border of the part of the domain assigned to this processor. A more optimized version of ExchangeHalo that is used for Grace path.
+
+  @param[in]    A The known system matrix
+  @param[inout] x On entry: the local vector entries followed by entries to be communicated; on exit: the vector with
+  non-local entries updated by other processors
+  @param[in]   use_ibarrier [Experimental] If 1, call MPI_Ibarrier after the communication is complete. A smart trick to improve MPI_Allreduce in DDOT, 
+    by calling MPI_Ibarrier once at the last routine call in MG.
+ */
+void ExchangeHaloCpu(const SparseMatrix& A, Vector& x, int use_ibarrier)
+{
+    // Extract Matrix pieces
+    local_int_t localNumberOfRows = A.localNumberOfRows;
+    int num_neighbors = A.numberOfSendNeighbors;
+    local_int_t* receiveLength = A.receiveLength;
+    local_int_t* sendLength = A.sendLength;
+    int* neighbors = A.neighborsPhysical;
+    double* sendBuffer = A.sendBuffer;
+    local_int_t totalToBeSent = A.totalToBeSent;
+    local_int_t* elementsToSend = A.elementsToSend;
+
+    if (P2P_Mode == MPI_CPU)
+    {
+        double* const xv = x.values;
+        double* x_external = (double*) xv + localNumberOfRows;
+        int MPI_MY_TAG = 99;
+        MPI_Request* request = new MPI_Request[num_neighbors];
+
+        // Post receives first
+        for (int i = 0; i < num_neighbors; i++)
+        {
+            local_int_t n_recv = receiveLength[i];
+            MPI_Irecv(x_external, n_recv, MPI_DOUBLE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD, request + i);
+            x_external += n_recv;
+        }
+
+        for (local_int_t i = 0; i < totalToBeSent; i++)
+            sendBuffer[i] = xv[elementsToSend[i]];
+
+        //
+        // Send to each neighbor
+        //
+        for (int i = 0; i < num_neighbors; i++)
+        {
+            local_int_t n_send = sendLength[i];
+            MPI_Send(sendBuffer, n_send, MPI_DOUBLE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD);
+            sendBuffer += n_send;
+        }
+
+        //
+        // Complete the reads issued above
+        //
+
+        MPI_Waitall(num_neighbors, request, MPI_STATUSES_IGNORE);
+
+        //[Experimental] Can improve MPI_Allreduce performance
+        #if 0
+        if (use_ibarrier == 1)
+            MPI_Ibarrier(MPI_COMM_WORLD, request);
+        #endif
+
+        delete[] request;
+    }
+    else if (P2P_Mode == MPI_CPU_All2allv)
+    {
+        double* const xv = x.values;
+        double* x_external = (double*) xv + localNumberOfRows;
+        for (local_int_t i = 0; i < totalToBeSent; i++)
+            sendBuffer[i] = xv[elementsToSend[i]];
+        MPI_Alltoallv(
+            sendBuffer, A.scounts, A.sdispls, MPI_DOUBLE, x_external, A.rcounts, A.rdispls, MPI_DOUBLE, MPI_COMM_WORLD);
+    }
+    return;
+}
+#endif
+// ifndef HPCG_NO_MPI
--- a/src/ExchangeHalo.hpp
+++ b/src/ExchangeHalo.hpp
@@ -0,0 +1,38 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef EXCHANGEHALO_HPP
+#define EXCHANGEHALO_HPP
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+void ExchangeHalo(const SparseMatrix& A, Vector& x);
+void ExchangeHaloCpu(const SparseMatrix& A, Vector& x, int use_ibarrier = 0);
+#endif // EXCHANGEHALO_HPP
--- a/src/GenerateCoarseProblem.cpp
+++ b/src/GenerateCoarseProblem.cpp
@@ -0,0 +1,158 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file GenerateProblem.cpp
+
+ HPCG routine
+ */
+
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+
+#include "GenerateCoarseProblem.hpp"
+#include "GenerateGeometry.hpp"
+#include "GenerateProblem.hpp"
+#include "SetupHalo.hpp"
+#include <cassert>
+
+#ifndef HPCG_NO_MPI
+// Used to find ranks for CPU and GPU programs
+extern int global_total_ranks;
+extern int* physical_rank_dims;
+#endif
+
+/*!
+  Routine to construct a prolongation/restriction operator for a given fine grid matrix
+  solution (as computed by a direct solver).
+
+  @param[inout]  Af - The known system matrix, on output its coarse operator, fine-to-coarse operator and auxiliary
+  vectors will be defined.
+
+  Note that the matrix Af is considered const because the attributes we are modifying are declared as mutable.
+
+*/
+
+void GenerateCoarseProblem(const SparseMatrix& Af)
+{
+    // Make local copies of geometry information.  Use global_int_t since the RHS products in the calculations
+    // below may result in global range values.
+    global_int_t nxf = Af.geom->nx;
+    global_int_t nyf = Af.geom->ny;
+    global_int_t nzf = Af.geom->nz;
+
+    local_int_t nxc, nyc, nzc; // Coarse nx, ny, nz
+    assert(nxf % 2 == 0);
+    assert(nyf % 2 == 0);
+    assert(nzf % 2 == 0); // Need fine grid dimensions to be divisible by 2
+    nxc = nxf / 2;
+    nyc = nyf / 2;
+    nzc = nzf / 2;
+    local_int_t* f2cOperator = new local_int_t[Af.localNumberOfRows];
+
+    local_int_t localNumberOfRows = nxc * nyc * nzc; // This is the size of our subblock
+    // If this assert fails, it most likely means that the local_int_t is set to int and should be set to long long
+    assert(localNumberOfRows
+        > 0); // Throw an exception of the number of rows is less than zero (can happen if "int" overflows)
+
+    for (int i = 0; i < 3 * global_total_ranks; i++)
+        physical_rank_dims[i] = physical_rank_dims[i] / 2;
+
+    // Construct the geometry and linear system
+    Geometry* geomc = new Geometry;
+    GenerateGeometry(Af.geom->size, Af.geom->rank, Af.geom->numThreads, nxc, nyc, nzc, Af.geom->npx, Af.geom->npy,
+        Af.geom->npz, Af.geom->different_dim, geomc);
+    Vector* rc = new Vector;
+    Vector* xc = new Vector;
+    Vector* Axf = new Vector;
+    MGData* mgData = new MGData;
+    if (Af.rankType == GPU)
+    {
+        SparseMatrix* Ac = Af.Ac;
+        Ac->rankType = GPU;
+        InitializeSparseMatrix(*Ac, geomc);
+        GenerateProblem(*Ac, 0, 0, 0);
+        SetupHalo(*Ac);
+        InitializeVector(*rc, Ac->localNumberOfRows, Ac->rankType);
+        InitializeVector(*xc, Ac->localNumberOfColumns, Ac->rankType);
+        InitializeVector(*Axf, Af.localNumberOfColumns, Ac->rankType);
+#ifdef USE_CUDA
+        cudaMemcpy(f2cOperator, Af.gpuAux.f2c, sizeof(local_int_t) * localNumberOfRows, cudaMemcpyDeviceToHost);
+#endif
+    }
+    else
+    {
+        SparseMatrix* Ac = new SparseMatrix;
+        InitializeSparseMatrix(*Ac, geomc);
+        Ac->rankType = CPU;
+        (*Ac).Ac = 0;
+        GenerateProblem(*Ac, 0, 0, 0);
+        SetupHalo(*Ac);
+        InitializeVector(*rc, Ac->localNumberOfRows, Ac->rankType);
+        InitializeVector(*xc, Ac->localNumberOfColumns, Ac->rankType);
+        InitializeVector(*Axf, Af.localNumberOfColumns, Ac->rankType);
+        Af.Ac = Ac;
+
+        // Use a parallel loop to do initial assignment:
+        // distributes the physical placement of arrays of pointers across the memory system
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+        for (local_int_t i = 0; i < localNumberOfRows; ++i)
+        {
+            f2cOperator[i] = 0;
+        }
+
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+        for(local_int_t i = 0; i < nzc * nyc * nxc; i++)
+        {
+            local_int_t izc = (i / (nxc * nyc));
+            local_int_t iyc = (i - izc * nxc * nyc) / nxc;
+            local_int_t ixc = i - (izc * nyc + iyc) * nxc;
+
+            local_int_t izf = 2 * izc;
+            local_int_t iyf = 2 * iyc;
+            local_int_t ixf = 2 * ixc;
+
+            local_int_t currentCoarseRow = izc * nxc * nyc + iyc * nxc + ixc;
+            local_int_t currentFineRow = izf * nxf * nyf + iyf * nxf + ixf;
+            f2cOperator[currentCoarseRow] = currentFineRow;
+        }
+    }
+    InitializeMGData(f2cOperator, rc, xc, Axf, *mgData);
+    Af.mgData = mgData;
+
+    return;
+}
+
--- a/src/GenerateCoarseProblem.hpp
+++ b/src/GenerateCoarseProblem.hpp
@@ -0,0 +1,19 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef GENERATECOARSEPROBLEM_HPP
+#define GENERATECOARSEPROBLEM_HPP
+#include "SparseMatrix.hpp"
+void GenerateCoarseProblem(const SparseMatrix& A);
+#endif // GENERATECOARSEPROBLEM_HPP
--- a/src/GenerateGeometry.cpp
+++ b/src/GenerateGeometry.cpp
@@ -0,0 +1,801 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file GenerateGeometry.cpp
+
+ HPCG routine
+ */
+
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+
+#include "ComputeOptimalShapeXYZ.hpp"
+#include "GenerateGeometry.hpp"
+
+#include <cstdio>
+
+#ifdef HPCG_DEBUG
+#include "hpcg.hpp"
+#include <fstream>
+using std::endl;
+
+#endif
+
+#ifndef HPCG_NO_MPI
+#include <mpi.h>
+#endif
+
+#ifndef HPCG_NO_MPI
+// Used to find ranks for CPU and GPU programs
+extern int global_total_ranks;
+extern int* physical_rank_dims;
+extern int* logical_rank_to_phys;
+#endif
+
+/*!
+  Computes the factorization of the total number of processes into a
+  3-dimensional process grid that is as close as possible to a cube. The
+  quality of the factorization depends on the prime number structure of the
+  total number of processes. It then stores this decompostion together with the
+  parallel parameters of the run in the geometry data structure.
+
+  @param[in]  size total number of MPI processes
+  @param[in]  rank this process' rank among other MPI processes
+  @param[in]  numThreads number of OpenMP threads in this process
+  @param[in]  nx, ny, nz number of grid points for each local block in the x, y, and z dimensions, respectively
+  @param[out] geom data structure that will store the above parameters and the factoring of total number of processes
+  into three dimensions
+*/
+
+// Level 0 Generation, we need to decide nx, ny, nz based on
+// G2C ratio and npx, npy, npz
+//  Remap rank IDs to logical IDs to enforce 3D shape correctness when exec_mode is GPUCPU
+void GenerateGeometry(HPCG_Params& params, Geometry* geom)
+{
+    int size = params.comm_size, rank = params.comm_rank; // Number of MPI processes, My process ID
+    int nx = params.nx, ny = params.ny, nz = params.nz;
+    int npx = params.npx, npy = params.npy, npz = params.npz;
+
+    // If npx. npy, and npz are not provided by user
+    // find the optimal shape
+    if (npx * npy * npz <= 0 || npx * npy * npz > size)
+        ComputeOptimalShapeXYZ(size, npx, npy, npz);
+
+    // When search_for_same0 is true, finds the next rank that is the same as local
+    //  problem size as rank 0. When false, finds the ranks that are not the same as rank 0
+    auto loop_over_ranks = [](int index, int lp, bool search_for_same0) -> int
+    {
+        for (int p = index; p < global_total_ranks; p++)
+        {
+            int nnpx = physical_rank_dims[3 * p];
+            int nnpy = physical_rank_dims[3 * p + 1];
+            int nnpz = physical_rank_dims[3 * p + 2];
+            bool same_zero = false;
+            if (nnpx == physical_rank_dims[0] && nnpy == physical_rank_dims[1] && nnpz == physical_rank_dims[2])
+                same_zero = true;
+
+            if (same_zero == search_for_same0)
+            {
+                logical_rank_to_phys[lp] = p;
+                index = p + 1;
+                break;
+            }
+        }
+        return index;
+    };
+
+    // Here decide and broadcast nx, ny, nz
+    // 1 Check for GPU and CPU execution modes
+    auto user_diff_dim = NONE;
+    if (params.exec_mode == GPUCPU)
+    {
+        // User defined diff direction between GPU and CPU
+        // If user decides that nz should be diff between GPU and CPU
+        //  and NPZ is even --> Decide GPU and CPU local size based on
+        //  local_problem_def and g2c
+        if (params.diff_dim == Z && (npz & 1) == 0)
+        {
+            user_diff_dim = Z;
+            if (params.local_problem_def == GPU_RATIO)
+            {
+                if (params.rank_type == CPU)
+                    nz = nz / params.g2c;
+            }
+            else if (params.local_problem_def == GPU_ABS)
+            {
+                if (params.rank_type == CPU)
+                    nz = params.g2c;
+            }
+            else if (params.local_problem_def == GPU_CPU_RATIO)
+            {
+                if (params.rank_type == CPU)
+                    nz = nz / params.g2c;
+                if (params.rank_type == GPU)
+                    nz = nz - (nz / params.g2c);
+            }
+            else
+            { /*GPU_CPU_ABS*/
+                if (params.rank_type == CPU)
+                    nz = params.g2c;
+                if (params.rank_type == GPU)
+                    nz = nz - params.g2c;
+            }
+        }
+        // If user decides that ny should be diff between GPU and CPU
+        //  and NPY is even --> Decide GPU and CPU local size based on
+        //  local_problem_def and g2c
+        else if (params.diff_dim == Y && (npy & 1) == 0)
+        {
+            user_diff_dim = Y;
+            if (params.local_problem_def == GPU_RATIO)
+            {
+                if (params.rank_type == CPU)
+                    ny = ny / params.g2c;
+            }
+            else if (params.local_problem_def == GPU_ABS)
+            {
+                if (params.rank_type == CPU)
+                    ny = params.g2c;
+            }
+            else if (params.local_problem_def == GPU_CPU_RATIO)
+            {
+                if (params.rank_type == CPU)
+                    ny = ny / params.g2c;
+                if (params.rank_type == GPU)
+                    ny = ny - (ny / params.g2c);
+            }
+            else
+            { /*GPU_CPU_ABS*/
+                if (params.rank_type == CPU)
+                    ny = params.g2c;
+                if (params.rank_type == GPU)
+                    ny = ny - params.g2c;
+            }
+        }
+        // If user decides that nx should be diff between GPU and CPU
+        //  and NPX is even --> Decide GPU and CPU local size based on
+        //  local_problem_def and g2c
+        else if (params.diff_dim == X && (npx & 1) == 0)
+        {
+            user_diff_dim = X;
+            if (params.local_problem_def == GPU_RATIO)
+            {
+                if (params.rank_type == CPU)
+                    nx = nx / params.g2c;
+            }
+            else if (params.local_problem_def == GPU_ABS)
+            {
+                if (params.rank_type == CPU)
+                    nx = params.g2c;
+            }
+            else if (params.local_problem_def == GPU_CPU_RATIO)
+            {
+                if (params.rank_type == CPU)
+                    nx = nx / params.g2c;
+                if (params.rank_type == GPU)
+                    nx = nx - (nx / params.g2c);
+            }
+            else
+            { /*GPU_CPU_ABS*/
+                if (params.rank_type == CPU)
+                    nx = params.g2c;
+                if (params.rank_type == GPU)
+                    nx = nx - params.g2c;
+            }
+        }
+        // Automatic partition direction
+        // When user does not specify the diff dimension
+        if (user_diff_dim == NONE)
+        { // Did not succeed with user choice
+            if ((npz & 1) == 0)
+            {
+                if (params.local_problem_def == GPU_RATIO)
+                {
+                    if (params.rank_type == CPU)
+                        nz = nz / params.g2c;
+                }
+                else if (params.local_problem_def == GPU_ABS)
+                {
+                    if (params.rank_type == CPU)
+                        nz = params.g2c;
+                }
+                else if (params.local_problem_def == GPU_CPU_RATIO)
+                {
+                    if (params.rank_type == CPU)
+                        nz = nz / params.g2c;
+                    if (params.rank_type == GPU)
+                        nz = nz - (nz / params.g2c);
+                }
+                else
+                { /*GPU_CPU_ABS*/
+                    if (params.rank_type == CPU)
+                        nz = params.g2c;
+                    if (params.rank_type == GPU)
+                        nz = nz - params.g2c;
+                }
+            }
+            else if ((npy & 1) == 0)
+            {
+                if (params.local_problem_def == GPU_RATIO)
+                {
+                    if (params.rank_type == CPU)
+                        ny = ny / params.g2c;
+                }
+                else if (params.local_problem_def == GPU_ABS)
+                {
+                    if (params.rank_type == CPU)
+                        ny = params.g2c;
+                }
+                else if (params.local_problem_def == GPU_CPU_RATIO)
+                {
+                    if (params.rank_type == CPU)
+                        ny = ny / params.g2c;
+                    if (params.rank_type == GPU)
+                        ny = ny - (ny / params.g2c);
+                }
+                else
+                { /*GPU_CPU_ABS*/
+                    if (params.rank_type == CPU)
+                        ny = params.g2c;
+                    if (params.rank_type == GPU)
+                        ny = ny - params.g2c;
+                }
+            }
+            else if ((npx & 1) == 0)
+            {
+                if (params.local_problem_def == GPU_RATIO)
+                {
+                    if (params.rank_type == CPU)
+                        nx = nx / params.g2c;
+                }
+                else if (params.local_problem_def == GPU_ABS)
+                {
+                    if (params.rank_type == CPU)
+                        nx = params.g2c;
+                }
+                else if (params.local_problem_def == GPU_CPU_RATIO)
+                {
+                    if (params.rank_type == CPU)
+                        nx = nx / params.g2c;
+                    if (params.rank_type == GPU)
+                        nx = nx - (nx / params.g2c);
+                }
+                else
+                { /*GPU_CPU_ABS*/
+                    if (params.rank_type == CPU)
+                        nx = params.g2c;
+                    if (params.rank_type == GPU)
+                        nx = nx - params.g2c;
+                }
+            }
+        }
+    }
+
+    // Now let us exchange dimensions
+    int sendBuf[] = {nx, ny, nz};
+#ifndef HPCG_NO_MPI
+    MPI_Allgather(sendBuf, 3, MPI_INT, physical_rank_dims, 3, MPI_INT, MPI_COMM_WORLD);
+#endif
+
+    // My logical rank Id
+    int logical_rank;
+    // last physical position for the rank that has the same size as 0
+    int same_as_0_position = 0;
+    // last physical position for the rank that does not have the same size as 0
+    int not_same_as_0_position = 0;
+    auto different_dim = NONE;
+
+    bool all_same = true;
+    int num_ranks_same = 1;
+    int num_ranks_not_same = 0;
+    int x0 = physical_rank_dims[0];
+    int y0 = physical_rank_dims[1];
+    int z0 = physical_rank_dims[2];
+    for (int p = 1; p < global_total_ranks; p++)
+    {
+        int x = physical_rank_dims[3 * p];
+        int y = physical_rank_dims[3 * p + 1];
+        int z = physical_rank_dims[3 * p + 2];
+        if (x != x0 || y != y0 || z != z0)
+            num_ranks_not_same++;
+        else
+            num_ranks_same++;
+    }
+
+    if (num_ranks_not_same > 0)
+        all_same = false;
+
+    if (!all_same)
+    {
+        // try twice: user-based, automatic
+        for (int i = 0; i < 2; i++)
+        {
+            bool z_condition = (i == 0) ? user_diff_dim == Z && (npz & 1) == 0 : (npz & 1) == 0;
+            bool y_condition = (i == 0) ? user_diff_dim == Y && (npy & 1) == 0 : (npy & 1) == 0;
+            bool x_condition = (i == 0) ? user_diff_dim == X && (npx & 1) == 0 : (npx & 1) == 0;
+            // Let us start with Z
+            if (z_condition)
+            { // Z is even
+                different_dim = Z;
+                bool x_same = true;
+                bool y_same = true;
+                for (int p = 1; p < global_total_ranks; p++)
+                {
+                    int x = physical_rank_dims[3 * p];
+                    int y = physical_rank_dims[3 * p + 1];
+                    assert(x == x0 && y == y0);
+                }
+            }
+            else if (y_condition)
+            { // Y is even
+                different_dim = Y;
+                bool x_same = true;
+                bool z_same = true;
+                for (int p = 1; p < global_total_ranks; p++)
+                {
+                    int x = physical_rank_dims[3 * p];
+                    int z = physical_rank_dims[3 * p + 2];
+                    assert(x == x0 && z == z0);
+                }
+            }
+            else if (x_condition)
+            {
+                different_dim = X;
+                bool y_same = true;
+                bool z_same = true;
+                for (int p = 1; p < global_total_ranks; p++)
+                {
+                    int y = physical_rank_dims[3 * p + 1];
+                    int z = physical_rank_dims[3 * p + 2];
+                    assert(z == z0 && y == y0);
+                }
+            }
+
+            if (z_condition || y_condition || x_condition)
+                break;
+        }
+    }
+
+    // When exec_mode is GPUCPU, GPU and CPU ranks can have different dims. Therefore,
+    // we must rearrange the ranks such that the 3D shape is correct.
+    int same_rank_counter = 0;
+    if (different_dim != NONE)
+    {
+        for (int iz = 0; iz < npz; iz++)
+            for (int iy = 0; iy < npy; iy++)
+                for (int ix = 0; ix < npx; ix++)
+                {
+                    int logical_position = iz * npy * npx + iy * npx + ix;
+
+                    // Different dim is Z
+                    // The first NPXxNPY are GPUs, then the next NPXxNPY is CPUs, and so on
+                    if (different_dim == Z)
+                    {
+                        if ((iz & 1) == 0 && same_rank_counter < num_ranks_same)
+                        { // same as 0
+                            same_as_0_position = loop_over_ranks(same_as_0_position, logical_position, true);
+                            same_rank_counter++;
+                        }
+                        else
+                        { // Not same as 0
+                            not_same_as_0_position = loop_over_ranks(not_same_as_0_position, logical_position, false);
+                        }
+                    }
+                    // Different dim is Y
+                    // The first NPXxNPZ are GPUs, then the next NPXxNPZ is CPUs, and so on
+                    else if (different_dim == Y)
+                    {
+                        if ((iy & 1) == 0 && same_rank_counter < num_ranks_same)
+                        { // same as 0
+                            same_as_0_position = loop_over_ranks(same_as_0_position, logical_position, true);
+                            same_rank_counter++;
+                        }
+                        else
+                        { // Not same as 0
+                            not_same_as_0_position = loop_over_ranks(not_same_as_0_position, logical_position, false);
+                        }
+                    }
+                    // Different dim is X
+                    // The first NPYxNPZ are GPUs, then the next NPYxNPZ is CPUs, and so on
+                    else if (different_dim == X)
+                    {
+                        if ((ix & 1) == 0 && same_rank_counter < num_ranks_same)
+                        { // same as 0
+                            same_as_0_position = loop_over_ranks(same_as_0_position, logical_position, true);
+                            same_rank_counter++;
+                        }
+                        else
+                        { // Not same as 0
+                            not_same_as_0_position = loop_over_ranks(not_same_as_0_position, logical_position, false);
+                        }
+                    }
+                }
+    }
+    else
+    {
+        // Keep rank Ids the same if all ranks have the same problem size
+        for (int p = 0; p < global_total_ranks; p++)
+            logical_rank_to_phys[p] = p;
+    }
+
+    for (int p = 0; p < global_total_ranks; p++)
+    {
+        if (rank == logical_rank_to_phys[p])
+        {
+            logical_rank = p;
+        }
+    }
+
+    // Now compute this process's indices in the 3D cube
+    int ipz = logical_rank / (npx * npy);
+    int ipy = (logical_rank - ipz * npx * npy) / npx;
+    int ipx = logical_rank % npx;
+
+#ifdef HPCG_DEBUG
+    if (rank == 0)
+        HPCG_fout << "size = " << size << endl
+                  << "nx  = " << nx << endl
+                  << "ny  = " << ny << endl
+                  << "nz  = " << nz << endl
+                  << "npx = " << npx << endl
+                  << "npy = " << npy << endl
+                  << "npz = " << npz << endl;
+
+    HPCG_fout << "For rank = " << rank << endl
+              << "ipx = " << ipx << endl
+              << "ipy = " << ipy << endl
+              << "ipz = " << ipz << endl;
+
+    assert(size >= npx * npy * npz);
+#endif
+    geom->size = size;
+    geom->rank = rank;
+    geom->logical_rank = logical_rank;
+    geom->different_dim = different_dim;
+    geom->numThreads = params.numThreads;
+    geom->nx = nx;
+    geom->ny = ny;
+    geom->nz = nz;
+    geom->npx = npx;
+    geom->npy = npy;
+    geom->npz = npz;
+    geom->ipx = ipx;
+    geom->ipy = ipy;
+    geom->ipz = ipz;
+
+    // These values should be defined to take into account changes in nx, ny, nz values
+    // due to variable local grid sizes
+    global_int_t gnx = 0;
+    global_int_t gny = 0;
+    global_int_t gnz = 0;
+
+    // Find the global NX. NY, and NZ
+    //  For diff dims, accumulate sequentially
+    //  For similar dims, just multiply rank 3D location by the local dim
+    if (different_dim == X)
+        for (int i = 0; i < npx; i++)
+        {
+            int r = ipz * npx * npy + ipy * npx + i;
+            int p = logical_rank_to_phys[r];
+            gnx += physical_rank_dims[p * 3];
+        }
+    else
+        gnx = npx * nx;
+
+    if (different_dim == Y)
+        for (int i = 0; i < npy; i++)
+        {
+            int r = ipz * npx * npy + i * npx + ipx;
+            int p = logical_rank_to_phys[r];
+            gny += physical_rank_dims[p * 3 + 1];
+        }
+    else
+        gny = npy * ny;
+
+    if (different_dim == Z)
+        for (int i = 0; i < npz; i++)
+        {
+            int r = i * npx * npy + ipy * npx + ipx;
+            int p = logical_rank_to_phys[r];
+            gnz += physical_rank_dims[p * 3 + 2];
+        }
+    else
+        gnz = npz * nz;
+
+    // Here, we find the initial global indices (gix0, giy0, and giz0)
+    // for each rank based on its 3d location in the grid
+    // Also, for the diff dim find the previous and next neighbor IDs
+    // Notice, on the diff dims the previous and next neighbors have
+    // the different dimension!
+    int prev_n = 0;
+    int next_n = 0;
+    global_int_t giz0 = 0;
+    global_int_t gix0 = 0;
+    global_int_t giy0 = 0;
+    if (different_dim == X)
+    {
+        for (int i = 0; i < ipx; i++)
+        {
+            int r = ipz * npx * npy + ipy * npx + i;
+            int p = logical_rank_to_phys[r];
+            gix0 += physical_rank_dims[p * 3];
+            if (i == ipx - 1)
+            {
+                prev_n = physical_rank_dims[p * 3];
+            }
+        }
+        if (ipx + 1 < npx)
+        {
+            int r = ipz * npx * npy + ipy * npx + (ipx + 1);
+            int p = logical_rank_to_phys[r];
+            next_n = physical_rank_dims[p * 3];
+        }
+    }
+    else
+        gix0 = ipx * nx;
+
+    if (different_dim == Y)
+    {
+        for (int i = 0; i < ipy; i++)
+        {
+            int r = ipz * npx * npy + i * npx + ipx;
+            int p = logical_rank_to_phys[r];
+            giy0 += physical_rank_dims[p * 3 + 1];
+            if (i == ipy - 1)
+            {
+                prev_n = physical_rank_dims[p * 3 + 1];
+            }
+        }
+        if (ipy + 1 < npy)
+        {
+            int r = ipz * npx * npy + (ipy + 1) * npx + ipx;
+            int p = logical_rank_to_phys[r];
+            next_n = physical_rank_dims[p * 3 + 1];
+        }
+    }
+    else
+        giy0 = ipy * ny;
+
+    if (different_dim == Z)
+    {
+        for (int i = 0; i < ipz; i++)
+        {
+            int r = i * npx * npy + ipy * npx + ipx;
+            int p = logical_rank_to_phys[r];
+            giz0 += physical_rank_dims[p * 3 + 2];
+            if (i == ipz - 1)
+            {
+                prev_n = physical_rank_dims[p * 3 + 2];
+            }
+        }
+        if (ipz + 1 < npz)
+        {
+            int r = (ipz + 1) * npx * npy + ipy * npx + ipx;
+            int p = logical_rank_to_phys[r];
+            next_n = physical_rank_dims[p * 3 + 2];
+        }
+    }
+    else
+        giz0 = ipz * nz;
+
+    // Keep these values for later
+    geom->gnx = gnx;
+    geom->gny = gny;
+    geom->gnz = gnz;
+    geom->gix0 = gix0;
+    geom->giy0 = giy0;
+    geom->giz0 = giz0;
+    geom->previous_neighbor_dim = prev_n;
+    geom->next_neighbor_dim = next_n;
+
+    return;
+}
+
+// Simpler generateion for next/coarse levels
+// Do not need to find nx, ny, nz for CPU and GPU based on parameters
+// Do not need to find logical rank IDs
+void GenerateGeometry(int size, int rank, int numThreads, local_int_t nx, local_int_t ny, local_int_t nz, int npx,
+    int npy, int npz, dim_3d_t different_dim, Geometry* geom)
+{
+
+    // My logical rank Id
+    int logical_rank;
+    for (int p = 0; p < global_total_ranks; p++)
+    {
+        if (rank == logical_rank_to_phys[p])
+        {
+            logical_rank = p;
+        }
+    }
+
+    // Now compute this process's indices in the 3D cube
+    int ipz = logical_rank / (npx * npy);
+    int ipy = (logical_rank - ipz * npx * npy) / npx;
+    int ipx = logical_rank % npx;
+
+#ifdef HPCG_DEBUG
+    if (rank == 0)
+        HPCG_fout << "size = " << size << endl
+                  << "nx  = " << nx << endl
+                  << "ny  = " << ny << endl
+                  << "nz  = " << nz << endl
+                  << "npx = " << npx << endl
+                  << "npy = " << npy << endl
+                  << "npz = " << npz << endl;
+
+    HPCG_fout << "For rank = " << rank << endl
+              << "ipx = " << ipx << endl
+              << "ipy = " << ipy << endl
+              << "ipz = " << ipz << endl;
+
+    assert(size >= npx * npy * npz);
+#endif
+    geom->size = size;
+    geom->rank = rank;
+    geom->logical_rank = logical_rank;
+    geom->different_dim = different_dim;
+    geom->numThreads = numThreads;
+    geom->nx = nx;
+    geom->ny = ny;
+    geom->nz = nz;
+    geom->npx = npx;
+    geom->npy = npy;
+    geom->npz = npz;
+    geom->ipx = ipx;
+    geom->ipy = ipy;
+    geom->ipz = ipz;
+
+    // Find the global NX. NY, and NZ
+    //  For diff dims, accumulate sequentially
+    //  For similar dims, just multiply rank 3D location by the local dim
+    global_int_t gnx = 0;
+    global_int_t gny = 0;
+    global_int_t gnz = 0;
+    if (different_dim == X)
+        for (int i = 0; i < npx; i++)
+        {
+            int r = ipz * npx * npy + ipy * npx + i;
+            int p = logical_rank_to_phys[r];
+            gnx += physical_rank_dims[p * 3];
+        }
+    else
+        gnx = npx * nx;
+
+    if (different_dim == Y)
+        for (int i = 0; i < npy; i++)
+        {
+            int r = ipz * npx * npy + i * npx + ipx;
+            int p = logical_rank_to_phys[r];
+            gny += physical_rank_dims[p * 3 + 1];
+        }
+    else
+        gny = npy * ny;
+
+    if (different_dim == Z)
+        for (int i = 0; i < npz; i++)
+        {
+            int r = i * npx * npy + ipy * npx + ipx;
+            int p = logical_rank_to_phys[r];
+            gnz += physical_rank_dims[p * 3 + 2];
+        }
+    else
+        gnz = npz * nz;
+
+    // Here, we find the initial global indices (gix0, giy0, and giz0)
+    // for each rank based on its 3d location in the grid
+    // Also, for the diff dim find the previous and next neighbor IDs
+    // Notice, on the diff dims the previous and next neighbors have
+    // the different dimension!
+    int prev_n = 0;
+    int next_n = 0;
+    global_int_t giz0 = 0;
+    global_int_t gix0 = 0;
+    global_int_t giy0 = 0;
+    if (different_dim == X)
+    {
+        for (int i = 0; i < ipx; i++)
+        {
+            int r = ipz * npx * npy + ipy * npx + i;
+            int p = logical_rank_to_phys[r];
+            gix0 += physical_rank_dims[p * 3];
+            if (i == ipx - 1)
+            {
+                prev_n = physical_rank_dims[p * 3];
+            }
+        }
+        if (ipx + 1 < npx)
+        {
+            int r = ipz * npx * npy + ipy * npx + (ipx + 1);
+            int p = logical_rank_to_phys[r];
+            next_n = physical_rank_dims[p * 3];
+        }
+    }
+    else
+        gix0 = ipx * nx;
+
+    if (different_dim == Y)
+    {
+        for (int i = 0; i < ipy; i++)
+        {
+            int r = ipz * npx * npy + i * npx + ipx;
+            int p = logical_rank_to_phys[r];
+            giy0 += physical_rank_dims[p * 3 + 1];
+            if (i == ipy - 1)
+            {
+                prev_n = physical_rank_dims[p * 3 + 1];
+            }
+        }
+        if (ipy + 1 < npy)
+        {
+            int r = ipz * npx * npy + (ipy + 1) * npx + ipx;
+            int p = logical_rank_to_phys[r];
+            next_n = physical_rank_dims[p * 3 + 1];
+        }
+    }
+    else
+        giy0 = ipy * ny;
+
+    if (different_dim == Z)
+    {
+        for (int i = 0; i < ipz; i++)
+        {
+            int r = i * npx * npy + ipy * npx + ipx;
+            int p = logical_rank_to_phys[r];
+            giz0 += physical_rank_dims[p * 3 + 2];
+            if (i == ipz - 1)
+            {
+                prev_n = physical_rank_dims[p * 3 + 2];
+            }
+        }
+        if (ipz + 1 < npz)
+        {
+            int r = (ipz + 1) * npx * npy + ipy * npx + ipx;
+            int p = logical_rank_to_phys[r];
+            next_n = physical_rank_dims[p * 3 + 2];
+        }
+    }
+    else
+        giz0 = ipz * nz;
+
+    // Keep these values for later
+    geom->gnx = gnx;
+    geom->gny = gny;
+    geom->gnz = gnz;
+    geom->gix0 = gix0;
+    geom->giy0 = giy0;
+    geom->giz0 = giz0;
+    geom->previous_neighbor_dim = prev_n;
+    geom->next_neighbor_dim = next_n;
+
+    return;
+}
--- a/src/GenerateGeometry.hpp
+++ b/src/GenerateGeometry.hpp
@@ -0,0 +1,39 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GENERATEGEOMETRY_HPP
+#define GENERATEGEOMETRY_HPP
+#include "Geometry.hpp"
+#include "hpcg.hpp"
+void GenerateGeometry(HPCG_Params& params, Geometry* geom);
+void GenerateGeometry(int size, int rank, int numThreads, local_int_t nx, local_int_t ny, local_int_t nz, int npx,
+    int npy, int npz, dim_3d_t partition_by, Geometry* geom);
+#endif // GENERATEGEOMETRY_HPP
--- a/src/GenerateProblem.cpp
+++ b/src/GenerateProblem.cpp
@@ -0,0 +1,404 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file GenerateProblem.cpp
+
+ HPCG routine
+ */
+
+#ifndef HPCG_NO_MPI
+#include <mpi.h>
+#endif
+
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+
+#include "mytimer.hpp"
+
+#include "GenerateProblem.hpp"
+#include "GenerateProblem_ref.hpp"
+#ifdef USE_CUDA
+#include "Cuda.hpp"
+#include "CudaKernels.hpp"
+#endif
+
+#ifdef USE_GRACE
+#include "CpuKernels.hpp"
+#endif
+
+/*!
+  Routine to generate a sparse matrix, right hand side, initial guess, and exact solution.
+
+  @param[in]  A        The generated system matrix
+  @param[inout] b      The newly allocated and generated right hand side vector (if b!=0 on entry)
+  @param[inout] x      The newly allocated solution vector with entries set to 0.0 (if x!=0 on entry)
+  @param[inout] xexact The newly allocated solution vector with entries set to the exact solution (if the xexact!=0
+  non-zero on entry)
+
+  @see GenerateGeometry
+*/
+#ifdef USE_CUDA
+void GenerateProblem_Gpu(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact)
+{
+    global_int_t nx = A.geom->nx;
+    global_int_t ny = A.geom->ny;
+    global_int_t nz = A.geom->nz;
+    global_int_t gnx = A.geom->gnx;
+    global_int_t gny = A.geom->gny;
+    global_int_t gnz = A.geom->gnz;
+    global_int_t gix0 = A.geom->gix0;
+    global_int_t giy0 = A.geom->giy0;
+    global_int_t giz0 = A.geom->giz0;
+
+    local_int_t localNumberOfRows = nx * ny * nz;
+    local_int_t numberOfNonzerosPerRow = 27;
+    global_int_t totalNumberOfRows = gnx * gny * gnz;
+
+    if (b != 0)
+        InitializeVector(*b, localNumberOfRows, GPU);
+    if (x != 0)
+        InitializeVector(*x, localNumberOfRows, GPU);
+    if (xexact != 0)
+        InitializeVector(*xexact, localNumberOfRows, GPU);
+
+    GenerateProblemCuda(A, b, x, xexact);
+
+    local_int_t localNumberOfNonzeros = A.localNumberOfNonzeros;
+    global_int_t totalNumberOfNonzeros = 27LL * ((gnx - 2LL) * (gny - 2LL) * (gnz - 2LL))
+        + 18LL
+            * (2LL * ((gnx - 2LL) * (gny - 2LL)) + 2LL * ((gnx - 2LL) * (gnz - 2LL))
+                + 2LL * ((gny - 2LL) * (gnz - 2LL)))
+        + 12LL * (4LL * (gnx - 2LL) + 4LL * (gny - 2LL) + 4LL * (gnz - 2LL)) + 8LL * 8LL;
+
+    A.title = 0;
+    A.totalNumberOfRows = totalNumberOfRows;
+    A.totalNumberOfNonzeros = totalNumberOfNonzeros;
+    A.localNumberOfRows = localNumberOfRows;
+    A.localNumberOfColumns = localNumberOfRows;
+    A.localNumberOfNonzeros = localNumberOfNonzeros;
+
+    return;
+}
+#endif
+
+#ifdef USE_GRACE
+// Neighbor rank to sequential ID and vice versa
+extern int *rankToId_h, *idToRank_h;
+// GenerateProblem_Cpu is called 4 times for each level
+// Sometimes we need to perform actions based on the level (global across the applications)
+int global_steps = 0;
+void GenerateProblem_Cpu(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact)
+{
+    // Make local copies of geometry information.  Use global_int_t since the RHS products in the calculations
+    // below may result in global range values.
+    global_int_t nx = A.geom->nx;
+    global_int_t ny = A.geom->ny;
+    global_int_t nz = A.geom->nz;
+    global_int_t gnx = A.geom->gnx;
+    global_int_t gny = A.geom->gny;
+    global_int_t gnz = A.geom->gnz;
+    global_int_t gix0 = A.geom->gix0;
+    global_int_t giy0 = A.geom->giy0;
+    global_int_t giz0 = A.geom->giz0;
+    int npx = A.geom->npx;
+    int npy = A.geom->npy;
+
+    local_int_t localNumberOfRows = nx * ny * nz; // This is the size of our subblock
+    // If this assert fails, it most likely means that the local_int_t is set to int and should be set to long long
+    assert(localNumberOfRows
+        > 0); // Throw an exception of the number of rows is less than zero (can happen if int overflow)
+    local_int_t numberOfNonzerosPerRow
+        = 27; // We are approximating a 27-point finite element/volume/difference 3D stencil
+
+    global_int_t totalNumberOfRows = gnx * gny * gnz; // Total number of grid points in mesh
+    // If this assert fails, it most likely means that the global_int_t is set to int and should be set to long long
+    assert(totalNumberOfRows
+        > 0); // Throw an exception of the number of rows is less than zero (can happen if int overflow)
+
+    // Allocate arrays that are of length localNumberOfRows
+    if (global_steps == 0)
+    {
+        rankToId_h = new int[A.geom->size + 1];
+        idToRank_h = new int[27];
+        global_steps++;
+    }
+    local_int_t* nonzerosInRow = new local_int_t[localNumberOfRows];
+    global_int_t** mtxIndG = new global_int_t*[localNumberOfRows];
+    local_int_t** mtxIndL = new local_int_t*[localNumberOfRows];
+    double** matrixValues = new double*[localNumberOfRows];
+    double** matrixDiagonal = new double*[localNumberOfRows];
+
+    if (b != 0)
+        InitializeVector(*b, localNumberOfRows, CPU);
+    if (x != 0)
+        InitializeVector(*x, localNumberOfRows, CPU);
+    if (xexact != 0)
+        InitializeVector(*xexact, localNumberOfRows, CPU);
+    double* bv = 0;
+    double* xv = 0;
+    double* xexactv = 0;
+    if (b != 0)
+        bv = b->values; // Only compute exact solution if requested
+    if (x != 0)
+        xv = x->values; // Only compute exact solution if requested
+    if (xexact != 0)
+        xexactv = xexact->values; // Only compute exact solution if requested
+    A.localToGlobalMap.resize(localNumberOfRows);
+
+    // Use a parallel loop to do initial assignment:
+    // distributes the physical placement of arrays of pointers across the memory system
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+    for (local_int_t i = 0; i < localNumberOfRows; ++i)
+    {
+        matrixValues[i] = 0;
+        matrixDiagonal[i] = 0;
+        mtxIndG[i] = 0;
+        mtxIndL[i] = 0;
+    }
+
+    if (global_steps == 1)
+    {
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+        for (local_int_t i = 0; i < A.geom->size + 1; i++)
+        {
+            rankToId_h[i] = 0;
+        }
+        global_steps++;
+    }
+
+    // Now allocate the arrays pointed to
+    mtxIndL[0] = new local_int_t[localNumberOfRows * numberOfNonzerosPerRow];
+    matrixValues[0] = new double[localNumberOfRows * numberOfNonzerosPerRow];
+    mtxIndG[0] = new global_int_t[localNumberOfRows * numberOfNonzerosPerRow];
+
+    local_int_t localNumberOfNonzeros = 0;
+    local_int_t ext_nnz = 0;
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for reduction(+ : localNumberOfNonzeros) reduction(+ : ext_nnz)
+#endif
+    for (local_int_t i = 0; i < localNumberOfRows; i++)
+    {
+        mtxIndL[i] = mtxIndL[0] + i * numberOfNonzerosPerRow;
+        matrixValues[i] = matrixValues[0] + i * numberOfNonzerosPerRow;
+        mtxIndG[i] = mtxIndG[0] + i * numberOfNonzerosPerRow;
+
+        const local_int_t iz = (i / (nx * ny));
+        const local_int_t iy = (i - iz * nx * ny) / nx;
+        const local_int_t ix = i - (iz * ny + iy) * nx;
+        const global_int_t gix = ix + gix0;
+        const global_int_t giy = iy + giy0;
+        const global_int_t giz = iz + giz0;
+
+        local_int_t currentLocalRow = i;
+        global_int_t currentGlobalRow = gix + giy * gnx + giz * gnx * gny;
+
+        A.localToGlobalMap[currentLocalRow] = currentGlobalRow;
+
+        char numberOfNonzerosInRow = 0;
+        double* currentValuePointer = matrixValues[currentLocalRow];
+        global_int_t* currentIndexPointerG = mtxIndG[currentLocalRow];
+        global_int_t curcol;
+        double* diagonalPointer = nullptr;
+        // Go through all the neighbors around a 3D point to decide
+        //  which one is a halo and which one is local to the rank
+        for (int k = 0; k < 27; k++)
+        {
+            // Neibor global Ids
+            long long int cgix = gix + tid2indCpu[k][0];
+            long long int cgiy = giy + tid2indCpu[k][1];
+            long long int cgiz = giz + tid2indCpu[k][2];
+
+            // These used when the point is local to the rank
+            local_int_t zi = (cgiz) % nz;
+            local_int_t yi = (cgiy) % ny;
+            local_int_t xi = (cgix) % nx;
+            // local column Id
+            local_int_t lcol = zi * ny * nx + yi * nx + xi;
+
+            // Is the global 3D point inside the global problem?
+            int ok = cgiz > -1 && cgiz < gnz && cgiy > -1 && cgiy < gny && cgix > -1 && cgix < gnx;
+
+            if (ok /*Yes this a valid point globally*/)
+            {
+                *currentIndexPointerG++ = cgix + cgiy * gnx + cgiz * gnx * gny;
+                ;
+                if (k == 13)
+                {
+                    *currentValuePointer = 26.0;
+                    diagonalPointer = currentValuePointer;
+                }
+                else
+                {
+                    *currentValuePointer = -1.0;
+                }
+
+                // Rank Id in the global domain
+                int ipz = cgiz / nz;
+                int ipy = cgiy / ny;
+                int ipx = cgix / nx;
+
+                // For GPUCPU exec mode, when the CPU and GPU have diff dims in a direction,
+                //  we need to find the point rank manually, not based on its local dimension
+                //  but based on its physical location to the local problem
+                //  Note the halo size is always 1
+                if (A.geom->different_dim == Z)
+                {
+                    long long int local = cgiz - giz0;
+                    if (local >= 0 && local < nz)
+                        ipz = A.geom->ipz;
+                    else if (local < 0)
+                        ipz = A.geom->ipz - 1;
+                    else if (local >= nz)
+                        ipz = A.geom->ipz + 1;
+                }
+                else if (A.geom->different_dim == Y)
+                {
+                    long long int local = cgiy - giy0;
+                    if (local >= 0 && local < ny)
+                        ipy = A.geom->ipy;
+                    else if (local < 0)
+                        ipy = A.geom->ipy - 1;
+                    else if (local >= ny)
+                        ipy = A.geom->ipy + 1;
+                }
+                else if (A.geom->different_dim == X)
+                {
+                    long long int local = cgix - gix0;
+                    if (local >= 0 && local < nx)
+                        ipx = A.geom->ipx;
+                    else if (local < 0)
+                        ipx = A.geom->ipx - 1;
+                    else if (local >= nx)
+                        ipx = A.geom->ipx + 1;
+                }
+
+                // Now, after find the point rank from the location
+                //  in the 3D grid (ranks domain NPXxNPYxNPZ)
+                int col_rank = ipx + ipy * npx + ipz * npy * npx;
+
+                // The neighbor point rank is diff than the current point rank
+                if (A.geom->logical_rank != col_rank)
+                {
+                    if (global_steps == 2)
+                        rankToId_h[col_rank + 1] = 1; // To find its sequential Id (will be prefix summed later)
+                    ext_nnz++;
+                }
+
+                currentValuePointer++;
+                numberOfNonzerosInRow++;
+            }
+        }
+
+        matrixDiagonal[currentLocalRow] = diagonalPointer;
+        nonzerosInRow[currentLocalRow] = numberOfNonzerosInRow;
+        localNumberOfNonzeros += numberOfNonzerosInRow;
+        if (b != 0)
+            bv[currentLocalRow] = 26.0 - ((double) (numberOfNonzerosInRow - 1));
+        if (x != 0)
+            xv[currentLocalRow] = 0.0;
+        if (xexact != 0)
+            xexactv[currentLocalRow] = 1.0;
+    }
+
+    // Prefixsum to RakToId
+    // Map physical neighbor ranks to sequential IDs
+    //  less memory consumption
+    if (global_steps == 2)
+    {
+        PrefixsumCpu(rankToId_h + 1, A.geom->size);
+        int counter = 1;
+        for (int i = 1; i < A.geom->size + 1; i++)
+        {
+            if (rankToId_h[i] == counter)
+            {
+                idToRank_h[counter - 1] = i - 1;
+                counter++;
+            }
+        }
+        global_steps++;
+    }
+
+#ifdef HPCG_DETAILED_DEBUG
+    HPCG_fout << "Process " << A.geom->rank << " of " << A.geom->size << " has " << localNumberOfRows << " rows."
+              << endl
+              << "Process " << A.geom->rank << " of " << A.geom->size << " has " << localNumberOfNonzeros
+              << " nonzeros." << endl;
+#endif
+
+    global_int_t totalNumberOfNonzeros = 27LL * ((gnx - 2LL) * (gny - 2LL) * (gnz - 2LL))
+        + 18LL
+            * (2LL * ((gnx - 2LL) * (gny - 2LL)) + 2LL * ((gnx - 2LL) * (gnz - 2LL))
+                + 2LL * ((gny - 2LL) * (gnz - 2LL)))
+        + 12LL * (4LL * (gnx - 2LL) + 4LL * (gny - 2LL) + 4LL * (gnz - 2LL)) + 8LL * 8LL;
+
+    // If this assert fails, it most likely means that the global_int_t is set to int and should be set to long long
+    // This assert is usually the first to fail as problem size increases beyond the 32-bit integer range.
+    assert(totalNumberOfNonzeros
+        > 0); // Throw an exception of the number of nonzeros is less than zero (can happen if int overflow)
+
+    A.title = 0;
+    A.totalNumberOfRows = totalNumberOfRows;
+    A.totalNumberOfNonzeros = totalNumberOfNonzeros;
+    A.localNumberOfRows = localNumberOfRows;
+    A.localNumberOfColumns = localNumberOfRows;
+    A.localNumberOfNonzeros = localNumberOfNonzeros;
+    A.nonzerosInRow = nonzerosInRow;
+    A.mtxIndG = mtxIndG;
+    A.mtxIndL = mtxIndL;
+    A.matrixValues = matrixValues;
+    A.matrixDiagonal = matrixDiagonal;
+    A.extNnz = ext_nnz;
+
+    return;
+}
+#endif // USE_GRACE
+
+void GenerateProblem(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact)
+{
+    if (A.rankType == GPU)
+    {
+#ifdef USE_CUDA
+        GenerateProblem_Gpu(A, b, x, xexact);
+#endif
+    }
+    else
+    {
+#ifdef USE_GRACE
+        GenerateProblem_Cpu(A, b, x, xexact);
+#endif
+    }
+}
--- a/src/GenerateProblem.hpp
+++ b/src/GenerateProblem.hpp
@@ -0,0 +1,20 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef GENERATEPROBLEM_HPP
+#define GENERATEPROBLEM_HPP
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+void GenerateProblem(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact);
+#endif // GENERATEPROBLEM_HPP
--- a/src/GenerateProblem_ref.cpp
+++ b/src/GenerateProblem_ref.cpp
@@ -0,0 +1,251 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file GenerateProblem_ref.cpp
+
+ HPCG routine
+ */
+
+#ifndef HPCG_NO_MPI
+#include <mpi.h>
+#endif
+
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+
+#if defined(HPCG_DEBUG) || defined(HPCG_DETAILED_DEBUG)
+#include <fstream>
+using std::endl;
+#include "hpcg.hpp"
+#endif
+#include <cassert>
+
+#include "GenerateProblem_ref.hpp"
+
+/*!
+  Reference version of GenerateProblem to generate the sparse matrix, right hand side, initial guess, and exact
+  solution.
+
+  @param[in]  A      The known system matrix
+  @param[inout] b      The newly allocated and generated right hand side vector (if b!=0 on entry)
+  @param[inout] x      The newly allocated solution vector with entries set to 0.0 (if x!=0 on entry)
+  @param[inout] xexact The newly allocated solution vector with entries set to the exact solution (if the xexact!=0
+  non-zero on entry)
+
+  @see GenerateGeometry
+*/
+
+void GenerateProblem_ref(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact)
+{
+
+    // Make local copies of geometry information.  Use global_int_t since the RHS products in the calculations
+    // below may result in global range values.
+    global_int_t nx = A.geom->nx;
+    global_int_t ny = A.geom->ny;
+    global_int_t nz = A.geom->nz;
+    global_int_t gnx = A.geom->gnx;
+    global_int_t gny = A.geom->gny;
+    global_int_t gnz = A.geom->gnz;
+    global_int_t gix0 = A.geom->gix0;
+    global_int_t giy0 = A.geom->giy0;
+    global_int_t giz0 = A.geom->giz0;
+
+    local_int_t localNumberOfRows = nx * ny * nz; // This is the size of our subblock
+    // If this assert fails, it most likely means that the local_int_t is set to int and should be set to long long
+    assert(localNumberOfRows
+        > 0); // Throw an exception of the number of rows is less than zero (can happen if int overflow)
+    local_int_t numberOfNonzerosPerRow
+        = 27; // We are approximating a 27-point finite element/volume/difference 3D stencil
+
+    global_int_t totalNumberOfRows = gnx * gny * gnz; // Total number of grid points in mesh
+    // If this assert fails, it most likely means that the global_int_t is set to int and should be set to long long
+    assert(totalNumberOfRows
+        > 0); // Throw an exception of the number of rows is less than zero (can happen if int overflow)
+
+    // Allocate arrays that are of length localNumberOfRows
+    local_int_t* nonzerosInRow = new local_int_t[localNumberOfRows];
+    global_int_t** mtxIndG = new global_int_t*[localNumberOfRows];
+    local_int_t** mtxIndL = new local_int_t*[localNumberOfRows];
+    double** matrixValues = new double*[localNumberOfRows];
+    double** matrixDiagonal = new double*[localNumberOfRows];
+
+    if (b != 0)
+        InitializeVector(*b, localNumberOfRows, CPU);
+    if (x != 0)
+        InitializeVector(*x, localNumberOfRows, CPU);
+    if (xexact != 0)
+        InitializeVector(*xexact, localNumberOfRows, CPU);
+    double* bv = 0;
+    double* xv = 0;
+    double* xexactv = 0;
+    if (b != 0)
+        bv = b->values; // Only compute exact solution if requested
+    if (x != 0)
+        xv = x->values; // Only compute exact solution if requested
+    if (xexact != 0)
+        xexactv = xexact->values; // Only compute exact solution if requested
+    A.localToGlobalMap.resize(localNumberOfRows);
+
+    // Use a parallel loop to do initial assignment:
+    // distributes the physical placement of arrays of pointers across the memory system
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+    for (local_int_t i = 0; i < localNumberOfRows; ++i)
+    {
+        matrixValues[i] = 0;
+        matrixDiagonal[i] = 0;
+        mtxIndG[i] = 0;
+        mtxIndL[i] = 0;
+    }
+
+#ifndef HPCG_CONTIGUOUS_ARRAYS
+    // Now allocate the arrays pointed to
+    for (local_int_t i = 0; i < localNumberOfRows; ++i)
+        mtxIndL[i] = new local_int_t[numberOfNonzerosPerRow];
+    for (local_int_t i = 0; i < localNumberOfRows; ++i)
+        matrixValues[i] = new double[numberOfNonzerosPerRow];
+    for (local_int_t i = 0; i < localNumberOfRows; ++i)
+        mtxIndG[i] = new global_int_t[numberOfNonzerosPerRow];
+
+#else
+    // Now allocate the arrays pointed to
+    mtxIndL[0] = new local_int_t[localNumberOfRows * numberOfNonzerosPerRow];
+    matrixValues[0] = new double[localNumberOfRows * numberOfNonzerosPerRow];
+    mtxIndG[0] = new global_int_t[localNumberOfRows * numberOfNonzerosPerRow];
+
+    for (local_int_t i = 1; i < localNumberOfRows; ++i)
+    {
+        mtxIndL[i] = mtxIndL[0] + i * numberOfNonzerosPerRow;
+        matrixValues[i] = matrixValues[0] + i * numberOfNonzerosPerRow;
+        mtxIndG[i] = mtxIndG[0] + i * numberOfNonzerosPerRow;
+    }
+#endif
+
+    local_int_t localNumberOfNonzeros = 0;
+    // TODO:  This triply nested loop could be flattened or use nested parallelism
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+    for (local_int_t iz = 0; iz < nz; iz++)
+    {
+        global_int_t giz = giz0 + iz;
+        for (local_int_t iy = 0; iy < ny; iy++)
+        {
+            global_int_t giy = giy0 + iy;
+            for (local_int_t ix = 0; ix < nx; ix++)
+            {
+                global_int_t gix = gix0 + ix;
+                local_int_t currentLocalRow = iz * nx * ny + iy * nx + ix;
+                global_int_t currentGlobalRow = giz * gnx * gny + giy * gnx + gix;
+#ifndef HPCG_NO_OPENMP
+                // C++ std::map is not threadsafe for writing
+#pragma omp critical
+#endif
+                A.globalToLocalMap[currentGlobalRow] = currentLocalRow;
+
+                A.localToGlobalMap[currentLocalRow] = currentGlobalRow;
+#ifdef HPCG_DETAILED_DEBUG
+                HPCG_fout << " rank, globalRow, localRow = " << A.geom->rank << " " << currentGlobalRow << " "
+                          << A.globalToLocalMap[currentGlobalRow] << endl;
+#endif
+                char numberOfNonzerosInRow = 0;
+                double* currentValuePointer = matrixValues[currentLocalRow]; // Pointer to current value in current row
+                global_int_t* currentIndexPointerG
+                    = mtxIndG[currentLocalRow]; // Pointer to current index in current row
+                for (int sz = -1; sz <= 1; sz++)
+                {
+                    if (giz + sz > -1 && giz + sz < gnz)
+                    {
+                        for (int sy = -1; sy <= 1; sy++)
+                        {
+                            if (giy + sy > -1 && giy + sy < gny)
+                            {
+                                for (int sx = -1; sx <= 1; sx++)
+                                {
+                                    if (gix + sx > -1 && gix + sx < gnx)
+                                    {
+                                        global_int_t curcol = currentGlobalRow + sz * gnx * gny + sy * gnx + sx;
+                                        if (curcol == currentGlobalRow)
+                                        {
+                                            matrixDiagonal[currentLocalRow] = currentValuePointer;
+                                            *currentValuePointer++ = 26.0;
+                                        }
+                                        else
+                                        {
+                                            *currentValuePointer++ = -1.0;
+                                        }
+                                        *currentIndexPointerG++ = curcol;
+                                        numberOfNonzerosInRow++;
+                                    } // end x bounds test
+                                } // end sx loop
+                            } // end y bounds test
+                        } // end sy loop
+                    } // end z bounds test
+                } // end sz loop
+                nonzerosInRow[currentLocalRow] = numberOfNonzerosInRow;
+#ifndef HPCG_NO_OPENMP
+#pragma omp critical
+#endif
+                localNumberOfNonzeros += numberOfNonzerosInRow; // Protect this with an atomic
+                if (b != 0)
+                    bv[currentLocalRow] = 26.0 - ((double) (numberOfNonzerosInRow - 1));
+                if (x != 0)
+                    xv[currentLocalRow] = 0.0;
+                if (xexact != 0)
+                    xexactv[currentLocalRow] = 1.0;
+            } // end ix loop
+        } // end iy loop
+    } // end iz loop
+#ifdef HPCG_DETAILED_DEBUG
+    HPCG_fout << "Process " << A.geom->rank << " of " << A.geom->size << " has " << localNumberOfRows << " rows."
+              << endl
+              << "Process " << A.geom->rank << " of " << A.geom->size << " has " << localNumberOfNonzeros
+              << " nonzeros." << endl;
+#endif
+
+    global_int_t totalNumberOfNonzeros = 0;
+#ifndef HPCG_NO_MPI
+    // Use MPI's reduce function to sum all nonzeros
+#ifdef HPCG_NO_LONG_LONG
+    MPI_Allreduce(&localNumberOfNonzeros, &totalNumberOfNonzeros, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+#else
+    long long lnnz = localNumberOfNonzeros, gnnz = 0; // convert to 64 bit for MPI call
+    MPI_Allreduce(&lnnz, &gnnz, 1, MPI_LONG_LONG_INT, MPI_SUM, MPI_COMM_WORLD);
+    totalNumberOfNonzeros = gnnz; // Copy back
+#endif
+#else
+    totalNumberOfNonzeros = localNumberOfNonzeros;
+#endif
+    // If this assert fails, it most likely means that the global_int_t is set to int and should be set to long long
+    // This assert is usually the first to fail as problem size increases beyond the 32-bit integer range.
+    assert(totalNumberOfNonzeros
+        > 0); // Throw an exception of the number of nonzeros is less than zero (can happen if int overflow)
+
+    A.title = 0;
+    A.totalNumberOfRows = totalNumberOfRows;
+    A.totalNumberOfNonzeros = totalNumberOfNonzeros;
+    A.localNumberOfRows = localNumberOfRows;
+    A.localNumberOfColumns = localNumberOfRows;
+    A.localNumberOfNonzeros = localNumberOfNonzeros;
+    A.nonzerosInRow = nonzerosInRow;
+    A.mtxIndG = mtxIndG;
+    A.mtxIndL = mtxIndL;
+    A.matrixValues = matrixValues;
+    A.matrixDiagonal = matrixDiagonal;
+
+    return;
+}
--- a/src/GenerateProblem_ref.hpp
+++ b/src/GenerateProblem_ref.hpp
@@ -0,0 +1,21 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef GENERATEPROBLEM_REF_HPP
+#define GENERATEPROBLEM_REF_HPP
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+
+void GenerateProblem_ref(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact);
+#endif // GENERATEPROBLEM_REF_HPP
--- a/src/Geometry.hpp
+++ b/src/Geometry.hpp
@@ -0,0 +1,207 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file Geometry.hpp
+
+ HPCG data structure for problem geometry
+ */
+
+#ifndef GEOMETRY_HPP
+#define GEOMETRY_HPP
+
+/*!
+  This defines the type for integers that have local subdomain dimension.
+
+  Define as "long long" when local problem dimension is > 2^31
+*/
+// #define INDEX_64
+
+#ifndef INDEX_64
+typedef int local_int_t;
+#else
+typedef long long local_int_t;
+#endif
+
+/*!
+  This defines the type for integers that have global dimension
+
+  Define as "long long" when global problem dimension is > 2^31
+*/
+#ifdef HPCG_NO_LONG_LONG
+typedef int global_int_t;
+#else
+typedef long long global_int_t;
+#endif
+
+#define HPCG_MAX_ROW_LEN 27
+
+// Enums
+typedef enum
+{
+    X = 0,
+    Y = 1,
+    Z = 2,
+    NONE = 3
+} dim_3d_t;
+typedef enum
+{
+    MPI_CPU,
+    MPI_CUDA_AWARE,
+    MPI_GPU_All2allv,
+    MPI_CPU_All2allv,
+    NCCL /*GPUONLY*/
+} p2p_comm_mode_t;
+typedef enum
+{
+    CPU,
+    GPU
+} rank_type_t;
+typedef enum
+{
+    GPUONLY = 0,
+    CPUONLY = 1,
+    GPUCPU = 2
+} exec_mode_t;
+typedef enum
+{
+    GPU_RATIO = 0 /*NX, NY, NZ are local to GPU and g2c is a ratio*/,
+    GPU_ABS = 1 /*NX, NY, NZ are local to GPU and g2c is absolute dimension size*/,
+    GPU_CPU_RATIO = 2 /*NX, NY, NZ are local to GPU+CPU and g2c is ratio*/,
+    GPU_CPU_ABS = 3 /*NX, NY, NZ are local to GPU+CPU and g2c is absolute dimension size*/
+} local_problem_def_t;
+
+// This macro should be defined if the global_int_t is not long long
+// in order to stop complaints from non-C++11 compliant compilers.
+// #define HPCG_NO_LONG_LONG
+
+/*!
+  This is a data structure to contain all processor geometry information
+*/
+struct Geometry_STRUCT
+{
+    int size;         //!< Number of MPI processes
+    int rank;         //!< This process' rank in the range [0 to size - 1]
+    int logical_rank; //!< For hetrogeneous setup,
+    int numThreads;   //!< This process' number of threads
+    local_int_t nx;   //!< Number of x-direction grid points for each local subdomain
+    local_int_t ny;   //!< Number of y-direction grid points for each local subdomain
+    local_int_t nz;   //!< Number of z-direction grid points for each local subdomain
+    int npx;          //!< Number of processors in x-direction
+    int npy;          //!< Number of processors in y-direction
+    int npz;          //!< Number of processors in z-direction
+    int pz;           //!< partition ID of z-dimension process that starts the second region of nz values
+    int npartz;       //!< Number of partitions with varying nz values
+    int* partz_ids;   //!< Array of partition ids of processor in z-direction where new value of nz starts (valid values
+                      //!< are 1 to npz)
+    local_int_t* partz_nz; //!< Array of length npartz containing the nz values for each partition
+    int ipx;               //!< Current rank's x location in the npx by npy by npz processor grid
+    int ipy;               //!< Current rank's y location in the npx by npy by npz processor grid
+    int ipz;               //!< Current rank's z location in the npx by npy by npz processor grid
+    global_int_t gnx;      //!< Global number of x-direction grid points
+    global_int_t gny;      //!< Global number of y-direction grid points
+    global_int_t gnz;      //!< Global number of z-direction grid points
+    global_int_t gix0;     //!< Base global x index for this rank in the npx by npy by npz processor grid
+    global_int_t giy0;     //!< Base global y index for this rank in the npx by npy by npz processor grid
+    global_int_t giz0;     //!< Base global z index for this rank in the npx by npy by npz processor grid
+
+    dim_3d_t different_dim; //!< The dimension that the GPU and CPU rank are partitioned along
+    int previous_neighbor_dim;
+    int next_neighbor_dim;
+};
+typedef struct Geometry_STRUCT Geometry;
+
+/*!
+  Returns the rank of the MPI process that is assigned the global row index
+  given as the input argument.
+
+  @param[in] geom  The description of the problem's geometry.
+  @param[in] index The global row index
+
+  @return Returns the MPI rank of the process assigned the row
+*/
+inline int ComputeRankOfMatrixRow(const Geometry& geom, global_int_t index)
+{
+    global_int_t gnx = geom.gnx;
+    global_int_t gny = geom.gny;
+
+    global_int_t iz = index / (gny * gnx);
+    global_int_t iy = (index - iz * gny * gnx) / gnx;
+    global_int_t ix = index % gnx;
+    // We now permit varying values for nz for any nx-by-ny plane of MPI processes.
+    // npartz is the number of different groups of nx-by-ny groups of processes.
+    // partz_ids is an array of length npartz where each value indicates the z process of the last process in the ith
+    // nx-by-ny group. partz_nz is an array of length npartz containing the value of nz for the ith group.
+
+    //        With no variation, npartz = 1, partz_ids[0] = npz, partz_nz[0] = nz
+
+    int ipz = 0;
+    int ipartz_ids = 0;
+    for (int i = 0; i < geom.npartz; ++i)
+    {
+        int ipart_nz = geom.partz_nz[i];
+        ipartz_ids = geom.partz_ids[i] - ipartz_ids;
+        if (iz <= ipart_nz * ipartz_ids)
+        {
+            ipz += iz / ipart_nz;
+            break;
+        }
+        else
+        {
+            ipz += ipartz_ids;
+            iz -= ipart_nz * ipartz_ids;
+        }
+    }
+    //  global_int_t ipz = iz/geom.nz;
+    int ipy = iy / geom.ny;
+    int ipx = ix / geom.nx;
+    int rank = ipx + ipy * geom.npx + ipz * geom.npy * geom.npx;
+    return rank;
+}
+
+/*!
+ Destructor for geometry data.
+
+ @param[inout] data the geometry data structure whose storage is deallocated
+ */
+inline void DeleteGeometry(Geometry& geom)
+{
+
+    // Not used anymore
+    // if(geom.partz_nz != 0)
+    //   delete [] geom.partz_nz;
+
+    // if(geom.partz_ids != 0)
+    //   delete [] geom.partz_ids;
+
+    return;
+}
+#endif // GEOMETRY_HPP
--- a/src/MGData.hpp
+++ b/src/MGData.hpp
@@ -0,0 +1,81 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file MGData.hpp
+
+ HPCG data structure
+ */
+
+#ifndef MGDATA_HPP
+#define MGDATA_HPP
+
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+#include <cassert>
+
+struct MGData_STRUCT
+{
+    int numberOfPresmootherSteps;  // Call ComputeSYMGS this many times prior to coarsening
+    int numberOfPostsmootherSteps; // Call ComputeSYMGS this many times after coarsening
+    local_int_t*
+        f2cOperator; //!< 1D array containing the fine operator local IDs that will be injected into coarse space.
+    Vector* rc;      // coarse grid residual vector
+    Vector* xc;      // coarse grid solution vector
+    Vector* Axf;     // fine grid residual vector
+    /*!
+     This is for storing optimized data structres created in OptimizeProblem and
+     used inside optimized ComputeSPMV().
+     */
+    void* optimizationData;
+};
+typedef struct MGData_STRUCT MGData;
+
+/*!
+ Constructor for the data structure of CG vectors.
+
+ @param[in] Ac - Fully-formed coarse matrix
+ @param[in] f2cOperator -
+ @param[out] data the data structure for CG vectors that will be allocated to get it ready for use in CG iterations
+ */
+inline void InitializeMGData(local_int_t* f2cOperator, Vector* rc, Vector* xc, Vector* Axf, MGData& data)
+{
+    data.numberOfPresmootherSteps = 1;
+    data.numberOfPostsmootherSteps = 1;
+    data.f2cOperator = f2cOperator; // Space for injection operator
+    data.rc = rc;
+    data.xc = xc;
+    data.Axf = Axf;
+    return;
+}
+
+/*!
+ Destructor for the CG vectors data.
+
+ @param[inout] data the MG data structure whose storage is deallocated
+ */
+inline void DeleteMGData(MGData& data)
+{
+
+    delete[] data.f2cOperator;
+    DeleteVector(*data.Axf);
+    DeleteVector(*data.rc);
+    DeleteVector(*data.xc);
+    delete data.Axf;
+    delete data.rc;
+    delete data.xc;
+    return;
+}
+
+#endif // MGDATA_HPP
--- a/src/MixedBaseCounter.cpp
+++ b/src/MixedBaseCounter.cpp
@@ -0,0 +1,66 @@
+
+#include <map>
+
+#include "MixedBaseCounter.hpp"
+
+MixedBaseCounter::MixedBaseCounter(int* counts, int length)
+{
+    this->length = length;
+
+    int i;
+
+    for (i = 0; i < 32; ++i)
+    {
+        this->max_counts[i] = counts[i];
+        this->cur_counts[i] = 0;
+    }
+    // terminate with 0's
+    this->max_counts[i] = this->cur_counts[i] = 0;
+    this->max_counts[length] = this->cur_counts[length] = 0;
+}
+
+MixedBaseCounter::MixedBaseCounter(MixedBaseCounter& left, MixedBaseCounter& right)
+{
+    this->length = left.length;
+    for (int i = 0; i < left.length; ++i)
+    {
+        this->max_counts[i] = left.max_counts[i] - right.cur_counts[i];
+        this->cur_counts[i] = 0;
+    }
+}
+
+void MixedBaseCounter::next()
+{
+    for (int i = 0; i < this->length; ++i)
+    {
+        this->cur_counts[i]++;
+        if (this->cur_counts[i] > this->max_counts[i])
+        {
+            this->cur_counts[i] = 0;
+            continue;
+        }
+        break;
+    }
+}
+
+int MixedBaseCounter::is_zero()
+{
+    for (int i = 0; i < this->length; ++i)
+        if (this->cur_counts[i])
+            return 0;
+    return 1;
+}
+
+int MixedBaseCounter::product(int* multipliers)
+{
+    int k = 0, x = 1;
+
+    for (int i = 0; i < this->length; ++i)
+        for (int j = 0; j < this->cur_counts[i]; ++j)
+        {
+            k = 1;
+            x *= multipliers[i];
+        }
+
+    return x * k;
+}
--- a/src/MixedBaseCounter.hpp
+++ b/src/MixedBaseCounter.hpp
@@ -0,0 +1,16 @@
+
+
+class MixedBaseCounter
+{
+private:
+    int length;             //!< number of prime factor counts (cannot exceed 32 for a 32-bit integer)
+    int max_counts[32 + 1]; //!< maximum value for prime factor counts
+    int cur_counts[32 + 1]; //!< current prime factor counts
+
+public:
+    MixedBaseCounter(int* counts, int length);
+    MixedBaseCounter(MixedBaseCounter& left, MixedBaseCounter& right);
+    void next();
+    int is_zero();
+    int product(int* multipliers);
+};
--- a/src/OptimizeProblem.cpp
+++ b/src/OptimizeProblem.cpp
@@ -0,0 +1,427 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file OptimizeProblem.cpp
+
+ HPCG routine
+ */
+
+#include "OptimizeProblem.hpp"
+#include "CpuKernels.hpp"
+#include "CudaKernels.hpp"
+#include "Cuda.hpp"
+#include "WriteProblem.hpp"
+#include "mytimer.hpp"
+
+extern bool Use_Hpcg_Mem_Reduction; /*USE HPCG aggresive memory reduction*/
+
+/*!
+  Optimizes the data structures used for CG iteration to increase the
+  performance of the benchmark version of the preconditioned CG algorithm.
+
+  @param[inout] A      The known system matrix, also contains the MG hierarchy in attributes Ac and mgData.
+  @param[inout] data   The data structure with all necessary CG vectors preallocated
+  @param[inout] b      The known right hand side vector
+  @param[inout] x      The solution vector to be computed in future CG iteration
+  @param[inout] xexact The exact solution vector
+
+  @return returns 0 upon success and non-zero otherwise
+
+  @see GenerateGeometry
+  @see GenerateProblem
+*/
+
+#ifdef USE_CUDA
+size_t OptimizeProblemGpu(SparseMatrix& A_in, CGData& data, Vector& b, Vector& x, Vector& xexact)
+{
+    // This function can be used to completely transform any part of the data structures.
+    // Right now it does nothing, so compiling with a check for unused variables results in complaints
+    SparseMatrix* A = &A_in;
+    local_int_t numberOfMgLevels = 4;
+    local_int_t slice_size = A->slice_size;
+    for (int level = 0; level < numberOfMgLevels; ++level)
+    {
+        const local_int_t nrow = A->localNumberOfRows;
+        int totalColors = 8;
+
+        // Let's deal with perm and iperm
+        SetVectorAscCuda(A->ref2opt, nrow);
+        SetVectorAscCuda(A->opt2ref, nrow);
+
+        // Let us color the matrix
+        int num_colors = 0;
+        ColorMatrixCuda(NULL, A->gpuAux.columns, A->gpuAux.nnzPerRow, A->localNumberOfRows, A->gpuAux.color,
+            &(num_colors), A->gpuAux.colorCountCpu, 8, A->ref2opt, A->opt2ref, A->geom->rank, A->geom->nx, NULL);
+        A->totalColors = totalColors;
+        PermElemToSendCuda(A->totalToBeSent, A->gpuAux.elementsToSend, A->ref2opt);
+
+        // Create (S)ELL
+        local_int_t TranslateIndex = slice_size * HPCG_MAX_ROW_LEN;
+        local_int_t* translated_ell_col_index = A->sellAPermColumns + TranslateIndex;
+        double* translated_ell_values = A->sellAPermValues + TranslateIndex;
+
+        EllPermColumnsValuesCuda(nrow, A->gpuAux.nnzPerRow, A->gpuAux.columns, A->gpuAux.values,
+            A->gpuAux.csrAPermOffsets, translated_ell_col_index, translated_ell_values, A->opt2ref, A->ref2opt,
+            A->gpuAux.sellADiagonalIdx, A->gpuAux.csrLPermOffsets, A->gpuAux.csrUPermOffsets, false);
+
+        // Coloumn mojor blocked/sliced ellpack
+        TransposeCuda(nrow, slice_size, A->sellAPermColumns, A->sellAPermValues);
+
+        // Per block max row len
+        local_int_t num_slices = (nrow + slice_size - 1) / slice_size;
+        EllMaxRowLenPerBlockCuda(nrow, slice_size, A->gpuAux.csrLPermOffsets, A->gpuAux.csrUPermOffsets,
+            A->sellLSliceMrl, A->sellUSliceMrl);
+
+        // Find prefix sum for sliced ell
+        PrefixsumCuda(num_slices, A->sellLSliceMrl);
+        MultiplyBySliceSizeCUDA(num_slices, slice_size, A->sellLSliceMrl + 1);
+
+        PrefixsumCuda(num_slices, A->sellUSliceMrl);
+        MultiplyBySliceSizeCUDA(num_slices, slice_size, A->sellUSliceMrl + 1);
+
+        // Set the general matrix slice_offsets
+        CreateAMatrixSliceOffsetsCuda(num_slices + 1, A->slice_size, A->sellASliceMrl);
+
+        // Lower Upper ELL variant parts
+        CreateSellLUColumnsValuesCuda(nrow, slice_size, A->sellAPermColumns, A->sellAPermValues, A->sellLSliceMrl,
+            A->sellLPermColumns, A->sellLPermValues, A->sellUSliceMrl, A->sellUPermColumns, A->sellUPermValues, level);
+
+        local_int_t sell_slices = (nrow + slice_size - 1) / slice_size;
+        const local_int_t half_nnz = (A->localNumberOfNonzeros - nrow - A->extNnz) / 2;
+
+        local_int_t sell_l_nnz = 0;
+        cudaMemcpyAsync(
+            &sell_l_nnz, &(A->sellLSliceMrl[sell_slices]), sizeof(local_int_t), cudaMemcpyDeviceToHost, stream);
+
+        local_int_t sell_u_nnz = 0;
+        cudaMemcpyAsync(
+            &sell_u_nnz, &(A->sellUSliceMrl[sell_slices]), sizeof(local_int_t), cudaMemcpyDeviceToHost, stream);
+
+        auto INDEX_TYPE = CUSPARSE_INDEX_32I;
+#ifdef INDEX_64 // In src/Geometry
+        INDEX_TYPE = CUSPARSE_INDEX_64I;
+#endif
+        cusparseCreateSlicedEll(&(A->cusparseOpt.matL), nrow, nrow, half_nnz, sell_l_nnz, slice_size,
+            A->sellLSliceMrl, A->sellLPermColumns, A->sellLPermValues, INDEX_TYPE, INDEX_TYPE, CUSPARSE_INDEX_BASE_ZERO,
+            CUDA_R_64F);
+
+        cusparseCreateSlicedEll(&(A->cusparseOpt.matU), nrow, nrow, half_nnz, sell_u_nnz, slice_size,
+            A->sellUSliceMrl, A->sellUPermColumns, A->sellUPermValues, INDEX_TYPE, INDEX_TYPE, CUSPARSE_INDEX_BASE_ZERO,
+            CUDA_R_64F);
+
+        local_int_t sell_nnz = sell_slices * slice_size * HPCG_MAX_ROW_LEN;
+        cusparseCreateSlicedEll(&(A->cusparseOpt.matA), nrow, nrow, A->localNumberOfNonzeros, sell_nnz, slice_size,
+            A->sellASliceMrl, A->sellAPermColumns, A->sellAPermValues, INDEX_TYPE, INDEX_TYPE, CUSPARSE_INDEX_BASE_ZERO,
+            CUDA_R_64F);
+
+        double alpha = 1.0, beta = 0.0;
+        size_t e_buf_size = 0;
+        size_t l_buf_size = 0, u_buf_size = 0, i_buf_size = 0, max_buf_size = 0;
+        cusparseDnVecDescr_t dummy1, dummy2;
+        cusparseCreateDnVec(&dummy1, nrow, x.values_d, CUDA_R_64F);
+        cusparseCreateDnVec(&dummy2, nrow, b.values_d, CUDA_R_64F);
+        cusparseCreateDnVec(&(A->cusparseOpt.vecX), nrow, x.values_d, CUDA_R_64F);
+        cusparseCreateDnVec(&(A->cusparseOpt.vecY), nrow, b.values_d, CUDA_R_64F);
+        max_buf_size = e_buf_size;
+
+        // MV
+        // Lower
+        cusparseSpMV_bufferSize(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matL, dummy1,
+            &beta, dummy2, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &l_buf_size);
+        cusparseSpMV_bufferSize(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matU, dummy1,
+            &beta, dummy2, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &u_buf_size);
+        cusparseSpMV_bufferSize(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matA, dummy1,
+            &beta, dummy2, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &i_buf_size);
+
+        max_buf_size = std::max(std::max(i_buf_size, e_buf_size), std::max(u_buf_size, l_buf_size));
+
+        // SV
+        // Lower
+        size_t buffer_size_sv_l, buffer_size_sv_u;
+        cusparseFillMode_t fillmode_l = CUSPARSE_FILL_MODE_LOWER;
+        cusparseFillMode_t fillmode_u = CUSPARSE_FILL_MODE_UPPER;
+        cusparseDiagType_t diagtype = CUSPARSE_DIAG_TYPE_NON_UNIT;
+
+        cusparseSpSV_createDescr(&A->cusparseOpt.spsvDescrL);
+        cusparseSpSV_createDescr(&A->cusparseOpt.spsvDescrU);
+        cusparseSpMatSetAttribute(A->cusparseOpt.matL, CUSPARSE_SPMAT_DIAG_TYPE, &(diagtype), sizeof(diagtype));
+        cusparseSpMatSetAttribute(A->cusparseOpt.matL, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
+
+        if (!Use_Hpcg_Mem_Reduction || (nrow % 8 != 0))
+        {
+            cusparseSpSV_bufferSize(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matL,
+                A->cusparseOpt.vecX, A->cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT,
+                A->cusparseOpt.spsvDescrL, &buffer_size_sv_l);
+            cudaMalloc(&A->bufferSvL, buffer_size_sv_l);
+        }
+        cusparseSpSV_analysis(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matL,
+            A->cusparseOpt.vecX, A->cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A->cusparseOpt.spsvDescrL,
+            A->bufferSvL);
+        cusparseSpSV_updateMatrix(
+            cusparsehandle, A->cusparseOpt.spsvDescrL, A->diagonal, CUSPARSE_SPSV_UPDATE_DIAGONAL);
+
+        cusparseSpMatSetAttribute(A->cusparseOpt.matU, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
+
+        if (!Use_Hpcg_Mem_Reduction || (nrow % 8 != 0))
+        {
+            cusparseSpSV_bufferSize(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matU,
+                A->cusparseOpt.vecX, A->cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT,
+                A->cusparseOpt.spsvDescrU, &buffer_size_sv_u);
+            cudaMalloc(&A->bufferSvU, buffer_size_sv_u);
+        }
+        cusparseSpSV_analysis(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matU,
+            A->cusparseOpt.vecX, A->cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A->cusparseOpt.spsvDescrU,
+            A->bufferSvU);
+        cusparseSpSV_updateMatrix(
+            cusparsehandle, A->cusparseOpt.spsvDescrU, A->diagonal, CUSPARSE_SPSV_UPDATE_DIAGONAL);
+
+        if (max_buf_size > 0)
+            cudaMalloc(&(A->bufferMvA), max_buf_size);
+
+        cusparseDestroyDnVec(dummy1);
+        cusparseDestroyDnVec(dummy2);
+        // //////////////////////////////////////////////////////////////////////////
+        A = A->Ac;
+    }
+
+    A = &A_in;
+    for (int level = 1; level < numberOfMgLevels; ++level)
+    {
+        const local_int_t nrow_c = A->Ac->localNumberOfRows;
+        const local_int_t nrow_f = A->localNumberOfRows;
+        F2cPermCuda(nrow_c, A->gpuAux.f2c, A->f2cPerm, A->ref2opt, A->Ac->opt2ref);
+        A = A->Ac;
+    }
+
+    return 0;
+}
+#endif
+
+#ifdef USE_GRACE
+size_t OptimizeProblemCpu(SparseMatrix& A_in, CGData& data, Vector& b, Vector& x, Vector& xexact)
+{
+    // Initialize data structures
+    size_t mem = AllocateMemCpu(A_in);
+
+    SparseMatrix* A = &A_in;
+    local_int_t numberOfMgLevels = 4;
+    local_int_t slice_size = A->slice_size;
+    for (int level = 0; level < numberOfMgLevels; ++level)
+    {
+        // Color the matrix
+        int num_colors;
+        ColorMatrixCpu(*A, &num_colors);
+        A->totalColors = num_colors;
+
+        // Compute when each color starts
+        A->cpuAux.firstRowOfColor[0] = 0;
+        for (int c = 1; c < A->totalColors; c++)
+        {
+            A->cpuAux.firstRowOfColor[c] = A->cpuAux.firstRowOfColor[c - 1] + A->cpuAux.nRowsWithColor[c - 1];
+        }
+
+        // Reorder the matrix
+        CreateSellPermCpu(*A);
+
+#ifndef HPCG_NO_MPI
+        // Translate row IDs that will be send to neighbours
+#pragma omp parallel for
+        for (local_int_t i = 0; i < A->totalToBeSent; i++)
+        {
+            local_int_t orig = A->elementsToSend[i];
+            A->elementsToSend[i] = A->ref2opt[orig];
+        }
+#endif
+
+        local_int_t numberOfNonzerosPerRow = HPCG_MAX_ROW_LEN;
+        local_int_t nrow = A->localNumberOfRows;
+        local_int_t half_nnz = (A->localNumberOfNonzeros - nrow - A->extNnz) / 2;
+        local_int_t num_slices = (nrow + slice_size - 1) / slice_size;
+        local_int_t sell_l_nnz = A->sellLSliceMrl[num_slices];
+        local_int_t sell_u_nnz = A->sellUSliceMrl[num_slices];
+        local_int_t sell_nnz = num_slices * slice_size * numberOfNonzerosPerRow;
+
+        auto INDEX_TYPE = NVPL_SPARSE_INDEX_32I;
+#ifdef INDEX_64 // In src/Geometry
+        INDEX_TYPE = NVPL_SPARSE_INDEX_64I;
+#endif
+
+        nvpl_sparse_create_sliced_ell(&(A->nvplSparseOpt.matL), nrow, nrow, half_nnz, sell_l_nnz, slice_size,
+            A->sellLSliceMrl, A->sellLPermColumns, A->sellLPermValues, INDEX_TYPE, INDEX_TYPE,
+            NVPL_SPARSE_INDEX_BASE_ZERO, NVPL_SPARSE_R_64F);
+
+        nvpl_sparse_create_sliced_ell(&(A->nvplSparseOpt.matU), nrow, nrow, half_nnz, sell_u_nnz, slice_size,
+            A->sellUSliceMrl, A->sellUPermColumns, A->sellUPermValues, INDEX_TYPE, INDEX_TYPE,
+            NVPL_SPARSE_INDEX_BASE_ZERO, NVPL_SPARSE_R_64F);
+
+        nvpl_sparse_create_sliced_ell(&(A->nvplSparseOpt.matA), nrow, nrow, A->localNumberOfNonzeros, sell_nnz,
+            slice_size, A->sellASliceMrl, A->sellAPermColumns, A->sellAPermValues, INDEX_TYPE, INDEX_TYPE,
+            NVPL_SPARSE_INDEX_BASE_ZERO, NVPL_SPARSE_R_64F);
+
+        double alpha = 1.0, beta = 0.0;
+        size_t e_buf_size = 0;
+        size_t l_buf_size = 0, u_buf_size = 0, i_buf_size = 0, max_buf_size = 0;
+        nvpl_sparse_create_dn_vec(&(A->nvplSparseOpt.vecX), nrow, x.values, NVPL_SPARSE_R_64F);
+        nvpl_sparse_create_dn_vec(&(A->nvplSparseOpt.vecY), nrow, b.values, NVPL_SPARSE_R_64F);
+        max_buf_size = e_buf_size;
+
+        // //MV
+        // //Lower
+        nvpl_sparse_spmv_create_descr(&A->nvplSparseOpt.spmvLDescr);
+        nvpl_sparse_spmv_buffer_size(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+            A->nvplSparseOpt.matL, A->nvplSparseOpt.vecX, &beta, A->nvplSparseOpt.vecY, A->nvplSparseOpt.vecY,
+            NVPL_SPARSE_R_64F, NVPL_SPARSE_SPMV_ALG_DEFAULT, A->nvplSparseOpt.spmvLDescr, &l_buf_size);
+        // //Upper
+        nvpl_sparse_spmv_create_descr(&A->nvplSparseOpt.spmvUDescr);
+        nvpl_sparse_spmv_buffer_size(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+            A->nvplSparseOpt.matU, A->nvplSparseOpt.vecX, &beta, A->nvplSparseOpt.vecY, A->nvplSparseOpt.vecY,
+            NVPL_SPARSE_R_64F, NVPL_SPARSE_SPMV_ALG_DEFAULT, A->nvplSparseOpt.spmvUDescr, &u_buf_size);
+        // //L+D+U
+        nvpl_sparse_spmv_create_descr(&A->nvplSparseOpt.spmvADescr);
+        nvpl_sparse_spmv_buffer_size(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+            A->nvplSparseOpt.matA, A->nvplSparseOpt.vecX, &beta, A->nvplSparseOpt.vecY, A->nvplSparseOpt.vecY,
+            NVPL_SPARSE_R_64F, NVPL_SPARSE_SPMV_ALG_DEFAULT, A->nvplSparseOpt.spmvADescr, &i_buf_size);
+
+        max_buf_size = std::max(std::max(i_buf_size, e_buf_size), std::max(u_buf_size, l_buf_size));
+
+        // //SV
+        // //Lower
+        size_t buffer_size_sv_l, buffer_size_sv_u;
+        nvpl_sparse_fill_mode_t fillmode_l = NVPL_SPARSE_FILL_MODE_LOWER;
+        nvpl_sparse_fill_mode_t fillmode_u = NVPL_SPARSE_FILL_MODE_UPPER;
+        nvpl_sparse_diag_type_t diagtype = NVPL_SPARSE_DIAG_TYPE_NON_UNIT;
+
+        nvpl_sparse_spsv_create_descr(&A->nvplSparseOpt.spsvDescrL);
+        nvpl_sparse_spsv_create_descr(&A->nvplSparseOpt.spsvDescrU);
+        nvpl_sparse_sp_mat_set_attribute(
+            A->nvplSparseOpt.matL, NVPL_SPARSE_SPMAT_DIAG_TYPE, &(diagtype), sizeof(diagtype));
+        nvpl_sparse_sp_mat_set_attribute(
+            A->nvplSparseOpt.matL, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
+
+        Vector origDiagA;
+        InitializeVector(origDiagA, A->localNumberOfRows, CPU);
+        CopyMatrixDiagonal(*A, origDiagA);
+
+        // Pass strictly L, and then update the diagonal
+        if (!Use_Hpcg_Mem_Reduction || A->localNumberOfRows % 8 != 0)
+        {
+            nvpl_sparse_sp_mat_set_attribute(
+                A->nvplSparseOpt.matA, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
+            nvpl_sparse_spsv_buffer_size(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+                A->nvplSparseOpt.matA, A->nvplSparseOpt.vecX, A->nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
+                NVPL_SPARSE_SPSV_ALG_DEFAULT, A->nvplSparseOpt.spsvDescrL, &buffer_size_sv_l);
+
+            A->bufferSvL = new char[buffer_size_sv_l];
+            mem += buffer_size_sv_l;
+            nvpl_sparse_spsv_analysis(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+                A->nvplSparseOpt.matA, A->nvplSparseOpt.vecX, A->nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
+                NVPL_SPARSE_SPSV_ALG_DEFAULT, A->nvplSparseOpt.spsvDescrL, A->bufferSvL);
+        }
+        else
+        {
+            nvpl_sparse_spsv_analysis(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+                A->nvplSparseOpt.matL, A->nvplSparseOpt.vecX, A->nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
+                NVPL_SPARSE_SPSV_ALG_DEFAULT, A->nvplSparseOpt.spsvDescrL, A->bufferSvL);
+            nvpl_sparse_spsv_update_matrix(
+                nvpl_sparse_handle, A->nvplSparseOpt.spsvDescrL, origDiagA.values, NVPL_SPARSE_SPSV_UPDATE_DIAGONAL);
+        }
+
+        // Pass strctly U, and then update diagonal
+        nvpl_sparse_sp_mat_set_attribute(
+            A->nvplSparseOpt.matU, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
+        if (!Use_Hpcg_Mem_Reduction || A->localNumberOfRows % 8 != 0)
+        {
+            nvpl_sparse_sp_mat_set_attribute(
+                A->nvplSparseOpt.matA, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
+            nvpl_sparse_spsv_buffer_size(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+                A->nvplSparseOpt.matA, A->nvplSparseOpt.vecX, A->nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
+                NVPL_SPARSE_SPSV_ALG_DEFAULT, A->nvplSparseOpt.spsvDescrU, &buffer_size_sv_u);
+            A->bufferSvU = new char[buffer_size_sv_u];
+            mem += buffer_size_sv_u;
+            nvpl_sparse_spsv_analysis(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+                A->nvplSparseOpt.matA, A->nvplSparseOpt.vecX, A->nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
+                NVPL_SPARSE_SPSV_ALG_DEFAULT, A->nvplSparseOpt.spsvDescrU, A->bufferSvU);
+        }
+        else
+        {
+            nvpl_sparse_spsv_analysis(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+                A->nvplSparseOpt.matU, A->nvplSparseOpt.vecX, A->nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
+                NVPL_SPARSE_SPSV_ALG_DEFAULT, A->nvplSparseOpt.spsvDescrU, A->bufferSvU);
+            nvpl_sparse_spsv_update_matrix(
+                nvpl_sparse_handle, A->nvplSparseOpt.spsvDescrU, origDiagA.values, NVPL_SPARSE_SPSV_UPDATE_DIAGONAL);
+        }
+
+        DeleteVector(origDiagA);
+        //////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+        A = A->Ac;
+    }
+    A = &A_in;
+
+    for (int level = 1; level < numberOfMgLevels; level++)
+    {
+        local_int_t nrow_c = A->Ac->localNumberOfRows;
+        local_int_t nrow_f = A->localNumberOfRows;
+        // Permute space injector operator
+        F2cPermCpu(nrow_c, A->mgData->f2cOperator, A->f2cPerm, A->ref2opt, A->Ac->opt2ref);
+        A = A->Ac;
+    }
+
+    return mem;
+}
+#endif // USE_GRACE
+
+size_t OptimizeProblem(SparseMatrix& A_in, CGData& data, Vector& b, Vector& x, Vector& xexact)
+{
+    size_t result = 0;
+    if (A_in.rankType == GPU)
+    {
+#ifdef USE_CUDA
+        result = OptimizeProblemGpu(A_in, data, b, x, xexact);
+#endif
+    }
+    else
+    {
+#ifdef USE_GRACE
+        result = OptimizeProblemCpu(A_in, data, b, x, xexact);
+#endif
+    }
+
+    return result;
+}
+
+// Helper function (see OptimizeProblem.hpp for details)
+double OptimizeProblemMemoryUse(const SparseMatrix& A)
+{
+
+    return 0.0;
+}
--- a/src/OptimizeProblem.hpp
+++ b/src/OptimizeProblem.hpp
@@ -0,0 +1,30 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef OPTIMIZEPROBLEM_HPP
+#define OPTIMIZEPROBLEM_HPP
+
+#include "CGData.hpp"
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+
+size_t OptimizeProblem(SparseMatrix& A, CGData& data, Vector& b, Vector& x, Vector& xexact);
+
+// This helper function should be implemented in a non-trivial way if OptimizeProblem is non-trivial
+// It should return as type double, the total number of bytes allocated and retained after calling OptimizeProblem.
+// This value will be used to report Gbytes used in ReportResults (the value returned will be divided by 1000000000.0).
+
+double OptimizeProblemMemoryUse(const SparseMatrix& A);
+
+#endif // OPTIMIZEPROBLEM_HPP
--- a/src/OutputFile.cpp
+++ b/src/OutputFile.cpp
@@ -0,0 +1,176 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <fstream>
+#include <iostream>
+#include <list>
+#include <sstream>
+#include <string>
+
+#include "OutputFile.hpp"
+
+using std::string;
+using std::stringstream;
+using std::list;
+using std::ofstream;
+
+extern int use_output_file;
+
+OutputFile::OutputFile(const string& name_arg, const string& version_arg)
+    : name(name_arg)
+    , version(version_arg)
+    , eol("\n")
+    , keySeparator("::")
+{
+}
+
+OutputFile::OutputFile(void)
+    : eol("\n")
+    , keySeparator("::")
+{
+}
+
+OutputFile::~OutputFile()
+{
+    for (list<OutputFile*>::iterator it = descendants.begin(); it != descendants.end(); ++it)
+    {
+        delete *it;
+    }
+}
+
+void OutputFile::add(const string& key_arg, const string& value_arg)
+{
+    descendants.push_back(allocKeyVal(key_arg, value_arg));
+}
+
+void OutputFile::add(const string& key_arg, double value_arg)
+{
+    stringstream ss;
+    ss << value_arg;
+    descendants.push_back(allocKeyVal(key_arg, ss.str()));
+}
+
+void OutputFile::add(const string& key_arg, int value_arg)
+{
+    stringstream ss;
+    ss << value_arg;
+    descendants.push_back(allocKeyVal(key_arg, ss.str()));
+}
+
+#ifndef HPCG_NO_LONG_LONG
+
+void OutputFile::add(const string& key_arg, long long value_arg)
+{
+    stringstream ss;
+    ss << value_arg;
+    descendants.push_back(allocKeyVal(key_arg, ss.str()));
+}
+
+#endif
+
+void OutputFile::add(const string& key_arg, size_t value_arg)
+{
+    stringstream ss;
+    ss << value_arg;
+    descendants.push_back(allocKeyVal(key_arg, ss.str()));
+}
+
+void OutputFile::setKeyValue(const string& key_arg, const string& value_arg)
+{
+    key = key_arg;
+    value = value_arg;
+}
+
+OutputFile* OutputFile::get(const string& key_arg)
+{
+    for (list<OutputFile*>::iterator it = descendants.begin(); it != descendants.end(); ++it)
+    {
+        if ((*it)->key == key_arg)
+            return *it;
+    }
+
+    return 0;
+}
+
+string OutputFile::generateRecursive(string prefix)
+{
+    string result = "";
+
+    result += prefix + key + "=" + value + eol;
+
+    for (list<OutputFile*>::iterator it = descendants.begin(); it != descendants.end(); ++it)
+    {
+        result += (*it)->generateRecursive(prefix + key + keySeparator);
+    }
+
+    return result;
+}
+
+string OutputFile::generate(void)
+{
+    string result = name + "\nversion=" + version + eol;
+
+    for (list<OutputFile*>::iterator it = descendants.begin(); it != descendants.end(); ++it)
+    {
+        result += (*it)->generateRecursive("");
+    }
+
+    time_t rawtime;
+    time(&rawtime);
+    tm* ptm = localtime(&rawtime);
+    char sdate[64];
+    // use tm_mon+1 because tm_mon is 0 .. 11 instead of 1 .. 12
+    sprintf(sdate, "%04d-%02d-%02d_%02d-%02d-%02d", ptm->tm_year + 1900, ptm->tm_mon + 1, ptm->tm_mday, ptm->tm_hour,
+        ptm->tm_min, ptm->tm_sec);
+
+    string filename = name + "_" + version + "_";
+    filename += string(sdate) + ".txt";
+
+    if (use_output_file)
+    {
+        ofstream myfile(filename.c_str());
+        myfile << result;
+        myfile.close();
+    }
+    else
+    {
+        std::cout << result << std::flush;
+    }
+
+    return result;
+}
+
+OutputFile* OutputFile::allocKeyVal(const std::string& key_arg, const std::string& value_arg)
+{
+    OutputFile* of = new OutputFile();
+    of->setKeyValue(key_arg, value_arg);
+    return of;
+}
--- a/src/OutputFile.hpp
+++ b/src/OutputFile.hpp
@@ -0,0 +1,161 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file Output_File.hpp
+
+ HPCG output file classes
+ */
+
+#ifndef OUTPUTFILE_HPP
+#define OUTPUTFILE_HPP
+
+#include <list>
+#include <string>
+
+//! The OutputFile class for the uniform collecting and reporting of performance data for HPCG
+
+/*!
+
+  The OutputFile class facilitates easy collecting and reporting of
+  key-value-formatted data that can be then registered with the HPCG results
+  collection website. The keys may have hierarchy key1::key2::key3=val with
+  double colon :: as a separator. A sample output may look like this (note how
+  "major" and "micro" keys repeat with different ancestor keys):
+
+\code
+
+version=3.2.1alpha
+version::major=3
+version::minor=2
+version::micro=1
+version::release=alpha
+axis=xyz
+axis::major=x
+axis::minor=y
+
+\endcode
+
+*/
+class OutputFile
+{
+protected:
+    std::list<OutputFile*> descendants; //!< descendant elements
+    std::string name;                   //!< name of the benchmark
+    std::string version;                //!< version of the benchmark
+    std::string key;                    //!< the key under which the element is stored
+    std::string value;                  //!< the value of the stored element
+    std::string eol;                    //!< end-of-line character sequence in the output file
+    std::string keySeparator;           //!< character sequence to separate keys in the output file
+
+    //! Recursively generate output string from descendant list, and their descendants and so on
+    std::string generateRecursive(std::string prefix);
+
+public:
+    static OutputFile* allocKeyVal(const std::string& key, const std::string& value);
+
+    //! Constructor: accepts name and version as strings that are used to create a file name for printing results.
+    /*!
+      This constructor accepts and name and version number for the benchmark that
+      are used to form a file name information for results that are generated by
+      the generate() method.
+      \param name (in) string containing name of the benchmark
+      \param version (in) string containing the version of the benchmark
+    */
+    OutputFile(const std::string& name, const std::string& version);
+
+    //! Default constructor: no-arguments accepted, should be used for descendant nodes
+    /*!
+      This no-argument constructor can be used for descendant nodes to provide
+      key1::key2::key3=val output. Unlike the root node, descendant nodes do not
+      have name and version but only store key-value pairs.
+    */
+    OutputFile(void);
+
+    ~OutputFile();
+
+    //! Create and add a descendant element with value of type "string"
+    /*!
+    Create and add a descendant element identified by "key" and associated with
+    "value".  The element is added at the end of a list of previously added
+    elements.
+
+    @param[in] key   The key that identifies the added element and under which the element is stored
+    @param[in] value The value stored by the element
+    */
+    void add(const std::string& key, const std::string& value);
+
+    //! Create and add a descendant element with value of type "double"
+    /*!
+    Create and add a descendant element identified by "key" and associated with
+    "value".  The element is added at the end of a list of previously added
+    elements.
+
+    @param[in] key   The key that identifies the added element and under which the element is stored
+    @param[in] value The value stored by the element
+    */
+    void add(const std::string& key, double value);
+
+    //! Create and add a descendant element with value of type "int"
+    /*!
+    Create and add a descendant element identified by "key" and associated with
+    "value".  The element is added at the end of a list of previously added
+    elements.
+
+    @param[in] key   The key that identifies the added element and under which the element is stored
+    @param[in] value The value stored by the element
+    */
+    void add(const std::string& key, int value);
+
+#ifndef HPCG_NO_LONG_LONG
+    //! Create and add a descendant element with value of type "long long"
+    /*!
+    Create and add a descendant element identified by "key" and associated with
+    "value".  The element is added at the end of a list of previously added
+    elements.
+
+    @param[in] key   The key that identifies the added element and under which the element is stored
+    @param[in] value The value stored by the element
+    */
+    void add(const std::string& key, long long value);
+#endif
+
+    //! Create and add a descendant element with value of type "size_t"
+    /*!
+    Create and add a descendant element identified by "key" and associated with
+    "value".  The element is added at the end of a list of previously added
+    elements.
+
+    @param[in] key   The key that identifies the added element and under which the element is stored
+    @param[in] value The value stored by the element
+    */
+    void add(const std::string& key, size_t value);
+
+    //! Key-Value setter method
+    /*!
+    Set the key and the value of this element.
+
+    @param[in] key   The key that identifies this element and under which the element is stored
+    @param[in] value The value stored by the element
+    */
+    void setKeyValue(const std::string& key, const std::string& value);
+
+    //! Get the element in the list with the given key or return NULL if not found
+    OutputFile* get(const std::string& key);
+
+    //! Generate output string with results based on the stored key-value hierarchy
+    std::string generate(void);
+};
+
+#endif // OUTPUTFILE_HPP
--- a/src/ReadHpcgDat.cpp
+++ b/src/ReadHpcgDat.cpp
@@ -0,0 +1,79 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#include <cstdio>
+
+#include "ReadHpcgDat.hpp"
+
+static int SkipUntilEol(FILE* stream)
+{
+    int chOrEof;
+    bool finished;
+
+    do
+    {
+        chOrEof = fgetc(stream);
+        finished = (chOrEof == EOF) || (chOrEof == '\n') || (chOrEof == '\r');
+    } while (!finished);
+
+    if ('\r' == chOrEof)
+    { // on Windows, \r might be followed by \n
+        int chOrEofExtra = fgetc(stream);
+
+        if ('\n' == chOrEofExtra || EOF == chOrEofExtra)
+            chOrEof = chOrEofExtra;
+        else
+            ungetc(chOrEofExtra, stream);
+    }
+
+    return chOrEof;
+}
+
+int ReadHpcgDat(int* localDimensions, int* secondsPerRun, int* localProcDimensions, char* filename)
+{
+    FILE* hpcgStream = fopen(filename, "r");
+
+    if (!hpcgStream)
+    {
+        printf("Cannot open input file: %s\n", filename);
+        return -1;
+    }
+
+    SkipUntilEol(hpcgStream); // skip the first line
+
+    SkipUntilEol(hpcgStream); // skip the second line
+
+    for (int i = 0; i < 3; ++i)
+        if (fscanf(hpcgStream, "%d", localDimensions + i) != 1 || localDimensions[i] < 16)
+            localDimensions[i] = 16;
+
+    SkipUntilEol(hpcgStream); // skip the rest of the second line
+
+    if (secondsPerRun != 0)
+    { // Only read number of seconds if the pointer is non-zero
+        if (fscanf(hpcgStream, "%d", secondsPerRun) != 1 || secondsPerRun[0] < 0)
+            secondsPerRun[0] = 30 * 60; // 30 minutes
+    }
+
+    SkipUntilEol(hpcgStream); // skip the rest of the third line
+
+    for (int i = 0; i < 3; ++i)
+        // the user didn't specify (or values are invalid) process dimensions
+        if (fscanf(hpcgStream, "%d", localProcDimensions + i) != 1 || localProcDimensions[i] < 1)
+            localProcDimensions[i] = 0; // value 0 means: "not specified" and it will be fixed later
+
+    fclose(hpcgStream);
+
+    return 0;
+}
--- a/src/ReadHpcgDat.hpp
+++ b/src/ReadHpcgDat.hpp
@@ -0,0 +1,20 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef READHPCGDAT_HPP
+#define READHPCGDAT_HPP
+
+int ReadHpcgDat(int* localDimensions, int* secondsPerRun, int* localProcDimensions, char* filename);
+
+#endif // READHPCGDAT_HPP
--- a/src/ReportResults.cpp
+++ b/src/ReportResults.cpp
@@ -0,0 +1,512 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file ReportResults.cpp
+
+ HPCG routine
+ */
+
+#ifndef HPCG_NO_MPI
+#include <mpi.h>
+#endif
+
+#include "OptimizeProblem.hpp"
+#include "OutputFile.hpp"
+#include "ReportResults.hpp"
+#include <vector>
+
+#ifdef HPCG_DEBUG
+#include <fstream>
+using std::endl;
+
+#include "hpcg.hpp"
+#endif
+
+extern int use_output_file;
+
+/*!
+ Creates a YAML file and writes the information about the HPCG run, its results, and validity.
+
+  @param[in] geom The description of the problem's geometry.
+  @param[in] A    The known system matrix
+  @param[in] numberOfMgLevels Number of levels in multigrid V cycle
+  @param[in] numberOfCgSets Number of CG runs performed
+  @param[in] niters Number of preconditioned CG iterations performed to lower the residual below a threshold
+  @param[in] times  Vector of cumulative timings for each of the phases of a preconditioned CG iteration
+  @param[in] testcg_data    the data structure with the results of the CG-correctness test including pass/fail
+ information
+  @param[in] testsymmetry_data the data structure with the results of the CG symmetry test including pass/fail
+ information
+  @param[in] testnorms_data the data structure with the results of the CG norm test including pass/fail information
+  @param[in] global_failure indicates whether a failure occurred during the correctness tests of CG
+
+  @see YAML_Doc
+*/
+
+void ReportResults(const SparseMatrix& A, int numberOfMgLevels, int numberOfCgSets, int refMaxIters, int optMaxIters,
+    double times[], const TestCGData& testcg_data, const TestSymmetryData& testsymmetry_data,
+    const TestNormsData& testnorms_data, int global_failure, bool quickPath)
+{
+
+    double minOfficialTime = 1800; // Any official benchmark result must run at least this many seconds
+
+#ifndef HPCG_NO_MPI
+    double t4 = times[4];
+    double t4min = 0.0;
+    double t4max = 0.0;
+    double t4avg = 0.0;
+    MPI_Allreduce(&t4, &t4min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+    MPI_Allreduce(&t4, &t4max, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+    MPI_Allreduce(&t4, &t4avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    t4avg = t4avg / ((double) A.geom->size);
+#endif
+
+    if (A.geom->rank == 0)
+    { // Only PE 0 needs to compute and report timing results
+
+        // TODO: Put the FLOP count, Memory BW and Memory Usage models into separate functions
+
+        // ======================== FLOP count model =======================================
+
+        double fNumberOfCgSets = numberOfCgSets;
+        double fniters = fNumberOfCgSets * (double) optMaxIters;
+        double fnrow = A.totalNumberOfRows;
+        double fnnz = A.totalNumberOfNonzeros;
+
+        // Op counts come from implementation of CG in CG.cpp (include 1 extra for the CG preamble ops)
+        double fnops_ddot = (3.0 * fniters + fNumberOfCgSets) * 2.0 * fnrow; // 3 ddots with nrow adds and nrow mults
+        double fnops_waxpby
+            = (3.0 * fniters + fNumberOfCgSets) * 2.0 * fnrow;            // 3 WAXPBYs with nrow adds and nrow mults
+        double fnops_sparsemv = (fniters + fNumberOfCgSets) * 2.0 * fnnz; // 1 SpMV with nnz adds and nnz mults
+        // Op counts from the multigrid preconditioners
+        double fnops_precond = 0.0;
+        const SparseMatrix* Af = &A;
+        for (int i = 1; i < numberOfMgLevels; ++i)
+        {
+            double fnnz_Af = Af->totalNumberOfNonzeros;
+            double fnumberOfPresmootherSteps = Af->mgData->numberOfPresmootherSteps;
+            double fnumberOfPostsmootherSteps = Af->mgData->numberOfPostsmootherSteps;
+            fnops_precond += fnumberOfPresmootherSteps * fniters * 4.0 * fnnz_Af; // number of presmoother flops
+            fnops_precond += fniters * 2.0 * fnnz_Af; // cost of fine grid residual calculation
+            fnops_precond += fnumberOfPostsmootherSteps * fniters * 4.0 * fnnz_Af; // number of postsmoother flops
+            Af = Af->Ac;                                                           // Go to next coarse level
+        }
+
+        fnops_precond
+            += fniters * 4.0 * ((double) Af->totalNumberOfNonzeros); // One symmetric GS sweep at the coarsest level
+        double fnops = fnops_ddot + fnops_waxpby + fnops_sparsemv + fnops_precond;
+        double frefnops = fnops * ((double) refMaxIters) / ((double) optMaxIters);
+
+        // ======================== Memory bandwidth model =======================================
+
+        // Read/Write counts come from implementation of CG in CG.cpp (include 1 extra for the CG preamble ops)
+        double fnreads_ddot
+            = (3.0 * fniters + fNumberOfCgSets) * 2.0 * fnrow * sizeof(double);    // 3 ddots with 2 nrow reads
+        double fnwrites_ddot = (3.0 * fniters + fNumberOfCgSets) * sizeof(double); // 3 ddots with 1 write
+        double fnreads_waxpby = (3.0 * fniters + fNumberOfCgSets) * 2.0 * fnrow
+            * sizeof(double); // 3 WAXPBYs with nrow adds and nrow mults
+        double fnwrites_waxpby
+            = (3.0 * fniters + fNumberOfCgSets) * fnrow * sizeof(double); // 3 WAXPBYs with nrow adds and nrow mults
+        double fnreads_sparsemv = (fniters + fNumberOfCgSets)
+            * (fnnz * (sizeof(double) + sizeof(local_int_t))
+                + fnrow * sizeof(double)); // 1 SpMV with nnz reads of values, nnz reads indices,
+        // plus nrow reads of x
+        double fnwrites_sparsemv = (fniters + fNumberOfCgSets) * fnrow * sizeof(double); // 1 SpMV nrow writes
+        // Op counts from the multigrid preconditioners
+        double fnreads_precond = 0.0;
+        double fnwrites_precond = 0.0;
+        Af = &A;
+        for (int i = 1; i < numberOfMgLevels; ++i)
+        {
+            double fnnz_Af = Af->totalNumberOfNonzeros;
+            double fnrow_Af = Af->totalNumberOfRows;
+            double fnumberOfPresmootherSteps = Af->mgData->numberOfPresmootherSteps;
+            double fnumberOfPostsmootherSteps = Af->mgData->numberOfPostsmootherSteps;
+            fnreads_precond += fnumberOfPresmootherSteps * fniters
+                * (2.0 * fnnz_Af * (sizeof(double) + sizeof(local_int_t))
+                    + fnrow_Af * sizeof(double)); // number of presmoother reads
+            fnwrites_precond
+                += fnumberOfPresmootherSteps * fniters * fnrow_Af * sizeof(double); // number of presmoother writes
+            fnreads_precond += fniters
+                * (fnnz_Af * (sizeof(double) + sizeof(local_int_t))
+                    + fnrow_Af * sizeof(double)); // Number of reads for fine grid residual calculation
+            fnwrites_precond
+                += fniters * fnnz_Af * sizeof(double); // Number of writes for fine grid residual calculation
+            fnreads_precond += fnumberOfPostsmootherSteps * fniters
+                * (2.0 * fnnz_Af * (sizeof(double) + sizeof(local_int_t))
+                    + fnrow_Af * sizeof(double)); // number of postsmoother reads
+            fnwrites_precond
+                += fnumberOfPostsmootherSteps * fniters * fnnz_Af * sizeof(double); // number of postsmoother writes
+            Af = Af->Ac;                                                            // Go to next coarse level
+        }
+
+        double fnnz_Af = Af->totalNumberOfNonzeros;
+        double fnrow_Af = Af->totalNumberOfRows;
+        fnreads_precond
+            += fniters * (2.0 * fnnz_Af * (sizeof(double) + sizeof(local_int_t)) + fnrow_Af * sizeof(double));
+        ;                                                        // One symmetric GS sweep at the coarsest level
+        fnwrites_precond += fniters * fnrow_Af * sizeof(double); // One symmetric GS sweep at the coarsest level
+        double fnreads = fnreads_ddot + fnreads_waxpby + fnreads_sparsemv + fnreads_precond;
+        double fnwrites = fnwrites_ddot + fnwrites_waxpby + fnwrites_sparsemv + fnwrites_precond;
+        double frefnreads = fnreads * ((double) refMaxIters) / ((double) optMaxIters);
+        double frefnwrites = fnwrites * ((double) refMaxIters) / ((double) optMaxIters);
+
+        // ======================== Memory usage model =======================================
+
+        // Data in GenerateProblem_ref
+
+        double numberOfNonzerosPerRow
+            = 27.0; // We are approximating a 27-point finite element/volume/difference 3D stencil
+        double size = ((double) A.geom->size); // Needed for estimating size of halo
+
+        double fnbytes = ((double) sizeof(Geometry));           // Geometry struct in main.cpp
+        fnbytes += ((double) sizeof(double) * fNumberOfCgSets); // testnorms_data in main.cpp
+
+        // Model for GenerateProblem_ref.cpp
+        fnbytes += fnrow * sizeof(char);                                             // array nonzerosInRow
+        fnbytes += fnrow * ((double) sizeof(global_int_t*));                         // mtxIndG
+        fnbytes += fnrow * ((double) sizeof(local_int_t*));                          // mtxIndL
+        fnbytes += fnrow * ((double) sizeof(double*));                               // matrixValues
+        fnbytes += fnrow * ((double) sizeof(double*));                               // matrixDiagonal
+        fnbytes += fnrow * numberOfNonzerosPerRow * ((double) sizeof(local_int_t));  // mtxIndL[1..nrows]
+        fnbytes += fnrow * numberOfNonzerosPerRow * ((double) sizeof(double));       // matrixValues[1..nrows]
+        fnbytes += fnrow * numberOfNonzerosPerRow * ((double) sizeof(global_int_t)); // mtxIndG[1..nrows]
+        fnbytes += fnrow * ((double) 3 * sizeof(double));                            // x, b, xexact
+
+        // Model for CGData.hpp
+        double fncol = ((global_int_t) A.localNumberOfColumns)
+            * size; // Estimate of the global number of columns using the value from rank 0
+        fnbytes += fnrow * ((double) 2 * sizeof(double)); // r, Ap
+        fnbytes += fncol * ((double) 2 * sizeof(double)); // z, p
+
+        std::vector<double> fnbytesPerLevel(numberOfMgLevels); // Count byte usage per level (level 0 is main CG level)
+        fnbytesPerLevel[0] = fnbytes;
+
+        // Benchmarker-provided model for OptimizeProblem.cpp
+        double fnbytes_OptimizedProblem = OptimizeProblemMemoryUse(A);
+        fnbytes += fnbytes_OptimizedProblem;
+
+        Af = A.Ac;
+        for (int i = 1; i < numberOfMgLevels; ++i)
+        {
+            double fnrow_Af = Af->totalNumberOfRows;
+            double fncol_Af = ((global_int_t) Af->localNumberOfColumns)
+                * size; // Estimate of the global number of columns using the value from rank 0
+            double fnbytes_Af = 0.0;
+            // Model for GenerateCoarseProblem.cpp
+            fnbytes_Af += fnrow_Af * ((double) sizeof(local_int_t)); // f2cOperator
+            fnbytes_Af += fnrow_Af * ((double) sizeof(double));      // rc
+            fnbytes_Af += 2.0 * fncol_Af
+                * ((double) sizeof(double)); // xc, Axf are estimated based on the size of these arrays on rank 0
+            fnbytes_Af += ((double) (sizeof(Geometry) + sizeof(SparseMatrix) + 3 * sizeof(Vector)
+                + sizeof(MGData))); // Account for structs geomc, Ac, rc, xc, Axf - (minor)
+
+            // Model for GenerateProblem.cpp (called within GenerateCoarseProblem.cpp)
+            fnbytes_Af += fnrow_Af * sizeof(char);                                             // array nonzerosInRow
+            fnbytes_Af += fnrow_Af * ((double) sizeof(global_int_t*));                         // mtxIndG
+            fnbytes_Af += fnrow_Af * ((double) sizeof(local_int_t*));                          // mtxIndL
+            fnbytes_Af += fnrow_Af * ((double) sizeof(double*));                               // matrixValues
+            fnbytes_Af += fnrow_Af * ((double) sizeof(double*));                               // matrixDiagonal
+            fnbytes_Af += fnrow_Af * numberOfNonzerosPerRow * ((double) sizeof(local_int_t));  // mtxIndL[1..nrows]
+            fnbytes_Af += fnrow_Af * numberOfNonzerosPerRow * ((double) sizeof(double));       // matrixValues[1..nrows]
+            fnbytes_Af += fnrow_Af * numberOfNonzerosPerRow * ((double) sizeof(global_int_t)); // mtxIndG[1..nrows]
+
+// Model for SetupHalo_ref.cpp
+#ifndef HPCG_NO_MPI
+            fnbytes_Af += ((double) sizeof(double) * Af->totalToBeSent);              // sendBuffer
+            fnbytes_Af += ((double) sizeof(local_int_t) * Af->totalToBeSent);         // elementsToSend
+            fnbytes_Af += ((double) sizeof(int) * Af->numberOfSendNeighbors);         // neighbors
+            fnbytes_Af += ((double) sizeof(local_int_t) * Af->numberOfSendNeighbors); // receiveLength, sendLength
+#endif
+            fnbytesPerLevel[i] = fnbytes_Af;
+            fnbytes += fnbytes_Af; // Running sum
+            Af = Af->Ac;           // Go to next coarse level
+        }
+
+        assert(Af == 0); // Make sure we got to the lowest grid level
+
+        // Count number of bytes used per equation
+        double fnbytesPerEquation = fnbytes / fnrow;
+
+        // Instantiate YAML document
+        OutputFile doc("HPCG-Benchmark", "3.1");
+        doc.add("Release date", "March 28, 2019");
+
+        doc.add("Machine Summary", "");
+        doc.get("Machine Summary")->add("Distributed Processes", A.geom->size);
+        doc.get("Machine Summary")->add("Threads per processes", A.geom->numThreads);
+
+        doc.add("Global Problem Dimensions", "");
+        doc.get("Global Problem Dimensions")->add("Global nx", A.geom->gnx);
+        doc.get("Global Problem Dimensions")->add("Global ny", A.geom->gny);
+        doc.get("Global Problem Dimensions")->add("Global nz", A.geom->gnz);
+
+        doc.add("Processor Dimensions", "");
+        doc.get("Processor Dimensions")->add("npx", A.geom->npx);
+        doc.get("Processor Dimensions")->add("npy", A.geom->npy);
+        doc.get("Processor Dimensions")->add("npz", A.geom->npz);
+
+        doc.add("Local Domain Dimensions", "");
+        doc.get("Local Domain Dimensions")->add("nx", A.geom->nx);
+        doc.get("Local Domain Dimensions")->add("ny", A.geom->ny);
+
+        doc.add("########## Problem Summary  ##########", "");
+
+        doc.add("Setup Information", "");
+        doc.get("Setup Information")->add("Setup Time", times[9]);
+
+        doc.add("Linear System Information", "");
+        doc.get("Linear System Information")->add("Number of Equations", A.totalNumberOfRows);
+        doc.get("Linear System Information")->add("Number of Nonzero Terms", A.totalNumberOfNonzeros);
+
+        doc.add("Multigrid Information", "");
+        doc.get("Multigrid Information")->add("Number of coarse grid levels", numberOfMgLevels - 1);
+        Af = &A;
+        doc.get("Multigrid Information")->add("Coarse Grids", "");
+        for (int i = 1; i < numberOfMgLevels; ++i)
+        {
+            doc.get("Multigrid Information")->get("Coarse Grids")->add("Grid Level", i);
+            doc.get("Multigrid Information")
+                ->get("Coarse Grids")
+                ->add("Number of Equations", Af->Ac->totalNumberOfRows);
+            doc.get("Multigrid Information")
+                ->get("Coarse Grids")
+                ->add("Number of Nonzero Terms", Af->Ac->totalNumberOfNonzeros);
+            doc.get("Multigrid Information")
+                ->get("Coarse Grids")
+                ->add("Number of Presmoother Steps", Af->mgData->numberOfPresmootherSteps);
+            doc.get("Multigrid Information")
+                ->get("Coarse Grids")
+                ->add("Number of Postsmoother Steps", Af->mgData->numberOfPostsmootherSteps);
+            Af = Af->Ac;
+        }
+
+        doc.add("########## Memory Use Summary  ##########", "");
+
+        doc.add("Memory Use Information", "");
+        doc.get("Memory Use Information")->add("Total memory used for data (Gbytes)", fnbytes / 1000000000.0);
+        doc.get("Memory Use Information")
+            ->add("Memory used for OptimizeProblem data (Gbytes)", fnbytes_OptimizedProblem / 1000000000.0);
+        doc.get("Memory Use Information")
+            ->add("Bytes per equation (Total memory / Number of Equations)", fnbytesPerEquation);
+
+        doc.get("Memory Use Information")
+            ->add("Memory used for linear system and CG (Gbytes)", fnbytesPerLevel[0] / 1000000000.0);
+
+        doc.get("Memory Use Information")->add("Coarse Grids", "");
+        for (int i = 1; i < numberOfMgLevels; ++i)
+        {
+            doc.get("Memory Use Information")->get("Coarse Grids")->add("Grid Level", i);
+            doc.get("Memory Use Information")
+                ->get("Coarse Grids")
+                ->add("Memory used", fnbytesPerLevel[i] / 1000000000.0);
+        }
+
+        doc.add("########## V&V Testing Summary  ##########", "");
+        doc.add("Spectral Convergence Tests", "");
+        if (testcg_data.count_fail == 0)
+            doc.get("Spectral Convergence Tests")->add("Result", "PASSED");
+        else
+            doc.get("Spectral Convergence Tests")->add("Result", "FAILED");
+        doc.get("Spectral Convergence Tests")->add("Unpreconditioned", "");
+        doc.get("Spectral Convergence Tests")
+            ->get("Unpreconditioned")
+            ->add("Maximum iteration count", testcg_data.niters_max_no_prec);
+        doc.get("Spectral Convergence Tests")
+            ->get("Unpreconditioned")
+            ->add("Expected iteration count", testcg_data.expected_niters_no_prec);
+        doc.get("Spectral Convergence Tests")->add("Preconditioned", "");
+        doc.get("Spectral Convergence Tests")
+            ->get("Preconditioned")
+            ->add("Maximum iteration count", testcg_data.niters_max_prec);
+        doc.get("Spectral Convergence Tests")
+            ->get("Preconditioned")
+            ->add("Expected iteration count", testcg_data.expected_niters_prec);
+
+        const char DepartureFromSymmetry[] = "Departure from Symmetry |x'Ay-y'Ax|/(2*||x||*||A||*||y||)/epsilon";
+        doc.add(DepartureFromSymmetry, "");
+        if (testsymmetry_data.count_fail == 0)
+            doc.get(DepartureFromSymmetry)->add("Result", "PASSED");
+        else
+            doc.get(DepartureFromSymmetry)->add("Result", "FAILED");
+        doc.get(DepartureFromSymmetry)->add("Departure for SpMV", testsymmetry_data.depsym_spmv);
+        doc.get(DepartureFromSymmetry)->add("Departure for MG", testsymmetry_data.depsym_mg);
+
+        doc.add("########## Iterations Summary  ##########", "");
+        doc.add("Iteration Count Information", "");
+        if (!global_failure)
+            doc.get("Iteration Count Information")->add("Result", "PASSED");
+        else
+            doc.get("Iteration Count Information")->add("Result", "FAILED");
+        doc.get("Iteration Count Information")->add("Reference CG iterations per set", refMaxIters);
+        doc.get("Iteration Count Information")->add("Optimized CG iterations per set", optMaxIters);
+        doc.get("Iteration Count Information")
+            ->add("Total number of reference iterations", refMaxIters * numberOfCgSets);
+        doc.get("Iteration Count Information")
+            ->add("Total number of optimized iterations", optMaxIters * numberOfCgSets);
+
+        doc.add("########## Reproducibility Summary  ##########", "");
+        doc.add("Reproducibility Information", "");
+        if (testnorms_data.pass)
+            doc.get("Reproducibility Information")->add("Result", "PASSED");
+        else
+            doc.get("Reproducibility Information")->add("Result", "FAILED");
+        doc.get("Reproducibility Information")->add("Scaled residual mean", testnorms_data.mean);
+        doc.get("Reproducibility Information")->add("Scaled residual variance", testnorms_data.variance);
+
+        doc.add("########## Performance Summary (times in sec) ##########", "");
+
+        doc.add("Benchmark Time Summary", "");
+        doc.get("Benchmark Time Summary")->add("Optimization phase", times[7]);
+        doc.get("Benchmark Time Summary")->add("DDOT", times[1]);
+        doc.get("Benchmark Time Summary")->add("WAXPBY", times[2]);
+        doc.get("Benchmark Time Summary")->add("SpMV", times[3]);
+        doc.get("Benchmark Time Summary")->add("MG", times[5]);
+        doc.get("Benchmark Time Summary")->add("Total", times[0]);
+
+        doc.add("Floating Point Operations Summary", "");
+        doc.get("Floating Point Operations Summary")->add("Raw DDOT", fnops_ddot);
+        doc.get("Floating Point Operations Summary")->add("Raw WAXPBY", fnops_waxpby);
+        doc.get("Floating Point Operations Summary")->add("Raw SpMV", fnops_sparsemv);
+        doc.get("Floating Point Operations Summary")->add("Raw MG", fnops_precond);
+        doc.get("Floating Point Operations Summary")->add("Total", fnops);
+        doc.get("Floating Point Operations Summary")->add("Total with convergence overhead", frefnops);
+
+        doc.add("GB/s Summary", "");
+        doc.get("GB/s Summary")->add("Raw Read B/W", fnreads / times[0] / 1.0E9);
+        doc.get("GB/s Summary")->add("Raw Write B/W", fnwrites / times[0] / 1.0E9);
+        doc.get("GB/s Summary")->add("Raw Total B/W", (fnreads + fnwrites) / (times[0]) / 1.0E9);
+        doc.get("GB/s Summary")
+            ->add("Total with convergence and optimization phase overhead",
+                (frefnreads + frefnwrites) / (times[0] + fNumberOfCgSets * (times[7] / 10.0 + times[9] / 10.0))
+                    / 1.0E9);
+
+        doc.add("GFLOP/s Summary", "");
+        doc.get("GFLOP/s Summary")->add("Raw DDOT", fnops_ddot / times[1] / 1.0E9);
+        doc.get("GFLOP/s Summary")->add("Raw WAXPBY", fnops_waxpby / times[2] / 1.0E9);
+        doc.get("GFLOP/s Summary")->add("Raw SpMV", fnops_sparsemv / (times[3]) / 1.0E9);
+        doc.get("GFLOP/s Summary")->add("Raw MG", fnops_precond / (times[5]) / 1.0E9);
+        doc.get("GFLOP/s Summary")->add("Raw Total", fnops / times[0] / 1.0E9);
+        doc.get("GFLOP/s Summary")->add("Total with convergence overhead", frefnops / times[0] / 1.0E9);
+        // This final GFLOP/s rating includes the overhead of problem setup and optimizing the data structures vs ten
+        // sets of 50 iterations of CG
+        double totalGflops = frefnops / (times[0] + fNumberOfCgSets * (times[7] / 10.0 + times[9] / 10.0)) / 1.0E9;
+        double totalGflops24 = frefnops / (times[0] + fNumberOfCgSets * times[7] / 10.0) / 1.0E9;
+        doc.get("GFLOP/s Summary")->add("Total with convergence and optimization phase overhead", totalGflops);
+
+        doc.add("User Optimization Overheads", "");
+        doc.get("User Optimization Overheads")->add("Optimization phase time (sec)", (times[7]));
+        doc.get("User Optimization Overheads")
+            ->add("Optimization phase time vs reference SpMV+MG time", times[7] / times[8]);
+
+#ifndef HPCG_NO_MPI
+        doc.add("DDOT Timing Variations", "");
+        doc.get("DDOT Timing Variations")->add("Min DDOT MPI_Allreduce time", t4min);
+        doc.get("DDOT Timing Variations")->add("Max DDOT MPI_Allreduce time", t4max);
+        doc.get("DDOT Timing Variations")->add("Avg DDOT MPI_Allreduce time", t4avg);
+
+// doc.get("Sparse Operations Overheads")->add("Halo exchange time (sec)", (times[6]));
+// doc.get("Sparse Operations Overheads")->add("Halo exchange as percentage of SpMV time",
+// (times[6])/totalSparseMVTime*100.0);
+#endif
+        doc.add("Final Summary", "");
+        bool isValidRun = (testcg_data.count_fail == 0) && (testsymmetry_data.count_fail == 0) && (testnorms_data.pass)
+            && (!global_failure);
+        if (isValidRun)
+        {
+            doc.get("Final Summary")->add("HPCG result is VALID with a GFLOP/s rating of", totalGflops);
+            doc.get("Final Summary")->add("HPCG 2.4 rating for historical reasons is", totalGflops24);
+            if (!A.isDotProductOptimized)
+            {
+                doc.get("Final Summary")
+                    ->add("Reference version of ComputeDotProduct used",
+                        "Performance results are most likely suboptimal");
+            }
+            if (!A.isSpmvOptimized)
+            {
+                doc.get("Final Summary")
+                    ->add("Reference version of ComputeSPMV used", "Performance results are most likely suboptimal");
+            }
+            if (!A.isMgOptimized)
+            {
+                if (A.geom->numThreads > 1)
+                    doc.get("Final Summary")
+                        ->add("Reference version of ComputeMG used and number of threads greater than 1",
+                            "Performance results are severely suboptimal");
+                else // numThreads ==1
+                    doc.get("Final Summary")
+                        ->add("Reference version of ComputeMG used", "Performance results are most likely suboptimal");
+            }
+            if (!A.isWaxpbyOptimized)
+            {
+                doc.get("Final Summary")
+                    ->add("Reference version of ComputeWAXPBY used", "Performance results are most likely suboptimal");
+            }
+            if (times[0] >= minOfficialTime)
+            {
+                doc.get("Final Summary")
+                    ->add("Please upload results from the YAML file contents to", "http://hpcg-benchmark.org");
+            }
+            else
+            {
+                doc.get("Final Summary")->add("Results are valid but execution time (sec) is", times[0]);
+                if (quickPath)
+                {
+                    doc.get("Final Summary")
+                        ->add("You have selected the QuickPath option",
+                            "Results are official for legacy installed systems with confirmation from the HPCG "
+                            "Benchmark leaders.");
+                    doc.get("Final Summary")
+                        ->add("After confirmation please upload results from the YAML file contents to",
+                            "http://hpcg-benchmark.org");
+                }
+                else
+                {
+                    doc.get("Final Summary")
+                        ->add("Official results execution time (sec) must be at least", minOfficialTime);
+                }
+            }
+        }
+        else
+        {
+            doc.get("Final Summary")->add("HPCG result is", "INVALID.");
+            doc.get("Final Summary")
+                ->add("Please review the YAML file contents", "You may NOT submit these results for consideration.");
+        }
+
+        std::string yaml = doc.generate();
+#ifdef HPCG_DEBUG
+        HPCG_fout << yaml;
+#endif
+    }
+    return;
+}
--- a/src/ReportResults.hpp
+++ b/src/ReportResults.hpp
@@ -0,0 +1,26 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef REPORTRESULTS_HPP
+#define REPORTRESULTS_HPP
+#include "SparseMatrix.hpp"
+#include "TestCG.hpp"
+#include "TestNorms.hpp"
+#include "TestSymmetry.hpp"
+
+void ReportResults(const SparseMatrix& A, int numberOfMgLevels, int numberOfCgSets, int refMaxIters, int optMaxIters,
+    double times[], const TestCGData& testcg_data, const TestSymmetryData& testsymmetry_data,
+    const TestNormsData& testnorms_data, int global_failure, bool quickPath);
+
+#endif // REPORTRESULTS_HPP
--- a/src/SetupHalo.cpp
+++ b/src/SetupHalo.cpp
@@ -0,0 +1,729 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file SetupHalo.cpp
+
+ HPCG routine
+ */
+
+#ifndef HPCG_NO_MPI
+#include <map>
+#include <mpi.h>
+#include <set>
+#endif
+
+#include <algorithm>
+
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+
+#include "SetupHalo.hpp"
+#include "SetupHalo_ref.hpp"
+
+#ifdef USE_CUDA
+#include "Cuda.hpp"
+#include "CudaKernels.hpp"
+#endif
+
+#ifdef USE_GRACE
+#include "CpuKernels.hpp"
+#endif
+
+#ifndef HPCG_NO_MPI
+// Used to find ranks for CPU and GPU programs
+extern int global_total_ranks;
+extern int* physical_rank_dims;
+extern int* logical_rank_to_phys;
+extern int* rankToId_h;
+extern int* idToRank_h;
+extern p2p_comm_mode_t P2P_Mode;
+#endif
+
+/*!
+  Prepares system matrix data structure and creates data necessary necessary
+  for communication of boundary values of this process.
+
+  @param[inout] A    The known system matrix
+
+  @see ExchangeHalo
+*/
+#ifdef USE_CUDA
+void SetupHalo_Gpu(SparseMatrix& A)
+{
+    global_int_t nx = A.geom->nx;
+    global_int_t ny = A.geom->ny;
+    global_int_t nz = A.geom->nz;
+    global_int_t gnx = A.geom->gnx;
+    global_int_t gny = A.geom->gny;
+    global_int_t gnz = A.geom->gnz;
+    global_int_t gix0 = A.geom->gix0;
+    global_int_t giy0 = A.geom->giy0;
+    global_int_t giz0 = A.geom->giz0;
+
+#ifndef HPCG_NO_MPI
+    local_int_t localNumberOfRows = A.localNumberOfRows;
+
+    local_int_t* send_buffer_d;
+    local_int_t sendbufld
+        = std::max(std::max(A.geom->nx * A.geom->ny, A.geom->nx * A.geom->nz), A.geom->ny * A.geom->nz);
+    int* neighbors = new int[27];
+    int* neighborsPhysical = new int[27];
+
+    CHECK_CUDART(cudaMalloc((void**) &(send_buffer_d), 27 * sendbufld * sizeof(local_int_t)));
+    local_int_t* sendLength = new local_int_t[27];
+
+    local_int_t totalToBeSent = 0;
+    int neiCount = 0;
+    int numberOfExternalValues = 0;
+
+    local_int_t* sendcounts2 = new local_int_t[27];
+    local_int_t* receiveLength = new local_int_t[27];
+    memset(sendcounts2, 0, sizeof(local_int_t) * (27));
+
+    local_int_t* sendcounts_d = NULL;
+    local_int_t* elementsToSendGpu;
+
+    cudaMalloc(&sendcounts_d, sizeof(local_int_t) * (27));
+    cudaMemsetAsync(sendcounts_d, 0, sizeof(local_int_t) * (27), stream);
+
+    // Finds elements to send and neighbors
+    SetupHaloCuda(A, sendbufld, sendcounts_d, send_buffer_d, &totalToBeSent, &neiCount, neighbors, sendLength,
+        &elementsToSendGpu);
+
+    local_int_t* elementsToSend = new local_int_t[totalToBeSent];
+    double* sendBuffer = nullptr;
+    if (totalToBeSent > 0)
+    {
+        cudaMemcpyAsync(
+            elementsToSend, elementsToSendGpu, sizeof(local_int_t) * totalToBeSent, cudaMemcpyDeviceToHost, stream);
+
+        local_int_t* sendcounts = (local_int_t*) malloc(sizeof(local_int_t) * (A.geom->size + 1));
+        memset(sendcounts, 0, sizeof(local_int_t) * (A.geom->size + 1));
+
+        local_int_t *eltsToRecv_d = NULL, *extToLocMap = NULL;
+
+        sendcounts[0] = 0;
+        for (int i = 0; i < neiCount; i++)
+        {
+            receiveLength[i] = sendLength[i];
+            sendcounts[i + 1] = sendcounts[i] + sendLength[i];
+            int neighborId = neighbors[i];
+            neighborsPhysical[i] = logical_rank_to_phys[neighborId];
+        }
+        CHECK_CUDART(cudaMalloc(&extToLocMap, sizeof(local_int_t) * localNumberOfRows));
+        CHECK_CUDART(cudaMalloc(&eltsToRecv_d, sizeof(local_int_t) * totalToBeSent));
+
+        CHECK_CUDART(cudaMallocHost(&(sendBuffer), sizeof(double) * totalToBeSent));
+        CHECK_CUDART(cudaMalloc(&(A.gpuAux.sendBuffer), sizeof(double) * totalToBeSent));
+
+        local_int_t* eltsToRecv = new local_int_t[totalToBeSent];
+
+        // Exchange elements to send with neighbors
+        auto INDEX_TYPE = MPI_INT;
+#ifdef INDEX_64 // In src/Geometry
+        INDEX_TYPE = MPI_LONG;
+#endif
+
+        MPI_Status status;
+        int MPI_MY_TAG = 93;
+        MPI_Request* request = new MPI_Request[neiCount];
+        cudaStreamSynchronize(stream);
+
+        local_int_t* recv_ptr = eltsToRecv;
+        for (int i = 0; i < neiCount; i++)
+        {
+            auto n_recv = sendLength[i];
+            MPI_Irecv(recv_ptr, n_recv, INDEX_TYPE, neighborsPhysical[i], MPI_MY_TAG, MPI_COMM_WORLD, request + i);
+            recv_ptr += n_recv;
+        }
+
+        local_int_t* elts_ptr = elementsToSend;
+        for (int i = 0; i < neiCount; i++)
+        {
+            auto n_send = sendLength[i];
+            MPI_Send(elts_ptr, n_send, INDEX_TYPE, neighborsPhysical[i], MPI_MY_TAG, MPI_COMM_WORLD);
+            elts_ptr += n_send;
+        }
+        for (int i = 0; i < neiCount; i++)
+        {
+            MPI_Wait(request + i, &status);
+        }
+        delete[] request;
+
+        cudaMemcpyAsync(
+            eltsToRecv_d, eltsToRecv, sizeof(local_int_t) * (totalToBeSent), cudaMemcpyHostToDevice, stream);
+
+        // Add the sorted indices from neighbors. For each neighbor, add its indices sequentially
+        //  before the next neighbor's indices. Tje indices will be adjusted to be
+        //  localNumberOfRows + its sequential location
+        for (int neighborCount = 0; neighborCount < neiCount; ++neighborCount)
+        {
+            int neighborId = neighbors[neighborCount];
+            cudaMemsetAsync(extToLocMap, 0, sizeof(local_int_t) * localNumberOfRows, stream);
+            local_int_t str = sendcounts[neighborCount];
+            local_int_t end = sendcounts[neighborCount + 1];
+            ExtToLocMapCuda(localNumberOfRows, str, end, extToLocMap, eltsToRecv_d);
+            ExtTolocCuda(localNumberOfRows, neighborId, A.extNnz, A.csrExtColumns, A.csrExtValues,
+                A.gpuAux.ext2csrOffsets, extToLocMap, A.gpuAux.columns);
+        }
+
+        CHECK_CUDART(cudaFree(sendcounts_d));
+        CHECK_CUDART(cudaFree(extToLocMap));
+        CHECK_CUDART(cudaFree(eltsToRecv_d));
+
+        // For P2P Alltoallv communication
+        if (P2P_Mode == MPI_GPU_All2allv || P2P_Mode == MPI_CPU_All2allv)
+        {
+            int* sdispls = new int[A.geom->size];
+            int* rdispls = new int[A.geom->size];
+            int* scounts = new int[A.geom->size];
+            int* rcounts = new int[A.geom->size];
+            int tmp_s = 0, tmp_r = 0;
+
+            if (sdispls == NULL || rdispls == NULL || scounts == NULL || rcounts == NULL)
+                return;
+
+            for (local_int_t i = 0; i < A.geom->size; i++)
+            {
+                scounts[i] = 0;
+                rcounts[i] = 0;
+                sdispls[i] = 0;
+                rdispls[i] = 0;
+            }
+
+            for (local_int_t i = 0; i < neiCount; i++)
+            {
+                local_int_t root = neighborsPhysical[i];
+                scounts[root] = sendLength[i];
+                rcounts[root] = receiveLength[i];
+                sdispls[root] = tmp_s;
+                tmp_s += sendLength[i];
+                rdispls[root] = tmp_r;
+                tmp_r += receiveLength[i];
+            }
+
+            A.scounts = scounts;
+            A.rcounts = rcounts;
+            A.sdispls = sdispls;
+            A.rdispls = rdispls;
+        }
+    }
+
+    // Store contents in our matrix struct
+    A.numberOfExternalValues = totalToBeSent;
+    A.localNumberOfColumns = A.localNumberOfRows + A.numberOfExternalValues;
+    A.numberOfSendNeighbors = neiCount;
+    A.totalToBeSent = totalToBeSent;
+    A.elementsToSend = elementsToSend;
+    A.gpuAux.elementsToSend = elementsToSendGpu;
+    A.neighbors = neighbors;
+    A.neighborsPhysical = neighborsPhysical;
+    A.receiveLength = receiveLength;
+    A.sendLength = sendLength;
+    A.sendBuffer = sendBuffer;
+#endif
+    return;
+}
+#endif
+
+#ifdef USE_GRACE
+void SetupHalo_Cpu(SparseMatrix& A)
+{
+    // Extract Matrix pieces
+    global_int_t nx = A.geom->nx;
+    global_int_t ny = A.geom->ny;
+    global_int_t nz = A.geom->nz;
+    global_int_t gnx = A.geom->gnx;
+    global_int_t gny = A.geom->gny;
+    global_int_t gnz = A.geom->gnz;
+    global_int_t gix0 = A.geom->gix0;
+    global_int_t giy0 = A.geom->giy0;
+    global_int_t giz0 = A.geom->giz0;
+    int npx = A.geom->npx;
+    int npy = A.geom->npy;
+
+    local_int_t localNumberOfRows = A.localNumberOfRows;
+    local_int_t* nonzerosInRow = A.nonzerosInRow;
+    global_int_t** mtxIndG = A.mtxIndG;
+    local_int_t** mtxIndL = A.mtxIndL;
+
+#ifdef HPCG_NO_MPI // In the non-MPI case we simply copy global indices to local index storage
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+    for (local_int_t i = 0; i < localNumberOfRows; i++)
+    {
+        int cur_nnz = nonzerosInRow[i];
+        for (int j = 0; j < cur_nnz; j++)
+            mtxIndL[i][j] = mtxIndG[i][j];
+    }
+
+#else // Run this section if compiling for MPI
+
+    // Scan global IDs of the nonzeros in the matrix.  Determine if the column ID matches a row ID.  If not:
+    // 1) We call the ComputeRankOfMatrixRow function, which tells us the rank of the processor owning the row ID.
+    //  We need to receive this value of the x vector during the halo exchange.
+    // 2) We record our row ID since we know that the other processor will need this value from us, due to symmetry.
+    std::map<local_int_t, std::map<global_int_t, local_int_t>> externalToLocalMap;
+    local_int_t* extTemp = new local_int_t[localNumberOfRows];
+
+    // Okay Let us git rid of the map
+    local_int_t sendbufld
+        = std::max(std::max(A.geom->nx * A.geom->ny, A.geom->nx * A.geom->nz), A.geom->ny * A.geom->nz);
+    local_int_t* send_buffer = new local_int_t[27 * sendbufld];
+    char* has_external = new char[localNumberOfRows];
+    local_int_t* sendcounter = new local_int_t[27];
+    for (local_int_t i = 0; i < 27; i++)
+        sendcounter[i] = 0;
+
+// Goes through all local rows, for each local point
+//  find its 27 3D neighbors (including the point itself).
+//  For each neibor decide if it is on a different rank (halo) or local
+//  If external, add to the send buffer
+//  If local, create the local matrix
+#pragma omp parallel for
+    for (local_int_t i = 0; i < localNumberOfRows; i++)
+    {
+        const local_int_t iz = (i / (nx * ny));
+        const local_int_t iy = (i - iz * nx * ny) / nx;
+        const local_int_t ix = i - (iz * ny + iy) * nx;
+        const global_int_t gix = ix + gix0;
+        const global_int_t giy = iy + giy0;
+        const global_int_t giz = iz + giz0;
+        global_int_t curcol;
+
+        int nnz_c = 0;
+        bool rank_set[27];
+        for (int j = 0; j < 27; j++)
+        {
+            rank_set[j] = false;
+        }
+        has_external[i] = 0;
+        for (int k = 0; k < 27; k++)
+        {
+            long long int cgix = gix + tid2indCpu[k][0];
+            long long int cgiy = giy + tid2indCpu[k][1];
+            long long int cgiz = giz + tid2indCpu[k][2];
+            int ok = cgiz > -1 && cgiz < gnz && cgiy > -1 && cgiy < gny && cgix > -1 && cgix < gnx;
+            if (ok)
+            {
+                int ipz = cgiz / nz;
+                int ipy = cgiy / ny;
+                int ipx = cgix / nx;
+
+                // For GPUCPU exec mode, find the 3D rank coordinates.
+                //  For diff dim between CPU and GPU, we cannot
+                //  just divide on the local dim to find ipx/ipy/ipz
+                //  We must find it manually based on neighbor 3d coordinates
+                //  Note the halo size is always 1
+                if (A.geom->different_dim == Z)
+                {
+                    long long int local = cgiz - giz0;
+                    if (local >= 0 && local < nz)
+                        ipz = A.geom->ipz;
+                    else if (local < 0)
+                        ipz = A.geom->ipz - 1;
+                    else if (local >= nz)
+                        ipz = A.geom->ipz + 1;
+                }
+                else if (A.geom->different_dim == Y)
+                {
+                    long long int local = cgiy - giy0;
+                    if (local >= 0 && local < ny)
+                        ipy = A.geom->ipy;
+                    else if (local < 0)
+                        ipy = A.geom->ipy - 1;
+                    else if (local >= ny)
+                        ipy = A.geom->ipy + 1;
+                }
+                else if (A.geom->different_dim == X)
+                {
+                    long long int local = cgix - gix0;
+                    if (local >= 0 && local < nx)
+                        ipx = A.geom->ipx;
+                    else if (local < 0)
+                        ipx = A.geom->ipx - 1;
+                    else if (local >= nx)
+                        ipx = A.geom->ipx + 1;
+                }
+
+                // Global rank Id
+                int col_rank = ipx + ipy * npx + ipz * npy * npx;
+
+                // The neighbor point rank is diff than the current point rank
+                if (A.geom->logical_rank != col_rank)
+                {
+                    has_external[i] = 1;
+                    int rankId = rankToId_h[col_rank];
+                    local_int_t* p = &(sendcounter[rankId]);
+                    // Add the halo point atomically to send_buffer
+                    // For all the cols in a row that has the same rank,
+                    //  we add the row once to the rank buffer
+                    if (!rank_set[rankId])
+                    {
+                        rank_set[rankId] = true;
+                        local_int_t t;
+#pragma omp atomic capture
+                        {
+                            t = *p;
+                            *p += 1;
+                        }
+                        send_buffer[rankId * sendbufld + t] = i;
+                    }
+                }
+                else
+                {
+                    // local neighbor, add it to the local matrix
+                    local_int_t zi = cgiz - giz0;
+                    local_int_t yi = cgiy - giy0;
+                    local_int_t xi = cgix - gix0;
+                    local_int_t lcol = zi * ny * nx + yi * nx + xi;
+                    mtxIndL[i][nnz_c] = lcol;
+                }
+                nnz_c++;
+            }
+        }
+    }
+
+    // Now external data structures
+    // 1 Create elements to send buffer (Sort the indicies for each neighbor)
+    local_int_t totalToBeSent = 0;
+    local_int_t* sendcounts = new local_int_t[A.geom->size + 1];
+    sendcounts[0] = 0;
+    int neighborCount = 0;
+#pragma omp parallel for
+    for (local_int_t i = 0; i < 27; i++)
+    {
+        if (sendcounter[i] > 0)
+        {
+            std::sort(send_buffer + i * sendbufld, send_buffer + i * sendbufld + sendcounter[i]);
+        }
+    }
+    for (local_int_t i = 0; i < 27; i++)
+    {
+        if (sendcounter[i] > 0)
+        {
+            totalToBeSent += sendcounter[i];
+            sendcounts[neighborCount + 1] = sendcounts[neighborCount] + sendcounter[i];
+            neighborCount++;
+        }
+    }
+
+    // 2 Now find neighbor Ids, neighbor physical Ids (see GenerateGeometry), and elemets to send
+    local_int_t sendEntryCount = 0;
+    local_int_t* receiveLength = new local_int_t[neighborCount];
+    local_int_t* sendLength = new local_int_t[neighborCount];
+    // Build the arrays and lists needed by the ExchangeHalo function.
+    double* sendBuffer = new double[totalToBeSent];
+    int* neighbors = new int[neighborCount];
+    int* neighborsPhysical = new int[neighborCount];
+    local_int_t* elementsToSend = new local_int_t[totalToBeSent];
+
+    neighborCount = 0;
+    for (local_int_t i = 0; i < 27; i++)
+    {
+        if (sendcounter[i] > 0)
+        {
+            int neighborId = idToRank_h[i]; // logical Id
+            int phys_neiId = logical_rank_to_phys[neighborId];
+
+            neighbors[neighborCount] = neighborId; // store rank ID of current neighbor
+            neighborsPhysical[neighborCount] = phys_neiId;
+            receiveLength[neighborCount] = sendcounter[i];
+            sendLength[neighborCount] = sendcounter[i];
+
+            for (int j = 0; j < sendcounter[i]; j++)
+            {
+                elementsToSend[sendEntryCount] = send_buffer[i * sendbufld + j];
+                sendEntryCount++;
+            }
+            neighborCount++;
+        }
+    }
+
+    delete[] send_buffer;
+    delete[] sendcounter;
+
+    // Exchange elements to send  wit other neighbors
+    auto INDEX_TYPE = MPI_INT;
+#ifdef INDEX_64 // In src/Geometry
+    INDEX_TYPE = MPI_LONG;
+#endif
+    MPI_Status status;
+    int MPI_MY_TAG = 93;
+    MPI_Request* request = new MPI_Request[neighborCount];
+    local_int_t* eltsToRecv = new local_int_t[totalToBeSent];
+    local_int_t* recv_ptr = eltsToRecv;
+    for (int i = 0; i < neighborCount; i++)
+    {
+        int n_recv = sendLength[i];
+        MPI_Irecv(recv_ptr, n_recv, INDEX_TYPE, neighborsPhysical[i], MPI_MY_TAG, MPI_COMM_WORLD, request + i);
+        recv_ptr += n_recv;
+    }
+
+    local_int_t* elts_ptr = elementsToSend;
+    for (int i = 0; i < neighborCount; i++)
+    {
+        local_int_t n_send = sendLength[i];
+        MPI_Send(elts_ptr, n_send, INDEX_TYPE, neighborsPhysical[i], MPI_MY_TAG, MPI_COMM_WORLD);
+        elts_ptr += n_send;
+    }
+    for (int i = 0; i < neighborCount; i++)
+    {
+        MPI_Wait(request + i, &status);
+    }
+    delete[] request;
+
+    // Create a map to be used in the optimization step
+    //  Any external column index will be given a sequntail Id
+    //  after the number of rows (Will be used to access x vector)
+    int prev_dim = 0;
+    for (int nc = 0; nc < neighborCount; ++nc)
+    {
+        int neighborId = neighbors[nc];
+        int phys_neiId = neighborsPhysical[nc];
+        local_int_t str = sendcounts[nc];
+        local_int_t end = sendcounts[nc + 1];
+        for (int j = str; j < end; j++)
+        {
+            const local_int_t col = eltsToRecv[j];
+            externalToLocalMap[neighborId][col] = localNumberOfRows + j;
+        }
+    }
+
+    delete[] eltsToRecv;
+    delete[] sendcounts;
+
+    if (totalToBeSent > 0)
+    {
+// Last step sort all external IDs per rank Id, elements of neighbor 0 first, then 1, and so on
+#pragma omp parallel for
+        for (local_int_t i = 0; i < localNumberOfRows; i++)
+        {
+            int nnz_ext = 0;
+            if (has_external[i] == 1)
+            {
+
+                const local_int_t iz = (i / (nx * ny));
+                const local_int_t iy = (i - iz * nx * ny) / nx;
+                const local_int_t ix = i - (iz * ny + iy) * nx;
+                const global_int_t gix = ix + gix0;
+                const global_int_t giy = iy + giy0;
+                const global_int_t giz = iz + giz0;
+                int nnz_c = 0;
+                
+                for (int k = 0; k < 27; k++)
+                {
+                    long long int cgix = gix + tid2indCpu[k][0];
+                    long long int cgiy = giy + tid2indCpu[k][1];
+                    long long int cgiz = giz + tid2indCpu[k][2];
+
+                    local_int_t zi = (cgiz) % nz;
+                    local_int_t yi = (cgiy) % ny;
+                    local_int_t xi = (cgix) % nx;
+                    int ok = cgiz > -1 && cgiz < gnz && cgiy > -1 && cgiy < gny && cgix > -1 && cgix < gnx;
+                    int ipz = cgiz / nz;
+                    int ipy = cgiy / ny;
+                    int ipx = cgix / nx;
+
+                    // The indices sent by the neighbor uses the neighbor's nx, ny, and nz which can
+                    // be deffirent than the current neighbor's dims. Thus, based on neighor location
+                    // and the diffrent_dim we adjust the indices if needed.
+                    // Also, the ipx, ipy, and ipz must be updated accordingly
+                    global_int_t new_nx = A.geom->nx;
+                    global_int_t new_ny = A.geom->ny;
+
+                    if (A.geom->different_dim == Z)
+                    {
+                        long long int local = cgiz - giz0;
+                        if (local >= 0 && local < nz)
+                        {
+                            ipz = A.geom->ipz;
+                            zi = local;
+                        }
+                        else if (local < 0)
+                        {
+                            ipz = A.geom->ipz - 1;
+                            zi = A.geom->previous_neighbor_dim - 1;
+                        }
+                        else if (local >= nz)
+                        {
+                            ipz = A.geom->ipz + 1;
+                            zi = 0;
+                        }
+                    }
+                    else if (A.geom->different_dim == Y)
+                    {
+                        long long int local = cgiy - giy0;
+                        if (local >= 0 && local < ny)
+                        {
+                            ipy = A.geom->ipy;
+                            yi = local;
+                        }
+                        else if (local < 0)
+                        {
+                            ipy = A.geom->ipy - 1;
+                            yi = A.geom->previous_neighbor_dim - 1;
+                            new_ny = A.geom->previous_neighbor_dim;
+                        }
+                        else if (local >= ny)
+                        {
+                            ipy = A.geom->ipy + 1;
+                            yi = 0;
+                            new_ny = A.geom->next_neighbor_dim;
+                        }
+                    }
+                    else if (A.geom->different_dim == X)
+                    {
+                        long long int local = cgix - gix0;
+                        if (local >= 0 && local < nx)
+                        {
+                            ipx = A.geom->ipx;
+                            xi = local;
+                        }
+                        else if (local < 0)
+                        {
+                            ipx = A.geom->ipx - 1;
+                            xi = A.geom->previous_neighbor_dim - 1;
+                            new_nx = A.geom->previous_neighbor_dim;
+                        }
+                        else if (local >= nx)
+                        {
+                            ipx = A.geom->ipx + 1;
+                            xi = 0;
+                            new_nx = A.geom->next_neighbor_dim;
+                        }
+                    }
+                    local_int_t lcol = zi * new_ny * new_nx + yi * new_nx + xi;
+                    int row_rank = ipx + ipy * npx + ipz * npy * npx;
+
+                    if (ok)
+                    {
+                        if (externalToLocalMap.find(row_rank) != externalToLocalMap.end())
+                        {
+                            mtxIndL[i][nnz_c] = externalToLocalMap[row_rank][lcol];
+                            nnz_ext++;
+                        }
+                        nnz_c++;
+                    }
+                }
+            }
+             extTemp[i] = nnz_ext;
+        }
+    }
+
+    if (P2P_Mode == MPI_CPU_All2allv)
+    {
+        int* sdispls = new int[A.geom->size];
+        int* rdispls = new int[A.geom->size];
+        int* scounts = new int[A.geom->size];
+        int* rcounts = new int[A.geom->size];
+        int tmp_s = 0, tmp_r = 0;
+
+        if (sdispls == NULL || rdispls == NULL || scounts == NULL || rcounts == NULL)
+            return;
+
+        for (local_int_t i = 0; i < A.geom->size; i++)
+        {
+            scounts[i] = 0;
+            rcounts[i] = 0;
+            sdispls[i] = 0;
+            rdispls[i] = 0;
+        }
+
+        for (local_int_t i = 0; i < neighborCount; i++)
+        {
+            local_int_t root = neighborsPhysical[i];
+            scounts[root] = sendLength[i];
+            rcounts[root] = receiveLength[i];
+            sdispls[root] = tmp_s;
+            tmp_s += sendLength[i];
+            rdispls[root] = tmp_r;
+            tmp_r += receiveLength[i];
+        }
+        A.scounts = scounts;
+        A.rcounts = rcounts;
+        A.sdispls = sdispls;
+        A.rdispls = rdispls;
+    }
+
+    delete[] has_external;
+
+    // Store contents in our matrix struct
+    A.numberOfExternalValues = totalToBeSent;
+    A.localNumberOfColumns = A.localNumberOfRows + A.numberOfExternalValues;
+    A.numberOfSendNeighbors = neighborCount;
+    A.totalToBeSent = totalToBeSent;
+    A.elementsToSend = elementsToSend;
+    A.neighbors = neighbors;
+    A.neighborsPhysical = neighborsPhysical;
+    A.receiveLength = receiveLength;
+    A.sendLength = sendLength;
+    A.sendBuffer = sendBuffer;
+    A.cpuAux.tempIndex = extTemp;
+
+#ifdef HPCG_DETAILED_DEBUG
+    HPCG_fout << " For rank " << A.geom->rank << " of " << A.geom->size
+              << ", number of neighbors = " << A.numberOfSendNeighbors << endl;
+    for (int i = 0; i < A.numberOfSendNeighbors; i++)
+    {
+        HPCG_fout << "     rank " << A.geom->rank << " neighbor " << neighbors[i]
+                  << " send/recv length = " << sendLength[i] << "/" << receiveLength[i] << endl;
+        for (local_int_t j = 0; j < sendLength[i]; ++j)
+            HPCG_fout << "       rank " << A.geom->rank << " elementsToSend[" << j << "] = " << elementsToSend[j]
+                      << endl;
+    }
+#endif
+
+#endif
+    // ifdef HPCG_NO_MPI
+
+    return;
+}
+#endif // USE_GRACE
+
+void SetupHalo(SparseMatrix& A)
+{
+    if (A.rankType == GPU)
+    {
+#ifdef USE_CUDA
+        SetupHalo_Gpu(A);
+#endif
+    }
+    else
+    {
+#ifdef USE_GRACE
+        SetupHalo_Cpu(A);
+#endif
+    }
+}
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1,2 @@`

				`void ComputeOptimalShapeXYZ(int xyz, int& x, int& y, int& z);`