first commit

2026-01-18 20:37:50 +08:00
commit fff9f18287
123 changed files with 1385491 additions and 0 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -0,0 +1,78 @@
 ---
 AccessModifierOffset: -4
 AlignAfterOpenBracket: DontAlign
 AlignConsecutiveAssignments: false
 AlignConsecutiveDeclarations: false
 AlignEscapedNewlinesLeft: false
 AlignOperands:   false
 AlignTrailingComments: true
 AllowAllParametersOfDeclarationOnNextLine: true
 AllowShortBlocksOnASingleLine: false
 AllowShortCaseLabelsOnASingleLine: true
 AllowShortFunctionsOnASingleLine: Empty
 AllowShortIfStatementsOnASingleLine: false
 AllowShortLoopsOnASingleLine: false
 AlwaysBreakAfterDefinitionReturnType: None
 AlwaysBreakAfterReturnType: None
 AlwaysBreakBeforeMultilineStrings: true
 AlwaysBreakTemplateDeclarations: true
 BasedOnStyle: None
 BinPackArguments: true
 BinPackParameters: true
 BreakBeforeBinaryOperators: All
 BreakBeforeBraces: Allman
 BreakBeforeTernaryOperators: true
 BreakConstructorInitializersBeforeComma: true
 ColumnLimit:     120
 CommentPragmas:  '^ IWYU pragma:'
 ConstructorInitializerAllOnOneLineOrOnePerLine: false
 ConstructorInitializerIndentWidth: 4
 ContinuationIndentWidth: 4
 Cpp11BracedListStyle: true
 DerivePointerAlignment: false
 DisableFormat:   false
 ExperimentalAutoDetectBinPacking: false
 ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
 IncludeBlocks: Preserve
 IncludeCategories:
  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
    Priority:        2
  - Regex:           '^(<|"(gtest|isl|json)/)'
    Priority:        3
  - Regex:           '.*'
    Priority:        1
 IndentCaseLabels: false
 IndentWidth:     4
 IndentWrappedFunctionNames: false
 KeepEmptyLinesAtTheStartOfBlocks: true
 Language: Cpp
 MacroBlockBegin: ''
 MacroBlockEnd:   ''
 MaxEmptyLinesToKeep: 1
 NamespaceIndentation: None
 ObjCBlockIndentWidth: 4
 ObjCSpaceAfterProperty: true
 ObjCSpaceBeforeProtocolList: true
 PenaltyBreakBeforeFirstCallParameter: 19
 PenaltyBreakComment: 300
 PenaltyBreakFirstLessLess: 120
 PenaltyBreakString: 1000
 PenaltyExcessCharacter: 1000000
 PenaltyReturnTypeOnItsOwnLine: 60
 PointerAlignment: Left
 PointerBindsToType: false
 ReflowComments:  true
 SortIncludes:    true
 SpaceAfterCStyleCast: true
 SpaceBeforeAssignmentOperators: true
 SpaceBeforeParens: ControlStatements
 SpaceInEmptyParentheses: false
 SpacesBeforeTrailingComments: 1
 SpacesInAngles:  false
 SpacesInCStyleCastParentheses: false
 SpacesInContainerLiterals: true
 SpacesInParentheses: false
 SpacesInSquareBrackets: false
 Standard:        Cpp11
 TabWidth:        4
 UseTab:          Never
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,10 @@
 out
 *.swp
 src/*.o
 bin/HPCG-Benchmark_3*.txt
 bin/xhpcg
 bin/xhpcg-cpu
 bin/hpcg20*.txt
 .DS_Store
 bin/
 build/
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -0,0 +1,120 @@
 ## NVIDIA HPCG Contribution Rules
 #### Issue Tracking
 * All enhancement, bugfix, or change requests must begin with the creation of a [NVIDIA HPCG Issue Request](https://github.com/NVIDIA/nvidia-hpcg/issues).
 * The issue request must be reviewed by NVIDIA HPCG engineers and approved prior to code review.
 #### Coding Guidelines
 - Please follow the existing conventions in the relevant file, submodule, module, and project when you add new code or when you extend/fix existing functionality.
 - To maintain consistency in code formatting and style, you should also run `clang-format` on the modified sources with the provided configuration file. This applies NVIDIA HPCG code formatting rules to:
  - class, function/method, and variable/field naming
  - comment style
  - indentation
  - line length
 - Format git changes:
  ```bash
  # Commit ID is optional - if unspecified, run format on staged changes.
  git-clang-format --style file [commit ID/reference]
  ```
 - Format  individual source files:
  ```bash
  # -style=file : Obtain the formatting rules from .clang-format
  # -i : In-place modification of the processed file
  clang-format -style=file -i -fallback-style=none <file(s) to process>
  ```
 - Format entire codebase (for project maintainers only):
  ```bash
  find samples plugin -iname *.h -o -iname *.c -o -iname *.cpp -o -iname *.hpp \
  | xargs clang-format -style=file -i -fallback-style=none
  ```
 - Try to keep pull requests (PRs) as concise as possible:
  - Avoid committing commented-out code.
  - Wherever possible, each PR should address a single concern. If there are several otherwise-unrelated things that should be fixed to reach a desired endpoint, our recommendation is to open several PRs and indicate the dependencies in the description. The more complex the changes are in a single PR, the more time it will take to review those changes.
 - Write commit titles using imperative mood and [these rules](https://chris.beams.io/posts/git-commit/), and reference the Issue number corresponding to the PR. Following is the recommended format for commit texts:
 ```
 #<Issue Number> - <Commit Title>
 <Commit Body>
 ```
 - Ensure that the build log is clean, meaning no warnings or errors should be present.
 - All OSS components must contain accompanying documentation (READMEs) describing the functionality, dependencies, and known issues.
 - Make sure that you can contribute your work to open source (no license and/or patent conflict is introduced by your code). You will need to [`sign`](#signing-your-work) your commit.
 - Thanks in advance for your patience as we review your contributions; we do appreciate them!
 #### Pull Requests
 Developer workflow for code contributions is as follows:
 1. Developers must first [fork](https://help.github.com/en/articles/fork-a-repo) the [upstream](https://github.com/NVIDIA/nvidia-hpcg.git) NVIDIA-HPCG repository.
 2. Git clone the forked repository and push changes to the personal fork.
  ```bash
 git clone https://github.com/YOUR_USERNAME/YOUR_FORK.git NVIDIA-HPCG
 # Checkout the targeted branch and commit changes
 # Push the commits to a branch on the fork (remote).
 git push -u origin <local-branch>:<remote-branch>
  ```
 3. Once the code changes are staged on the fork and ready for review, a [Pull Request](https://help.github.com/en/articles/about-pull-requests) (PR) can be [requested](https://help.github.com/en/articles/creating-a-pull-request) to merge the changes from a branch of the fork into a selected branch of upstream.
  * Exercise caution when selecting the source and target branches for the PR.
  * Creation of a PR creation kicks off the code review process.
  * At least one NVIDIA-HPCG engineer will be assigned for the review.
  * While under review, mark your PRs as work-in-progress by prefixing the PR title with [WIP].
 4. Since there is no CI/CD process in place yet, the PR will be accepted and the corresponding issue closed only after adequate testing has been completed, manually, by the developer and/or NVIDIA-HPCG engineer reviewing the code.
 #### Signing Your Work
 * We require that all contributors "sign-off" on their commits. This certifies that the contribution is your original work, or you have rights to submit it under the same license, or a compatible license.
  * Any contribution which contains commits that are not Signed-Off will not be accepted.
 * To sign off on a commit you simply use the `--signoff` (or `-s`) option when committing your changes:
  ```bash
  $ git commit -s -m "Add cool feature."
  ```
  This will append the following to your commit message:
  ```
  Signed-off-by: Your Name <your@email.com>
  ```
 * Full text of the DCO:
  ```
    Developer Certificate of Origin
    Version 1.1
    Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
    Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.
  ```
  ```
    Developer's Certificate of Origin 1.1
    By making a contribution to this project, I certify that:
    (a) The contribution was created in whole or in part by me and I have the right to submit it under the open source license indicated in the file; or
    (b) The contribution is based upon previous work that, to the best of my knowledge, is covered under an appropriate open source license and I have the right under that license to submit that work with modifications, whether created in whole or in part by me, under the same open source license (unless I am permitted to submit under a different license), as indicated in the file; or
    (c) The contribution was provided directly to me by some other person who certified (a), (b) or (c) and I have not modified it.
    (d) I understand and agree that this project and the contribution are public and that a record of the contribution (including all personal information I submit with it, including my sign-off) is maintained indefinitely and may be redistributed consistent with this project or the open source license(s) involved.
  ```
--- a/46
+++ b/46
@@ -0,0 +1,46 @@
 ======================================================================
 -- High Performance Conjugate Gradients (HPCG) Benchmark
    HPCG - 3.1 - March 28, 2019
    Michael A. Heroux
    Scalable Algorithms Group, Center for Computing Research
    Sandia National Laboratories, Albuquerque, NM
    Piotr Luszczek
    Jack Dongarra
    University of Tennessee, Knoxville
    Innovative Computing Laboratory
    (C) Copyright 2013-2019 All Rights Reserved
 -- Copyright notice and Licensing terms:
 Redistribution  and  use in  source and binary forms, with or without
 modification, are  permitted provided  that the following  conditions
 are met:
 1. Redistributions  of  source  code  must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce  the above copyright
 notice, this list of conditions,  and the following disclaimer in the
 documentation and/or other materials provided with the distribution.
 3. The name of the  University,  the name of the  Laboratory,  or the
 names  of  its  contributors  may  not  be used to endorse or promote
 products  derived   from   this  software  without  specific  written
 permission.
 -- Disclaimer:
 THIS  SOFTWARE  IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,  INCLUDING,  BUT NOT
 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
 OR  CONTRIBUTORS  BE  LIABLE FOR ANY  DIRECT,  INDIRECT,  INCIDENTAL,
 SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES  (INCLUDING,  BUT NOT
 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 DATA OR PROFITS; OR BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON ANY
 THEORY OF LIABILITY, WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ======================================================================
--- a/63
+++ b/63
@@ -0,0 +1,63 @@
 ======================================================================
 Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. 
 All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ======================================================================
 ======================================================================
 -- High Performance Conjugate Gradients (HPCG) Benchmark
    HPCG - 3.1 - March 28, 2019
    Michael A. Heroux
    Scalable Algorithms Group, Center for Computing Research
    Sandia National Laboratories, Albuquerque, NM
    Piotr Luszczek
    Jack Dongarra
    University of Tennessee, Knoxville
    Innovative Computing Laboratory
    (C) Copyright 2013-2019 All Rights Reserved
 -- Copyright notice and Licensing terms:
 Redistribution  and  use in  source and binary forms, with or without
 modification, are  permitted provided  that the following  conditions
 are met:
 1. Redistributions  of  source  code  must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce  the above copyright
 notice, this list of conditions,  and the following disclaimer in the
 documentation and/or other materials provided with the distribution.
 3. The name of the  University,  the name of the  Laboratory,  or the
 names  of  its  contributors  may  not  be used to endorse or promote
 products  derived   from   this  software  without  specific  written
 permission.
 -- Disclaimer:
 THIS  SOFTWARE  IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,  INCLUDING,  BUT NOT
 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
 OR  CONTRIBUTORS  BE  LIABLE FOR ANY  DIRECT,  INDIRECT,  INCIDENTAL,
 SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES  (INCLUDING,  BUT NOT
 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 DATA OR PROFITS; OR BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON ANY
 THEORY OF LIABILITY, WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ======================================================================
--- a/DASP/LICENSE
+++ b/DASP/LICENSE
@@ -0,0 +1,661 @@
                    GNU AFFERO GENERAL PUBLIC LICENSE
                       Version 3, 19 November 2007
 Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.
                            Preamble
  The GNU Affero General Public License is a free, copyleft license for
 software and other kinds of works, specifically designed to ensure
 cooperation with the community in the case of network server software.
  The licenses for most software and other practical works are designed
 to take away your freedom to share and change the works.  By contrast,
 our General Public Licenses are intended to guarantee your freedom to
 share and change all versions of a program--to make sure it remains free
 software for all its users.
  When we speak of free software, we are referring to freedom, not
 price.  Our General Public Licenses are designed to make sure that you
 have the freedom to distribute copies of free software (and charge for
 them if you wish), that you receive source code or can get it if you
 want it, that you can change the software or use pieces of it in new
 free programs, and that you know you can do these things.
  Developers that use our General Public Licenses protect your rights
 with two steps: (1) assert copyright on the software, and (2) offer
 you this License which gives you legal permission to copy, distribute
 and/or modify the software.
  A secondary benefit of defending all users' freedom is that
 improvements made in alternate versions of the program, if they
 receive widespread use, become available for other developers to
 incorporate.  Many developers of free software are heartened and
 encouraged by the resulting cooperation.  However, in the case of
 software used on network servers, this result may fail to come about.
 The GNU General Public License permits making a modified version and
 letting the public access it on a server without ever releasing its
 source code to the public.
  The GNU Affero General Public License is designed specifically to
 ensure that, in such cases, the modified source code becomes available
 to the community.  It requires the operator of a network server to
 provide the source code of the modified version running there to the
 users of that server.  Therefore, public use of a modified version, on
 a publicly accessible server, gives the public access to the source
 code of the modified version.
  An older license, called the Affero General Public License and
 published by Affero, was designed to accomplish similar goals.  This is
 a different license, not a version of the Affero GPL, but Affero has
 released a new version of the Affero GPL which permits relicensing under
 this license.
  The precise terms and conditions for copying, distribution and
 modification follow.
                       TERMS AND CONDITIONS
  0. Definitions.
  "This License" refers to version 3 of the GNU Affero General Public License.
  "Copyright" also means copyright-like laws that apply to other kinds of
 works, such as semiconductor masks.
  "The Program" refers to any copyrightable work licensed under this
 License.  Each licensee is addressed as "you".  "Licensees" and
 "recipients" may be individuals or organizations.
  To "modify" a work means to copy from or adapt all or part of the work
 in a fashion requiring copyright permission, other than the making of an
 exact copy.  The resulting work is called a "modified version" of the
 earlier work or a work "based on" the earlier work.
  A "covered work" means either the unmodified Program or a work based
 on the Program.
  To "propagate" a work means to do anything with it that, without
 permission, would make you directly or secondarily liable for
 infringement under applicable copyright law, except executing it on a
 computer or modifying a private copy.  Propagation includes copying,
 distribution (with or without modification), making available to the
 public, and in some countries other activities as well.
  To "convey" a work means any kind of propagation that enables other
 parties to make or receive copies.  Mere interaction with a user through
 a computer network, with no transfer of a copy, is not conveying.
  An interactive user interface displays "Appropriate Legal Notices"
 to the extent that it includes a convenient and prominently visible
 feature that (1) displays an appropriate copyright notice, and (2)
 tells the user that there is no warranty for the work (except to the
 extent that warranties are provided), that licensees may convey the
 work under this License, and how to view a copy of this License.  If
 the interface presents a list of user commands or options, such as a
 menu, a prominent item in the list meets this criterion.
  1. Source Code.
  The "source code" for a work means the preferred form of the work
 for making modifications to it.  "Object code" means any non-source
 form of a work.
  A "Standard Interface" means an interface that either is an official
 standard defined by a recognized standards body, or, in the case of
 interfaces specified for a particular programming language, one that
 is widely used among developers working in that language.
  The "System Libraries" of an executable work include anything, other
 than the work as a whole, that (a) is included in the normal form of
 packaging a Major Component, but which is not part of that Major
 Component, and (b) serves only to enable use of the work with that
 Major Component, or to implement a Standard Interface for which an
 implementation is available to the public in source code form.  A
 "Major Component", in this context, means a major essential component
 (kernel, window system, and so on) of the specific operating system
 (if any) on which the executable work runs, or a compiler used to
 produce the work, or an object code interpreter used to run it.
  The "Corresponding Source" for a work in object code form means all
 the source code needed to generate, install, and (for an executable
 work) run the object code and to modify the work, including scripts to
 control those activities.  However, it does not include the work's
 System Libraries, or general-purpose tools or generally available free
 programs which are used unmodified in performing those activities but
 which are not part of the work.  For example, Corresponding Source
 includes interface definition files associated with source files for
 the work, and the source code for shared libraries and dynamically
 linked subprograms that the work is specifically designed to require,
 such as by intimate data communication or control flow between those
 subprograms and other parts of the work.
  The Corresponding Source need not include anything that users
 can regenerate automatically from other parts of the Corresponding
 Source.
  The Corresponding Source for a work in source code form is that
 same work.
  2. Basic Permissions.
  All rights granted under this License are granted for the term of
 copyright on the Program, and are irrevocable provided the stated
 conditions are met.  This License explicitly affirms your unlimited
 permission to run the unmodified Program.  The output from running a
 covered work is covered by this License only if the output, given its
 content, constitutes a covered work.  This License acknowledges your
 rights of fair use or other equivalent, as provided by copyright law.
  You may make, run and propagate covered works that you do not
 convey, without conditions so long as your license otherwise remains
 in force.  You may convey covered works to others for the sole purpose
 of having them make modifications exclusively for you, or provide you
 with facilities for running those works, provided that you comply with
 the terms of this License in conveying all material for which you do
 not control copyright.  Those thus making or running the covered works
 for you must do so exclusively on your behalf, under your direction
 and control, on terms that prohibit them from making any copies of
 your copyrighted material outside their relationship with you.
  Conveying under any other circumstances is permitted solely under
 the conditions stated below.  Sublicensing is not allowed; section 10
 makes it unnecessary.
  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
  No covered work shall be deemed part of an effective technological
 measure under any applicable law fulfilling obligations under article
 11 of the WIPO copyright treaty adopted on 20 December 1996, or
 similar laws prohibiting or restricting circumvention of such
 measures.
  When you convey a covered work, you waive any legal power to forbid
 circumvention of technological measures to the extent such circumvention
 is effected by exercising rights under this License with respect to
 the covered work, and you disclaim any intention to limit operation or
 modification of the work as a means of enforcing, against the work's
 users, your or third parties' legal rights to forbid circumvention of
 technological measures.
  4. Conveying Verbatim Copies.
  You may convey verbatim copies of the Program's source code as you
 receive it, in any medium, provided that you conspicuously and
 appropriately publish on each copy an appropriate copyright notice;
 keep intact all notices stating that this License and any
 non-permissive terms added in accord with section 7 apply to the code;
 keep intact all notices of the absence of any warranty; and give all
 recipients a copy of this License along with the Program.
  You may charge any price or no price for each copy that you convey,
 and you may offer support or warranty protection for a fee.
  5. Conveying Modified Source Versions.
  You may convey a work based on the Program, or the modifications to
 produce it from the Program, in the form of source code under the
 terms of section 4, provided that you also meet all of these conditions:
    a) The work must carry prominent notices stating that you modified
    it, and giving a relevant date.
    b) The work must carry prominent notices stating that it is
    released under this License and any conditions added under section
    7.  This requirement modifies the requirement in section 4 to
    "keep intact all notices".
    c) You must license the entire work, as a whole, under this
    License to anyone who comes into possession of a copy.  This
    License will therefore apply, along with any applicable section 7
    additional terms, to the whole of the work, and all its parts,
    regardless of how they are packaged.  This License gives no
    permission to license the work in any other way, but it does not
    invalidate such permission if you have separately received it.
    d) If the work has interactive user interfaces, each must display
    Appropriate Legal Notices; however, if the Program has interactive
    interfaces that do not display Appropriate Legal Notices, your
    work need not make them do so.
  A compilation of a covered work with other separate and independent
 works, which are not by their nature extensions of the covered work,
 and which are not combined with it such as to form a larger program,
 in or on a volume of a storage or distribution medium, is called an
 "aggregate" if the compilation and its resulting copyright are not
 used to limit the access or legal rights of the compilation's users
 beyond what the individual works permit.  Inclusion of a covered work
 in an aggregate does not cause this License to apply to the other
 parts of the aggregate.
  6. Conveying Non-Source Forms.
  You may convey a covered work in object code form under the terms
 of sections 4 and 5, provided that you also convey the
 machine-readable Corresponding Source under the terms of this License,
 in one of these ways:
    a) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by the
    Corresponding Source fixed on a durable physical medium
    customarily used for software interchange.
    b) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by a
    written offer, valid for at least three years and valid for as
    long as you offer spare parts or customer support for that product
    model, to give anyone who possesses the object code either (1) a
    copy of the Corresponding Source for all the software in the
    product that is covered by this License, on a durable physical
    medium customarily used for software interchange, for a price no
    more than your reasonable cost of physically performing this
    conveying of source, or (2) access to copy the
    Corresponding Source from a network server at no charge.
    c) Convey individual copies of the object code with a copy of the
    written offer to provide the Corresponding Source.  This
    alternative is allowed only occasionally and noncommercially, and
    only if you received the object code with such an offer, in accord
    with subsection 6b.
    d) Convey the object code by offering access from a designated
    place (gratis or for a charge), and offer equivalent access to the
    Corresponding Source in the same way through the same place at no
    further charge.  You need not require recipients to copy the
    Corresponding Source along with the object code.  If the place to
    copy the object code is a network server, the Corresponding Source
    may be on a different server (operated by you or a third party)
    that supports equivalent copying facilities, provided you maintain
    clear directions next to the object code saying where to find the
    Corresponding Source.  Regardless of what server hosts the
    Corresponding Source, you remain obligated to ensure that it is
    available for as long as needed to satisfy these requirements.
    e) Convey the object code using peer-to-peer transmission, provided
    you inform other peers where the object code and Corresponding
    Source of the work are being offered to the general public at no
    charge under subsection 6d.
  A separable portion of the object code, whose source code is excluded
 from the Corresponding Source as a System Library, need not be
 included in conveying the object code work.
  A "User Product" is either (1) a "consumer product", which means any
 tangible personal property which is normally used for personal, family,
 or household purposes, or (2) anything designed or sold for incorporation
 into a dwelling.  In determining whether a product is a consumer product,
 doubtful cases shall be resolved in favor of coverage.  For a particular
 product received by a particular user, "normally used" refers to a
 typical or common use of that class of product, regardless of the status
 of the particular user or of the way in which the particular user
 actually uses, or expects or is expected to use, the product.  A product
 is a consumer product regardless of whether the product has substantial
 commercial, industrial or non-consumer uses, unless such uses represent
 the only significant mode of use of the product.
  "Installation Information" for a User Product means any methods,
 procedures, authorization keys, or other information required to install
 and execute modified versions of a covered work in that User Product from
 a modified version of its Corresponding Source.  The information must
 suffice to ensure that the continued functioning of the modified object
 code is in no case prevented or interfered with solely because
 modification has been made.
  If you convey an object code work under this section in, or with, or
 specifically for use in, a User Product, and the conveying occurs as
 part of a transaction in which the right of possession and use of the
 User Product is transferred to the recipient in perpetuity or for a
 fixed term (regardless of how the transaction is characterized), the
 Corresponding Source conveyed under this section must be accompanied
 by the Installation Information.  But this requirement does not apply
 if neither you nor any third party retains the ability to install
 modified object code on the User Product (for example, the work has
 been installed in ROM).
  The requirement to provide Installation Information does not include a
 requirement to continue to provide support service, warranty, or updates
 for a work that has been modified or installed by the recipient, or for
 the User Product in which it has been modified or installed.  Access to a
 network may be denied when the modification itself materially and
 adversely affects the operation of the network or violates the rules and
 protocols for communication across the network.
  Corresponding Source conveyed, and Installation Information provided,
 in accord with this section must be in a format that is publicly
 documented (and with an implementation available to the public in
 source code form), and must require no special password or key for
 unpacking, reading or copying.
  7. Additional Terms.
  "Additional permissions" are terms that supplement the terms of this
 License by making exceptions from one or more of its conditions.
 Additional permissions that are applicable to the entire Program shall
 be treated as though they were included in this License, to the extent
 that they are valid under applicable law.  If additional permissions
 apply only to part of the Program, that part may be used separately
 under those permissions, but the entire Program remains governed by
 this License without regard to the additional permissions.
  When you convey a copy of a covered work, you may at your option
 remove any additional permissions from that copy, or from any part of
 it.  (Additional permissions may be written to require their own
 removal in certain cases when you modify the work.)  You may place
 additional permissions on material, added by you to a covered work,
 for which you have or can give appropriate copyright permission.
  Notwithstanding any other provision of this License, for material you
 add to a covered work, you may (if authorized by the copyright holders of
 that material) supplement the terms of this License with terms:
    a) Disclaiming warranty or limiting liability differently from the
    terms of sections 15 and 16 of this License; or
    b) Requiring preservation of specified reasonable legal notices or
    author attributions in that material or in the Appropriate Legal
    Notices displayed by works containing it; or
    c) Prohibiting misrepresentation of the origin of that material, or
    requiring that modified versions of such material be marked in
    reasonable ways as different from the original version; or
    d) Limiting the use for publicity purposes of names of licensors or
    authors of the material; or
    e) Declining to grant rights under trademark law for use of some
    trade names, trademarks, or service marks; or
    f) Requiring indemnification of licensors and authors of that
    material by anyone who conveys the material (or modified versions of
    it) with contractual assumptions of liability to the recipient, for
    any liability that these contractual assumptions directly impose on
    those licensors and authors.
  All other non-permissive additional terms are considered "further
 restrictions" within the meaning of section 10.  If the Program as you
 received it, or any part of it, contains a notice stating that it is
 governed by this License along with a term that is a further
 restriction, you may remove that term.  If a license document contains
 a further restriction but permits relicensing or conveying under this
 License, you may add to a covered work material governed by the terms
 of that license document, provided that the further restriction does
 not survive such relicensing or conveying.
  If you add terms to a covered work in accord with this section, you
 must place, in the relevant source files, a statement of the
 additional terms that apply to those files, or a notice indicating
 where to find the applicable terms.
  Additional terms, permissive or non-permissive, may be stated in the
 form of a separately written license, or stated as exceptions;
 the above requirements apply either way.
  8. Termination.
  You may not propagate or modify a covered work except as expressly
 provided under this License.  Any attempt otherwise to propagate or
 modify it is void, and will automatically terminate your rights under
 this License (including any patent licenses granted under the third
 paragraph of section 11).
  However, if you cease all violation of this License, then your
 license from a particular copyright holder is reinstated (a)
 provisionally, unless and until the copyright holder explicitly and
 finally terminates your license, and (b) permanently, if the copyright
 holder fails to notify you of the violation by some reasonable means
 prior to 60 days after the cessation.
  Moreover, your license from a particular copyright holder is
 reinstated permanently if the copyright holder notifies you of the
 violation by some reasonable means, this is the first time you have
 received notice of violation of this License (for any work) from that
 copyright holder, and you cure the violation prior to 30 days after
 your receipt of the notice.
  Termination of your rights under this section does not terminate the
 licenses of parties who have received copies or rights from you under
 this License.  If your rights have been terminated and not permanently
 reinstated, you do not qualify to receive new licenses for the same
 material under section 10.
  9. Acceptance Not Required for Having Copies.
  You are not required to accept this License in order to receive or
 run a copy of the Program.  Ancillary propagation of a covered work
 occurring solely as a consequence of using peer-to-peer transmission
 to receive a copy likewise does not require acceptance.  However,
 nothing other than this License grants you permission to propagate or
 modify any covered work.  These actions infringe copyright if you do
 not accept this License.  Therefore, by modifying or propagating a
 covered work, you indicate your acceptance of this License to do so.
  10. Automatic Licensing of Downstream Recipients.
  Each time you convey a covered work, the recipient automatically
 receives a license from the original licensors, to run, modify and
 propagate that work, subject to this License.  You are not responsible
 for enforcing compliance by third parties with this License.
  An "entity transaction" is a transaction transferring control of an
 organization, or substantially all assets of one, or subdividing an
 organization, or merging organizations.  If propagation of a covered
 work results from an entity transaction, each party to that
 transaction who receives a copy of the work also receives whatever
 licenses to the work the party's predecessor in interest had or could
 give under the previous paragraph, plus a right to possession of the
 Corresponding Source of the work from the predecessor in interest, if
 the predecessor has it or can get it with reasonable efforts.
  You may not impose any further restrictions on the exercise of the
 rights granted or affirmed under this License.  For example, you may
 not impose a license fee, royalty, or other charge for exercise of
 rights granted under this License, and you may not initiate litigation
 (including a cross-claim or counterclaim in a lawsuit) alleging that
 any patent claim is infringed by making, using, selling, offering for
 sale, or importing the Program or any portion of it.
  11. Patents.
  A "contributor" is a copyright holder who authorizes use under this
 License of the Program or a work on which the Program is based.  The
 work thus licensed is called the contributor's "contributor version".
  A contributor's "essential patent claims" are all patent claims
 owned or controlled by the contributor, whether already acquired or
 hereafter acquired, that would be infringed by some manner, permitted
 by this License, of making, using, or selling its contributor version,
 but do not include claims that would be infringed only as a
 consequence of further modification of the contributor version.  For
 purposes of this definition, "control" includes the right to grant
 patent sublicenses in a manner consistent with the requirements of
 this License.
  Each contributor grants you a non-exclusive, worldwide, royalty-free
 patent license under the contributor's essential patent claims, to
 make, use, sell, offer for sale, import and otherwise run, modify and
 propagate the contents of its contributor version.
  In the following three paragraphs, a "patent license" is any express
 agreement or commitment, however denominated, not to enforce a patent
 (such as an express permission to practice a patent or covenant not to
 sue for patent infringement).  To "grant" such a patent license to a
 party means to make such an agreement or commitment not to enforce a
 patent against the party.
  If you convey a covered work, knowingly relying on a patent license,
 and the Corresponding Source of the work is not available for anyone
 to copy, free of charge and under the terms of this License, through a
 publicly available network server or other readily accessible means,
 then you must either (1) cause the Corresponding Source to be so
 available, or (2) arrange to deprive yourself of the benefit of the
 patent license for this particular work, or (3) arrange, in a manner
 consistent with the requirements of this License, to extend the patent
 license to downstream recipients.  "Knowingly relying" means you have
 actual knowledge that, but for the patent license, your conveying the
 covered work in a country, or your recipient's use of the covered work
 in a country, would infringe one or more identifiable patents in that
 country that you have reason to believe are valid.
  If, pursuant to or in connection with a single transaction or
 arrangement, you convey, or propagate by procuring conveyance of, a
 covered work, and grant a patent license to some of the parties
 receiving the covered work authorizing them to use, propagate, modify
 or convey a specific copy of the covered work, then the patent license
 you grant is automatically extended to all recipients of the covered
 work and works based on it.
  A patent license is "discriminatory" if it does not include within
 the scope of its coverage, prohibits the exercise of, or is
 conditioned on the non-exercise of one or more of the rights that are
 specifically granted under this License.  You may not convey a covered
 work if you are a party to an arrangement with a third party that is
 in the business of distributing software, under which you make payment
 to the third party based on the extent of your activity of conveying
 the work, and under which the third party grants, to any of the
 parties who would receive the covered work from you, a discriminatory
 patent license (a) in connection with copies of the covered work
 conveyed by you (or copies made from those copies), or (b) primarily
 for and in connection with specific products or compilations that
 contain the covered work, unless you entered into that arrangement,
 or that patent license was granted, prior to 28 March 2007.
  Nothing in this License shall be construed as excluding or limiting
 any implied license or other defenses to infringement that may
 otherwise be available to you under applicable patent law.
  12. No Surrender of Others' Freedom.
  If conditions are imposed on you (whether by court order, agreement or
 otherwise) that contradict the conditions of this License, they do not
 excuse you from the conditions of this License.  If you cannot convey a
 covered work so as to satisfy simultaneously your obligations under this
 License and any other pertinent obligations, then as a consequence you may
 not convey it at all.  For example, if you agree to terms that obligate you
 to collect a royalty for further conveying from those to whom you convey
 the Program, the only way you could satisfy both those terms and this
 License would be to refrain entirely from conveying the Program.
  13. Remote Network Interaction; Use with the GNU General Public License.
  Notwithstanding any other provision of this License, if you modify the
 Program, your modified version must prominently offer all users
 interacting with it remotely through a computer network (if your version
 supports such interaction) an opportunity to receive the Corresponding
 Source of your version by providing access to the Corresponding Source
 from a network server at no charge, through some standard or customary
 means of facilitating copying of software.  This Corresponding Source
 shall include the Corresponding Source for any work covered by version 3
 of the GNU General Public License that is incorporated pursuant to the
 following paragraph.
  Notwithstanding any other provision of this License, you have
 permission to link or combine any covered work with a work licensed
 under version 3 of the GNU General Public License into a single
 combined work, and to convey the resulting work.  The terms of this
 License will continue to apply to the part which is the covered work,
 but the work with which it is combined will remain governed by version
 3 of the GNU General Public License.
  14. Revised Versions of this License.
  The Free Software Foundation may publish revised and/or new versions of
 the GNU Affero General Public License from time to time.  Such new versions
 will be similar in spirit to the present version, but may differ in detail to
 address new problems or concerns.
  Each version is given a distinguishing version number.  If the
 Program specifies that a certain numbered version of the GNU Affero General
 Public License "or any later version" applies to it, you have the
 option of following the terms and conditions either of that numbered
 version or of any later version published by the Free Software
 Foundation.  If the Program does not specify a version number of the
 GNU Affero General Public License, you may choose any version ever published
 by the Free Software Foundation.
  If the Program specifies that a proxy can decide which future
 versions of the GNU Affero General Public License can be used, that proxy's
 public statement of acceptance of a version permanently authorizes you
 to choose that version for the Program.
  Later license versions may give you additional or different
 permissions.  However, no additional obligations are imposed on any
 author or copyright holder as a result of your choosing to follow a
 later version.
  15. Disclaimer of Warranty.
  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
 APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
 HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
 OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
 THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
 IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
 ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
  16. Limitation of Liability.
  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
 WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
 THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
 GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
 USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
 DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
 PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
 EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
 SUCH DAMAGES.
  17. Interpretation of Sections 15 and 16.
  If the disclaimer of warranty and limitation of liability provided
 above cannot be given local legal effect according to their terms,
 reviewing courts shall apply local law that most closely approximates
 an absolute waiver of all civil liability in connection with the
 Program, unless a warranty or assumption of liability accompanies a
 copy of the Program in return for a fee.
                     END OF TERMS AND CONDITIONS
            How to Apply These Terms to Your New Programs
  If you develop a new program, and you want it to be of the greatest
 possible use to the public, the best way to achieve this is to make it
 free software which everyone can redistribute and change under these terms.
  To do so, attach the following notices to the program.  It is safest
 to attach them to the start of each source file to most effectively
 state the exclusion of warranty; and each file should have at least
 the "copyright" line and a pointer to where the full notice is found.
    <one line to give the program's name and a brief idea of what it does.>
    Copyright (C) <year>  <name of author>
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as published
    by the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.
    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 Also add information on how to contact you by electronic and paper mail.
  If your software can interact with users remotely through a computer
 network, you should also make sure that it provides a way for users to
 get its source.  For example, if your program is a web application, its
 interface could display a "Source" link that leads users to an archive
 of the code.  There are many ways you could offer source, and different
 solutions will be better for different programs; see section 13 for the
 specific requirements.
  You should also get your employer (if you work as a programmer) or school,
 if any, to sign a "copyright disclaimer" for the program, if necessary.
 For more information on this, and how to apply and follow the GNU AGPL, see
 <https://www.gnu.org/licenses/>.
--- a/DASP/Makefile
+++ b/DASP/Makefile
@@ -0,0 +1,24 @@
 #compilers
 CC=/usr/local/cuda-12.0/bin/nvcc
 NVCC_FLAGS = -O3 -ccbin /usr/local/gcc-12.2/bin -m64 -gencode arch=compute_80,code=sm_80
 # #ENVIRONMENT_PARAMETERS
 # CUDA_INSTALL_PATH = /usr/local/cuda-12.0
 CUDA_LIBS = -lcusparse -lcublas
 LIBS =  -lineinfo $(CUDA_LIBS)
 #options
 OPTIONS = -Xcompiler -fopenmp-simd
 double:
 	$(CC) $(NVCC_FLAGS) src/main_f64.cu -o spmv_double  -D f64 $(OPTIONS) $(LIBS) 
 half:
 	$(CC) $(NVCC_FLAGS) src/main_f16.cu -o spmv_half $(OPTIONS) $(LIBS) 
 clean:
 	rm -rf spmv_double
 	rm -rf spmv_half
 	rm data/*.csv
--- a/DASP/README.md
+++ b/DASP/README.md
@@ -0,0 +1,43 @@
 # DASP
 Specific Dense Matrix Multiply-Accumulate Units Accelerated General Sparse Matrix-Vector Multiplication
 ## Paper
 This is the code of our paper published at SC '23:
 Yuechen Lu and Weifeng Liu. 2023. DASP: Specific Dense Matrix Multiply-Accumulate Units Accelerated General Sparse Matrix-Vector Multiplication. In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC '23). Association for Computing Machinery, New York, NY, USA, Article 73, 1–14. https://doi.org/10.1145/3581784.3607051
 ## Introduction
 Sparse matrix-vector multiplication (SpMV) plays a key role in computational science and engineering, graph processing and machine learning applications. In this work, we propose DASP, a new algorithm using specific dense MMA units for accelerating the compute part of general SpMV. We analyze the row-wise distribution of nonzeros and group the rows into three categories containing long, medium and short rows, respectively. We then organize them into small blocks of proper sizes to meet the requirement of MMA computation. For the three categories, DASP offers different strategies to complete SpMV by efficiently utilizing the MMA units.
 ## Installation
 To better reproduce experiment results, we suggest an NVIDIA GPU with compute capability 8.0. DASP evaluation requires the CUDA GPU driver, the nvcc CUDA compiler, and the cuSPARSE library, all of them are included with the CUDA Toolkit. 
 ## Execution
 Our test programs currently support input files encoded using the matrix market format. All matrix market datasets used in this evaluation are publicly available from the SuiteSparse Matrix Collection.
 1. The command 'make xxx' generates an executable file.
 `make double`
 `make half`
 2. Run code on matrix data. Running the program requires one parameter: matrix path.
 `./spmv_double matrix.mtx`
 3. Example
 `cd test`
 `sh run_double.sh`
 ## Contact us
 If you have any questions about running the code, please contact Yuechen Lu.
 E-mail: yuechenlu@student.cup.edu.cn
--- a/DASP/data/record.csv
+++ b/DASP/data/record.csv
@@ -0,0 +1,2 @@
 #
--- a/DASP/src/common.h
+++ b/DASP/src/common.h
@@ -0,0 +1,60 @@
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <mma.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
 #include <sys/time.h>
 #include <math.h>
 // #include <helper_cuda.h>
 // #include <helper_functions.h>
 #include <cusparse.h>
 #include <cublas_v2.h>
 #include "omp.h"
 #include "mmio_highlevel.h"
 #ifdef f64
 #define MAT_VAL_TYPE double
 #else
 #define MAT_VAL_TYPE half
 #endif
 #define WARP_SIZE 32
 #define BlockSize 8
 #define MMA_M 8
 #define MMA_N 8
 #define MMA_K 4
 #define MAT_PTR_TYPE int
 #define NEW_CID_TYPE int
 #define GET_BIT_REST(x)  ((unsigned int)(x << 2) >> 2)
 #define SET_16_BIT(dst, src, index)  \
    dst &= ~(0xffff << (index << 4)); \
    dst |= (src << (index << 4))
 #define SET_8_BIT(dst, src, index)  \
    dst &= ~(0xff << (index << 3)); \
    dst |= (src << (index << 3))
 #define SET_4_BIT(dst, src, index) \
    dst &= ~(0xf << (index << 2)); \
    dst |= (src << (index << 2))
 #define SET_2_BIT(dst, src) dst |= src << 30
 #define GET_16_BIT(src, index) ((src >> (index << 4)) & 0xffff)
 #define GET_8_BIT(src, index) ((src >> (index << 3)) & 0xff)
 #define GET_4_BIT(src, index) ((src >> (index << 2)) & 0xf)
 #define GET_2_BIT(src) ((src >> 30) & 0b11)
 #define omp_valve 1e4
--- a/DASP/src/dasp_f16.h
+++ b/DASP/src/dasp_f16.h
--- a/DASP/src/dasp_f64.h
+++ b/DASP/src/dasp_f64.h
--- a/DASP/src/main_f16.cu
+++ b/DASP/src/main_f16.cu
@@ -0,0 +1,164 @@
 #include "dasp_f16.h"
 int verify_new(MAT_VAL_TYPE *cusp_val, MAT_VAL_TYPE *cuda_val, int *new_order, int length)
 {
    for (int i = 0; i < length; i ++)
    {
        int cusp_idx = new_order[i];
        float temp_cusp_val = cusp_val[cusp_idx];
        float temp_cuda_val = cuda_val[i];
        if (fabs(temp_cusp_val - temp_cuda_val) > 1)
        {
            printf("error in (%d), cusp(%4.2f), cuda(%4.2f),please check your code!\n", i, temp_cusp_val, temp_cuda_val); 
            return -1;
        }
    }
    printf("Y(%d), compute succeed!\n", length);
    return 0;
 }
 __host__
 void cusparse_spmv_all(MAT_VAL_TYPE *cu_ValA, MAT_PTR_TYPE *cu_RowPtrA, int *cu_ColIdxA, 
                       MAT_VAL_TYPE *cu_ValX, MAT_VAL_TYPE *cu_ValY, int rowA, int colA, MAT_PTR_TYPE nnzA,
                       long long int data_origin1, long long int data_origin2, double *pre_time, double *cu_time, double *cu_gflops, double *cu_bandwidth1, double *cu_bandwidth2)
 {
    struct timeval t1, t2;
    MAT_VAL_TYPE *dA_val, *dX, *dY;
    int *dA_cid;
    MAT_PTR_TYPE *dA_rpt;
    float alpha = 1.0, beta = 0.0;
    cudaMalloc((void **)&dA_val, sizeof(MAT_VAL_TYPE) * nnzA);
    cudaMalloc((void **)&dA_cid, sizeof(int) * nnzA);
    cudaMalloc((void **)&dA_rpt, sizeof(MAT_PTR_TYPE) * (rowA + 1));
    cudaMalloc((void **)&dX, sizeof(MAT_VAL_TYPE) * colA);
    cudaMalloc((void **)&dY, sizeof(MAT_VAL_TYPE) * rowA);
    cudaMemcpy(dA_val, cu_ValA, sizeof(MAT_VAL_TYPE) * nnzA, cudaMemcpyHostToDevice);
    cudaMemcpy(dA_cid, cu_ColIdxA, sizeof(int) * nnzA, cudaMemcpyHostToDevice);
    cudaMemcpy(dA_rpt, cu_RowPtrA, sizeof(MAT_PTR_TYPE) * (rowA + 1), cudaMemcpyHostToDevice);
    cudaMemcpy(dX, cu_ValX, sizeof(MAT_VAL_TYPE) * colA, cudaMemcpyHostToDevice);
    // cudaMemset(dY, 0.0, sizeof(MAT_VAL_TYPE) * rowA);
    cusparseHandle_t     handle = NULL;
    cusparseSpMatDescr_t matA;
    cusparseDnVecDescr_t vecX, vecY;
    void*                dBuffer = NULL;
    size_t               bufferSize = 0;
    gettimeofday(&t1, NULL);
    cusparseCreate(&handle);
    cusparseCreateCsr(&matA, rowA, colA, nnzA, dA_rpt, dA_cid, dA_val,
                        CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
                        CUSPARSE_INDEX_BASE_ZERO, CUDA_R_16F);
    cusparseCreateDnVec(&vecX, colA, dX, CUDA_R_16F);
    cusparseCreateDnVec(&vecY, rowA, dY, CUDA_R_16F);
    cusparseSpMV_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                            &alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
                            CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize);
    cudaMalloc(&dBuffer, bufferSize);
    gettimeofday(&t2, NULL);
    *pre_time = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0;
    // printf("cusparse preprocessing time: %8.4lf ms\n", *pre_time);
    for (int i = 0; i < 100; ++i)
    {
        cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                    &alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
                    CUSPARSE_SPMV_ALG_DEFAULT, dBuffer);
    }
    cudaDeviceSynchronize();
    gettimeofday(&t1, NULL);
    for (int i = 0; i < 1000; ++i)
    {
        cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                    &alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
                    CUSPARSE_SPMV_ALG_DEFAULT, dBuffer);
    }
    cudaDeviceSynchronize();
    gettimeofday(&t2, NULL);
    *cu_time = ((t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0) / 1000;
    *cu_gflops = (double)((long)nnzA * 2) / (*cu_time * 1e6);
    *cu_bandwidth1 = (double)data_origin1 / (*cu_time * 1e6); 
    *cu_bandwidth2 = (double)data_origin2 / (*cu_time * 1e6); 
    printf("cusparse:%8.4lf ms, %8.4lf Gflop/s, %9.4lf GB/s, %9.4lf GB/s\n", *cu_time, *cu_gflops, *cu_bandwidth1, *cu_bandwidth2);
    cusparseDestroySpMat(matA);
    cusparseDestroyDnVec(vecX);
    cusparseDestroyDnVec(vecY);
    cusparseDestroy(handle);
    cudaMemcpy(cu_ValY, dY, sizeof(MAT_VAL_TYPE) * rowA, cudaMemcpyDeviceToHost);
    cudaFree(dA_val);
    cudaFree(dA_cid);
    cudaFree(dA_rpt);
    cudaFree(dX);
    cudaFree(dY);
 }
 __host__
 int main(int argc, char **argv)
 {
    if (argc < 2)
    {
        printf("Run the code by './spmv_half matrix.mtx'.\n");
        return 0;
    }
    // struct timeval t1, t2;
    int rowA, colA;
    MAT_PTR_TYPE nnzA;
    int isSymmetricA;
    // float *csrValA_f32;
    MAT_VAL_TYPE *csrValA;
    int *csrColIdxA;
    MAT_PTR_TYPE *csrRowPtrA;
    char *filename;
    filename = argv[1];
    // int NUM = atoi(argv[2]);
    // int block_longest = atoi(argv[3]);
    int NUM = 4;
    int block_longest = 256;
    double threshold = 0.75;
    printf("\n===%s===\n\n", filename);
    mmio_allinone(&rowA, &colA, &nnzA, &isSymmetricA, &csrRowPtrA, &csrColIdxA, &csrValA, filename);
    MAT_VAL_TYPE *X_val = (MAT_VAL_TYPE *)malloc(sizeof(MAT_VAL_TYPE) * colA);
    initVec(X_val, colA);
    initVec(csrValA, nnzA);
    MAT_VAL_TYPE *dY_val = (MAT_VAL_TYPE *)malloc(sizeof(MAT_VAL_TYPE) * rowA);
    MAT_VAL_TYPE *Y_val = (MAT_VAL_TYPE *)malloc(sizeof(MAT_VAL_TYPE) * rowA);
    int *new_order = (int *)malloc(sizeof(int) * rowA);
    double pre_time = 0, cu_time = 0, cu_gflops = 0, cu_bandwidth1 = 0, cu_bandwidth2 = 0;
    long long int data_origin1 = (nnzA + colA + rowA) * sizeof(MAT_VAL_TYPE) + nnzA * sizeof(int) + (rowA + 1) * sizeof(MAT_PTR_TYPE);
    long long int data_origin2 = (nnzA + nnzA + rowA) * sizeof(MAT_VAL_TYPE) + nnzA * sizeof(int) + (rowA + 1) * sizeof(MAT_PTR_TYPE);
    cusparse_spmv_all(csrValA, csrRowPtrA, csrColIdxA, X_val, dY_val, rowA, colA, nnzA, data_origin1, data_origin2, &pre_time, &cu_time, &cu_gflops, &cu_bandwidth1, &cu_bandwidth2);
    spmv_all(filename, csrValA, csrRowPtrA, csrColIdxA, X_val, Y_val, new_order, rowA, colA, nnzA, NUM, threshold, block_longest);
    FILE* fout;
    fout = fopen("data/spmv_f16_record.csv", "a");
    fprintf(fout, "%lld,%lf,%lf,%lf,%lf,%lf\n", data_origin1, pre_time, cu_time, cu_gflops, cu_bandwidth1, cu_bandwidth2);
    fclose(fout);
    // int result = verify_new(dY_val, Y_val, new_order, rowA);
    free(X_val);
    free(Y_val);
    free(dY_val);
    free(csrColIdxA);
    free(csrRowPtrA);
    free(csrValA);
    free(new_order);
    return 0;
 }
--- a/DASP/src/main_f64.cu
+++ b/DASP/src/main_f64.cu
@@ -0,0 +1,168 @@
 #include "dasp_f64.h"
 int verify_new(MAT_VAL_TYPE *cusp_val, MAT_VAL_TYPE *cuda_val, int *new_order, int length)
 {
    for (int i = 0; i < length; i ++)
    {
        int cusp_idx = new_order[i];
        if (fabs(cusp_val[cusp_idx] - cuda_val[i]) > 1e-5)
        {
            printf("error in (%d), cusp(%4.2f), cuda(%4.2f),please check your code!\n", i, cusp_val[cusp_idx], cuda_val[i]);
            return -1;
        }
    }
    printf("Y(%d), compute succeed!\n", length);
    return 0;
 }
 __host__
 void cusparse_spmv_all(MAT_VAL_TYPE *cu_ValA, MAT_PTR_TYPE *cu_RowPtrA, int *cu_ColIdxA, 
                       MAT_VAL_TYPE *cu_ValX, MAT_VAL_TYPE *cu_ValY, int rowA, int colA, MAT_PTR_TYPE nnzA,
                       long long int data_origin1, long long int data_origin2, double *cu_time, double *cu_gflops, double *cu_bandwidth1, double *cu_bandwidth2, double *cu_pre)
 {
    struct timeval t1, t2;
    MAT_VAL_TYPE *dA_val, *dX, *dY;
    int *dA_cid;
    MAT_PTR_TYPE *dA_rpt;
    MAT_VAL_TYPE alpha = 1.0, beta = 0.0;
    cudaMalloc((void **)&dA_val, sizeof(MAT_VAL_TYPE) * nnzA);
    cudaMalloc((void **)&dA_cid, sizeof(int) * nnzA);
    cudaMalloc((void **)&dA_rpt, sizeof(MAT_PTR_TYPE) * (rowA + 1));
    cudaMalloc((void **)&dX, sizeof(MAT_VAL_TYPE) * colA);
    cudaMalloc((void **)&dY, sizeof(MAT_VAL_TYPE) * rowA);
    cudaMemcpy(dA_val, cu_ValA, sizeof(MAT_VAL_TYPE) * nnzA, cudaMemcpyHostToDevice);
    cudaMemcpy(dA_cid, cu_ColIdxA, sizeof(int) * nnzA, cudaMemcpyHostToDevice);
    cudaMemcpy(dA_rpt, cu_RowPtrA, sizeof(MAT_PTR_TYPE) * (rowA + 1), cudaMemcpyHostToDevice);
    cudaMemcpy(dX, cu_ValX, sizeof(MAT_VAL_TYPE) * colA, cudaMemcpyHostToDevice);
    cudaMemset(dY, 0.0, sizeof(MAT_VAL_TYPE) * rowA);
    cusparseHandle_t     handle = NULL;
    cusparseSpMatDescr_t matA;
    cusparseDnVecDescr_t vecX, vecY;
    void*                dBuffer = NULL;
    size_t               bufferSize = 0;
    gettimeofday(&t1, NULL);
    cusparseCreate(&handle);
    cusparseCreateCsr(&matA, rowA, colA, nnzA, dA_rpt, dA_cid, dA_val,
                        CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
                        CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
    cusparseCreateDnVec(&vecX, colA, dX, CUDA_R_64F);
    cusparseCreateDnVec(&vecY, rowA, dY, CUDA_R_64F);
    cusparseSpMV_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                            &alpha, matA, vecX, &beta, vecY, CUDA_R_64F,
                            CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize);
    cudaMalloc(&dBuffer, bufferSize);
    // cudaDeviceSynchronize();
    gettimeofday(&t2, NULL);
    double cusparse_pre = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0;
    // printf("cusparse preprocessing time: %8.4lf ms\n", cusparse_pre);
    *cu_pre = cusparse_pre;
    for (int i = 0; i < 100; ++i)
    {
        cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                    &alpha, matA, vecX, &beta, vecY, CUDA_R_64F,
                    CUSPARSE_SPMV_ALG_DEFAULT, dBuffer);
    }
    cudaDeviceSynchronize();
    gettimeofday(&t1, NULL);
    for (int i = 0; i < 1000; ++i)
    {
        cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                    &alpha, matA, vecX, &beta, vecY, CUDA_R_64F,
                    CUSPARSE_SPMV_ALG_DEFAULT, dBuffer);
    }
    cudaDeviceSynchronize();
    gettimeofday(&t2, NULL);
    *cu_time = ((t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0) / 1000;
    *cu_gflops = (double)((long)nnzA * 2) / (*cu_time * 1e6);
    *cu_bandwidth1 = (double)data_origin1 / (*cu_time * 1e6); 
    *cu_bandwidth2 = (double)data_origin2 / (*cu_time * 1e6); 
    printf("cusparse:%8.4lf ms, %8.4lf Gflop/s, %9.4lf GB/s, %9.4lf GB/s\n", *cu_time, *cu_gflops, *cu_bandwidth1, *cu_bandwidth2);
    cusparseDestroySpMat(matA);
    cusparseDestroyDnVec(vecX);
    cusparseDestroyDnVec(vecY);
    cusparseDestroy(handle);
    cudaMemcpy(cu_ValY, dY, sizeof(MAT_VAL_TYPE) * rowA, cudaMemcpyDeviceToHost);
    cudaFree(dA_val);
    cudaFree(dA_cid);
    cudaFree(dA_rpt);
    cudaFree(dX);
    cudaFree(dY);
 }
 __host__
 int main(int argc, char **argv)
 {
    if (argc < 2)
    {
        printf("Run the code by './spmv_double matrix.mtx'. \n");
        return 0;
    }
    // struct timeval t1, t2;
    int rowA, colA;
    MAT_PTR_TYPE nnzA;
    int isSymmetricA;
    MAT_VAL_TYPE *csrValA;
    int *csrColIdxA;
    MAT_PTR_TYPE *csrRowPtrA;
    char *filename;
    filename = argv[1];
    // int NUM = atoi(argv[2]);
    // int block_longest = atoi(argv[3]);
    int NUM = 4;
    int block_longest = 256;
    double threshold = 0.75;
    printf("\n===%s===\n\n", filename);
    mmio_allinone(&rowA, &colA, &nnzA, &isSymmetricA, &csrRowPtrA, &csrColIdxA, &csrValA, filename);
    MAT_VAL_TYPE *X_val = (MAT_VAL_TYPE *)malloc(sizeof(MAT_VAL_TYPE) * colA);
    initVec(X_val, colA);
    initVec(csrValA, nnzA);
    printf("INIT DONE\n");
    MAT_VAL_TYPE *dY_val = (MAT_VAL_TYPE *)malloc(sizeof(MAT_VAL_TYPE) * rowA);
    MAT_VAL_TYPE *Y_val = (MAT_VAL_TYPE *)malloc(sizeof(MAT_VAL_TYPE) * rowA);
    int *new_order = (int *)malloc(sizeof(int) * rowA);
    // int warmup = 3, pre_num = 10;
    double cu_time = 0, cu_gflops = 0, cu_bandwidth1 = 0, cu_bandwidth2 = 0, cu_pre = 0;
    long long int data_origin1 = (nnzA + colA + rowA) * sizeof(MAT_VAL_TYPE) + nnzA * sizeof(int) + (rowA + 1) * sizeof(MAT_PTR_TYPE);
    long long int data_origin2 = (nnzA + nnzA + rowA) * sizeof(MAT_VAL_TYPE) + nnzA * sizeof(int) + (rowA + 1) * sizeof(MAT_PTR_TYPE);
    cusparse_spmv_all(csrValA, csrRowPtrA, csrColIdxA, X_val, dY_val, rowA, colA, nnzA, data_origin1, data_origin2, &cu_time, &cu_gflops, &cu_bandwidth1, &cu_bandwidth2, &cu_pre);
    // double dasp_pre = 0;
    spmv_all(filename, csrValA, csrRowPtrA, csrColIdxA, X_val, Y_val, new_order, rowA, colA, nnzA, NUM, threshold, block_longest);
    FILE* fout;
    fout = fopen("data/spmv_f64_record.csv", "a");
    fprintf(fout, "%lld,%lf,%lf,%lf,%lf\n", data_origin1, cu_time, cu_gflops, cu_bandwidth1, cu_bandwidth2);
    fclose(fout);
    /* verify the result with cusparse */
    // int result = verify_new(dY_val, Y_val, new_order, rowA);
    free(X_val);
    free(Y_val);
    free(dY_val);
    free(csrColIdxA);
    free(csrRowPtrA);
    free(csrValA);
    free(new_order);
    return 0;
 }
--- a/DASP/src/mmio.h
+++ b/DASP/src/mmio.h
--- a/DASP/src/mmio_highlevel.h
+++ b/DASP/src/mmio_highlevel.h
@@ -0,0 +1,778 @@
 #ifndef _MMIO_HIGHLEVEL_
 #define _MMIO_HIGHLEVEL_
 #include "mmio.h"
 #include "common.h"
 void exclusive_scan(MAT_PTR_TYPE *input, int length)
 {
    if (length == 0 || length == 1)
        return;
    MAT_PTR_TYPE old_val, new_val;
    old_val = input[0];
    input[0] = 0;
    for (int i = 1; i < length; i++)
    {
        new_val = input[i];
        input[i] = old_val + input[i - 1];
        old_val = new_val;
    }
 }
 // read matrix infomation from mtx file
 int mmio_info(int *m, int *n, int *nnz, int *isSymmetric, char *filename)
 {
    int m_tmp, n_tmp, nnz_tmp;
    int ret_code;
    MM_typecode matcode;
    FILE *f;
    int nnz_mtx_report;
    int isInteger = 0, isReal = 0, isPattern = 0, isSymmetric_tmp = 0, isComplex = 0;
    // load matrix
    if ((f = fopen(filename, "r")) == NULL)
        return -1;
    if (mm_read_banner(f, &matcode) != 0)
    {
        printf("Could not process Matrix Market banner.\n");
        return -2;
    }
    if ( mm_is_pattern( matcode ) )  { isPattern = 1; /*printf("type = Pattern\n");*/ }
    if ( mm_is_real ( matcode) )     { isReal = 1; /*printf("type = real\n");*/ }
    if ( mm_is_complex( matcode ) )  { isComplex = 1; /*printf("type = real\n");*/ }
    if ( mm_is_integer ( matcode ) ) { isInteger = 1; /*printf("type = integer\n");*/ }
    /* find out size of sparse matrix .... */
    ret_code = mm_read_mtx_crd_size(f, &m_tmp, &n_tmp, &nnz_mtx_report);
    if (ret_code != 0)
        return -4;
    if ( mm_is_symmetric( matcode ) || mm_is_hermitian( matcode ) )
    {
        isSymmetric_tmp = 1;
        //printf("input matrix is symmetric = true\n");
    }
    else
    {
        //printf("input matrix is symmetric = false\n");
    }
    int *csrRowPtr_counter = (int *)malloc((m_tmp+1) * sizeof(int));
    memset(csrRowPtr_counter, 0, (m_tmp+1) * sizeof(int));
    int *csrRowIdx_tmp = (int *)malloc(nnz_mtx_report * sizeof(int));
    int *csrColIdx_tmp = (int *)malloc(nnz_mtx_report * sizeof(int));
    MAT_VAL_TYPE *csrVal_tmp    = (MAT_VAL_TYPE *)malloc(nnz_mtx_report * sizeof(MAT_VAL_TYPE));
    /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
    /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
    /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
    for (int i = 0; i < nnz_mtx_report; i++)
    {
        int idxi, idxj;
        double fval, fval_im;
        int ival;
        int returnvalue;
        if (isReal)
        {
            returnvalue = fscanf(f, "%d %d %lg\n", &idxi, &idxj, &fval);
        }
        else if (isComplex)
        {
            returnvalue = fscanf(f, "%d %d %lg %lg\n", &idxi, &idxj, &fval, &fval_im);
        }
        else if (isInteger)
        {
            returnvalue = fscanf(f, "%d %d %d\n", &idxi, &idxj, &ival);
            fval = ival;
        }
        else if (isPattern)
        {
            returnvalue = fscanf(f, "%d %d\n", &idxi, &idxj);
            fval = 1.0;
        }
        // adjust from 1-based to 0-based
        idxi--;
        idxj--;
        csrRowPtr_counter[idxi]++;
        csrRowIdx_tmp[i] = idxi;
        csrColIdx_tmp[i] = idxj;
        csrVal_tmp[i] = fval;
    }
    if (f != stdin)
        fclose(f);
    if (isSymmetric_tmp)
    {
        for (int i = 0; i < nnz_mtx_report; i++)
        {
            if (csrRowIdx_tmp[i] != csrColIdx_tmp[i])
                csrRowPtr_counter[csrColIdx_tmp[i]]++;
        }
    }
    // exclusive scan for csrRowPtr_counter
    int old_val, new_val;
    old_val = csrRowPtr_counter[0];
    csrRowPtr_counter[0] = 0;
    for (int i = 1; i <= m_tmp; i++)
    {
        new_val = csrRowPtr_counter[i];
        csrRowPtr_counter[i] = old_val + csrRowPtr_counter[i-1];
        old_val = new_val;
    }
    nnz_tmp = csrRowPtr_counter[m_tmp];
    *m = m_tmp;
    *n = n_tmp;
    *nnz = nnz_tmp;
    *isSymmetric = isSymmetric_tmp;
    // free tmp space
    free(csrColIdx_tmp);
    free(csrVal_tmp);
    free(csrRowIdx_tmp);
    free(csrRowPtr_counter);
    return 0;
 }
 // read matrix infomation from mtx file
 int mmio_data(int *csrRowPtr, int *csrColIdx, MAT_VAL_TYPE *csrVal, char *filename)
 {
    int m_tmp, n_tmp, nnz_tmp;
    int ret_code;
    MM_typecode matcode;
    FILE *f;
    int nnz_mtx_report;
    int isInteger = 0, isReal = 0, isPattern = 0, isSymmetric_tmp = 0, isComplex = 0;
    // load matrix
    if ((f = fopen(filename, "r")) == NULL)
        return -1;
    if (mm_read_banner(f, &matcode) != 0)
    {
        printf("Could not process Matrix Market banner.\n");
        return -2;
    }
    if ( mm_is_pattern( matcode ) )  { isPattern = 1; /*printf("type = Pattern\n");*/ }
    if ( mm_is_real ( matcode) )     { isReal = 1; /*printf("type = real\n");*/ }
    if ( mm_is_complex( matcode ) )  { isComplex = 1; /*printf("type = real\n");*/ }
    if ( mm_is_integer ( matcode ) ) { isInteger = 1; /*printf("type = integer\n");*/ }
    /* find out size of sparse matrix .... */
    ret_code = mm_read_mtx_crd_size(f, &m_tmp, &n_tmp, &nnz_mtx_report);
    if (ret_code != 0)
        return -4;
    if ( mm_is_symmetric( matcode ) || mm_is_hermitian( matcode ) )
    {
        isSymmetric_tmp = 1;
        //printf("input matrix is symmetric = true\n");
    }
    else
    {
        //printf("input matrix is symmetric = false\n");
    }
    int *csrRowPtr_counter = (int *)malloc((m_tmp+1) * sizeof(int));
    memset(csrRowPtr_counter, 0, (m_tmp+1) * sizeof(int));
    int *csrRowIdx_tmp = (int *)malloc(nnz_mtx_report * sizeof(int));
    int *csrColIdx_tmp = (int *)malloc(nnz_mtx_report * sizeof(int));
    MAT_VAL_TYPE *csrVal_tmp    = (MAT_VAL_TYPE *)malloc(nnz_mtx_report * sizeof(MAT_VAL_TYPE));
    /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
    /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
    /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
    for (int i = 0; i < nnz_mtx_report; i++)
    {
        int idxi, idxj;
        double fval, fval_im;
        int ival;
        int returnvalue;
        if (isReal)
        {
            returnvalue = fscanf(f, "%d %d %lg\n", &idxi, &idxj, &fval);
        }
        else if (isComplex)
        {
            returnvalue = fscanf(f, "%d %d %lg %lg\n", &idxi, &idxj, &fval, &fval_im);
        }
        else if (isInteger)
        {
            returnvalue = fscanf(f, "%d %d %d\n", &idxi, &idxj, &ival);
            fval = ival;
        }
        else if (isPattern)
        {
            returnvalue = fscanf(f, "%d %d\n", &idxi, &idxj);
            fval = 1.0;
        }
        // adjust from 1-based to 0-based
        idxi--;
        idxj--;
        csrRowPtr_counter[idxi]++;
        csrRowIdx_tmp[i] = idxi;
        csrColIdx_tmp[i] = idxj;
        csrVal_tmp[i] = fval;
    }
    if (f != stdin)
        fclose(f);
    if (isSymmetric_tmp)
    {
        for (int i = 0; i < nnz_mtx_report; i++)
        {
            if (csrRowIdx_tmp[i] != csrColIdx_tmp[i])
                csrRowPtr_counter[csrColIdx_tmp[i]]++;
        }
    }
    // exclusive scan for csrRowPtr_counter
    int old_val, new_val;
    old_val = csrRowPtr_counter[0];
    csrRowPtr_counter[0] = 0;
    for (int i = 1; i <= m_tmp; i++)
    {
        new_val = csrRowPtr_counter[i];
        csrRowPtr_counter[i] = old_val + csrRowPtr_counter[i-1];
        old_val = new_val;
    }
    nnz_tmp = csrRowPtr_counter[m_tmp];
    memcpy(csrRowPtr, csrRowPtr_counter, (m_tmp+1) * sizeof(int));
    memset(csrRowPtr_counter, 0, (m_tmp+1) * sizeof(int));
    if (isSymmetric_tmp)
    {
        for (int i = 0; i < nnz_mtx_report; i++)
        {
            if (csrRowIdx_tmp[i] != csrColIdx_tmp[i])
            {
                int offset = csrRowPtr[csrRowIdx_tmp[i]] + csrRowPtr_counter[csrRowIdx_tmp[i]];
                csrColIdx[offset] = csrColIdx_tmp[i];
                csrVal[offset] = csrVal_tmp[i];
                csrRowPtr_counter[csrRowIdx_tmp[i]]++;
                offset = csrRowPtr[csrColIdx_tmp[i]] + csrRowPtr_counter[csrColIdx_tmp[i]];
                csrColIdx[offset] = csrRowIdx_tmp[i];
                csrVal[offset] = csrVal_tmp[i];
                csrRowPtr_counter[csrColIdx_tmp[i]]++;
            }
            else
            {
                int offset = csrRowPtr[csrRowIdx_tmp[i]] + csrRowPtr_counter[csrRowIdx_tmp[i]];
                csrColIdx[offset] = csrColIdx_tmp[i];
                csrVal[offset] = csrVal_tmp[i];
                csrRowPtr_counter[csrRowIdx_tmp[i]]++;
            }
        }
    }
    else
    {
        for (int i = 0; i < nnz_mtx_report; i++)
        {
            int offset = csrRowPtr[csrRowIdx_tmp[i]] + csrRowPtr_counter[csrRowIdx_tmp[i]];
            csrColIdx[offset] = csrColIdx_tmp[i];
            csrVal[offset] = csrVal_tmp[i];
            csrRowPtr_counter[csrRowIdx_tmp[i]]++;
        }
    }
    // free tmp space
    free(csrColIdx_tmp);
    free(csrVal_tmp);
    free(csrRowIdx_tmp);
    free(csrRowPtr_counter);
    return 0;
 }
 // read matrix infomation from mtx file
 int mmio_allinone(int *m, int *n, MAT_PTR_TYPE *nnz, int *isSymmetric, 
                  MAT_PTR_TYPE **csrRowPtr, int **csrColIdx, MAT_VAL_TYPE **csrVal, 
                  char *filename)
 {
    int m_tmp, n_tmp;
    MAT_PTR_TYPE nnz_tmp;
    int ret_code;
    MM_typecode matcode;
    FILE *f;
    MAT_PTR_TYPE nnz_mtx_report;
    int isInteger = 0, isReal = 0, isPattern = 0, isSymmetric_tmp = 0, isComplex = 0;
    // load matrix
    if ((f = fopen(filename, "r")) == NULL)
        return -1;
    if (mm_read_banner(f, &matcode) != 0)
    {
        printf("Could not process Matrix Market banner.\n");
        return -2;
    }
    if ( mm_is_pattern( matcode ) )  { isPattern = 1; /*printf("type = Pattern\n");*/ }
    if ( mm_is_real ( matcode) )     { isReal = 1; /*printf("type = real\n");*/ }
    if ( mm_is_complex( matcode ) )  { isComplex = 1; /*printf("type = real\n");*/ }
    if ( mm_is_integer ( matcode ) ) { isInteger = 1; /*printf("type = integer\n");*/ }
    /* find out size of sparse matrix .... */
    ret_code = mm_read_mtx_crd_size(f, &m_tmp, &n_tmp, &nnz_mtx_report);
    if (ret_code != 0)
        return -4;
    if ( mm_is_symmetric( matcode ) || mm_is_hermitian( matcode ) )
    {
        isSymmetric_tmp = 1;
        //printf("input matrix is symmetric = true\n");
    }
    else
    {
        //printf("input matrix is symmetric = false\n");
    }
    MAT_PTR_TYPE *csrRowPtr_counter = (MAT_PTR_TYPE *)malloc((m_tmp+1) * sizeof(MAT_PTR_TYPE));
    memset(csrRowPtr_counter, 0, (m_tmp+1) * sizeof(MAT_PTR_TYPE));
    int *csrRowIdx_tmp = (int *)malloc(nnz_mtx_report * sizeof(int));
    int *csrColIdx_tmp = (int *)malloc(nnz_mtx_report * sizeof(int));
    MAT_VAL_TYPE *csrVal_tmp    = (MAT_VAL_TYPE *)malloc(nnz_mtx_report * sizeof(MAT_VAL_TYPE));
    /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
    /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
    /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
    for (MAT_PTR_TYPE i = 0; i < nnz_mtx_report; i++)
    {
        int idxi, idxj;
        double fval, fval_im;
        int ival;
        int returnvalue;
        if (isReal)
        {
            returnvalue = fscanf(f, "%d %d %lg\n", &idxi, &idxj, &fval);
        }
        else if (isComplex)
        {
            returnvalue = fscanf(f, "%d %d %lg %lg\n", &idxi, &idxj, &fval, &fval_im);
        }
        else if (isInteger)
        {
            returnvalue = fscanf(f, "%d %d %d\n", &idxi, &idxj, &ival);
            fval = ival;
        }
        else if (isPattern)
        {
            returnvalue = fscanf(f, "%d %d\n", &idxi, &idxj);
            fval = 1.0;
        }
        // adjust from 1-based to 0-based
        idxi--;
        idxj--;
        csrRowPtr_counter[idxi]++;
        csrRowIdx_tmp[i] = idxi;
        csrColIdx_tmp[i] = idxj;
        csrVal_tmp[i] = fval;
    }
    if (f != stdin)
        fclose(f);
    if (isSymmetric_tmp)
    {
        for (MAT_PTR_TYPE i = 0; i < nnz_mtx_report; i++)
        {
            if (csrRowIdx_tmp[i] != csrColIdx_tmp[i])
                csrRowPtr_counter[csrColIdx_tmp[i]]++;
        }
    }
    // exclusive scan for csrRowPtr_counter
    exclusive_scan(csrRowPtr_counter, m_tmp+1);
    MAT_PTR_TYPE *csrRowPtr_alias = (MAT_PTR_TYPE *)malloc((m_tmp+1) * sizeof(MAT_PTR_TYPE));
    nnz_tmp = csrRowPtr_counter[m_tmp];
    int *csrColIdx_alias = (int *)malloc(nnz_tmp * sizeof(int));
    MAT_VAL_TYPE *csrVal_alias    = (MAT_VAL_TYPE *)malloc(nnz_tmp * sizeof(MAT_VAL_TYPE));
    memcpy(csrRowPtr_alias, csrRowPtr_counter, (m_tmp+1) * sizeof(MAT_PTR_TYPE));
    memset(csrRowPtr_counter, 0, (m_tmp+1) * sizeof(MAT_PTR_TYPE));
    if (isSymmetric_tmp)
    {
        for (MAT_PTR_TYPE i = 0; i < nnz_mtx_report; i++)
        {
            if (csrRowIdx_tmp[i] != csrColIdx_tmp[i])
            {
                MAT_PTR_TYPE offset = csrRowPtr_alias[csrRowIdx_tmp[i]] + csrRowPtr_counter[csrRowIdx_tmp[i]];
                csrColIdx_alias[offset] = csrColIdx_tmp[i];
                csrVal_alias[offset] = csrVal_tmp[i];
                csrRowPtr_counter[csrRowIdx_tmp[i]]++;
                offset = csrRowPtr_alias[csrColIdx_tmp[i]] + csrRowPtr_counter[csrColIdx_tmp[i]];
                csrColIdx_alias[offset] = csrRowIdx_tmp[i];
                csrVal_alias[offset] = csrVal_tmp[i];
                csrRowPtr_counter[csrColIdx_tmp[i]]++;
            }
            else
            {
                MAT_PTR_TYPE offset = csrRowPtr_alias[csrRowIdx_tmp[i]] + csrRowPtr_counter[csrRowIdx_tmp[i]];
                csrColIdx_alias[offset] = csrColIdx_tmp[i];
                csrVal_alias[offset] = csrVal_tmp[i];
                csrRowPtr_counter[csrRowIdx_tmp[i]]++;
            }
        }
    }
    else
    {
        for (MAT_PTR_TYPE i = 0; i < nnz_mtx_report; i++)
        {            
            MAT_PTR_TYPE offset = csrRowPtr_alias[csrRowIdx_tmp[i]] + csrRowPtr_counter[csrRowIdx_tmp[i]];
            csrColIdx_alias[offset] = csrColIdx_tmp[i];
            csrVal_alias[offset] = csrVal_tmp[i];
            csrRowPtr_counter[csrRowIdx_tmp[i]]++;
        }
    }
    *m = m_tmp;
    *n = n_tmp;
    *nnz = nnz_tmp;
    *isSymmetric = isSymmetric_tmp;
    *csrRowPtr = csrRowPtr_alias;
    *csrColIdx = csrColIdx_alias;
    *csrVal = csrVal_alias;
    // free tmp space
    free(csrColIdx_tmp);
    free(csrVal_tmp);
    free(csrRowIdx_tmp);
    free(csrRowPtr_counter);
    return 0;
 }
 #endif
--- a/DASP/src/utils.h
+++ b/DASP/src/utils.h
@@ -0,0 +1,212 @@
 #include "common.h"
 int BinarySearch(int *arr, int len, int target) {
 	int low = 0;
 	int high = len;
 	int mid = 0;
 	while (low <= high) {
 		mid = (low + high) / 2;
 		if (target < arr[mid]) high = mid - 1;
 		else if (target > arr[mid]) low = mid + 1;
 		else return mid;
 	}
 	return -1;
 }
 void swap_key(int *a, int *b)
 {
    int tmp = *a;
    *a = *b;
    *b = tmp;
 }
 // quick sort key (child function)
 int partition_key(int *key, int length, int pivot_index)
 {
    int i = 0;
    int small_length = pivot_index;
    int pivot = key[pivot_index];
    swap_key(&key[pivot_index], &key[pivot_index + (length - 1)]);
    for (; i < length; i++)
    {
        if (key[pivot_index + i] < pivot)
        {
            swap_key(&key[pivot_index + i], &key[small_length]);
            small_length++;
        }
    }
    swap_key(&key[pivot_index + length - 1], &key[small_length]);
    return small_length;
 }
 // quick sort key (child function)
 int partition_key_idx(int *key, int *len, int length, int pivot_index)
 {
    int i = 0;
    int small_length = pivot_index;
    int pivot = key[pivot_index];
    swap_key(&key[pivot_index], &key[pivot_index + (length - 1)]);
    swap_key(&len[pivot_index], &len[pivot_index + (length - 1)]);
    for (; i < length; i++)
    {
        if (key[pivot_index + i] < pivot)
        {
            swap_key(&key[pivot_index + i], &key[small_length]);
            swap_key(&len[pivot_index + i], &len[small_length]);
            small_length++;
        }
    }
    swap_key(&key[pivot_index + length - 1], &key[small_length]);
    swap_key(&len[pivot_index + length - 1], &len[small_length]);
    return small_length;
 }
 // quick sort key (main function)
 void quick_sort_key(int *key, int length)
 {
    if (length == 0 || length == 1)
        return;
    int small_length = partition_key(key, length, 0);
    quick_sort_key(key, small_length);
    quick_sort_key(&key[small_length + 1], length - small_length - 1);
 }
 void quick_sort_key_idx(int *key, int *len, int length)
 {
    if (length == 0 || length == 1)
        return;
    int small_length = partition_key_idx(key, len, length, 0);
    quick_sort_key_idx(key, len, small_length);
    quick_sort_key_idx(&key[small_length + 1], &len[small_length + 1], length - small_length - 1);
 }
 void initVec(MAT_VAL_TYPE *vec, int length)
 {
    for (int i = 0; i < length; ++ i)
    {
        // vec[i] = rand() % 20 * 0.1;
        vec[i] = 1;
    }
 }
 #ifdef f64
 __device__ __forceinline__ void mma_m8n8k4(MAT_VAL_TYPE *acc, MAT_VAL_TYPE &frag_a, MAT_VAL_TYPE &frag_b)
 {
    asm volatile(
        "mma.sync.aligned.m8n8k4.row.col.f64.f64.f64.f64"
        " { %0, %1 }, "
        " { %2 }, "
        " { %3 }, "
        " { %0, %1 };"
        : "+d"(acc[0]), "+d"(acc[1]):
        "d"(frag_a), "d"(frag_b)
    );
 }
 #endif
 int get_max(int *arr, int len)
 {
    int max = arr[0];
    for (int i = 1; i < len; i ++)
    {
        if (arr[i] > max) max = arr[i];
    }
    return max;
 }
 void count_sort(int *arr, int *idx, int len, int exp)
 {
    int *temp_arr = (int *)malloc(sizeof(int) * len);
    int *temp_idx = (int *)malloc(sizeof(int) * len);
    int buckets[10] = {0};
    for (int i = 0; i < len; i ++)
    {
        buckets[(arr[i] / exp) % 10] ++;
    }
    for (int i = 1; i < 10; i ++)
    {
        buckets[i] += buckets[i - 1];
    }
    for (int i = 0; i < len; i ++)
    {
        int offset = len - (buckets[(arr[i] / exp) % 10] - 1) - 1;
        temp_arr[offset] = arr[i];
        temp_idx[offset] = idx[i];
        buckets[(arr[i] / exp) % 10] --;
    }
    for (int i = 0; i < len; i ++)
    {
        arr[i] = temp_arr[i];
        idx[i] = temp_idx[i];
    }
    free(temp_arr);
    free(temp_idx);
 }
 void count_sort_asce(int *arr, int *idx, int len, int exp)
 {
    int *temp_arr = (int *)malloc(sizeof(int) * len);
    int *temp_idx = (int *)malloc(sizeof(int) * len);
    int buckets[10] = {0};
    for (int i = 0; i < len; i ++)
    {
        buckets[(arr[i] / exp) % 10] ++;
    }
    for (int i = 1; i < 10; i ++)
    {
        buckets[i] += buckets[i - 1];
    }
    for (int i = len - 1; i >= 0; i ++)
    {
        int offset = buckets[(arr[i] / exp) % 10] - 1;
        temp_arr[offset] = arr[i];
        temp_idx[offset] = idx[i];
        buckets[(arr[i] / exp) % 10] --;
    }
    for (int i = 0; i < len; i ++)
    {
        arr[i] = temp_arr[i];
        idx[i] = temp_idx[i];
    }
    free(temp_arr);
    free(temp_idx);
 }
 void radix_sort(int *arr, int *idx, int len)
 {
    int max = get_max(arr, len);
    for (int exp = 1; max / exp > 0; exp *= 10)
    {
        count_sort(arr, idx, len, exp);
    }
 }
 void radix_sort_asce(int *arr, int *idx, int len)
 {
    int max = get_max(arr, len);
    for (int exp = 1; max / exp > 0; exp *= 10)
    {
        count_sort_asce(arr, idx, len, exp);
    }
 }
--- a/DASP/test/cop20k_A.mtx
+++ b/DASP/test/cop20k_A.mtx
--- a/DASP/test/run_double.sh
+++ b/DASP/test/run_double.sh
@@ -0,0 +1,3 @@
 #!/bin/bash
 cd ..
 ./spmv_double  test/cop20k_A.mtx
--- a/DASP/test/run_half.sh
+++ b/DASP/test/run_half.sh
@@ -0,0 +1,3 @@
 #!/bin/bash
 cd ..
 ./spmv_half  test/cop20k_A.mtx
--- a/45
+++ b/45
@@ -0,0 +1,45 @@
 Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. 
 All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 Copyright (c) 2013-2019, hpcg-benchmark
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 * Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.
 * Redistributions in binary form must reproduce the above copyright notice,
  this list of conditions and the following disclaimer in the documentation
  and/or other materials provided with the distribution.
 * Neither the name of hpcg nor the names of its
  contributors may be used to endorse or promote products derived from
  this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/54
+++ b/54
@@ -0,0 +1,54 @@
 # -*- Makefile -*-
 # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # by default, "arch" is unknown, should be specified in the command line
 arch = UNKNOWN
 setup_file = setup/Make.$(arch)
 include $(setup_file)
 bin_name='bin/xhpcg'
 ifeq ($(USE_CUDA), 0)
     ifneq ($(USE_GRACE), 0)
          bin_name='bin/xhpcg-cpu'
     endif
 endif
 HPCG_DEPS = src/CG.o src/CG_ref.o src/TestCG.o src/ComputeResidual.o \
         src/ExchangeHalo.o src/GenerateGeometry.o src/GenerateProblem.o \
         src/GenerateProblem_ref.o src/CheckProblem.o \
 	 src/OptimizeProblem.o src/ReadHpcgDat.o src/ReportResults.o \
 	 src/SetupHalo.o src/SetupHalo_ref.o src/TestSymmetry.o src/TestNorms.o src/WriteProblem.o \
         src/YAML_Doc.o src/YAML_Element.o src/ComputeDotProduct.o \
         src/ComputeDotProduct_ref.o src/finalize.o src/init.o src/mytimer.o src/ComputeSPMV.o \
         src/ComputeSPMV_ref.o src/ComputeSYMGS.o src/ComputeSYMGS_ref.o src/ComputeWAXPBY.o src/ComputeWAXPBY_ref.o \
         src/ComputeMG_ref.o src/ComputeMG.o src/ComputeProlongation_ref.o src/ComputeRestriction_ref.o src/GenerateCoarseProblem.o \
 	 src/ComputeOptimalShapeXYZ.o src/MixedBaseCounter.o src/CheckAspectRatio.o src/OutputFile.o \
     src/ComputeProlongation.o src/ComputeRestriction.o
 $(bin_name): src/main.o $(HPCG_DEPS)
 	$(LINKER) $(LINKFLAGS) src/main.o $(HPCG_DEPS) -o $(bin_name) $(HPCG_LIBS)
 install:
 	cp build/bin/xhpcg* bin/
 clean:
 	rm -f $(HPCG_DEPS) $(bin_name) src/main.o
 .PHONY: clean
--- a/Makefile.ext
+++ b/Makefile.ext
@@ -0,0 +1,221 @@
 # -*- Makefile -*-
 # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 arch = UNKNOWN
 setup_file = setup/Make.$(arch)
 include $(setup_file)
 bin_name='bin/xhpcg'
 ifeq ($(USE_CUDA), 0)
     ifneq ($(USE_GRACE), 0)
          bin_name='bin/xhpcg-cpu'
     endif
 endif
 HPCG_DEPS = src/CG.o \
 	    src/CG_ref.o \
 	    src/TestCG.o \
 	    src/ComputeResidual.o \
 	    src/ExchangeHalo.o \
 	    src/GenerateGeometry.o \
 	    src/GenerateProblem.o \
 	    src/GenerateProblem_ref.o \
 	    src/CheckProblem.o \
 	    src/MixedBaseCounter.o \
 	    src/OptimizeProblem.o \
 	    src/ReadHpcgDat.o \
 	    src/ReportResults.o \
 	    src/SetupHalo.o \
 	    src/SetupHalo_ref.o \
 	    src/TestSymmetry.o \
 	    src/TestNorms.o \
 	    src/WriteProblem.o \
 	    src/YAML_Doc.o \
 	    src/YAML_Element.o \
 	    src/ComputeDotProduct.o \
 	    src/ComputeDotProduct_ref.o \
 	    src/mytimer.o \
 	    src/ComputeOptimalShapeXYZ.o \
 	    src/ComputeSPMV.o \
 	    src/ComputeSPMV_ref.o \
 	    src/ComputeSYMGS.o \
 	    src/ComputeSYMGS_ref.o \
 	    src/ComputeWAXPBY.o \
 	    src/ComputeWAXPBY_ref.o \
 	    src/ComputeMG_ref.o \
 	    src/ComputeMG.o \
 	    src/ComputeProlongation_ref.o \
 	    src/ComputeRestriction_ref.o \
 		src/ComputeProlongation.o \
 	    src/ComputeRestriction.o \
 	    src/CheckAspectRatio.o \
 	    src/OutputFile.o \
 	    src/GenerateCoarseProblem.o \
 	    src/init.o \
 	    src/finalize.o \
 	    src/CudaKernels.o \
 		src/CpuKernels.o
 # These header files are included in many source files, so we recompile every file if one or more of these header is modified.
 PRIMARY_HEADERS = HPCG_SRC_PATH/src/Geometry.hpp HPCG_SRC_PATH/src/SparseMatrix.hpp HPCG_SRC_PATH/src/Vector.hpp HPCG_SRC_PATH/src/CGData.hpp \
                  HPCG_SRC_PATH/src/MGData.hpp HPCG_SRC_PATH/src/hpcg.hpp
 all: $(bin_name)
 $(bin_name): src/main.o $(HPCG_DEPS)
 	$(LINKER) $(LINKFLAGS) src/main.o $(HPCG_DEPS) $(HPCG_LIBS) -o $(bin_name)
 install:
 	cp $(bin_name) ../bin/
 clean:
 	rm -f src/*.o $(bin_name)
 .PHONY: all clean
 src/main.o: HPCG_SRC_PATH/src/main.cpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/CG.o: HPCG_SRC_PATH/src/CG.cpp HPCG_SRC_PATH/src/CG.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/CG_ref.o: HPCG_SRC_PATH/src/CG_ref.cpp HPCG_SRC_PATH/src/CG_ref.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/TestCG.o: HPCG_SRC_PATH/src/TestCG.cpp HPCG_SRC_PATH/src/TestCG.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/ComputeResidual.o: HPCG_SRC_PATH/src/ComputeResidual.cpp HPCG_SRC_PATH/src/ComputeResidual.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/ExchangeHalo.o: HPCG_SRC_PATH/src/ExchangeHalo.cpp HPCG_SRC_PATH/src/ExchangeHalo.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/GenerateGeometry.o: HPCG_SRC_PATH/src/GenerateGeometry.cpp HPCG_SRC_PATH/src/GenerateGeometry.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/GenerateProblem.o: HPCG_SRC_PATH/src/GenerateProblem.cpp HPCG_SRC_PATH/src/GenerateProblem.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/GenerateProblem_ref.o: HPCG_SRC_PATH/src/GenerateProblem_ref.cpp HPCG_SRC_PATH/src/GenerateProblem_ref.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/CheckProblem.o: HPCG_SRC_PATH/src/CheckProblem.cpp HPCG_SRC_PATH/src/CheckProblem.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/MixedBaseCounter.o: HPCG_SRC_PATH/src/MixedBaseCounter.cpp HPCG_SRC_PATH/src/MixedBaseCounter.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/OptimizeProblem.o: HPCG_SRC_PATH/src/OptimizeProblem.cpp HPCG_SRC_PATH/src/OptimizeProblem.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/ReadHpcgDat.o: HPCG_SRC_PATH/src/ReadHpcgDat.cpp HPCG_SRC_PATH/src/ReadHpcgDat.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/ReportResults.o: HPCG_SRC_PATH/src/ReportResults.cpp HPCG_SRC_PATH/src/ReportResults.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/SetupHalo.o: HPCG_SRC_PATH/src/SetupHalo.cpp HPCG_SRC_PATH/src/SetupHalo.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/SetupHalo_ref.o: HPCG_SRC_PATH/src/SetupHalo_ref.cpp HPCG_SRC_PATH/src/SetupHalo_ref.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/TestSymmetry.o: HPCG_SRC_PATH/src/TestSymmetry.cpp HPCG_SRC_PATH/src/TestSymmetry.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/TestNorms.o: HPCG_SRC_PATH/src/TestNorms.cpp HPCG_SRC_PATH/src/TestNorms.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/WriteProblem.o: HPCG_SRC_PATH/src/WriteProblem.cpp HPCG_SRC_PATH/src/WriteProblem.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/YAML_Doc.o: HPCG_SRC_PATH/src/YAML_Doc.cpp HPCG_SRC_PATH/src/YAML_Doc.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/YAML_Element.o: HPCG_SRC_PATH/src/YAML_Element.cpp HPCG_SRC_PATH/src/YAML_Element.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/ComputeDotProduct.o: HPCG_SRC_PATH/src/ComputeDotProduct.cpp HPCG_SRC_PATH/src/ComputeDotProduct.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/ComputeDotProduct_ref.o: HPCG_SRC_PATH/src/ComputeDotProduct_ref.cpp HPCG_SRC_PATH/src/ComputeDotProduct_ref.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/finalize.o: HPCG_SRC_PATH/src/finalize.cpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/init.o: HPCG_SRC_PATH/src/init.cpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/mytimer.o: HPCG_SRC_PATH/src/mytimer.cpp HPCG_SRC_PATH/src/mytimer.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/ComputeOptimalShapeXYZ.o: HPCG_SRC_PATH/src/ComputeOptimalShapeXYZ.cpp HPCG_SRC_PATH/src/ComputeOptimalShapeXYZ.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/ComputeSPMV.o: HPCG_SRC_PATH/src/ComputeSPMV.cpp HPCG_SRC_PATH/src/ComputeSPMV.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/ComputeSPMV_ref.o: HPCG_SRC_PATH/src/ComputeSPMV_ref.cpp HPCG_SRC_PATH/src/ComputeSPMV_ref.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/ComputeSYMGS.o: HPCG_SRC_PATH/src/ComputeSYMGS.cpp HPCG_SRC_PATH/src/ComputeSYMGS.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/ComputeSYMGS_ref.o: HPCG_SRC_PATH/src/ComputeSYMGS_ref.cpp HPCG_SRC_PATH/src/ComputeSYMGS_ref.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/ComputeWAXPBY.o: HPCG_SRC_PATH/src/ComputeWAXPBY.cpp HPCG_SRC_PATH/src/ComputeWAXPBY.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/ComputeWAXPBY_ref.o: HPCG_SRC_PATH/src/ComputeWAXPBY_ref.cpp HPCG_SRC_PATH/src/ComputeWAXPBY_ref.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/ComputeMG_ref.o: HPCG_SRC_PATH/src/ComputeMG_ref.cpp HPCG_SRC_PATH/src/ComputeMG_ref.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/ComputeMG.o: HPCG_SRC_PATH/src/ComputeMG.cpp HPCG_SRC_PATH/src/ComputeMG.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/ComputeProlongation_ref.o: HPCG_SRC_PATH/src/ComputeProlongation_ref.cpp HPCG_SRC_PATH/src/ComputeProlongation_ref.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/ComputeRestriction_ref.o: HPCG_SRC_PATH/src/ComputeRestriction_ref.cpp HPCG_SRC_PATH/src/ComputeRestriction_ref.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/ComputeProlongation.o: HPCG_SRC_PATH/src/ComputeProlongation.cpp HPCG_SRC_PATH/src/ComputeProlongation.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/ComputeRestriction.o: HPCG_SRC_PATH/src/ComputeRestriction.cpp HPCG_SRC_PATH/src/ComputeRestriction.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/GenerateCoarseProblem.o: HPCG_SRC_PATH/src/GenerateCoarseProblem.cpp HPCG_SRC_PATH/src/GenerateCoarseProblem.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/CheckAspectRatio.o: HPCG_SRC_PATH/src/CheckAspectRatio.cpp HPCG_SRC_PATH/src/CheckAspectRatio.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/OutputFile.o: HPCG_SRC_PATH/src/OutputFile.cpp HPCG_SRC_PATH/src/OutputFile.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/CpuKernels.o: HPCG_SRC_PATH/src/CpuKernels.cpp HPCG_SRC_PATH/src/CpuKernels.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 src/CudaKernels.o: HPCG_SRC_PATH/src/CudaKernels.cu
 	nvcc -c -O3 $(CUDA_ARCH) $(HPCG_DEFS) -IHPCG_SRC_PATH/src -I$(CUDA_HOME)/include -I$(CUBLASROOT)/include -I$(MPdir)/include $< -o $@
--- a/README.md
+++ b/README.md
@@ -0,0 +1,90 @@
 # NVIDIA High Performance Conjugate Gradient Benchmark (HPCG)
 NVIDIA HPCG is based on the [HPCG](https://github.com/hpcg-benchmark/hpcg) benchmark and optimized for performance on NVIDIA accelerated HPC systems.
 NVIDIA's HPCG benchmark accelerates the High Performance Conjugate Gradients (HPCG) Benchmark. HPCG is a software package that performs a fixed number of multigrid preconditioned (using a symmetric Gauss-Seidel smoother) conjugate gradient (PCG) iterations using double precision (64 bit) floating point values.
 ## Main Features
 * The NVIDIA HPCG benchmark exploits NVIDIA high-performance math libraries: [cuSPARSE](https://docs.nvidia.com/cuda/cusparse/) and [NVPL Sparse](https://docs.nvidia.com/nvpl/_static/sparse/index.html) to achieve the highest possible performance for Sparse Matrix-vector multiplication (SpMV) and Sparse Matrix triangular solvers (SpSV) on NVIDIA GPUs and Grace CPUs.
 * The NVIDIA HPCG benchmark supports highly configurable command line parameters to decide:
    * Problem sizes for the GPU and Grace CPU
    * 3D rank grid shape
    * Execution modes: CPU-only, GPU-only and heterogeneous
    * Point-to-point communication: MPI_Host (Send/Recv), MPI_Host_Alltoallv, MPI_CUDA_Aware, MPI_CUDA_Aware_Alltoallv, and NCCL
    * NUMA-related configurations
    See `bin/RUNNING-x86` and `bin/RUNNING-aarch64` for detailed description.
 * The supported sparse storage format in the NVIDIA HPCG benchmark is the standard [sliced-ELLPACK format (SELL)](https://docs.nvidia.com/cuda/cusparse/#sliced-ellpack-sell).
 ## Supported Platforms
 * The NVIDIA HPCG benchmark supports GPU-only execution on x86 and NVIDIA Grace CPU systems with NVIDIA Ampere GPU architecture (sm80) and NVIDIA Hopper GPU architecture (sm90), CPU only execution for NVIDIA Grace CPUs, and heterogeneous GPU-Grace execution for NVIDIA Grace Hopper superchips.
 * NVIDIA HPCG only supports Linux operating systems.
 ## Prerequisite
 * Git
 * MPI, OpenMPI 4.1+ and MPICH 4.0+
 * CUDA Toolkit 12.3+, for NVIDIA GPU execution 
 * cuSPARSE 12.3+, for NVIDIA GPU execution
 * cuBLAS 12.2+, for NVIDIA GPU execution
 * GCC 13.0+ NVIDIA Grace CPU execution
 * NVPL 24.03+, for NVIDIA Grace CPU execution
 * NCCL 2.19+, optional for inter-process communication
 ## Compile and build
 ### Cloning the repo
 SSH
 ```
 git clone ssh://github.com/NVIDIA/nvidia-hpcg
 ```
 HTTPS
 ```
 git clone https://github.com/NVIDIA/nvidia-hpcg
 ```
 GitHub CLI
 ```
 gh repo clone NVIDIA/nvidia-hpcg
 ```
 ### Compile the NVIDIA HPCG benchmark
 The `build_sample.sh` script can be used to compile and build the NVIDIA HPCG benchmark. The paths to MPI, CUDA toolkit, CUDA Mathlibs, NCCL, and NVPL Sparse must be exported into `MPI_PATH`, `CUDA_PATH`, `MATHLIBS_PATH`, `NCCL_PATH`, and `NVPL_SPARSE_PATH` before running the `make` command. 
 The following options can used to decide the target platform:
 * `USE_CUDA`, set to 1 to build for NVIDIA GPUs and 0 otherwise.
 * `USE_GRACE`, set to 1 to build for NVIDIA Grace CPUs and 0 otherwise. When set to 0, the code builds for x86 platforms.
 * `USE_NCCL`, set to 1 to build for NCCL and 0 otherwise.
 The `USE_CUDA` and `USE_GRACE` options are used to create binaries that support one of three execution modes as follows:
 * For GPU-only, set `USE_CUDA` to 1. When `USE_GRACE=1`, build for `aarch64`. When `USE_GRACE=0`, build for `x86`.
 * For Grace-only, set `USE_CUDA` to 0 and `USE_GRACE` to 1.
 * For GPU-Grace, set `USE_CUDA` to 1 and `USE_GRACE` to 1.
 The `build_sample.sh` script uses `setup/MAKE.CUDA_AARCH64` and `setup/MAKE.CUDA_X86` to compose the include and link lines for the `make` command. These two scripts define compile-time options that are used in the source code. These options are explained in the two `setup/MAKE.CUDA_*` script files. The build script creates `build` directory and stores the NVIDIA HPCG binary in `build/bin` and `bin` directories (the binary is copied from `build/bin` to `bin`). The build script can create one of the following binaries:
 * xhpcg, when `USE_CUDA=1`.
 * xhpcg-cpu, when `USE_CUDA=0` and `USE_GRACE=1`.
 ## Running the NVIDIA HPCG benchmark
 The NVIDIA HPCG benchmark uses the same input format as the standard HPCG benchmark or user can pass benchmarks parameters with help of options. Please see the HPCG benchmark for getting started with the HPCG software concepts and best practices. The `bin` directory has scripts to run the NVIDIA HPCG benchmark along with descriptions and samples. Files `bin/RUNNING-x86` and `bin/RUNNING-aa64` explain, in detail, how to run the NVIDIA HPCG benchmark on `x86` and `aarch64` platforms, respectively. The `run_sample.sh` script provides four examples to run on `x86` and Grace Hopper x4 platforms.
 ### Heterogenous (GPU-GRACE) execution mode in-depth
 The NVIDIA HPCG benchmark can run efficiently on heterogeneous systems comprising GPUs and Grace CPUs like GRACE HOPPER. The approach involves assigning an MPI rank to each GPU and one or more MPI ranks to the Grace CPU. Given that the GPU performs significantly faster than the Grace CPU, the strategy is to allocate a larger local problem size to the GPU compared to the Grace CPU. This ensures that during MPI blocking communication steps like `MPI_Allreduce`, the GPU's execution is not interrupted by the Grace CPU's slower execution.
 In the NVIDIA HPCG benchmark, the GPU and Grace local problems are configured to differ in only one dimension while keeping the other dimensions identical. This design enables proper halo exchange operations across the dimensions that remain identical between the GPU and Grace ranks. The image below depicts an example of this design. The GPU and Grace ranks have the same x and y dimensions, where the halo exchange takes place. The z dimension is different which enables assigning different local problems for the GPU and Grace ranks. The NVIDIA HPCG benchmark has the flexibility to choose the 3D shape of ranks,  choose the different dimension, and configure the sizes of GPU and Grace ranks. Refer to `bin/RUNNING-aarch64` for more details.
 <img src="images/hpcg-gpu-grace-example.png" alt="drawing" width="150"/>
 ### Interpreting the results
 By default, the NVIDIA HPCG benchmark outputs the logs to the standard output (`stdout`). To print into log files, set knob `--of` to 1.
 Even though a run can be valid, there are performance flags to observe in performance logs (line numbers are considered when output to log files):
 * In the iterations summary section (line 68), the number of optimized CG iterations per set (line 72) should be as close as possible to the reference value (i.e., 50 iterations). The user can try different parameters such as the local problem size and 3D grid shape to achieve low iterations value.
 * In the GFLOP/s summary (line 100), the value of `Total with convergence and optimization phase overhead` at line 107 should be as close as possible to `Raw Total`. Otherwise, make sure the number of optimized CG iterations per set, the setup time (line 20), and the optimization time (line 82) are reasonable compared to the total execution time. This is important when scaling on multi-node.
 * When scaling on multi-node platforms, most of the DDOT computation time is the time of the `MPI_Allreduce`. High `MPI_Allreduce` time indicates scaling bottlenecks due to a small local problem size or a problem in configurations or platforms.
 ## Support
 For questions or to provide feedback, please contact [HPCBenchmarks@nvidia.com](mailto:HPCBenchmarks@nvidia.com)
 ## License
 The license file can be found in the [LICENSE](LICENSE) file.
--- a/build_sample.sh
+++ b/build_sample.sh
@@ -0,0 +1,104 @@
 #! /usr/bin/env bash
 # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 export CXX_PATH=/usr
 export PATH=${CXX_PATH}/bin:${PATH}
 if [[ -z "${MPI_PATH}" ]]; then
    export MPI_PATH=/path/to/mpi #Change this to correct MPI path
 fi
 if [[ -z "${CUDA_PATH}" ]]; then
    export MATHLIBS_PATH=/path/to/mathlibs #Change this to correct CUDA mathlibs
 fi
 if [[ -z "${NCCL_PATH}" ]]; then
    export NCCL_PATH=/path/to/nccl #Change to correct NCCL path
 fi
 if [[ -z "${CUDA_PATH}" ]]; then
    export CUDA_PATH=/path/to/cuda #Change this to correct CUDA path
 fi
 if [[ -z "${NVPL_SPARSE_PATH}" ]]; then
    export NVPL_SPARSE_PATH=/path/to/nvpllibs #Change this to correct NVPL mathlibs
 fi
 export PATH=${CUDA_PATH}/bin:${PATH}
 export LD_LIBRARY_PATH=${CUDA_PATH}/lib64:${LD_LIBRARY_PATH}
 export LD_LIBRARY_PATH=${NVPL_SPARSE_PATH}/lib:${LD_LIBRARY_PATH}
 #xhpcg binary will be located in build/bin
 mkdir -p build
 cd build
 ######## USE Nvidia GPU? ############
 # 1:         Yes
 # O:         No 
 export USE_CUDA=1
 if [[ $5 == "0" ]]; then
    export USE_CUDA=0
 fi
 ################################################
 ######## USE Grace CPU? ############
 # 1:         Yes
 # O:         No 
 export USE_GRACE=1
 if [[ $6 == "0" ]]; then
    export USE_GRACE=0
 fi
 ################################################
 ######## USE NCCL? ############
 # 1:         Yes
 # O:         No 
 export USE_NCCL=1
 if [[ $7 == "0" ]]; then
    export USE_NCCL=0
 fi
 ################################################
 if [[ $USE_GRACE == 1 ]]; then
    ../configure CUDA_AARCH64
 else
    ../configure CUDA_X86
 fi
 export build_B100=0
 if [[ $8 == "1" ]]; then
    export build_B100=1
 fi
 make -j 16 \
    USE_CUDA=${USE_CUDA} \
    USE_GRACE=${USE_GRACE} \
    USE_NCCL=${USE_NCCL} \
    MPdir=${MPI_PATH} \
    MPlib=${MPI_PATH}/lib \
    Mathdir=${MATHLIBS_PATH} \
    NCCLdir=${NCCL_PATH} \
    CUDA_HOME=${CUDA_PATH} \
    NVPL_PATH=${NVPL_SPARSE_PATH} \
    HPCG_ENG_VERSION=${is_ENG_VERSION} \
    HPCG_COMMIT_HASH=$2 \
    HPCG_VER_MAJOR=$3 \
    HPCG_VER_MINOR=$4 \
    BUILD_B100=${build_B100}
 #Move build/bin/xhpcg to bin/xhpcg
 make install
--- a/50
+++ b/50
@@ -0,0 +1,50 @@
 #! /bin/sh
 src_path=`echo $0 | sed -e s:/configure$::`
 bld_path=`pwd`
 #FIXME: need to check whether src and bld are the same (test f1 -ef f2)
 if test x"$#" != x"1" -o x"$1" = "x" ; then
  echo
  echo Please specify '"'arch'"' argument, for example:
  echo
  echo $0 Unix
  echo
  exit 127
 fi
 arg_arch="$1"
 setup_file=${src_path}/setup/Make.${arg_arch}
 if test ! -f $setup_file ; then
  echo
  echo Please create the configuration file $setup_file
  echo
  exit 127
 fi
 mkfile=${bld_path}/Makefile
 if test -d $mkfile -o -f $mkfile ; then
  rm -rf $mkfile
 fi
 sed -e "s:HPCG_ROOT_PATH:${bld_path}:g" ${src_path}/Makefile.ext | sed -e "s:HPCG_SRC_PATH:${src_path}:g" | sed -e "s:UNKNOWN:${arg_arch}:" > $mkfile
 # creating missing directories
 for path in src testing bin setup
 do
  if test ! -d $path ; then
    mkdir $path
  fi
 done
 # copy hpcg.dat if it doesn't exist
 if test ! -f bin/hpcg.dat ; then
  cp ${src_path}/bin/hpcg.dat bin/hpcg.dat
 fi
 # copy the architecture setup file
 cp -f $setup_file setup
--- a/images/hpcg-gpu-grace-example.png
+++ b/images/hpcg-gpu-grace-example.png
--- a/run_sample.sh
+++ b/run_sample.sh
@@ -0,0 +1,99 @@
 #!/bin/bash
 # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 export CXX_PATH=/usr
 export PATH=${CXX_PATH}/bin:${PATH}
 if [[ -z "${MPI_PATH}" ]]; then
    export MPI_PATH=/path/to/mpi #Change this to correct MPI path
 fi
 if [[ -z "${CUDA_PATH}" ]]; then
    export MATHLIBS_PATH=/path/to/mathlibs #Change this to correct CUDA mathlibs
 fi
 if [[ -z "${NCCL_PATH}" ]]; then
    export NCCL_PATH=/path/to/nccl #Change to correct NCCL path
 fi
 if [[ -z "${CUDA_PATH}" ]]; then
    export CUDA_PATH=/path/to/cuda #Change this to correct CUDA path
 fi
 if [[ -z "${NVPL_SPARSE}" ]]; then
    export NVPL_SPARSE=/path/to/nvpllibs #Change this to correct NVPL mathlibs
 fi
 #Please fix, if needed
 export CUDA_BLAS_VERSION=${CUDA_BUILD_VERSION:-12.2}
 export LD_LIBRARY_PATH=${MATHLIBS_PATH}/${CUDA_BLAS_VERSION}/lib64/:${LD_LIBRARY_PATH}
 export PATH=${CUDA_PATH}/bin:${PATH}
 export LD_LIBRARY_PATH=${CUDA_PATH}/lib64:${LD_LIBRARY_PATH}
 export LD_LIBRARY_PATH=${NCCL_PATH}/lib:${LD_LIBRARY_PATH}
 export LD_LIBRARY_PATH=${NVPL_SPARSE}/lib:${LD_LIBRARY_PATH}
 ext="--mca pml ^ucx --mca btl ^openib,smcuda -mca coll_hcoll_enable 0 -x coll_hcoll_np=0 --bind-to none"
 #Directory to xhpcg binary
 dir="bin/"
 #Sample on a Hopper GPU x86
 ###########################
 #Local problem size
 nx=512 #Large problem size x
 ny=512 #Large problem size y
 nz=288 #Large problem size z
 mpirun --oversubscribe ${ext} -np 1 ${dir}/hpcg.sh  --exec-name ${dir}/xhpcg \
 --nx $nx --ny $ny --nz $nz --rt 10 --b 0
 ########################################################################################
 #Sample on Grace Hopper x4
 ###########################
 #Local problem size
 nx=256 #Large problem size x, assumed for the GPU
 ny=1024 #Large problem size y, assumed for the GPU
 nz=288 #Large problem size z, assumed for the GPU
 #1 GPUOnly
 #---------#
 np=4  #Total number of ranks
 mpirun --oversubscribe ${ext} -np $np ${dir}/hpcg-aarch64.sh  --exec-name ${dir}/xhpcg \
 --nx $nx --ny $ny --nz $nz --rt 10 --b 0 --exm 0 --p2p 0 \
 --mem-affinity 0:1:2:3 --cpu-affinity 0-71:72-143:144-215:216-287
 #2 GraceOnly
 #-----------#
 np=4  #Total number of ranks
 mpirun --oversubscribe ${ext} -np $np ${dir}/hpcg-aarch64.sh  --exec-name ${dir}/xhpcg-cpu \
 --nx $nx --ny $ny --nz $nz --rt 10 --b 0 --exm 0 --p2p 0 \
 --mem-affinity 0:1:2:3 --cpu-affinity 0-71:72-143:144-215:216-287
 #3 Hetrogeneous (GPU + Grace)
 #----------------------------#
 np=8  #Total number of ranks (4GPU + 4Grace)
 exm=2 #Execution mode GPU+Grace
 diff_dim=2 #different dim between GPU and Grace is Y
 lpm=1 #Local problem mode (nx/ny/nz are local to GPU, g2c is the Grace different dimension)
 g2c=64 #Based on dif_dim=2 and lpm=1 --> Grace rank local problem size is $nx x $g2c x $nz
 #3D grid size 4x2x1 (must be equal to np)
 npx=4 #number of ranks in the x direction
 npy=2 #number of ranks in the y direction
 npz=1 #number of ranks in the z direction
 mpirun --oversubscribe ${ext} -np $np ${dir}/hpcg-aarch64.sh  --exec-name ${dir}/xhpcg \
 --nx $nx --ny $ny --nz $nz --rt 10 --b 0 --p2p 0 --exm $exm --lpm $lpm --g2c $g2c --ddm $diff_dim --npx $npx --npy $npy --npz $npz \
 --mem-affinity 0:0:1:1:2:2:3:3 --cpu-affinity 0-7:8-71:72-79:80-143:144-151:152-215:216-223:224-287
--- a/setup/Make.CUDA_AARCH64
+++ b/setup/Make.CUDA_AARCH64
@@ -0,0 +1,202 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #HEADER
 #  -- High Performance Conjugate Gradient Benchmark (HPCG)
 #     HPCG - 3.1 - March 28, 2019
 #     Michael A. Heroux
 #     Scalable Algorithms Group, Computing Research Division
 #     Sandia National Laboratories, Albuquerque, NM
 #
 #     Piotr Luszczek
 #     Jack Dongarra
 #     University of Tennessee, Knoxville
 #     Innovative Computing Laboratory
 #
 #     (C) Copyright 2013-2019 All Rights Reserved
 #
 #
 #  -- Copyright notice and Licensing terms:
 #
 #  Redistribution  and  use in  source and binary forms, with or without
 #  modification, are  permitted provided  that the following  conditions
 #  are met:
 #
 #  1. Redistributions  of  source  code  must retain the above copyright
 #  notice, this list of conditions and the following disclaimer.
 #
 #  2. Redistributions in binary form must reproduce  the above copyright
 #  notice, this list of conditions,  and the following disclaimer in the
 #  documentation and/or other materials provided with the distribution.
 #
 #  3. All  advertising  materials  mentioning  features  or  use of this
 #  software must display the following acknowledgement:
 #  This  product  includes  software  developed  at Sandia National
 #  Laboratories, Albuquerque, NM and the  University  of
 #  Tennessee, Knoxville, Innovative Computing Laboratory.
 #
 #  4. The name of the  University,  the name of the  Laboratory,  or the
 #  names  of  its  contributors  may  not  be used to endorse or promote
 #  products  derived   from   this  software  without  specific  written
 #  permission.
 #
 #  -- Disclaimer:
 #
 #  THIS  SOFTWARE  IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 #  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,  INCLUDING,  BUT NOT
 #  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 #  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
 #  OR  CONTRIBUTORS  BE  LIABLE FOR ANY  DIRECT,  INDIRECT,  INCIDENTAL,
 #  SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES  (INCLUDING,  BUT NOT
 #  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 #  DATA OR PROFITS; OR BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON ANY
 #  THEORY OF LIABILITY, WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
 #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 #  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 # ######################################################################
 #@HEADER
 # ----------------------------------------------------------------------
 # - shell --------------------------------------------------------------
 # ----------------------------------------------------------------------
 #
 SHELL        = /bin/sh
 #
 CD           = cd
 CP           = cp
 LN_S         = ln -s -f
 MKDIR        = mkdir -p
 RM           = /bin/rm -f
 TOUCH        = touch
 #
 # ----------------------------------------------------------------------
 # - HPCG Directory Structure / HPCG library ------------------------------
 # ----------------------------------------------------------------------
 #
 TOPdir       = .
 SRCdir       = $(TOPdir)/src
 INCdir       = $(TOPdir)/src
 BINdir       = $(TOPdir)/bin
 #
 # ----------------------------------------------------------------------
 # - Message Passing library (MPI) --------------------------------------
 # ----------------------------------------------------------------------
 # MPinc tells the  C  compiler where to find the Message Passing library
 # header files,  MPlib  is defined  to be the name of  the library to be
 # used. The variable MPdir is only used for defining MPinc and MPlib.
 #
 #MPdir        =
 #MPinc        =
 #MPlib        =
 #
 #
 # ----------------------------------------------------------------------
 # - HPCG includes / libraries / specifics -------------------------------
 # ----------------------------------------------------------------------
 #
 NVPL_SPARSE_INC=$(NVPL_PATH)/include
 NVPL_SPARSE_LIB=$(NVPL_PATH)/lib
 HPCG_INCLUDES = -I$(INCdir) -I$(INCdir)/$(arch) -I$(MPdir)/include  $(CRAY_CUDATOOLKIT_INCLUDE_OPTS)
 HPCG_LIBS     = -L${MPlib} -lmpi
 ifeq ($(USE_CUDA), 1)
    HPCG_INCLUDES += -I$(CUDA_HOME)/include -I$(Mathdir)/include
    HPCG_LIBS += -L$(Mathdir)/lib  -lcuda -lcusparse -lcublas -lcublasLt  -L$(CUDA_HOME)/lib64
 endif
 ifeq ($(USE_GRACE), 1)
    HPCG_INCLUDES += -I$(NVPL_SPARSE_INC)
    HPCG_LIBS += -L$(NVPL_SPARSE_LIB) -lnvpl_sparse
 endif
 ifeq ($(USE_NCCL), 1)
    HPCG_INCLUDES += -I$(NCCLdir)/include
    HPCG_LIBS += -L$(NCCLdir)/lib -lnccl
 endif
 #
 # - Compile time options -----------------------------------------------
 #
 # -DHPCG_NO_MPI	            Define to disable MPI
 # -DHPCG_NO_OPENMP	        Define to disable OPENMP
 # -DHPCG_CONTIGUOUS_ARRAYS  Define to have sparse matrix arrays long and contiguous
 # -DHPCG_DEBUG       	    Define to enable debugging output
 # -DHPCG_DETAILED_DEBUG     Define to enable very detailed debugging output
 # -DUSE_CUDA                Define to enable GPU execution
 # -DUSE_GRACE               Define to enable Grace CPU execution
 # -DUSE_NCCL                Define to enable NCCL P2P communication. Use --p2p=4 for NCCL
 # -DUSE_INT64               Define to enable INT64 indexing
 # By default HPCG will:
 #    *) Build with MPI enabled.
 #    *) Build with OpenMP enabled.
 #    *) Not generate debugging output.
 #
 HPCG_OPTS  = -DHPCG_CUBIC_RADICAL_SEARCH  -DHPCG_CONTIGUOUS_ARRAYS  #-DHPCG_DEBUG #-DHPCG_NO_MPI
 ifeq ($(USE_CUDA), 1)
    HPCG_OPTS  += -DUSE_CUDA
 endif
 ifeq ($(USE_GRACE), 1)
    HPCG_OPTS  += -DUSE_GRACE
 endif
 ifeq ($(USE_NCCL), 1)
    HPCG_OPTS  += -DUSE_NCCL
 endif
 ifeq ($(HPCG_ENG_VERSION), 1)
    HPCG_OPTS += -DHPCG_ENG_VERSION
 endif
 ifeq ($(USE_INT64), 1)
    HPCG_OPTS += -DINDEX_64
 endif
 #If not set, the defualt values in src/hpcg.hpp will be used
 HPCG_OPTS += -Dmake_HPCG_VER_MAJOR=$(HPCG_VER_MAJOR)
 HPCG_OPTS += -Dmake_HPCG_VER_MINOR=$(HPCG_VER_MINOR)
 HPCG_OPTS += -DHPCG_COMMIT_HASH=$(HPCG_COMMIT_HASH)
 #
 # ----------------------------------------------------------------------
 #
 HPCG_DEFS     = $(HPCG_OPTS) $(HPCG_INCLUDES)
 #
 # ----------------------------------------------------------------------
 # - Compilers / linkers - Optimization flags ---------------------------
 # ----------------------------------------------------------------------
 #
 ifeq ($(USE_CUDA), 1)
    CUDA_ARCH = -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90
 endif
 ifeq ($(BUILD_B100), 1)
 CUDA_ARCH += --generate-code arch=compute_100,code=sm_100
 endif
 CPU_ARCH ?= native
 CXX          = nvcc
 CXXFLAGS     = $(HPCG_DEFS) -O3 -Xcompiler --std=c++17 -Xcompiler -Ofast,-fopenmp,-mcpu=$(CPU_ARCH),-mtune=$(CPU_ARCH),-ftree-vectorize,-funroll-loops $(CUDA_ARCH)
 #
 LINKER       = $(CXX)
 LINKFLAGS    = $(CXXFLAGS) $(HPCG_LIBS)
 #
 ARCHIVER     = ar
 ARFLAGS      = r
 RANLIB       = echo
 #
 # ----------------------------------------------------------------------
--- a/setup/Make.CUDA_X86
+++ b/setup/Make.CUDA_X86
@@ -0,0 +1,186 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #HEADER
 #  -- High Performance Conjugate Gradient Benchmark (HPCG)
 #     HPCG - 3.1 - March 28, 2019
 #     Michael A. Heroux
 #     Scalable Algorithms Group, Computing Research Division
 #     Sandia National Laboratories, Albuquerque, NM
 #
 #     Piotr Luszczek
 #     Jack Dongarra
 #     University of Tennessee, Knoxville
 #     Innovative Computing Laboratory
 #
 #     (C) Copyright 2013-2019 All Rights Reserved
 #
 #
 #  -- Copyright notice and Licensing terms:
 #
 #  Redistribution  and  use in  source and binary forms, with or without
 #  modification, are  permitted provided  that the following  conditions
 #  are met:
 #
 #  1. Redistributions  of  source  code  must retain the above copyright
 #  notice, this list of conditions and the following disclaimer.
 #
 #  2. Redistributions in binary form must reproduce  the above copyright
 #  notice, this list of conditions,  and the following disclaimer in the
 #  documentation and/or other materials provided with the distribution.
 #
 #  3. All  advertising  materials  mentioning  features  or  use of this
 #  software must display the following acknowledgement:
 #  This  product  includes  software  developed  at Sandia National
 #  Laboratories, Albuquerque, NM and the  University  of
 #  Tennessee, Knoxville, Innovative Computing Laboratory.
 #
 #  4. The name of the  University,  the name of the  Laboratory,  or the
 #  names  of  its  contributors  may  not  be used to endorse or promote
 #  products  derived   from   this  software  without  specific  written
 #  permission.
 #
 #  -- Disclaimer:
 #
 #  THIS  SOFTWARE  IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 #  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,  INCLUDING,  BUT NOT
 #  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 #  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
 #  OR  CONTRIBUTORS  BE  LIABLE FOR ANY  DIRECT,  INDIRECT,  INCIDENTAL,
 #  SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES  (INCLUDING,  BUT NOT
 #  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 #  DATA OR PROFITS; OR BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON ANY
 #  THEORY OF LIABILITY, WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
 #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 #  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 # ######################################################################
 #@HEADER
 # ----------------------------------------------------------------------
 # - shell --------------------------------------------------------------
 # ----------------------------------------------------------------------
 #
 SHELL        = /bin/sh
 #
 CD           = cd
 CP           = cp
 LN_S         = ln -s -f
 MKDIR        = mkdir -p
 RM           = /bin/rm -f
 TOUCH        = touch
 #
 # ----------------------------------------------------------------------
 # - HPCG Directory Structure / HPCG library ------------------------------
 # ----------------------------------------------------------------------
 #
 TOPdir       = .
 SRCdir       = $(TOPdir)/src
 INCdir       = $(TOPdir)/src
 BINdir       = $(TOPdir)/bin
 #
 # ----------------------------------------------------------------------
 # - Message Passing library (MPI) --------------------------------------
 # ----------------------------------------------------------------------
 # MPinc tells the  C  compiler where to find the Message Passing library
 # header files,  MPlib  is defined  to be the name of  the library to be
 # used. The variable MPdir is only used for defining MPinc and MPlib.
 #
 #MPdir        =
 #MPinc        =
 #MPlib        =
 #
 #
 # ----------------------------------------------------------------------
 # - HPCG includes / libraries / specifics -------------------------------
 # ----------------------------------------------------------------------
 #
 NVPL_SPARSE_INC=$(NVPL_PATH)/include
 NVPL_SPARSE_LIB=$(NVPL_PATH)/lib
 HPCG_INCLUDES = -I$(INCdir) -I$(INCdir)/$(arch) -I$(MPdir)/include  $(CRAY_CUDATOOLKIT_INCLUDE_OPTS)
 HPCG_LIBS     = -L${MPlib} -lmpi
 HPCG_INCLUDES += -I$(CUDA_HOME)/include -I$(Mathdir)/include
 HPCG_LIBS += -L$(Mathdir)/lib  -lcuda -lcusparse -lcublas -lcublasLt  -L$(CUDA_HOME)/lib64
 ifeq ($(USE_NCCL), 1)
    HPCG_INCLUDES += -I$(NCCLdir)/include
    HPCG_LIBS += -L$(NCCLdir)/lib -lnccl
 endif
 #
 # - Compile time options -----------------------------------------------
 #
 # -DHPCG_NO_MPI	             Define to disable MPI
 # -DHPCG_NO_OPENMP	         Define to disable OPENMP
 # -DHPCG_CONTIGUOUS_ARRAYS   Define to have sparse matrix arrays long and contiguous
 # -DHPCG_DEBUG       	     Define to enable debugging output
 # -DHPCG_DETAILED_DEBUG      Define to enable very detailed debugging output
 # -DUSE_CUDA                 Define to enable GPU execution
 # -DUSE_NCCL                 Define to enabele NCCL P2P communication. Use --p2p=4 for NCCL
 # -DUSE_INT64                Define to enable INT64 indexing
 # By default HPCG will:
 #    *) Build with MPI enabled.
 #    *) Build with OpenMP enabled.
 #    *) Not generate debugging output.
 #
 HPCG_OPTS  = -DHPCG_CUBIC_RADICAL_SEARCH  -DHPCG_CONTIGUOUS_ARRAYS  #-DHPCG_DEBUG #-DHPCG_NO_MPI
 HPCG_OPTS  += -DUSE_CUDA
 ifeq ($(USE_NCCL), 1)
    HPCG_OPTS  += -DUSE_NCCL
 endif
 ifeq ($(HPCG_ENG_VERSION), 1)
    HPCG_OPTS += -DHPCG_ENG_VERSION
 endif
 ifeq ($(USE_INT64), 1)
    HPCG_OPTS += -DINDEX_64
 endif
 #If not set, the defualt values in src/hpcg.hpp will be used
 HPCG_OPTS += -Dmake_HPCG_VER_MAJOR=$(HPCG_VER_MAJOR)
 HPCG_OPTS += -Dmake_HPCG_VER_MINOR=$(HPCG_VER_MINOR)
 HPCG_OPTS += -DHPCG_COMMIT_HASH=$(HPCG_COMMIT_HASH)
 #
 # ----------------------------------------------------------------------
 #
 HPCG_DEFS     = $(HPCG_OPTS) $(HPCG_INCLUDES)
 #
 # ----------------------------------------------------------------------
 # - Compilers / linkers - Optimization flags ---------------------------
 # ----------------------------------------------------------------------
 #
 CUDA_ARCH = -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90
 ifeq ($(BUILD_B100), 1)
 CUDA_ARCH += --generate-code arch=compute_100,code=sm_100
 endif
 CPU_ARCH ?= native
 CXX          = nvcc
 CXXFLAGS     = $(HPCG_DEFS) -O3 -Xcompiler --std=c++17 -Xcompiler -Ofast,-fopenmp,-mcpu=$(CPU_ARCH),-mtune=$(CPU_ARCH),-ftree-vectorize,-funroll-loops $(CUDA_ARCH)
 #
 LINKER       = $(CXX)
 LINKFLAGS    = $(CXXFLAGS) $(HPCG_LIBS)
 #
 ARCHIVER     = ar
 ARFLAGS      = r
 RANLIB       = echo
 #
 # ----------------------------------------------------------------------
--- a/setup/Make.UNKNOWN
+++ b/setup/Make.UNKNOWN
@@ -0,0 +1,24 @@
 # -*- Makefile -*-
 arch=UNKNOWN
 VERSION = 3.1
 UNKNOWN:
 	@echo
 	@echo Please specify "'"arch"'" variable, for example:
 	@echo 1. Create file "'"Make.Unix"'" in the "'"setup"'" directory
 	@echo 2. Type: "'"make arch=Unix"'"
 	@echo
 #GNUTAR = gnutar # or "gtar" on Linux
 GNUTAR = gtar
 dist:
 	@echo Packaging for version $(VERSION)
 	ln -s -f . hpcg-$(VERSION)
 	grep :0: /etc/group | sed -e 's/:.*//' | xargs -I '{}' $(GNUTAR) --owner=root --group='{}' -cvhof hpcg-$(VERSION).tar hpcg-$(VERSION)/src/*.[ch]pp hpcg-$(VERSION)/[BCHIQRTV]* hpcg-$(VERSION)/bin/hpcg.dat hpcg-$(VERSION)/setup/Make.* hpcg-$(VERSION)/configure hpcg-$(VERSION)/Makefile hpcg-$(VERSION)/Makefile.ext hpcg-$(VERSION)/tools/hpcg.dox
 	gzip -v --best hpcg-$(VERSION).tar
 	rm -f hpcg-$(VERSION)
 .PHONY: UNKNOWN dist
--- a/src/CG.cpp
+++ b/src/CG.cpp
@@ -0,0 +1,241 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /*!
 @file CG.cpp
 HPCG routine
 */
 #include <fstream>
 #include <cmath>
 #include "hpcg.hpp"
 #include "CG.hpp"
 #include "ComputeDotProduct.hpp"
 #include "ComputeMG.hpp"
 #include "ComputeSPMV.hpp"
 #include "ComputeWAXPBY.hpp"
 #include "mytimer.hpp"
 #include <iostream>
 #include "CpuKernels.hpp"
 #include <mpi.h>
 extern int use_output_file;
 #define TICKD() t0 = mytimer()       //!< record current time in 't0'
 #define TOCKD(t) t += mytimer() - t0 //!< store time difference in 't' using time in 't0'
 /*!
  Routine to compute an approximate solution to Ax = b
  @param[in]    geom The description of the problem's geometry.
  @param[inout] A    The known system matrix
  @param[inout] data The data structure with all necessary CG vectors preallocated
  @param[in]    b    The known right hand side vector
  @param[inout] x    On entry: the initial guess; on exit: the new approximate solution
  @param[in]    max_iter  The maximum number of iterations to perform, even if tolerance is not met.
  @param[in]    tolerance The stopping criterion to assert convergence: if norm of residual is <= to tolerance.
  @param[out]   niters    The number of iterations actually performed.
  @param[out]   normr     The 2-norm of the residual vector after the last iteration.
  @param[out]   normr0    The 2-norm of the residual vector before the first iteration.
  @param[out]   times     The 7-element vector of the timing information accumulated during all of the iterations.
  @param[in]    doPreconditioning The flag to indicate whether the preconditioner should be invoked at each iteration.
  @return Returns zero on success and a non-zero value otherwise.
  @see CG_ref()
 */
 int CG(const SparseMatrix& A, CGData& data, const Vector& b, Vector& x, const int max_iter, const double tolerance,
    int& niters, double& normr, double& normr0, double* times, bool doPreconditioning, int flag)
 {
    double t_begin = mytimer(); // Start timing right away
    normr = 0.0;
    double rtz = 0.0, oldrtz = 0.0, alpha = 0.0, beta = 0.0, pAp = 0.0;
    double t0 = 0.0, t1 = 0.0, t2 = 0.0, t3 = 0.0, t4 = 0.0, t5 = 0.0;
    // #ifndef HPCG_NO_MPI
    //   double t6 = 0.0;
    // #endif
    local_int_t nrow = A.localNumberOfRows;
    Vector& r = data.r; // Residual vector
    Vector& z = data.z; // Preconditioned residual vector
    Vector& p = data.p; // Direction vector (in MPI mode ncol>=nrow)
    Vector& Ap = data.Ap;
    if (!doPreconditioning && A.geom->rank == 0)
        if (use_output_file)
        {
            HPCG_fout << "WARNING: PERFORMING UNPRECONDITIONED ITERATIONS" << std::endl;
        }
        else
        {
            std::cout << "WARNING: PERFORMING UNPRECONDITIONED ITERATIONS" << std::endl;
        }
    int print_freq = 1;
    if (print_freq > 50)
        print_freq = 50;
    if (print_freq < 1)
        print_freq = 1;
    // p is of length ncols, copy x to p for sparse MV operation
    if (A.rankType == GPU)
    {
 #ifdef USE_CUDA
        CopyVectorD2D(x, p);
 #endif
    }
    else
    {
        CopyVector(x, p);
    }
    TICKD();
    ComputeSPMV(A, p, Ap);
    TOCKD(t3); // Ap = A*p
    TICKD();
    ComputeWAXPBY(nrow, 1.0, b, -1.0, Ap, r, A.isWaxpbyOptimized, A.rankType);
    TOCKD(t2); // r = b - Ax (x stored in p)
    TICKD();
    ComputeDotProduct(nrow, r, r, normr, t4, A.isDotProductOptimized, A.rankType);
    TOCKD(t1);
    normr = sqrt(normr);
    if (A.geom->rank == 0 && flag)
        if (use_output_file)
        {
            HPCG_fout << "Initial Residual = " << normr << std::endl;
        }
        else
        {
            std::cout << "Initial Residual = " << normr << std::endl;
        }
    // Record initial residual for convergence testing
    normr0 = normr;
    // Start iterations
    for (int k = 1; k <= max_iter && normr / normr0 * (1.0 + 1.0e-6) > tolerance; k++)
    {
        TICKD();
        if (doPreconditioning)
        {
            ComputeMG(A, r, z); // Apply preconditioner
            if (A.rankType == GPU)
            {
 #ifdef USE_CUDA
                cudaStreamSynchronize(stream);
 #endif
            }
        }
        else
        {
            if (A.rankType == GPU)
            {
 #ifdef USE_CUDA
                CopyVectorD2D(r, z); // copy r to z (no preconditioning)
 #endif
            }
            else
            {
                CopyVector(r, z); // copy r to z (no preconditioning)
            }
        }
        TOCKD(t5); // Preconditioner apply time
        if (k == 1)
        {
            TICKD();
            ComputeWAXPBY(nrow, 1.0, z, 0.0, z, p, A.isWaxpbyOptimized, A.rankType);
            TOCKD(t2); // Copy Mr to p
            TICKD();
            ComputeDotProduct(nrow, r, z, rtz, t4, A.isDotProductOptimized, A.rankType);
            TOCKD(t1); // rtz = r'*z
        }
        else
        {
            oldrtz = rtz;
            TICKD();
            ComputeDotProduct(nrow, r, z, rtz, t4, A.isDotProductOptimized, A.rankType);
            TOCKD(t1); // rtz = r'*z
            beta = rtz / oldrtz;
            TICKD();
            ComputeWAXPBY(nrow, 1.0, z, beta, p, p, A.isWaxpbyOptimized, A.rankType);
            TOCKD(t2); // p = beta*p + z
        }
        TICKD();
        ComputeSPMV(A, p, Ap);
        TOCKD(t3); // Ap = A*p
        TICKD();
        ComputeDotProduct(nrow, p, Ap, pAp, t4, A.isDotProductOptimized, A.rankType);
        TOCKD(t1); // alpha = p'*Ap
        alpha = rtz / pAp;
        TICKD();
        ComputeWAXPBY(nrow, 1.0, x, alpha, p, x, A.isWaxpbyOptimized, A.rankType); // x = x + alpha*p
        ComputeWAXPBY(nrow, 1.0, r, -alpha, Ap, r, A.isWaxpbyOptimized, A.rankType);
        TOCKD(t2); // r = r - alpha*Ap
        TICKD();
        ComputeDotProduct(nrow, r, r, normr, t4, A.isDotProductOptimized, A.rankType);
        TOCKD(t1);
        normr = sqrt(normr);
        if (flag && A.geom->rank == 0 && (k % print_freq == 0 || k == max_iter))
            if (use_output_file)
            {
                HPCG_fout << "Iteration = " << k << "   Scaled Residual = " << normr / normr0 << std::endl;
            }
            else
            {
                std::cout << "Iteration = " << k << "   Scaled Residual = " << normr / normr0 << std::endl;
            }
        niters = k;
    }
    // Store times
    times[1] += t1; // dot-product time
    times[2] += t2; // WAXPBY time
    times[3] += t3; // SPMV time
    times[4] += t4; // AllReduce time
    times[5] += t5; // preconditioner apply time
                    // #ifndef HPCG_NO_MPI
    //   times[6] += t6; // exchange halo time
    // #endif
    times[0] += mytimer() - t_begin; // Total time. All done...
    return 0;
 }
--- a/src/CG.hpp
+++ b/src/CG.hpp
@@ -0,0 +1,55 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef CG_HPP
 #define CG_HPP
 #include "CGData.hpp"
 #include "SparseMatrix.hpp"
 #include "Vector.hpp"
 int CG(const SparseMatrix& A, CGData& data, const Vector& b, Vector& x, const int max_iter, const double tolerance,
    int& niters, double& normr, double& normr0, double* times, bool doPreconditioning, int flag);
 // this function will compute the Conjugate Gradient iterations.
 // geom - Domain and processor topology information
 // A - Matrix
 // b - constant
 // x - used for return value
 // max_iter - how many times we iterate
 // tolerance - Stopping tolerance for preconditioned iterations.
 // niters - number of iterations performed
 // normr - computed residual norm
 // normr0 - Original residual
 // times - array of timing information
 // doPreconditioning - bool to specify whether or not symmetric GS will be applied.
 #endif // CG_HPP
--- a/src/CGData.hpp
+++ b/src/CGData.hpp
@@ -0,0 +1,84 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /*!
 @file CGData.hpp
 HPCG data structure
 */
 #ifndef CGDATA_HPP
 #define CGDATA_HPP
 #include "SparseMatrix.hpp"
 #include "Vector.hpp"
 struct CGData_STRUCT
 {
    Vector r;  //!< pointer to residual vector
    Vector z;  //!< pointer to preconditioned residual vector
    Vector p;  //!< pointer to direction vector
    Vector Ap; //!< pointer to Krylov vector
 };
 typedef struct CGData_STRUCT CGData;
 /*!
 Constructor for the data structure of CG vectors.
 @param[in]  A    the data structure that describes the problem matrix and its structure
 @param[out] data the data structure for CG vectors that will be allocated to get it ready for use in CG iterations
 */
 inline void InitializeSparseCGData(SparseMatrix& A, CGData& data)
 {
    local_int_t nrow = A.localNumberOfRows;
    local_int_t ncol = A.localNumberOfColumns;
    InitializeVector(data.r, nrow, A.rankType);
    InitializeVector(data.z, ncol, A.rankType, true /*Only when rank type is GPU*/);
    InitializeVector(data.p, ncol, A.rankType, true);
    InitializeVector(data.Ap, nrow, A.rankType);
    return;
 }
 /*!
 Destructor for the CG vectors data.
 @param[inout] data the CG vectors data structure whose storage is deallocated
 */
 inline void DeleteCGData(CGData& data)
 {
    DeleteVector(data.r);
    DeleteVector(data.z);
    DeleteVector(data.p);
    DeleteVector(data.Ap);
    return;
 }
 #endif // CGDATA_HPP
--- a/src/CG_ref.cpp
+++ b/src/CG_ref.cpp
@@ -0,0 +1,198 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*!
 @file CG_ref.cpp
 HPCG routine
 */
 #include <cmath>
 #include <fstream>
 #include <iostream>
 #include "hpcg.hpp"
 #include "CG_ref.hpp"
 #include "ComputeDotProduct_ref.hpp"
 #include "ComputeMG_ref.hpp"
 #include "ComputeSPMV_ref.hpp"
 #include "ComputeWAXPBY_ref.hpp"
 #include "mytimer.hpp"
 extern int use_output_file;
 // Use TICK and TOCK to time a code section in MATLAB-like fashion
 #define TICK() t0 = mytimer()       //!< record current time in 't0'
 #define TOCK(t) t += mytimer() - t0 //!< store time difference in 't' using time in 't0'
 /*!
  Reference routine to compute an approximate solution to Ax = b
  @param[inout] A    The known system matrix
  @param[inout] data The data structure with all necessary CG vectors preallocated
  @param[in]    b    The known right hand side vector
  @param[inout] x    On entry: the initial guess; on exit: the new approximate solution
  @param[in]    max_iter  The maximum number of iterations to perform, even if tolerance is not met.
  @param[in]    tolerance The stopping criterion to assert convergence: if norm of residual is <= to tolerance.
  @param[out]   niters    The number of iterations actually performed.
  @param[out]   normr     The 2-norm of the residual vector after the last iteration.
  @param[out]   normr0    The 2-norm of the residual vector before the first iteration.
  @param[out]   times     The 7-element vector of the timing information accumulated during all of the iterations.
  @param[in]    doPreconditioning The flag to indicate whether the preconditioner should be invoked at each iteration.
  @return Returns zero on success and a non-zero value otherwise.
  @see CG()
 */
 int CG_ref(const SparseMatrix& A, CGData& data, const Vector& b, Vector& x, const int max_iter, const double tolerance,
    int& niters, double& normr, double& normr0, double* times, bool doPreconditioning, int flag)
 {
    double t_begin = mytimer(); // Start timing right away
    normr = 0.0;
    double rtz = 0.0, oldrtz = 0.0, alpha = 0.0, beta = 0.0, pAp = 0.0;
    double t0 = 0.0, t1 = 0.0, t2 = 0.0, t3 = 0.0, t4 = 0.0, t5 = 0.0;
    // #ifndef HPCG_NO_MPI
    //   double t6 = 0.0;
    // #endif
    local_int_t nrow = A.localNumberOfRows;
    Vector& r = data.r; // Residual vector
    Vector& z = data.z; // Preconditioned residual vector
    Vector& p = data.p; // Direction vector (in MPI mode ncol>=nrow)
    Vector& Ap = data.Ap;
    if (!doPreconditioning && A.geom->rank == 0)
        if (use_output_file)
        {
            HPCG_fout << "WARNING: PERFORMING UNPRECONDITIONED ITERATIONS" << std::endl;
        }
        else
        {
            std::cout << "WARNING: PERFORMING UNPRECONDITIONED ITERATIONS" << std::endl;
        }
 #if 1
    // def HPCG_DEBUG
    int print_freq = 1;
    if (print_freq > 50)
        print_freq = 50;
    if (print_freq < 1)
        print_freq = 1;
 #endif
    // p is of length ncols, copy x to p for sparse MV operation
    CopyVector(x, p);
    TICK();
    ComputeSPMV_ref(A, p, Ap);
    TOCK(t3); // Ap = A*p
    TICK();
    ComputeWAXPBY_ref(nrow, 1.0, b, -1.0, Ap, r);
    TOCK(t2); // r = b - Ax (x stored in p)
    TICK();
    ComputeDotProduct_ref(nrow, r, r, normr, t4);
    TOCK(t1);
    normr = sqrt(normr);
 #if 1
    // def HPCG_DEBUG
    if (A.geom->rank == 0 && flag)
        if (use_output_file)
        {
            HPCG_fout << "Initial Residual = " << normr << std::endl;
        }
        else
        {
            std::cout << "Initial Residual = " << normr << std::endl;
        }
 #endif
    // Record initial residual for convergence testing
    normr0 = normr;
    // Start iterations
    for (int k = 1; k <= max_iter && normr / normr0 > tolerance; k++)
    {
        TICK();
        if (doPreconditioning)
            ComputeMG_ref(A, r, z); // Apply preconditioner
        else
            ComputeWAXPBY_ref(nrow, 1.0, r, 0.0, r, z); // copy r to z (no preconditioning)
        TOCK(t5);                                       // Preconditioner apply time
        if (k == 1)
        {
            CopyVector(z, p);
            TOCK(t2); // Copy Mr to p
            TICK();
            ComputeDotProduct_ref(nrow, r, z, rtz, t4);
            TOCK(t1); // rtz = r'*z
        }
        else
        {
            oldrtz = rtz;
            TICK();
            ComputeDotProduct_ref(nrow, r, z, rtz, t4);
            TOCK(t1); // rtz = r'*z
            beta = rtz / oldrtz;
            TICK();
            ComputeWAXPBY_ref(nrow, 1.0, z, beta, p, p);
            TOCK(t2); // p = beta*p + z
        }
        TICK();
        ComputeSPMV_ref(A, p, Ap);
        TOCK(t3); // Ap = A*p
        TICK();
        ComputeDotProduct_ref(nrow, p, Ap, pAp, t4);
        TOCK(t1); // alpha = p'*Ap
        alpha = rtz / pAp;
        TICK();
        ComputeWAXPBY_ref(nrow, 1.0, x, alpha, p, x); // x = x + alpha*p
        ComputeWAXPBY_ref(nrow, 1.0, r, -alpha, Ap, r);
        TOCK(t2); // r = r - alpha*Ap
        TICK();
        ComputeDotProduct_ref(nrow, r, r, normr, t4);
        TOCK(t1);
        normr = sqrt(normr);
 #if 1
        // def HPCG_DEBUG
        if (flag && A.geom->rank == 0 && (k % print_freq == 0 || k == max_iter))
            if (use_output_file)
            {
                HPCG_fout << "Iteration = " << k << "   Scaled Residual = " << normr / normr0 << std::endl;
            }
            else
            {
                std::cout << "Iteration = " << k << "   Scaled Residual = " << normr / normr0 << std::endl;
            }
 #endif
        niters = k;
    }
    // Store times
    times[1] += t1; // dot product time
    times[2] += t2; // WAXPBY time
    times[3] += t3; // SPMV time
    times[4] += t4; // AllReduce time
    times[5] += t5; // preconditioner apply time
    // #ifndef HPCG_NO_MPI
    //   times[6] += t6; // exchange halo time
    // #endif
    times[0] += mytimer() - t_begin; // Total time. All done...
    return 0;
 }
--- a/src/CG_ref.hpp
+++ b/src/CG_ref.hpp
@@ -0,0 +1,42 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 #ifndef CG_REF_HPP
 #define CG_REF_HPP
 #include "CGData.hpp"
 #include "SparseMatrix.hpp"
 #include "Vector.hpp"
 // The use of CPU and GPU Sparse Matrix is intended to resolve
 // the linked list structures for MG coarse levels
 // There is no change of th erefernce code
 int CG_ref(const SparseMatrix& A, CGData& data, const Vector& b, Vector& x, const int max_iter, const double tolerance,
    int& niters, double& normr, double& normr0, double* times, bool doPreconditioning, int flag);
 // this function will compute the Conjugate Gradient iterations.
 // geom - Domain and processor topology information
 // A - Matrix
 // b - constant
 // x - used for return value
 // max_iter - how many times we iterate
 // tolerance - Stopping tolerance for preconditioned iterations.
 // niters - number of iterations performed
 // normr - computed residual norm
 // normr0 - Original residual
 // times - array of timing information
 // doPreconditioning - bool to specify whether or not symmetric GS will be applied.
 #endif // CG_REF_HPP
--- a/src/CheckAspectRatio.cpp
+++ b/src/CheckAspectRatio.cpp
@@ -0,0 +1,84 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /*!
 @file CheckAspectRatio.cpp
 HPCG routine
 */
 #include <algorithm>
 #include <iostream>
 #ifndef HPCG_NO_MPI
 #include <mpi.h>
 #endif
 #include "hpcg.hpp"
 #include "CheckAspectRatio.hpp"
 extern int use_output_file;
 int CheckAspectRatio(double smallest_ratio, int x, int y, int z, const char* what, bool DoIo)
 {
    double current_ratio = std::min(std::min(x, y), z) / double(std::max(std::max(x, y), z));
    if (current_ratio < smallest_ratio)
    { // ratio of the smallest to the largest
        if (DoIo)
        {
            if (use_output_file)
            {
                HPCG_fout << "The " << what << " sizes (" << x << "," << y << "," << z
                          << ") are invalid because the ratio min(x,y,z)/max(x,y,z)=" << current_ratio
                          << " is too small (at least " << smallest_ratio << " is required)." << std::endl;
                HPCG_fout << "The shape should resemble a 3D cube. Please adjust and try again." << std::endl;
                HPCG_fout.flush();
            }
            else
            {
                std::cout << "The " << what << " sizes (" << x << "," << y << "," << z
                          << ") are invalid because the ratio min(x,y,z)/max(x,y,z)=" << current_ratio
                          << " is too small (at least " << smallest_ratio << " is required)." << std::endl;
                std::cout << "The shape should resemble a 3D cube. Please adjust and try again." << std::endl
                          << std::flush;
            }
        }
 #ifndef HPCG_NO_MPI
        MPI_Abort(MPI_COMM_WORLD, 127);
 #endif
        return 127;
    }
    return 0;
 }
--- a/src/CheckAspectRatio.hpp
+++ b/src/CheckAspectRatio.hpp
@@ -0,0 +1,18 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 #ifndef CHECKASPECTRATIO_HPP
 #define CHECKASPECTRATIO_HPP
 extern int CheckAspectRatio(double smallest_ratio, int x, int y, int z, const char* what, bool DoIo);
 #endif // CHECKASPECTRATIO_HPP
--- a/src/CheckProblem.cpp
+++ b/src/CheckProblem.cpp
@@ -0,0 +1,192 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /*!
 @file CheckProblem.cpp
 HPCG routine
 */
 #ifndef HPCG_NO_MPI
 #include <mpi.h>
 #endif
 #ifndef HPCG_NO_OPENMP
 #include <omp.h>
 #endif
 #if defined(HPCG_DEBUG) || defined(HPCG_DETAILED_DEBUG)
 #include <fstream>
 using std::endl;
 #include "hpcg.hpp"
 #endif
 #include <cassert>
 #include "CheckProblem.hpp"
 /*!
  Check the contents of the generated sparse matrix to see if values match expected contents.
  @param[in]  A      The known system matrix
  @param[inout] b      The newly allocated and generated right hand side vector (if b!=0 on entry)
  @param[inout] x      The newly allocated solution vector with entries set to 0.0 (if x!=0 on entry)
  @param[inout] xexact The newly allocated solution vector with entries set to the exact solution (if the xexact!=0
  non-zero on entry)
  @see GenerateGeometry
 */
 void CheckProblem(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact)
 {
    // Make local copies of geometry information.  Use global_int_t since the RHS products in the calculations
    // below may result in global range values.
    global_int_t nx = A.geom->nx;
    global_int_t ny = A.geom->ny;
    global_int_t nz = A.geom->nz;
    global_int_t gnx = A.geom->gnx;
    global_int_t gny = A.geom->gny;
    global_int_t gnz = A.geom->gnz;
    global_int_t gix0 = A.geom->gix0;
    global_int_t giy0 = A.geom->giy0;
    global_int_t giz0 = A.geom->giz0;
    local_int_t localNumberOfRows = nx * ny * nz;     // This is the size of our subblock
    global_int_t totalNumberOfRows = gnx * gny * gnz; // Total number of grid points in mesh
    double* bv = 0;
    double* xv = 0;
    double* xexactv = 0;
    if (b != 0)
        bv = b->values; // Only compute exact solution if requested
    if (x != 0)
        xv = x->values; // Only compute exact solution if requested
    if (xexact != 0)
        xexactv = xexact->values; // Only compute exact solution if requested
    local_int_t localNumberOfNonzeros = 0;
    // TODO:  This triply nested loop could be flattened or use nested parallelism
 #ifndef HPCG_NO_OPENMP
 #pragma omp parallel for
 #endif
    for (local_int_t iz = 0; iz < nz; iz++)
    {
        global_int_t giz = giz0 + iz;
        for (local_int_t iy = 0; iy < ny; iy++)
        {
            global_int_t giy = giy0 + iy;
            for (local_int_t ix = 0; ix < nx; ix++)
            {
                global_int_t gix = gix0 + ix;
                local_int_t currentLocalRow = iz * nx * ny + iy * nx + ix;
                global_int_t currentGlobalRow = giz * gnx * gny + giy * gnx + gix;
                assert(A.localToGlobalMap[currentLocalRow] == currentGlobalRow);
 #ifdef HPCG_DETAILED_DEBUG
                HPCG_fout << " rank, globalRow, localRow = " << A.geom->rank << " " << currentGlobalRow << " "
                          << A.globalToLocalMap.find(currentGlobalRow)->second << endl;
 #endif
                char numberOfNonzerosInRow = 0;
                double* currentValuePointer
                    = A.matrixValues[currentLocalRow]; // Pointer to current value in current row
                global_int_t* currentIndexPointerG
                    = A.mtxIndG[currentLocalRow]; // Pointer to current index in current row
                for (int sz = -1; sz <= 1; sz++)
                {
                    if (giz + sz > -1 && giz + sz < gnz)
                    {
                        for (int sy = -1; sy <= 1; sy++)
                        {
                            if (giy + sy > -1 && giy + sy < gny)
                            {
                                for (int sx = -1; sx <= 1; sx++)
                                {
                                    if (gix + sx > -1 && gix + sx < gnx)
                                    {
                                        global_int_t curcol = currentGlobalRow + sz * gnx * gny + sy * gnx + sx;
                                        if (curcol == currentGlobalRow)
                                        {
                                            assert(A.matrixDiagonal[currentLocalRow] == currentValuePointer);
                                            assert(*currentValuePointer++ == 26.0);
                                        }
                                        else
                                        {
                                            assert(*currentValuePointer++ == -1.0);
                                        }
                                        assert(*currentIndexPointerG++ == curcol);
                                        numberOfNonzerosInRow++;
                                    } // end x bounds test
                                } // end sx loop
                            } // end y bounds test
                        } // end sy loop
                    } // end z bounds test
                } // end sz loop
                assert(A.nonzerosInRow[currentLocalRow] == numberOfNonzerosInRow);
 #ifndef HPCG_NO_OPENMP
 #pragma omp critical
 #endif
                localNumberOfNonzeros += numberOfNonzerosInRow; // Protect this with an atomic
                if (b != 0)
                    assert(bv[currentLocalRow] == 26.0 - ((double) (numberOfNonzerosInRow - 1)));
                if (x != 0)
                    assert(xv[currentLocalRow] == 0.0);
                if (xexact != 0)
                    assert(xexactv[currentLocalRow] == 1.0);
            } // end ix loop
        } // end iy loop
    } // end iz loop
 #ifdef HPCG_DETAILED_DEBUG
    HPCG_fout << "Process " << A.geom->rank << " of " << A.geom->size << " has " << localNumberOfRows << " rows."
              << endl
              << "Process " << A.geom->rank << " of " << A.geom->size << " has " << localNumberOfNonzeros
              << " nonzeros." << endl;
 #endif
    global_int_t totalNumberOfNonzeros = 0;
 #ifndef HPCG_NO_MPI
    // Use MPI's reduce function to sum all nonzeros
 #ifdef HPCG_NO_LONG_LONG
    MPI_Allreduce(&localNumberOfNonzeros, &totalNumberOfNonzeros, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
 #else
    long long lnnz = localNumberOfNonzeros, gnnz = 0; // convert to 64 bit for MPI call
    MPI_Allreduce(&lnnz, &gnnz, 1, MPI_LONG_LONG_INT, MPI_SUM, MPI_COMM_WORLD);
    totalNumberOfNonzeros = gnnz; // Copy back
 #endif
 #else
    totalNumberOfNonzeros = localNumberOfNonzeros;
 #endif
    assert(A.totalNumberOfRows == totalNumberOfRows);
    assert(A.totalNumberOfNonzeros == totalNumberOfNonzeros);
    assert(A.localNumberOfRows == localNumberOfRows);
    assert(A.localNumberOfNonzeros == localNumberOfNonzeros);
    return;
 }
--- a/src/CheckProblem.hpp
+++ b/src/CheckProblem.hpp
@@ -0,0 +1,21 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 #ifndef CHECKPROBLEM_HPP
 #define CHECKPROBLEM_HPP
 #include "SparseMatrix.hpp"
 #include "Vector.hpp"
 void CheckProblem(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact);
 #endif // CHECKPROBLEM_HPP
--- a/src/ComputeDotProduct.cpp
+++ b/src/ComputeDotProduct.cpp
@@ -0,0 +1,114 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /*!
 @file ComputeDotProduct.cpp
 HPCG routine
 */
 #ifndef HPCG_NO_MPI
 #include "mytimer.hpp"
 #include <mpi.h>
 #endif
 #include "ComputeDotProduct.hpp"
 #include "ComputeDotProduct_ref.hpp"
 #ifdef USE_CUDA
 #include "Cuda.hpp"
 #define CHECK_CUBLAS(x)                                                                                                \
    do                                                                                                                 \
    {                                                                                                                  \
        cublasStatus_t cublasStatus = (x);                                                                             \
        if (cublasStatus != CUBLAS_STATUS_SUCCESS)                                                                     \
        {                                                                                                              \
            fprintf(stderr, "CUBLAS: %s = %d at (%s:%d)\n", #x, cublasStatus, __FILE__, __LINE__);                     \
            exit(1);                                                                                                   \
        }                                                                                                              \
    } while (0)
 #endif
 #ifdef USE_GRACE
 #include "CpuKernels.hpp"
 #endif
 /*!
  Routine to compute the dot product of two vectors.
  This routine calls the reference dot-product implementation by default, but
  can be replaced by a custom routine that is optimized and better suited for
  the target system.
  @param[in]  n the number of vector elements (on this processor)
  @param[in]  x, y the input vectors
  @param[out] result a pointer to scalar value, on exit will contain the result.
  @param[out] time_allreduce the time it took to perform the communication between processes
  @param[out] isOptimized should be set to false if this routine uses the reference implementation (is not optimized);
  otherwise leave it unchanged
  @return returns 0 upon success and non-zero otherwise
  @see ComputeDotProduct_ref
 */
 int ComputeDotProduct(const local_int_t n, const Vector& x, const Vector& y, double& result, double& time_allreduce,
    bool& isOptimized, rank_type_t rt)
 {
    double local_result = 0.0;
    if (rt == GPU)
    {
 #ifdef USE_CUDA
        cublasStatus_t t = cublasDdot(cublashandle, n, x.values_d, 1, y.values_d, 1, &local_result);
 #endif
    }
    else
    {
 #ifdef USE_GRACE
        // Consider replacing with NVPL BLAS dot product
        ComputeDotProductCpu(n, x, y, local_result, isOptimized);
 #endif
    }
 #ifndef HPCG_NO_MPI
    // Use MPI's reduce function to collect all partial sums
    double t0 = mytimer();
    double global_result = 0.0;
    MPI_Allreduce(&local_result, &global_result, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    result = global_result;
    t0 = mytimer() - t0;
    time_allreduce += t0;
 #else
    time_allreduce += 0.0;
    result = local_result;
 #endif
    return 0;
 }
--- a/src/ComputeDotProduct.hpp
+++ b/src/ComputeDotProduct.hpp
@@ -0,0 +1,39 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef COMPUTEDOTPRODUCT_HPP
 #define COMPUTEDOTPRODUCT_HPP
 #include "Vector.hpp"
 int ComputeDotProduct(const local_int_t n, const Vector& x, const Vector& y, double& result, double& time_allreduce,
    bool& isOptimized, rank_type_t rt);
 #endif // COMPUTEDOTPRODUCT_HPP
--- a/src/ComputeDotProduct_ref.cpp
+++ b/src/ComputeDotProduct_ref.cpp
@@ -0,0 +1,84 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*!
 @file ComputeDotProduct_ref.cpp
 HPCG routine
 */
 #ifndef HPCG_NO_MPI
 #include "mytimer.hpp"
 #include <mpi.h>
 #endif
 #ifndef HPCG_NO_OPENMP
 #include <omp.h>
 #endif
 #include "ComputeDotProduct_ref.hpp"
 #include <cassert>
 /*!
  Routine to compute the dot product of two vectors where:
  This is the reference dot-product implementation.  It _CANNOT_ be modified for the
  purposes of this benchmark.
  @param[in] n the number of vector elements (on this processor)
  @param[in] x, y the input vectors
  @param[in] result a pointer to scalar value, on exit will contain result.
  @param[out] time_allreduce the time it took to perform the communication between processes
  @return returns 0 upon success and non-zero otherwise
  @see ComputeDotProduct
 */
 int ComputeDotProduct_ref(const local_int_t n, const Vector& x, const Vector& y, double& result, double& time_allreduce)
 {
    assert(x.localLength >= n); // Test vector lengths
    assert(y.localLength >= n);
    double local_result = 0.0;
    double* xv = x.values;
    double* yv = y.values;
    if (yv == xv)
    {
 #ifndef HPCG_NO_OPENMP
 #pragma omp parallel for reduction(+ : local_result)
 #endif
        for (local_int_t i = 0; i < n; i++)
            local_result += xv[i] * xv[i];
    }
    else
    {
 #ifndef HPCG_NO_OPENMP
 #pragma omp parallel for reduction(+ : local_result)
 #endif
        for (local_int_t i = 0; i < n; i++)
            local_result += xv[i] * yv[i];
    }
 #ifndef HPCG_NO_MPI
    // Use MPI's reduce function to collect all partial sums
    double t0 = mytimer();
    double global_result = 0.0;
    MPI_Allreduce(&local_result, &global_result, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    result = global_result;
    time_allreduce += mytimer() - t0;
 #else
    time_allreduce += 0.0;
    result = local_result;
 #endif
    return 0;
 }
--- a/src/ComputeDotProduct_ref.hpp
+++ b/src/ComputeDotProduct_ref.hpp
@@ -0,0 +1,21 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 #ifndef COMPUTEDOTPRODUCT_REF_HPP
 #define COMPUTEDOTPRODUCT_REF_HPP
 #include "Vector.hpp"
 int ComputeDotProduct_ref(
    const local_int_t n, const Vector& x, const Vector& y, double& result, double& time_allreduce);
 #endif // COMPUTEDOTPRODUCT_REF_HPP
--- a/src/ComputeMG.cpp
+++ b/src/ComputeMG.cpp
@@ -0,0 +1,96 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /*!
 @file ComputeMG.cpp
 HPCG routine
 */
 #include "ComputeMG.hpp"
 #include "ComputeProlongation.hpp"
 #include "ComputeRestriction.hpp"
 #include "ComputeSYMGS.hpp"
 #include "CudaKernels.hpp"
 /*!
  @param[in] A the known system matrix
  @param[in] r the input vector
  @param[inout] x On exit contains the result of the multigrid V-cycle with r as the RHS, x is the approximation to Ax =
  r.
  @return returns 0 upon success and non-zero otherwise
  @see ComputeMG_ref
 */
 int ComputeMG(const SparseMatrix& A, const Vector& r, Vector& x)
 {
    int ierr = 0;
    if (A.mgData != 0)
    { // Go to next coarse level if defined
        ComputeSYMGS(A, r, x, 1);
        if (A.rankType == GPU)
        {
 #ifdef USE_CUDA
            ComputeRestrictionCuda(A, r);
 #endif
        }
        else
        {
 #ifdef USE_GRACE
            ComputeRestriction(A, r);
 #endif
        }
        ierr = ComputeMG(*A.Ac, *A.mgData->rc, *A.mgData->xc);
        if (A.rankType == GPU)
        {
 #ifdef USE_CUDA
            ComputeProlongationCuda(A, x);
 #endif
        }
        else
        {
 #ifdef USE_GRACE
            ComputeProlongation(A, x);
 #endif
        }
        ComputeSYMGS(A, r, x, 0);
    }
    else
    {
        ComputeSYMGS(A, r, x, 1);
    }
    return 0;
 }
--- a/src/ComputeMG.hpp
+++ b/src/ComputeMG.hpp
@@ -0,0 +1,22 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 #ifndef COMPUTEMG_HPP
 #define COMPUTEMG_HPP
 #include "SparseMatrix.hpp"
 #include "Vector.hpp"
 int ComputeMG(const SparseMatrix& A, const Vector& r, Vector& x);
 #endif // COMPUTEMG_HPP
--- a/src/ComputeMG_ref.cpp
+++ b/src/ComputeMG_ref.cpp
@@ -0,0 +1,81 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*!
 @file ComputeSYMGS_ref.cpp
 HPCG routine
 */
 #include "ComputeMG_ref.hpp"
 #include "ComputeProlongation_ref.hpp"
 #include "ComputeRestriction_ref.hpp"
 #include "ComputeSPMV_ref.hpp"
 #include "ComputeSYMGS_ref.hpp"
 #include <cassert>
 #include <iostream>
 /*!
  @param[in] A the known system matrix
  @param[in] r the input vector
  @param[inout] x On exit contains the result of the multigrid V-cycle with r as the RHS, x is the approximation to Ax =
  r.
  @return returns 0 upon success and non-zero otherwise
  @see ComputeMG
 */
 int ComputeMG_ref(const SparseMatrix& A, const Vector& r, Vector& x)
 {
    assert(x.localLength == A.localNumberOfColumns); // Make sure x contain space for halo values
    ZeroVector(x); // initialize x to zero
    int ierr = 0;
    if (A.mgData != 0)
    { // Go to next coarse level if defined
        int numberOfPresmootherSteps = A.mgData->numberOfPresmootherSteps;
        for (int i = 0; i < numberOfPresmootherSteps; ++i)
            ierr += ComputeSYMGS_ref(A, r, x);
        if (ierr != 0)
            return ierr;
        ierr = ComputeSPMV_ref(A, x, *A.mgData->Axf);
        if (ierr != 0)
            return ierr;
        // Perform restriction operation using simple injection
        ierr = ComputeRestriction_ref(A, r);
        if (ierr != 0)
            return ierr;
        ierr = ComputeMG_ref(*A.Ac, *A.mgData->rc, *A.mgData->xc);
        if (ierr != 0)
            return ierr;
        ierr = ComputeProlongation_ref(A, x);
        if (ierr != 0)
            return ierr;
        int numberOfPostsmootherSteps = A.mgData->numberOfPostsmootherSteps;
        for (int i = 0; i < numberOfPostsmootherSteps; ++i)
            ierr += ComputeSYMGS_ref(A, r, x);
        if (ierr != 0)
            return ierr;
    }
    else
    {
        ierr = ComputeSYMGS_ref(A, r, x);
        if (ierr != 0)
            return ierr;
    }
    return 0;
 }
--- a/src/ComputeMG_ref.hpp
+++ b/src/ComputeMG_ref.hpp
@@ -0,0 +1,26 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 #ifndef COMPUTEMG_REF_HPP
 #define COMPUTEMG_REF_HPP
 #include "SparseMatrix.hpp"
 #include "Vector.hpp"
 // The use of CPU and GPU Sparse Matrix is intended to resolve
 // the linked list structures for MG coarse levels (A->Ac)
 // There is no change of th erefernce code
 int ComputeMG_ref(const SparseMatrix& A, const Vector& r, Vector& x);
 #endif // COMPUTEMG_REF_HPP
--- a/src/ComputeOptimalShapeXYZ.cpp
+++ b/src/ComputeOptimalShapeXYZ.cpp
@@ -0,0 +1,175 @@
 #include <cmath>
 #include <cstdlib>
 #ifdef HPCG_CUBIC_RADICAL_SEARCH
 #include <algorithm>
 #endif
 #include <map>
 #include "ComputeOptimalShapeXYZ.hpp"
 #include "MixedBaseCounter.hpp"
 #ifdef HPCG_CUBIC_RADICAL_SEARCH
 static int min3(int a, int b, int c)
 {
    return std::min(a, std::min(b, c));
 }
 static int max3(int a, int b, int c)
 {
    return std::max(a, std::max(b, c));
 }
 static void cubic_radical_search(int n, int& x, int& y, int& z)
 {
    double best = 0.0;
    for (int f1 = (int) (pow(n, 1.0 / 3.0) + 0.5); f1 > 0; --f1)
        if (n % f1 == 0)
        {
            int n1 = n / f1;
            for (int f2 = (int) (pow(n1, 0.5) + 0.5); f2 > 0; --f2)
                if (n1 % f2 == 0)
                {
                    int f3 = n1 / f2;
                    double current = (double) min3(f1, f2, f3) / max3(f1, f2, f3);
                    if (current > best)
                    {
                        best = current;
                        x = f1;
                        y = f2;
                        z = f3;
                    }
                }
        }
 }
 #else
 static void ComputePrimeFactors(int n, std::map<int, int>& factors)
 {
    int d, sq = int((sqrt(double(n))) + 1L);
    div_t r;
    // remove 2 as a factor with shifts instead "/" and "%"
    for (; n > 1 && (n & 1) == 0; n >>= 1)
    {
        factors[2]++;
    }
    // keep removing subsequent odd numbers
    for (d = 3; d <= sq; d += 2)
    {
        while (1)
        {
            r = div(n, d);
            if (r.rem == 0)
            {
                factors[d]++;
                n = r.quot;
                continue;
            }
            break;
        }
    }
    if (n > 1 || factors.size() == 0) // left with a prime or x==1
        factors[n]++;
 }
 static int pow_i(int x, int p)
 {
    int v;
    if (0 == x || 1 == x)
        return x;
    if (p < 0)
        return 0;
    for (v = 1; p; p >>= 1)
    {
        if (1 & p)
            v *= x;
        x *= x;
    }
    return v;
 }
 #endif
 void ComputeOptimalShapeXYZ(int xyz, int& x, int& y, int& z)
 {
 #ifdef HPCG_CUBIC_RADICAL_SEARCH
    cubic_radical_search(xyz, x, y, z);
 #else
    std::map<int, int> factors;
    ComputePrimeFactors(xyz, factors); // factors are sorted: ascending order
    std::map<int, int>::iterator iter = factors.begin();
    // there is at least one prime factor
    x = (iter++)->first; // cache the first factor, move to the next one
    y = iter != factors.end() ? (iter++)->first : y; // try to cache the second factor in "y"
    if (factors.size() == 1)
    { // only a single factor
        z = pow_i(x, factors[x] / 3);
        y = pow_i(x, factors[x] / 3 + ((factors[x] % 3) >= 2 ? 1 : 0));
        x = pow_i(x, factors[x] / 3 + ((factors[x] % 3) >= 1 ? 1 : 0));
    }
    else if (factors.size() == 2 && factors[x] == 1 && factors[y] == 1)
    { // two distinct prime factors
        z = 1;
    }
    else if (factors.size() == 2 && factors[x] + factors[y] == 3)
    {                                // three prime factors, one repeated
        z = factors[x] == 2 ? x : y; // test which factor is repeated
    }
    else if (factors.size() == 3 && factors[x] == 1 && factors[y] == 1 && iter->second == 1)
    { // three distinct and single prime factors
        z = iter->first;
    }
    else
    { // 3 or more prime factors so try all possible 3-subsets
        int i, distinct_factors[32 + 1], count_factors[32 + 1];
        i = 0;
        for (std::map<int, int>::iterator iter = factors.begin(); iter != factors.end(); ++iter, ++i)
        {
            distinct_factors[i] = iter->first;
            count_factors[i] = iter->second;
        }
        // count total number of prime factors in "c_main" and distribute some factors into "c1"
        MixedBaseCounter c_main(count_factors, factors.size()), c1(count_factors, factors.size());
        // at the beginning, minimum area is the maximum area
        double area, min_area = 2.0 * xyz + 1.0;
        for (c1.next(); !c1.is_zero(); c1.next())
        {
            MixedBaseCounter c2(c_main, c1); // "c2" gets the factors remaining in "c_main" that "c1" doesn't have
            for (c2.next(); !c2.is_zero(); c2.next())
            {
                int tf1 = c1.product(distinct_factors);
                int tf2 = c2.product(distinct_factors);
                int tf3 = xyz / tf1 / tf2; // we derive the third dimension, we don't keep track of the factors it has
                area = tf1 * double(tf2) + tf2 * double(tf3) + tf1 * double(tf3);
                if (area < min_area)
                {
                    min_area = area;
                    x = tf1;
                    y = tf2;
                    z = tf3;
                }
            }
        }
    }
 #endif
 }
--- a/src/ComputeOptimalShapeXYZ.hpp
+++ b/src/ComputeOptimalShapeXYZ.hpp
@@ -0,0 +1,2 @@
 void ComputeOptimalShapeXYZ(int xyz, int& x, int& y, int& z);
--- a/src/ComputeProlongation.cpp
+++ b/src/ComputeProlongation.cpp
@@ -0,0 +1,72 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /*!
 @file ComputeProlongation.cpp
 HPCG routine
 */
 #ifndef HPCG_NO_OPENMP
 #include <omp.h>
 #endif
 #include "ComputeProlongation.hpp"
 /*!
  Routine to compute the coarse residual vector.
  @param[in]  Af - Fine grid sparse matrix object containing pointers to current coarse grid correction and the f2c
  operator.
  @param[inout] xf - Fine grid solution vector, update with coarse grid correction.
  Note that the fine grid residual is never explicitly constructed.
  We only compute it for the fine grid points that will be injected into corresponding coarse grid points.
  @return Returns zero on success and a non-zero value otherwise.
 */
 int ComputeProlongation(const SparseMatrix& Af, Vector& xf)
 {
    double* xfv = xf.values;
    double* xcv = Af.mgData->xc->values;
    local_int_t* f2c = Af.mgData->f2cOperator;
    local_int_t nc = Af.mgData->rc->localLength;
 #ifndef HPCG_NO_OPENMP
 #pragma omp parallel for
 #endif
    for (local_int_t i = 0; i < nc; ++i)
    {
        xfv[Af.f2cPerm[i]] += xcv[i];
    }
    return 0;
 }
--- a/src/ComputeProlongation.hpp
+++ b/src/ComputeProlongation.hpp
@@ -0,0 +1,20 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 #ifndef COMPUTEPROLONGATION_HPP
 #define COMPUTEPROLONGATION_HPP
 #include "SparseMatrix.hpp"
 #include "Vector.hpp"
 int ComputeProlongation(const SparseMatrix& Af, Vector& xf);
 #endif // COMPUTEPROLONGATION_HPP
--- a/src/ComputeProlongation_ref.cpp
+++ b/src/ComputeProlongation_ref.cpp
@@ -0,0 +1,55 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*!
 @file ComputeProlongation_ref.cpp
 HPCG routine
 */
 #ifndef HPCG_NO_OPENMP
 #include <omp.h>
 #endif
 #include "ComputeProlongation_ref.hpp"
 /*!
  Routine to compute the coarse residual vector.
  @param[in]  Af - Fine grid sparse matrix object containing pointers to current coarse grid correction and the f2c
  operator.
  @param[inout] xf - Fine grid solution vector, update with coarse grid correction.
  Note that the fine grid residual is never explicitly constructed.
  We only compute it for the fine grid points that will be injected into corresponding coarse grid points.
  @return Returns zero on success and a non-zero value otherwise.
 */
 int ComputeProlongation_ref(const SparseMatrix& Af, Vector& xf)
 {
    double* xfv = xf.values;
    double* xcv = Af.mgData->xc->values;
    local_int_t* f2c = Af.mgData->f2cOperator;
    local_int_t nc = Af.mgData->rc->localLength;
 #ifndef HPCG_NO_OPENMP
 #pragma omp parallel for
 #endif
    // TODO: Somehow note that this loop can be safely vectorized since f2c has no repeated indices
    for (local_int_t i = 0; i < nc; ++i)
        xfv[f2c[i]] += xcv[i]; // This loop is safe to vectorize
    return 0;
 }
--- a/src/ComputeProlongation_ref.hpp
+++ b/src/ComputeProlongation_ref.hpp
@@ -0,0 +1,20 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 #ifndef COMPUTEPROLONGATION_REF_HPP
 #define COMPUTEPROLONGATION_REF_HPP
 #include "SparseMatrix.hpp"
 #include "Vector.hpp"
 int ComputeProlongation_ref(const SparseMatrix& Af, Vector& xf);
 #endif // COMPUTEPROLONGATION_REF_HPP
--- a/src/ComputeResidual.cpp
+++ b/src/ComputeResidual.cpp
@@ -0,0 +1,95 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*!
 @file ComputeResidual.cpp
 HPCG routine
 */
 #ifndef HPCG_NO_MPI
 #include <mpi.h>
 #endif
 #ifndef HPCG_NO_OPENMP
 #include <omp.h>
 #endif
 #include "Vector.hpp"
 #ifdef HPCG_DETAILED_DEBUG
 #include "hpcg.hpp"
 #include <fstream>
 #endif
 #include "ComputeResidual.hpp"
 #include <cmath> // needed for fabs
 #ifdef HPCG_DETAILED_DEBUG
 #include <iostream>
 #endif
 /*!
  Routine to compute the inf-norm difference between two vectors where:
  @param[in]  n        number of vector elements (local to this processor)
  @param[in]  v1, v2   input vectors
  @param[out] residual pointer to scalar value; on exit, will contain result: inf-norm difference
  @return Returns zero on success and a non-zero value otherwise.
 */
 int ComputeResidual(const local_int_t n, const Vector& v1, const Vector& v2, double& residual)
 {
    double* v1v = v1.values;
    double* v2v = v2.values;
    double local_residual = 0.0;
 #ifndef HPCG_NO_OPENMP
 #pragma omp parallel shared(local_residual, v1v, v2v)
    {
        double threadlocal_residual = 0.0;
 #pragma omp for
        for (local_int_t i = 0; i < n; i++)
        {
            double diff = std::fabs(v1v[i] - v2v[i]);
            if (diff > threadlocal_residual)
                threadlocal_residual = diff;
        }
 #pragma omp critical
        {
            if (threadlocal_residual > local_residual)
                local_residual = threadlocal_residual;
        }
    }
 #else // No threading
    for (local_int_t i = 0; i < n; i++)
    {
        double diff = std::fabs(v1v[i] - v2v[i]);
        if (diff > local_residual)
            local_residual = diff;
 #ifdef HPCG_DETAILED_DEBUG
        HPCG_fout << " Computed, exact, diff = " << v1v[i] << " " << v2v[i] << " " << diff << std::endl;
 #endif
    }
 #endif
 #ifndef HPCG_NO_MPI
    // Use MPI's reduce function to collect all partial sums
    double global_residual = 0;
    MPI_Allreduce(&local_residual, &global_residual, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
    residual = global_residual;
 #else
    residual = local_residual;
 #endif
    return 0;
 }
--- a/src/ComputeResidual.hpp
+++ b/src/ComputeResidual.hpp
@@ -0,0 +1,19 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 #ifndef COMPUTERESIDUAL_HPP
 #define COMPUTERESIDUAL_HPP
 #include "Vector.hpp"
 int ComputeResidual(const local_int_t n, const Vector& v1, const Vector& v2, double& residual);
 #endif // COMPUTERESIDUAL_HPP
--- a/src/ComputeRestriction.cpp
+++ b/src/ComputeRestriction.cpp
@@ -0,0 +1,75 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /*!
 @file ComputeRestriction.cpp
 HPCG routine
 */
 #ifndef HPCG_NO_OPENMP
 #include <omp.h>
 #endif
 #include "ComputeRestriction.hpp"
 /*!
  Routine to compute the coarse residual vector.
  @param[inout]  A - Sparse matrix object containing pointers to mgData->Axf, the fine grid matrix-vector product and
  mgData->rc the coarse residual vector.
  @param[in]    rf - Fine grid RHS.
  Note that the fine grid residual is never explicitly constructed.
  We only compute it for the fine grid points that will be injected into corresponding coarse grid points.
  @return Returns zero on success and a non-zero value otherwise.
 */
 int ComputeRestriction(const SparseMatrix& A, const Vector& rf)
 {
    double* Axfv = A.mgData->Axf->values;
    double* rfv = rf.values;
    double* rcv = A.mgData->rc->values;
    local_int_t* f2c = A.mgData->f2cOperator;
    local_int_t nc = A.mgData->rc->localLength;
 #ifndef HPCG_NO_OPENMP
 #pragma omp parallel for
 #endif
    for (local_int_t i = 0; i < nc; ++i)
    {
        rcv[i] = rfv[A.f2cPerm[i]] - Axfv[A.f2cPerm[i]];
    }
    return 0;
 }
--- a/src/ComputeRestriction.hpp
+++ b/src/ComputeRestriction.hpp
@@ -0,0 +1,20 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 #ifndef COMPUTERESTRICTION_HPP
 #define COMPUTERESTRICTION_HPP
 #include "SparseMatrix.hpp"
 #include "Vector.hpp"
 int ComputeRestriction(const SparseMatrix& A, const Vector& rf);
 #endif // COMPUTERESTRICTION_HPP
--- a/src/ComputeRestriction_ref.cpp
+++ b/src/ComputeRestriction_ref.cpp
@@ -0,0 +1,56 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*!
 @file ComputeRestriction_ref.cpp
 HPCG routine
 */
 #ifndef HPCG_NO_OPENMP
 #include <omp.h>
 #endif
 #include "ComputeRestriction_ref.hpp"
 /*!
  Routine to compute the coarse residual vector.
  @param[inout]  A - Sparse matrix object containing pointers to mgData->Axf, the fine grid matrix-vector product and
  mgData->rc the coarse residual vector.
  @param[in]    rf - Fine grid RHS.
  Note that the fine grid residual is never explicitly constructed.
  We only compute it for the fine grid points that will be injected into corresponding coarse grid points.
  @return Returns zero on success and a non-zero value otherwise.
 */
 int ComputeRestriction_ref(const SparseMatrix& A, const Vector& rf)
 {
    double* Axfv = A.mgData->Axf->values;
    double* rfv = rf.values;
    double* rcv = A.mgData->rc->values;
    local_int_t* f2c = A.mgData->f2cOperator;
    local_int_t nc = A.mgData->rc->localLength;
 #ifndef HPCG_NO_OPENMP
 #pragma omp parallel for
 #endif
    for (local_int_t i = 0; i < nc; ++i)
        rcv[i] = rfv[f2c[i]] - Axfv[f2c[i]];
    return 0;
 }
--- a/src/ComputeRestriction_ref.hpp
+++ b/src/ComputeRestriction_ref.hpp
@@ -0,0 +1,20 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 #ifndef COMPUTERESTRICTION_REF_HPP
 #define COMPUTERESTRICTION_REF_HPP
 #include "SparseMatrix.hpp"
 #include "Vector.hpp"
 int ComputeRestriction_ref(const SparseMatrix& A, const Vector& rf);
 #endif // COMPUTERESTRICTION_REF_HPP
--- a/src/ComputeSPMV.cpp
+++ b/src/ComputeSPMV.cpp
@@ -0,0 +1,111 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /*!
 @file ComputeSPMV.cpp
 HPCG routine
 */
 #include "ComputeSPMV.hpp"
 #include "ComputeSPMV_ref.hpp"
 #ifndef HPCG_NO_MPI
 #include "ExchangeHalo.hpp"
 #endif
 #ifdef USE_CUDA
 #include "Cuda.hpp"
 #include "CudaKernels.hpp"
 #endif
 #include "CpuKernels.hpp"
 /*!
  Routine to compute sparse matrix vector product y = Ax where:
  Precondition: First call exchange_externals to get off-processor values of x
  This routine calls the reference SpMV implementation by default, but
  can be replaced by a custom, optimized routine suited for
  the target system.
  @param[in]  A the known system matrix
  @param[in]  x the known vector
  @param[out] y the On exit contains the result: Ax.
  @return returns 0 upon success and non-zero otherwise
  @see ComputeSPMV_ref
 */
 int ComputeSPMV(const SparseMatrix& A, Vector& x, Vector& y)
 {
    double one = 1.0, zero = 0.0;
    if (A.rankType == GPU)
    {
 // #ifdef USE_CUDA
 #ifndef HPCG_NO_MPI
        PackSendBufferCuda(A, x, false, copy_stream);
 #endif
        cusparseDnVecSetValues(A.cusparseOpt.vecX, x.values_d);
        cusparseDnVecSetValues(A.cusparseOpt.vecY, y.values_d);
        cusparseSpMV(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, A.cusparseOpt.matA, A.cusparseOpt.vecX,
            &zero, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, A.bufferMvA);
 #ifndef HPCG_NO_MPI
        if (A.totalToBeSent > 0)
        {
            ExchangeHaloCuda(A, x, copy_stream);
            ExtSpMVCuda((SparseMatrix&) A, one, x.values_d + A.localNumberOfRows, y.values_d);
        }
 #endif
        cudaStreamSynchronize(stream);
 // #endif
    }
 //     else
 //     {
 // #ifdef USE_GRACE
 //         nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, x.values);
 //         nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, y.values);
 //         nvpl_sparse_spmv(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matA,
 //             A.nvplSparseOpt.vecX, &zero, A.nvplSparseOpt.vecY, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
 //             NVPL_SPARSE_SPMV_ALG_DEFAULT, A.nvplSparseOpt.spmvADescr);
 // #ifndef HPCG_NO_MPI
 //         if (A.totalToBeSent > 0)
 //         {
 //             ExchangeHaloCpu(A, x);
 //             ExtSpMVCpu(A, A.localNumberOfRows, 1.0, x.values, y.values);
 //         }
 // #endif
 // #endif // USE_GRACE
 //     }
    return 0;
 }
--- a/src/ComputeSPMV.hpp
+++ b/src/ComputeSPMV.hpp
@@ -0,0 +1,22 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 #ifndef COMPUTESPMV_HPP
 #define COMPUTESPMV_HPP
 #include "SparseMatrix.hpp"
 #include "Vector.hpp"
 int ComputeSPMV(const SparseMatrix& A, Vector& x, Vector& y);
 #endif // COMPUTESPMV_HPP
--- a/src/ComputeSPMV_ref.cpp
+++ b/src/ComputeSPMV_ref.cpp
@@ -0,0 +1,74 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*!
 @file ComputeSPMV_ref.cpp
 HPCG routine
 */
 #include "ComputeSPMV_ref.hpp"
 #ifndef HPCG_NO_MPI
 #include "ExchangeHalo.hpp"
 #endif
 #ifndef HPCG_NO_OPENMP
 #include <omp.h>
 #endif
 #include <cassert>
 /*!
  Routine to compute matrix vector product y = Ax where:
  Precondition: First call exchange_externals to get off-processor values of x
  This is the reference SPMV implementation.  It CANNOT be modified for the
  purposes of this benchmark.
  @param[in]  A the known system matrix
  @param[in]  x the known vector
  @param[out] y the On exit contains the result: Ax.
  @return returns 0 upon success and non-zero otherwise
  @see ComputeSPMV
 */
 int ComputeSPMV_ref(const SparseMatrix& A, Vector& x, Vector& y)
 {
    assert(x.localLength >= A.localNumberOfColumns); // Test vector lengths
    assert(y.localLength >= A.localNumberOfRows);
 #ifndef HPCG_NO_MPI
    ExchangeHalo(A, x);
 #endif
    const double* const xv = x.values;
    double* const yv = y.values;
    const local_int_t nrow = A.localNumberOfRows;
 #ifndef HPCG_NO_OPENMP
 #pragma omp parallel for
 #endif
    for (local_int_t i = 0; i < nrow; i++)
    {
        double sum = 0.0;
        const double* const cur_vals = A.matrixValues[i];
        const local_int_t* const cur_inds = A.mtxIndL[i];
        const int cur_nnz = A.nonzerosInRow[i];
        for (int j = 0; j < cur_nnz; j++)
            sum += cur_vals[j] * xv[cur_inds[j]];
        yv[i] = sum;
    }
    return 0;
 }
--- a/src/ComputeSPMV_ref.hpp
+++ b/src/ComputeSPMV_ref.hpp
@@ -0,0 +1,22 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 #ifndef COMPUTESPMV_REF_HPP
 #define COMPUTESPMV_REF_HPP
 #include "SparseMatrix.hpp"
 #include "Vector.hpp"
 int ComputeSPMV_ref(const SparseMatrix& A, Vector& x, Vector& y);
 #endif // COMPUTESPMV_REF_HPP
--- a/src/ComputeSYMGS.cpp
+++ b/src/ComputeSYMGS.cpp
@@ -0,0 +1,309 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /*!
 @file ComputeSYMGS.cpp
 HPCG routine
 */
 #ifdef USE_CUDA
 #include "Cuda.hpp"
 #endif
 #ifndef HPCG_NO_MPI
 #include "ExchangeHalo.hpp"
 #endif
 #include "ComputeSPMV.hpp"
 #include "ComputeSYMGS.hpp"
 #include "CpuKernels.hpp"
 #include "CudaKernels.hpp"
 /*!
  Routine to compute one step of symmetric Gauss-Seidel:
  Assumption about the structure of matrix A:
  - Each row 'i' of the matrix has nonzero diagonal value whose address is matrixDiagonal[i]
  - Entries in row 'i' are ordered such that:
       - lower triangular terms are stored before the diagonal element.
       - upper triangular terms are stored after the diagonal element.
       - No other assumptions are made about entry ordering.
  Symmetric Gauss-Seidel notes:
  - We use the input vector x as the RHS and start with an initial guess for y of all zeros.
  - We perform one forward sweep.  Since y is initially zero we can ignore the upper triangular terms of A.
  - We then perform one back sweep.
       - For simplicity we include the diagonal contribution in the for-j loop, then correct the sum after
  @param[in] A the known system matrix
  @param[in] r the input vector
  @param[inout] x On entry, x should contain relevant values, on exit x contains the result of one symmetric GS sweep
  with r as the RHS.
  @return returns 0 upon success and non-zero otherwise
  @warning Early versions of this kernel (Version 1.1 and earlier) had the r and x arguments in reverse order, and out
  of sync with other kernels.
  @see ComputeSYMGS_ref
 */
 #ifdef USE_CUDA
 int ComputeSYMGS_Gpu(const SparseMatrix& A, const Vector& r, Vector& x, bool step)
 {
    double* tmp_d;
    if (step == 1 && A.mgData != 0)
    {
        tmp_d = (*A.mgData->Axf).values_d;
    }
    else
    {
        tmp_d = A.tempBuffer;
    }
    const local_int_t nrow = A.localNumberOfRows;
    double alpha = 1.0;
    cusparseFillMode_t fillmode_l = CUSPARSE_FILL_MODE_LOWER;
    cusparseFillMode_t fillmode_u = CUSPARSE_FILL_MODE_UPPER;
    if (step == 1)
    {
        // TRSV(D+L, r, t)
        cusparseDnVecSetValues(A.cusparseOpt.vecX, r.values_d);
        cusparseDnVecSetValues(A.cusparseOpt.vecY, tmp_d);
        cusparseSpMatSetAttribute(A.cusparseOpt.matA, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
        cusparseSpSV_solve(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matA,
            A.cusparseOpt.vecX, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A.cusparseOpt.spsvDescrL);
        // SPMV(D, t, t)
        SpmvDiagCuda(nrow, tmp_d, A.diagonal);
        // TRSV(D+U, t, x)
        cusparseDnVecSetValues(A.cusparseOpt.vecX, tmp_d);
        cusparseDnVecSetValues(A.cusparseOpt.vecY, x.values_d);
        cusparseSpMatSetAttribute(A.cusparseOpt.matA, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
        cusparseSpSV_solve(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matA,
            A.cusparseOpt.vecX, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A.cusparseOpt.spsvDescrU);
        if (A.mgData != 0)
        {
 #ifndef HPCG_NO_MPI
            cudaStreamSynchronize(stream);
            PackSendBufferCuda(A, x, false, copy_stream);
 #endif
            // SPMV(L, x, t): t = t + L * x
            double alpha = 1.0;
            cusparseDnVecSetValues(A.cusparseOpt.vecX, x.values_d);
            cusparseDnVecSetValues(A.cusparseOpt.vecY, (*A.mgData->Axf).values_d);
            cusparseSpMV(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matL,
                A.cusparseOpt.vecX, &alpha, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, A.bufferMvA);
 #ifndef HPCG_NO_MPI
            if (A.totalToBeSent > 0)
            {
                ExchangeHaloCuda(A, x, copy_stream);
                double one = 1.0, zero = 0.0;
                ExtSpMVCuda((SparseMatrix&) A, one, x.values_d + A.localNumberOfRows, (*A.mgData->Axf).values_d);
            }
 #endif
        }
    }
    else
    { // step == 0
 #ifndef HPCG_NO_MPI
        cudaStreamSynchronize(stream);
        PackSendBufferCuda(A, x, false, copy_stream);
 #endif
        // SPMV(U, x, t): t = U * x
        double alpha = 1.0, beta = 0.0;
        cusparseDnVecSetValues(A.cusparseOpt.vecX, x.values_d);
        cusparseDnVecSetValues(A.cusparseOpt.vecY, (*A.mgData->Axf).values_d);
        cusparseSpMV(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matU, A.cusparseOpt.vecX,
            &beta, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, A.bufferMvA);
        // tmp = rv - t
        AxpbyCuda(nrow, r.values_d, (*A.mgData->Axf).values_d, tmp_d);
 #ifndef HPCG_NO_MPI
        if (A.totalToBeSent > 0)
        {
            // MPI_Ibarrier --> will help improve MPI_Allreduce in dot product
            ExchangeHaloCuda(A, x, copy_stream, A.level == 0 ? 1 /*call MPI_Ibarrier*/ : 0);
            double mone = -1.0, zero = 0.0;
            ExtSpMVCuda((SparseMatrix&) A, mone, x.values_d + A.localNumberOfRows, tmp_d);
        }
 #endif
        // TRSV(D+L, r-t, x)
        cusparseDnVecSetValues(A.cusparseOpt.vecX, tmp_d);
        cusparseDnVecSetValues(A.cusparseOpt.vecY, x.values_d);
        cusparseSpMatSetAttribute(A.cusparseOpt.matA, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
        cusparseSpSV_solve(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matA,
            A.cusparseOpt.vecX, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A.cusparseOpt.spsvDescrL);
        // SPMV(D, x, t) t += D*x
        SpFmaCuda(nrow, x.values_d, A.diagonal, (*A.mgData->Axf).values_d);
        // TRSV(D+U, x, x)
        cusparseDnVecSetValues(A.cusparseOpt.vecX, (*A.mgData->Axf).values_d);
        cusparseDnVecSetValues(A.cusparseOpt.vecY, x.values_d);
        cusparseSpMatSetAttribute(A.cusparseOpt.matA, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
        cusparseSpSV_solve(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matA,
            A.cusparseOpt.vecX, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A.cusparseOpt.spsvDescrU);
    }
    return 0;
 }
 #endif
 #ifdef USE_GRACE
 int ComputeSYMGS_Cpu(const SparseMatrix& A, const Vector& r, Vector& x, bool step)
 {
    local_int_t nrow = A.localNumberOfRows;
    double* temp;
    if (step == 1 && A.mgData != 0)
    {
        temp = (*A.mgData->Axf).values;
    }
    else
    {
        temp = A.tempBuffer;
    }
    double* xv = x.values;
    double* rv = r.values;
    double one = 1.0, zero = 0.0;
    nvpl_sparse_fill_mode_t fillmode_l = NVPL_SPARSE_FILL_MODE_LOWER;
    nvpl_sparse_fill_mode_t fillmode_u = NVPL_SPARSE_FILL_MODE_UPPER;
    if (step == 1)
    {
        // TRSV(L, r, x)
        nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, r.values);
        nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, xv);
        nvpl_sparse_sp_mat_set_attribute(
            A.nvplSparseOpt.matL, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
        nvpl_sparse_spsv_solve(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matL,
            A.nvplSparseOpt.vecX, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F, NVPL_SPARSE_SPSV_ALG_DEFAULT,
            A.nvplSparseOpt.spsvDescrL);
        // SPMV(D, x, t) t = D*x
        SpmvDiagCpu(nrow, A.diagonal, xv, temp);
        // TRSV(U, x, x)
        nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, temp);
        nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, xv);
        nvpl_sparse_sp_mat_set_attribute(
            A.nvplSparseOpt.matU, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
        nvpl_sparse_spsv_solve(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matU,
            A.nvplSparseOpt.vecX, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F, NVPL_SPARSE_SPSV_ALG_DEFAULT,
            A.nvplSparseOpt.spsvDescrU);
        if (A.mgData != 0)
        {
            // SPMV(L, x, t): t += L*x
            nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, xv);
            nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, temp);
            nvpl_sparse_spmv(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matL,
                A.nvplSparseOpt.vecX, &one, A.nvplSparseOpt.vecY, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
                NVPL_SPARSE_SPMV_ALG_DEFAULT, A.nvplSparseOpt.spmvLDescr);
 #ifndef HPCG_NO_MPI
            ExchangeHaloCpu(A, x);
            if (A.totalToBeSent > 0)
            {
                ExtSpMVCpu(A, nrow, 1.0, xv, temp);
            }
 #endif
        }
    }
    else if (step == 0)
    {
        // SPMV(U, x, t) t = U*x
        nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, xv);
        nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, (*A.mgData->Axf).values);
        nvpl_sparse_spmv(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matU,
            A.nvplSparseOpt.vecX, &zero, A.nvplSparseOpt.vecY, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
            NVPL_SPARSE_SPMV_ALG_DEFAULT, A.nvplSparseOpt.spmvUDescr);
        // axpy: t = r-t
        AxpbyCpu(nrow, rv, (*A.mgData->Axf).values, temp);
 #ifndef HPCG_NO_MPI
        // MPI_Ibarrier --> will help improve MPI_Allreduce in dot product
        ExchangeHaloCpu(A, x, A.level == 0 ? 1 /*call MPI_Ibarrier*/ : 0);
        if (A.totalToBeSent > 0)
        {
            ExtSpMVCpu(A, nrow, -1.0, xv, temp);
        }
 #endif
        // TRSV(L, r-t, x)
        nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, temp);
        nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, xv);
        nvpl_sparse_sp_mat_set_attribute(
            A.nvplSparseOpt.matL, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
        nvpl_sparse_spsv_solve(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matL,
            A.nvplSparseOpt.vecX, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F, NVPL_SPARSE_SPSV_ALG_DEFAULT,
            A.nvplSparseOpt.spsvDescrL);
        // SPMV(D, x, t) t += D*x
        SpFmaCpu(nrow, A.diagonal, xv, (*A.mgData->Axf).values);
        // TRSV(U, x, x)
        nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, (*A.mgData->Axf).values);
        nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, xv);
        nvpl_sparse_sp_mat_set_attribute(
            A.nvplSparseOpt.matU, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
        nvpl_sparse_spsv_solve(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matU,
            A.nvplSparseOpt.vecX, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F, NVPL_SPARSE_SPSV_ALG_DEFAULT,
            A.nvplSparseOpt.spsvDescrU);
    }
    return 0;
 }
 #endif // USE_GRACE
 int ComputeSYMGS(const SparseMatrix& A, const Vector& r, Vector& x, bool step)
 {
    if (A.rankType == GPU)
    {
 #ifdef USE_CUDA
        ComputeSYMGS_Gpu(A, r, x, step);
 #endif
    }
    else
    {
 #ifdef USE_GRACE
        ComputeSYMGS_Cpu(A, r, x, step);
 #endif
    }
    return 0;
 }
--- a/src/ComputeSYMGS.hpp
+++ b/src/ComputeSYMGS.hpp
@@ -0,0 +1,39 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef COMPUTESYMGS_HPP
 #define COMPUTESYMGS_HPP
 #include "SparseMatrix.hpp"
 #include "Vector.hpp"
 int ComputeSYMGS(const SparseMatrix& A, const Vector& r, Vector& x, bool step);
 #endif // COMPUTESYMGS_HPP
--- a/src/ComputeSYMGS_ref.cpp
+++ b/src/ComputeSYMGS_ref.cpp
@@ -0,0 +1,110 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*!
 @file ComputeSYMGS_ref.cpp
 HPCG routine
 */
 #ifndef HPCG_NO_MPI
 #include "ExchangeHalo.hpp"
 #endif
 #include "ComputeSYMGS_ref.hpp"
 #include <cassert>
 /*!
  Computes one step of symmetric Gauss-Seidel:
  Assumption about the structure of matrix A:
  - Each row 'i' of the matrix has nonzero diagonal value whose address is matrixDiagonal[i]
  - Entries in row 'i' are ordered such that:
       - lower triangular terms are stored before the diagonal element.
       - upper triangular terms are stored after the diagonal element.
       - No other assumptions are made about entry ordering.
  Symmetric Gauss-Seidel notes:
  - We use the input vector x as the RHS and start with an initial guess for y of all zeros.
  - We perform one forward sweep.  x should be initially zero on the first GS sweep, but we do not attempt to exploit
  this fact.
  - We then perform one back sweep.
  - For simplicity we include the diagonal contribution in the for-j loop, then correct the sum after
  @param[in] A the known system matrix
  @param[in] r the input vector
  @param[inout] x On entry, x should contain relevant values, on exit x contains the result of one symmetric GS sweep
  with r as the RHS.
  @warning Early versions of this kernel (Version 1.1 and earlier) had the r and x arguments in reverse order, and out
  of sync with other kernels.
  @return returns 0 upon success and non-zero otherwise
  @see ComputeSYMGS
 */
 int ComputeSYMGS_ref(const SparseMatrix& A, const Vector& r, Vector& x)
 {
    assert(x.localLength == A.localNumberOfColumns); // Make sure x contain space for halo values
 #ifndef HPCG_NO_MPI
    ExchangeHalo(A, x);
 #endif
    const local_int_t nrow = A.localNumberOfRows;
    double** matrixDiagonal = A.matrixDiagonal; // An array of pointers to the diagonal entries A.matrixValues
    const double* const rv = r.values;
    double* const xv = x.values;
    for (local_int_t i = 0; i < nrow; i++)
    {
        const double* const currentValues = A.matrixValues[i];
        const local_int_t* const currentColIndices = A.mtxIndL[i];
        const int currentNumberOfNonzeros = A.nonzerosInRow[i];
        const double currentDiagonal = matrixDiagonal[i][0]; // Current diagonal value
        double sum = rv[i];                                  // RHS value
        for (int j = 0; j < currentNumberOfNonzeros; j++)
        {
            local_int_t curCol = currentColIndices[j];
            sum -= currentValues[j] * xv[curCol];
        }
        sum += xv[i] * currentDiagonal; // Remove diagonal contribution from previous loop
        xv[i] = sum / currentDiagonal;
    }
    // Now the back sweep.
    for (local_int_t i = nrow - 1; i >= 0; i--)
    {
        const double* const currentValues = A.matrixValues[i];
        const local_int_t* const currentColIndices = A.mtxIndL[i];
        const int currentNumberOfNonzeros = A.nonzerosInRow[i];
        const double currentDiagonal = matrixDiagonal[i][0]; // Current diagonal value
        double sum = rv[i];                                  // RHS value
        for (int j = 0; j < currentNumberOfNonzeros; j++)
        {
            local_int_t curCol = currentColIndices[j];
            sum -= currentValues[j] * xv[curCol];
        }
        sum += xv[i] * currentDiagonal; // Remove diagonal contribution from previous loop
        xv[i] = sum / currentDiagonal;
    }
    return 0;
 }
--- a/src/ComputeSYMGS_ref.hpp
+++ b/src/ComputeSYMGS_ref.hpp
@@ -0,0 +1,22 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 #ifndef COMPUTESYMGS_REF_HPP
 #define COMPUTESYMGS_REF_HPP
 #include "SparseMatrix.hpp"
 #include "Vector.hpp"
 int ComputeSYMGS_ref(const SparseMatrix& A, const Vector& r, Vector& x);
 #endif // COMPUTESYMGS_REF_HPP
--- a/src/ComputeWAXPBY.cpp
+++ b/src/ComputeWAXPBY.cpp
@@ -0,0 +1,89 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /*!
 @file ComputeWAXPBY.cpp
 HPCG routine
 */
 #ifndef HPCG_NO_MPI
 #include "mytimer.hpp"
 #include <mpi.h>
 #endif
 #ifndef HPCG_NO_OPENMP
 #include <omp.h>
 #endif
 #ifdef USE_CUDA
 #include "Cuda.hpp"
 #endif
 #include "ComputeWAXPBY.hpp"
 #include "ComputeWAXPBY_ref.hpp"
 #include "CpuKernels.hpp"
 #include "CudaKernels.hpp"
 #include "SparseMatrix.hpp"
 /*!
  Routine to compute the update of a vector with the sum of two
  scaled vectors where: w = alpha*x + beta*y
  This routine calls the reference WAXPBY implementation by default, but
  can be replaced by a custom, optimized routine suited for
  the target system.
  @param[in] n the number of vector elements (on this processor)
  @param[in] alpha, beta the scalars applied to x and y respectively.
  @param[in] x, y the input vectors
  @param[out] w the output vector
  @param[out] isOptimized should be set to false if this routine uses the reference implementation (is not optimized);
  otherwise leave it unchanged
  @return returns 0 upon success and non-zero otherwise
  @see ComputeWAXPBY_ref
 */
 int ComputeWAXPBY(const local_int_t n, const double alpha, const Vector& x, const double beta, const Vector& y,
    Vector& w, bool& isOptimized, rank_type_t rt)
 {
    if (rt == GPU)
    {
 #ifdef USE_CUDA
        ComputeWAXPBYCuda(n, alpha, x, beta, y, w);
 #endif
    }
    else
    {
 #ifdef USE_GRACE
        ComputeWAXPBYCpu(n, alpha, x, beta, y, w, isOptimized);
 #endif
    }
    return 0;
 }
--- a/src/ComputeWAXPBY.hpp
+++ b/src/ComputeWAXPBY.hpp
@@ -0,0 +1,39 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef COMPUTEWAXPBY_HPP
 #define COMPUTEWAXPBY_HPP
 #include "Vector.hpp"
 int ComputeWAXPBY(const local_int_t n, const double alpha, const Vector& x, const double beta, const Vector& y,
    Vector& w, bool& isOptimized, rank_type_t rt);
 #endif // COMPUTEWAXPBY_HPP
--- a/src/ComputeWAXPBY_ref.cpp
+++ b/src/ComputeWAXPBY_ref.cpp
@@ -0,0 +1,79 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*!
 @file ComputeWAXPBY_ref.cpp
 HPCG routine
 */
 #include "ComputeWAXPBY_ref.hpp"
 #ifndef HPCG_NO_OPENMP
 #include <omp.h>
 #endif
 #include <cassert>
 /*!
  Routine to compute the update of a vector with the sum of two
  scaled vectors where: w = alpha*x + beta*y
  This is the reference WAXPBY impmentation.  It CANNOT be modified for the
  purposes of this benchmark.
  @param[in] n the number of vector elements (on this processor)
  @param[in] alpha, beta the scalars applied to x and y respectively.
  @param[in] x, y the input vectors
  @param[out] w the output vector.
  @return returns 0 upon success and non-zero otherwise
  @see ComputeWAXPBY
 */
 int ComputeWAXPBY_ref(
    const local_int_t n, const double alpha, const Vector& x, const double beta, const Vector& y, Vector& w)
 {
    assert(x.localLength >= n); // Test vector lengths
    assert(y.localLength >= n);
    const double* const xv = x.values;
    const double* const yv = y.values;
    double* const wv = w.values;
    if (alpha == 1.0)
    {
 #ifndef HPCG_NO_OPENMP
 #pragma omp parallel for
 #endif
        for (local_int_t i = 0; i < n; i++)
            wv[i] = xv[i] + beta * yv[i];
    }
    else if (beta == 1.0)
    {
 #ifndef HPCG_NO_OPENMP
 #pragma omp parallel for
 #endif
        for (local_int_t i = 0; i < n; i++)
            wv[i] = alpha * xv[i] + yv[i];
    }
    else
    {
 #ifndef HPCG_NO_OPENMP
 #pragma omp parallel for
 #endif
        for (local_int_t i = 0; i < n; i++)
            wv[i] = alpha * xv[i] + beta * yv[i];
    }
    return 0;
 }
--- a/src/ComputeWAXPBY_ref.hpp
+++ b/src/ComputeWAXPBY_ref.hpp
@@ -0,0 +1,20 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 #ifndef COMPUTEWAXPBY_REF_HPP
 #define COMPUTEWAXPBY_REF_HPP
 #include "Vector.hpp"
 int ComputeWAXPBY_ref(
    const local_int_t n, const double alpha, const Vector& x, const double beta, const Vector& y, Vector& w);
 #endif // COMPUTEWAXPBY_REF_HPP
--- a/src/CpuKernels.cpp
+++ b/src/CpuKernels.cpp
--- a/src/CpuKernels.hpp
+++ b/src/CpuKernels.hpp
@@ -0,0 +1,92 @@
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef CPUKERNELS_HPP
 #define CPUKERNELS_HPP
 #ifdef USE_GRACE
 #include <nvpl_sparse.h>
 extern nvpl_sparse_handle_t nvpl_sparse_handle;
 #include "SparseMatrix.hpp"
 #include "Vector.hpp"
 #include <algorithm>
 #include <random>
 #include <vector>
 #ifdef __ARM_FEATURE_SVE
 #include <arm_sve.h>
 #endif
 #ifndef HPCG_NO_MPI
 #include <mpi.h>
 #endif
 #ifndef HPCG_NO_OPENMP
 #include <omp.h>
 #endif
 ///////// Deallocate CPU Memory for data structures //
 void DeleteMatrixCpu(SparseMatrix& A);
 ///////// Find the size of CPU reference allocated memory //
 size_t EstimateCpuRefMem(SparseMatrix& A);
 /*
    Translation of a 3D point in all directions
    27 possibilities
 */
 constexpr int tid2indCpu[32][4] = {{-1, -1, -1, 0}, {0, -1, -1, 0}, {1, -1, -1, 0}, {-1, 0, -1, 0}, {0, 0, -1, 0},
    {1, 0, -1, 0}, {-1, 1, -1, 0}, {0, 1, -1, 0}, {1, 1, -1, 0}, {-1, -1, 0, 0}, {0, -1, 0, 0}, {1, -1, 0, 0},
    {-1, 0, 0, 0}, {0, 0, 0, 0}, {1, 0, 0, 0}, {-1, 1, 0, 0}, {0, 1, 0, 0}, {1, 1, 0, 0}, {-1, -1, 1, 0}, {0, -1, 1, 0},
    {1, -1, 1, 0}, {-1, 0, 1, 0}, {0, 0, 1, 0}, {1, 0, 1, 0}, {-1, 1, 1, 0}, {0, 1, 1, 0}, {1, 1, 1, 0}, {0, 0, 0, 0},
    {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}};
 // Generate Problem
 // Inclusive Prefix Sum
 void PrefixsumCpu(int* x, int N);
 // Optimize Problem
 size_t AllocateMemCpu(SparseMatrix& A_in);
 void ColorMatrixCpu(SparseMatrix& A, int* num_colors);
 void CreateSellPermCpu(SparseMatrix& A);
 void F2cPermCpu(local_int_t nrow_c, local_int_t* f2c, local_int_t* f2c_perm, local_int_t* perm_f, local_int_t* iperm_c);
 // Permute a vector using coloring buffer
 void PermVectorCpu(local_int_t* perm, Vector& x, local_int_t length);
 // Test CG
 void ReplaceMatrixDiagonalCpu(SparseMatrix& A, Vector diagonal);
 // CG Support Kernels
 // Dot-product Per single rank
 void ComputeDotProductCpu(const local_int_t n, const Vector& x, const Vector& y, double& result, bool& isOptimized);
 // WAXPBY
 int ComputeWAXPBYCpu(const local_int_t n, const double alpha, const Vector& x, const double beta, const Vector& y,
    Vector& w, bool& isOptimized);
 // SYMGS
 void SpmvDiagCpu(local_int_t n, const double* x, double* y, double* z);
 void AxpbyCpu(local_int_t n, double* x, double* y, double* z);
 void SpFmaCpu(local_int_t n, const double* x, double* y, double* z);
 // External Matrix SpMV + Scatter
 void ExtSpMVCpu(const SparseMatrix& A, const local_int_t n, const double alpha, const double* x, double* y);
 #endif // USE_GRACE
 #endif // CPUKERNELS_HPP
--- a/src/Cuda.hpp
+++ b/src/Cuda.hpp
@@ -0,0 +1,87 @@
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #pragma once
 #ifdef USE_CUDA
 #include "cublas_v2.h"
 #include "cuda_runtime_api.h"
 #include "cusparse.h"
 #include <cuda.h>
 #ifdef USE_NCCL
 #include "nccl.h"
 #endif
 #ifdef USE_NVTX
 #include <nvToolsExt.h>
 #endif
 #include <unistd.h>
 extern cusparseHandle_t cusparsehandle;
 extern cublasHandle_t cublashandle;
 extern cudaStream_t stream;
 extern cudaEvent_t copy_done;
 extern cudaStream_t copy_stream;
 extern int* ranktoId;   // DEV:Compress rank in MPI_WORLD to Neighbors
 extern int* rankToId_h; // HOST:Compress rank in MPI_WORLD to Neighbors
 extern int* idToRank_h;
 extern bool Use_Compression;        /*USE CUDA L2 compression*/
 extern bool Use_Hpcg_Mem_Reduction; /*USE HPCG aggresive memory reduction*/
 #endif
 #ifdef USE_CUDA
 #define CHECK_CUDART(x)                                                                                                \
    do                                                                                                                 \
    {                                                                                                                  \
        cudaError_t res = (x);                                                                                         \
        if (res != cudaSuccess)                                                                                        \
        {                                                                                                              \
            char rank_name[1024];                                                                                      \
            gethostname(rank_name, 1024);                                                                              \
            fprintf(stderr, "CUDART: %s = %d (%s) on %s at (%s:%d)\n", #x, res, cudaGetErrorString(res), rank_name,    \
                __FILE__, __LINE__);                                                                                   \
            exit(1);                                                                                                   \
        }                                                                                                              \
    } while (0)
 // IF NVTX is needed for profiling, please define USE_NVTX
 // Then, add PUSH_RANGE and POP_RANGE around the target code block
 // See, https://developer.nvidia.com/blog/cuda-pro-tip-generate-custom-application-profile-timelines-nvtx/
 // #define USE_NVTX
 #ifdef USE_NVTX
 const uint32_t colors[] = {0xff00ff00, 0xff0000ff, 0xffffff00, 0xffff00ff, 0xff00ffff, 0xffff0000, 0xffffffff};
 const int num_colors = sizeof(colors) / sizeof(uint32_t);
 #define PUSH_RANGE(name, cid)                                                                                          \
    {                                                                                                                  \
        int color_id = cid;                                                                                            \
        color_id = color_id % num_colors;                                                                              \
        nvtxEventAttributes_t eventAttrib = {0};                                                                       \
        eventAttrib.version = NVTX_VERSION;                                                                            \
        eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;                                                              \
        eventAttrib.colorType = NVTX_COLOR_ARGB;                                                                       \
        eventAttrib.color = colors[color_id];                                                                          \
        eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;                                                             \
        eventAttrib.message.ascii = name;                                                                              \
        nvtxRangePushEx(&eventAttrib);                                                                                 \
    }
 #define POP_RANGE nvtxRangePop();
 #else
 #define PUSH_RANGE(name, cid)                                                                                          \
    {                                                                                                                  \
    }
 #define POP_RANGE
 #endif
 #endif
--- a/src/CudaKernels.cu
+++ b/src/CudaKernels.cu
--- a/src/CudaKernels.hpp
+++ b/src/CudaKernels.hpp
@@ -0,0 +1,92 @@
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #pragma once
 #ifdef USE_CUDA
 #include "SparseMatrix.hpp"
 ///////// L2 Memory Compression Allocation Support Routines //
 cudaError_t setProp(CUmemAllocationProp* prop);
 cudaError_t cudaMallocCompressible(void** adr, size_t size);
 cudaError_t cudaFreeCompressible(void* ptr, size_t size);
 ///////// Allocate CUDA Memory for data structures //
 local_int_t EstimateLUmem(local_int_t n, local_int_t padded_n, local_int_t level);
 void AllocateMemCuda(SparseMatrix& A_in);
 void AllocateMemOptCuda(SparseMatrix& A_in);
 ///////// Deallocate CUDA Memory for data structures //
 void DeleteMatrixGpu(SparseMatrix& A);
 ///////// Genrerate Problem //
 void GenerateProblemCuda(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact);
 // Halo Exchange
 void SetupHaloCuda(SparseMatrix& A, local_int_t sendbufld, local_int_t* sendlen, local_int_t* sendbuff,
    local_int_t* tot_to_send, int* nneighs, int* neighs_h, local_int_t* sendlen_h, local_int_t** elem_to_send_d);
 void ExtToLocMapCuda(
    local_int_t localNumberOfRows, local_int_t str, local_int_t end, local_int_t* extToLocMap, local_int_t* eltsToRecv);
 void ExtTolocCuda(local_int_t localNumberOfRows, int neighborId, local_int_t ext_nnz, local_int_t* csr_ext_columns,
    double* csr_ext_values, local_int_t* ext2csr_offsets, local_int_t* extToLocMap, local_int_t* csrColumns);
 void PackSendBufferCuda(const SparseMatrix& A, Vector& x, bool cpu_data, cudaStream_t stream1);
 void ExchangeHaloCuda(const SparseMatrix& A, Vector& x, cudaStream_t stream1, int use_ibarrier = 0);
 // Optimize Problem
 void SetVectorAscCuda(local_int_t* arr, local_int_t n);
 void ColorMatrixCuda(double* A_vals, local_int_t* A_col, local_int_t* nnzPerRow, local_int_t rows, local_int_t* color,
    int* num_colors, int* count_colors, int max_colors, local_int_t* ref2opt, local_int_t* opt2ref, int rank, int nx,
    int* rowhash);
 void PermElemToSendCuda(local_int_t totalToBeSent, local_int_t* elementsToSend, local_int_t* perm);
 void EllPermColumnsValuesCuda(local_int_t localNumberOfRows, local_int_t* nnzPerRow, local_int_t* csrColumns,
    double* csrValues, local_int_t* permOffsets, local_int_t* permColumns, double* permValues, local_int_t* opt2ref,
    local_int_t* ref2opt, local_int_t* diagonalIdx, local_int_t* permLOffsets, local_int_t* permUOffsets, bool diag);
 void TransposeCuda(local_int_t n, local_int_t slice_size, local_int_t* sellCollIndex, double* sellValues);
 void EllMaxRowLenPerBlockCuda(local_int_t nrow, int sliceSize, local_int_t* sellLPermOffsets,
    local_int_t* sellUPermOffsets, local_int_t* sellLSliceMrl, local_int_t* sellUSliceMrl);
 void PrefixsumCuda(local_int_t localNumberOfRows, local_int_t* arr);
 void MultiplyBySliceSizeCUDA(local_int_t nrow, int slice_size, local_int_t* arr);
 void CreateAMatrixSliceOffsetsCuda(local_int_t nrow, local_int_t slice_size, local_int_t* arr);
 void CreateSellLUColumnsValuesCuda(const local_int_t n, int sliceSize, local_int_t* columns, double* values,
    local_int_t* sellLSliceOffset, local_int_t* sellLColumns, double* sellLValues, local_int_t* sellUSliceOffset,
    local_int_t* sellUColumns, double* sellUValues, int level);
 void PermVectorCuda(local_int_t* perm, Vector& x, local_int_t length);
 void F2cPermCuda(local_int_t nrow_c, local_int_t* f2c, local_int_t* f2cPerm, local_int_t* permF, local_int_t* ipermC);
 // Test CG
 void ReplaceMatrixDiagonalCuda(SparseMatrix& A, Vector& diagonal);
 void CopyMatrixDiagonalCuda(SparseMatrix& A, Vector& diagonal);
 // CG Support Kernels
 // 1. MG
 void ComputeRestrictionCuda(const SparseMatrix& A, const Vector& r);
 void ComputeProlongationCuda(const SparseMatrix& A, Vector& x);
 // 2. WAXPBY
 void ComputeWAXPBYCuda(
    const local_int_t n, const double alpha, const Vector& x, const double beta, const Vector& y, Vector& w);
 // 3.SYMGS
 void SpmvDiagCuda(local_int_t n, double* x, double* d);
 void AxpbyCuda(local_int_t n, double* x, double* y, double* z);
 void SpFmaCuda(local_int_t n, double* x, double* y, double* z);
 // 4.External Matrix SpMV + Scatter
 void ExtSpMVCuda(SparseMatrix& A, double alpha, double* x, double* y);
 // Transfer Problem to CPU
 size_t CopyDataToHostCuda(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact);
 #endif
--- a/src/ExchangeHalo.cpp
+++ b/src/ExchangeHalo.cpp
@@ -0,0 +1,205 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /*!
 @file ExchangeHalo.cpp
 HPCG routine
 */
 // Compile this routine only if running with MPI
 #ifndef HPCG_NO_MPI
 #include "ExchangeHalo.hpp"
 #include "Geometry.hpp"
 #include <cstdlib>
 #include <mpi.h>
 extern p2p_comm_mode_t P2P_Mode;
 /*!
  Communicates data that is at the border of the part of the domain assigned to this processor.
  @param[in]    A The known system matrix
  @param[inout] x On entry: the local vector entries followed by entries to be communicated; on exit: the vector with
  non-local entries updated by other processors
 */
 void ExchangeHalo(const SparseMatrix& A, Vector& x)
 {
    local_int_t localNumberOfRows = A.localNumberOfRows;
    int num_neighbors = A.numberOfSendNeighbors;
    local_int_t * receiveLength = A.receiveLength;
    local_int_t * sendLength = A.sendLength;
    int * neighbors = A.neighbors;
    double * sendBuffer = A.sendBuffer;
    local_int_t totalToBeSent = A.totalToBeSent;
    local_int_t * elementsToSend = A.elementsToSend;
    double * const xv = x.values;
    int size, rank; // Number of MPI processes, My process ID
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    //
    //  first post receives, these are immediate receives
    //  Do not wait for result to come, will do that at the
    //  wait call below.
    //
    int MPI_MY_TAG = 99;
    MPI_Request * request = new MPI_Request[num_neighbors];
    //
    // Externals are at end of locals
    //
    double * x_external = (double *) xv + localNumberOfRows;
    // Post receives first
    // TODO: Thread this loop
    for (int i = 0; i < num_neighbors; i++) {
      local_int_t n_recv = receiveLength[i];
      MPI_Irecv(x_external, n_recv, MPI_DOUBLE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD, request+i);
      x_external += n_recv;
    }
    //
    // Fill up send buffer
    //
    // TODO: Thread this loop
    for (local_int_t i=0; i<totalToBeSent; i++) sendBuffer[i] = xv[elementsToSend[i]];
    //
    // Send to each neighbor
    //
    // TODO: Thread this loop
    for (int i = 0; i < num_neighbors; i++) {
      local_int_t n_send = sendLength[i];
      MPI_Send(sendBuffer, n_send, MPI_DOUBLE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD);
      sendBuffer += n_send;
    }
    //
    // Complete the reads issued above
    //
    MPI_Status status;
    // TODO: Thread this loop
    for (int i = 0; i < num_neighbors; i++) {
      if ( MPI_Wait(request+i, &status) ) {
        std::exit(-1); // TODO: have better error exit
      }
    }
    delete [] request;
    return;
 }
 /*!
  Communicates data that is at the border of the part of the domain assigned to this processor. A more optimized version of ExchangeHalo that is used for Grace path.
  @param[in]    A The known system matrix
  @param[inout] x On entry: the local vector entries followed by entries to be communicated; on exit: the vector with
  non-local entries updated by other processors
  @param[in]   use_ibarrier [Experimental] If 1, call MPI_Ibarrier after the communication is complete. A smart trick to improve MPI_Allreduce in DDOT, 
    by calling MPI_Ibarrier once at the last routine call in MG.
 */
 void ExchangeHaloCpu(const SparseMatrix& A, Vector& x, int use_ibarrier)
 {
    // Extract Matrix pieces
    local_int_t localNumberOfRows = A.localNumberOfRows;
    int num_neighbors = A.numberOfSendNeighbors;
    local_int_t* receiveLength = A.receiveLength;
    local_int_t* sendLength = A.sendLength;
    int* neighbors = A.neighborsPhysical;
    double* sendBuffer = A.sendBuffer;
    local_int_t totalToBeSent = A.totalToBeSent;
    local_int_t* elementsToSend = A.elementsToSend;
    if (P2P_Mode == MPI_CPU)
    {
        double* const xv = x.values;
        double* x_external = (double*) xv + localNumberOfRows;
        int MPI_MY_TAG = 99;
        MPI_Request* request = new MPI_Request[num_neighbors];
        // Post receives first
        for (int i = 0; i < num_neighbors; i++)
        {
            local_int_t n_recv = receiveLength[i];
            MPI_Irecv(x_external, n_recv, MPI_DOUBLE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD, request + i);
            x_external += n_recv;
        }
        for (local_int_t i = 0; i < totalToBeSent; i++)
            sendBuffer[i] = xv[elementsToSend[i]];
        //
        // Send to each neighbor
        //
        for (int i = 0; i < num_neighbors; i++)
        {
            local_int_t n_send = sendLength[i];
            MPI_Send(sendBuffer, n_send, MPI_DOUBLE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD);
            sendBuffer += n_send;
        }
        //
        // Complete the reads issued above
        //
        MPI_Waitall(num_neighbors, request, MPI_STATUSES_IGNORE);
        //[Experimental] Can improve MPI_Allreduce performance
        #if 0
        if (use_ibarrier == 1)
            MPI_Ibarrier(MPI_COMM_WORLD, request);
        #endif
        delete[] request;
    }
    else if (P2P_Mode == MPI_CPU_All2allv)
    {
        double* const xv = x.values;
        double* x_external = (double*) xv + localNumberOfRows;
        for (local_int_t i = 0; i < totalToBeSent; i++)
            sendBuffer[i] = xv[elementsToSend[i]];
        MPI_Alltoallv(
            sendBuffer, A.scounts, A.sdispls, MPI_DOUBLE, x_external, A.rcounts, A.rdispls, MPI_DOUBLE, MPI_COMM_WORLD);
    }
    return;
 }
 #endif
 // ifndef HPCG_NO_MPI
--- a/src/ExchangeHalo.hpp
+++ b/src/ExchangeHalo.hpp
@@ -0,0 +1,38 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef EXCHANGEHALO_HPP
 #define EXCHANGEHALO_HPP
 #include "SparseMatrix.hpp"
 #include "Vector.hpp"
 void ExchangeHalo(const SparseMatrix& A, Vector& x);
 void ExchangeHaloCpu(const SparseMatrix& A, Vector& x, int use_ibarrier = 0);
 #endif // EXCHANGEHALO_HPP
--- a/src/GenerateCoarseProblem.cpp
+++ b/src/GenerateCoarseProblem.cpp
@@ -0,0 +1,158 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /*!
 @file GenerateProblem.cpp
 HPCG routine
 */
 #ifndef HPCG_NO_OPENMP
 #include <omp.h>
 #endif
 #include "GenerateCoarseProblem.hpp"
 #include "GenerateGeometry.hpp"
 #include "GenerateProblem.hpp"
 #include "SetupHalo.hpp"
 #include <cassert>
 #ifndef HPCG_NO_MPI
 // Used to find ranks for CPU and GPU programs
 extern int global_total_ranks;
 extern int* physical_rank_dims;
 #endif
 /*!
  Routine to construct a prolongation/restriction operator for a given fine grid matrix
  solution (as computed by a direct solver).
  @param[inout]  Af - The known system matrix, on output its coarse operator, fine-to-coarse operator and auxiliary
  vectors will be defined.
  Note that the matrix Af is considered const because the attributes we are modifying are declared as mutable.
 */
 void GenerateCoarseProblem(const SparseMatrix& Af)
 {
    // Make local copies of geometry information.  Use global_int_t since the RHS products in the calculations
    // below may result in global range values.
    global_int_t nxf = Af.geom->nx;
    global_int_t nyf = Af.geom->ny;
    global_int_t nzf = Af.geom->nz;
    local_int_t nxc, nyc, nzc; // Coarse nx, ny, nz
    assert(nxf % 2 == 0);
    assert(nyf % 2 == 0);
    assert(nzf % 2 == 0); // Need fine grid dimensions to be divisible by 2
    nxc = nxf / 2;
    nyc = nyf / 2;
    nzc = nzf / 2;
    local_int_t* f2cOperator = new local_int_t[Af.localNumberOfRows];
    local_int_t localNumberOfRows = nxc * nyc * nzc; // This is the size of our subblock
    // If this assert fails, it most likely means that the local_int_t is set to int and should be set to long long
    assert(localNumberOfRows
        > 0); // Throw an exception of the number of rows is less than zero (can happen if "int" overflows)
    for (int i = 0; i < 3 * global_total_ranks; i++)
        physical_rank_dims[i] = physical_rank_dims[i] / 2;
    // Construct the geometry and linear system
    Geometry* geomc = new Geometry;
    GenerateGeometry(Af.geom->size, Af.geom->rank, Af.geom->numThreads, nxc, nyc, nzc, Af.geom->npx, Af.geom->npy,
        Af.geom->npz, Af.geom->different_dim, geomc);
    Vector* rc = new Vector;
    Vector* xc = new Vector;
    Vector* Axf = new Vector;
    MGData* mgData = new MGData;
    if (Af.rankType == GPU)
    {
        SparseMatrix* Ac = Af.Ac;
        Ac->rankType = GPU;
        InitializeSparseMatrix(*Ac, geomc);
        GenerateProblem(*Ac, 0, 0, 0);
        SetupHalo(*Ac);
        InitializeVector(*rc, Ac->localNumberOfRows, Ac->rankType);
        InitializeVector(*xc, Ac->localNumberOfColumns, Ac->rankType);
        InitializeVector(*Axf, Af.localNumberOfColumns, Ac->rankType);
 #ifdef USE_CUDA
        cudaMemcpy(f2cOperator, Af.gpuAux.f2c, sizeof(local_int_t) * localNumberOfRows, cudaMemcpyDeviceToHost);
 #endif
    }
    else
    {
        SparseMatrix* Ac = new SparseMatrix;
        InitializeSparseMatrix(*Ac, geomc);
        Ac->rankType = CPU;
        (*Ac).Ac = 0;
        GenerateProblem(*Ac, 0, 0, 0);
        SetupHalo(*Ac);
        InitializeVector(*rc, Ac->localNumberOfRows, Ac->rankType);
        InitializeVector(*xc, Ac->localNumberOfColumns, Ac->rankType);
        InitializeVector(*Axf, Af.localNumberOfColumns, Ac->rankType);
        Af.Ac = Ac;
        // Use a parallel loop to do initial assignment:
        // distributes the physical placement of arrays of pointers across the memory system
 #ifndef HPCG_NO_OPENMP
 #pragma omp parallel for
 #endif
        for (local_int_t i = 0; i < localNumberOfRows; ++i)
        {
            f2cOperator[i] = 0;
        }
 #ifndef HPCG_NO_OPENMP
 #pragma omp parallel for
 #endif
        for(local_int_t i = 0; i < nzc * nyc * nxc; i++)
        {
            local_int_t izc = (i / (nxc * nyc));
            local_int_t iyc = (i - izc * nxc * nyc) / nxc;
            local_int_t ixc = i - (izc * nyc + iyc) * nxc;
            local_int_t izf = 2 * izc;
            local_int_t iyf = 2 * iyc;
            local_int_t ixf = 2 * ixc;
            local_int_t currentCoarseRow = izc * nxc * nyc + iyc * nxc + ixc;
            local_int_t currentFineRow = izf * nxf * nyf + iyf * nxf + ixf;
            f2cOperator[currentCoarseRow] = currentFineRow;
        }
    }
    InitializeMGData(f2cOperator, rc, xc, Axf, *mgData);
    Af.mgData = mgData;
    return;
 }
--- a/src/GenerateCoarseProblem.hpp
+++ b/src/GenerateCoarseProblem.hpp
@@ -0,0 +1,19 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 #ifndef GENERATECOARSEPROBLEM_HPP
 #define GENERATECOARSEPROBLEM_HPP
 #include "SparseMatrix.hpp"
 void GenerateCoarseProblem(const SparseMatrix& A);
 #endif // GENERATECOARSEPROBLEM_HPP
--- a/src/GenerateGeometry.cpp
+++ b/src/GenerateGeometry.cpp
@@ -0,0 +1,801 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /*!
 @file GenerateGeometry.cpp
 HPCG routine
 */
 #include <cassert>
 #include <cmath>
 #include <cstdlib>
 #include "ComputeOptimalShapeXYZ.hpp"
 #include "GenerateGeometry.hpp"
 #include <cstdio>
 #ifdef HPCG_DEBUG
 #include "hpcg.hpp"
 #include <fstream>
 using std::endl;
 #endif
 #ifndef HPCG_NO_MPI
 #include <mpi.h>
 #endif
 #ifndef HPCG_NO_MPI
 // Used to find ranks for CPU and GPU programs
 extern int global_total_ranks;
 extern int* physical_rank_dims;
 extern int* logical_rank_to_phys;
 #endif
 /*!
  Computes the factorization of the total number of processes into a
  3-dimensional process grid that is as close as possible to a cube. The
  quality of the factorization depends on the prime number structure of the
  total number of processes. It then stores this decompostion together with the
  parallel parameters of the run in the geometry data structure.
  @param[in]  size total number of MPI processes
  @param[in]  rank this process' rank among other MPI processes
  @param[in]  numThreads number of OpenMP threads in this process
  @param[in]  nx, ny, nz number of grid points for each local block in the x, y, and z dimensions, respectively
  @param[out] geom data structure that will store the above parameters and the factoring of total number of processes
  into three dimensions
 */
 // Level 0 Generation, we need to decide nx, ny, nz based on
 // G2C ratio and npx, npy, npz
 //  Remap rank IDs to logical IDs to enforce 3D shape correctness when exec_mode is GPUCPU
 void GenerateGeometry(HPCG_Params& params, Geometry* geom)
 {
    int size = params.comm_size, rank = params.comm_rank; // Number of MPI processes, My process ID
    int nx = params.nx, ny = params.ny, nz = params.nz;
    int npx = params.npx, npy = params.npy, npz = params.npz;
    // If npx. npy, and npz are not provided by user
    // find the optimal shape
    if (npx * npy * npz <= 0 || npx * npy * npz > size)
        ComputeOptimalShapeXYZ(size, npx, npy, npz);
    // When search_for_same0 is true, finds the next rank that is the same as local
    //  problem size as rank 0. When false, finds the ranks that are not the same as rank 0
    auto loop_over_ranks = [](int index, int lp, bool search_for_same0) -> int
    {
        for (int p = index; p < global_total_ranks; p++)
        {
            int nnpx = physical_rank_dims[3 * p];
            int nnpy = physical_rank_dims[3 * p + 1];
            int nnpz = physical_rank_dims[3 * p + 2];
            bool same_zero = false;
            if (nnpx == physical_rank_dims[0] && nnpy == physical_rank_dims[1] && nnpz == physical_rank_dims[2])
                same_zero = true;
            if (same_zero == search_for_same0)
            {
                logical_rank_to_phys[lp] = p;
                index = p + 1;
                break;
            }
        }
        return index;
    };
    // Here decide and broadcast nx, ny, nz
    // 1 Check for GPU and CPU execution modes
    auto user_diff_dim = NONE;
    if (params.exec_mode == GPUCPU)
    {
        // User defined diff direction between GPU and CPU
        // If user decides that nz should be diff between GPU and CPU
        //  and NPZ is even --> Decide GPU and CPU local size based on
        //  local_problem_def and g2c
        if (params.diff_dim == Z && (npz & 1) == 0)
        {
            user_diff_dim = Z;
            if (params.local_problem_def == GPU_RATIO)
            {
                if (params.rank_type == CPU)
                    nz = nz / params.g2c;
            }
            else if (params.local_problem_def == GPU_ABS)
            {
                if (params.rank_type == CPU)
                    nz = params.g2c;
            }
            else if (params.local_problem_def == GPU_CPU_RATIO)
            {
                if (params.rank_type == CPU)
                    nz = nz / params.g2c;
                if (params.rank_type == GPU)
                    nz = nz - (nz / params.g2c);
            }
            else
            { /*GPU_CPU_ABS*/
                if (params.rank_type == CPU)
                    nz = params.g2c;
                if (params.rank_type == GPU)
                    nz = nz - params.g2c;
            }
        }
        // If user decides that ny should be diff between GPU and CPU
        //  and NPY is even --> Decide GPU and CPU local size based on
        //  local_problem_def and g2c
        else if (params.diff_dim == Y && (npy & 1) == 0)
        {
            user_diff_dim = Y;
            if (params.local_problem_def == GPU_RATIO)
            {
                if (params.rank_type == CPU)
                    ny = ny / params.g2c;
            }
            else if (params.local_problem_def == GPU_ABS)
            {
                if (params.rank_type == CPU)
                    ny = params.g2c;
            }
            else if (params.local_problem_def == GPU_CPU_RATIO)
            {
                if (params.rank_type == CPU)
                    ny = ny / params.g2c;
                if (params.rank_type == GPU)
                    ny = ny - (ny / params.g2c);
            }
            else
            { /*GPU_CPU_ABS*/
                if (params.rank_type == CPU)
                    ny = params.g2c;
                if (params.rank_type == GPU)
                    ny = ny - params.g2c;
            }
        }
        // If user decides that nx should be diff between GPU and CPU
        //  and NPX is even --> Decide GPU and CPU local size based on
        //  local_problem_def and g2c
        else if (params.diff_dim == X && (npx & 1) == 0)
        {
            user_diff_dim = X;
            if (params.local_problem_def == GPU_RATIO)
            {
                if (params.rank_type == CPU)
                    nx = nx / params.g2c;
            }
            else if (params.local_problem_def == GPU_ABS)
            {
                if (params.rank_type == CPU)
                    nx = params.g2c;
            }
            else if (params.local_problem_def == GPU_CPU_RATIO)
            {
                if (params.rank_type == CPU)
                    nx = nx / params.g2c;
                if (params.rank_type == GPU)
                    nx = nx - (nx / params.g2c);
            }
            else
            { /*GPU_CPU_ABS*/
                if (params.rank_type == CPU)
                    nx = params.g2c;
                if (params.rank_type == GPU)
                    nx = nx - params.g2c;
            }
        }
        // Automatic partition direction
        // When user does not specify the diff dimension
        if (user_diff_dim == NONE)
        { // Did not succeed with user choice
            if ((npz & 1) == 0)
            {
                if (params.local_problem_def == GPU_RATIO)
                {
                    if (params.rank_type == CPU)
                        nz = nz / params.g2c;
                }
                else if (params.local_problem_def == GPU_ABS)
                {
                    if (params.rank_type == CPU)
                        nz = params.g2c;
                }
                else if (params.local_problem_def == GPU_CPU_RATIO)
                {
                    if (params.rank_type == CPU)
                        nz = nz / params.g2c;
                    if (params.rank_type == GPU)
                        nz = nz - (nz / params.g2c);
                }
                else
                { /*GPU_CPU_ABS*/
                    if (params.rank_type == CPU)
                        nz = params.g2c;
                    if (params.rank_type == GPU)
                        nz = nz - params.g2c;
                }
            }
            else if ((npy & 1) == 0)
            {
                if (params.local_problem_def == GPU_RATIO)
                {
                    if (params.rank_type == CPU)
                        ny = ny / params.g2c;
                }
                else if (params.local_problem_def == GPU_ABS)
                {
                    if (params.rank_type == CPU)
                        ny = params.g2c;
                }
                else if (params.local_problem_def == GPU_CPU_RATIO)
                {
                    if (params.rank_type == CPU)
                        ny = ny / params.g2c;
                    if (params.rank_type == GPU)
                        ny = ny - (ny / params.g2c);
                }
                else
                { /*GPU_CPU_ABS*/
                    if (params.rank_type == CPU)
                        ny = params.g2c;
                    if (params.rank_type == GPU)
                        ny = ny - params.g2c;
                }
            }
            else if ((npx & 1) == 0)
            {
                if (params.local_problem_def == GPU_RATIO)
                {
                    if (params.rank_type == CPU)
                        nx = nx / params.g2c;
                }
                else if (params.local_problem_def == GPU_ABS)
                {
                    if (params.rank_type == CPU)
                        nx = params.g2c;
                }
                else if (params.local_problem_def == GPU_CPU_RATIO)
                {
                    if (params.rank_type == CPU)
                        nx = nx / params.g2c;
                    if (params.rank_type == GPU)
                        nx = nx - (nx / params.g2c);
                }
                else
                { /*GPU_CPU_ABS*/
                    if (params.rank_type == CPU)
                        nx = params.g2c;
                    if (params.rank_type == GPU)
                        nx = nx - params.g2c;
                }
            }
        }
    }
    // Now let us exchange dimensions
    int sendBuf[] = {nx, ny, nz};
 #ifndef HPCG_NO_MPI
    MPI_Allgather(sendBuf, 3, MPI_INT, physical_rank_dims, 3, MPI_INT, MPI_COMM_WORLD);
 #endif
    // My logical rank Id
    int logical_rank;
    // last physical position for the rank that has the same size as 0
    int same_as_0_position = 0;
    // last physical position for the rank that does not have the same size as 0
    int not_same_as_0_position = 0;
    auto different_dim = NONE;
    bool all_same = true;
    int num_ranks_same = 1;
    int num_ranks_not_same = 0;
    int x0 = physical_rank_dims[0];
    int y0 = physical_rank_dims[1];
    int z0 = physical_rank_dims[2];
    for (int p = 1; p < global_total_ranks; p++)
    {
        int x = physical_rank_dims[3 * p];
        int y = physical_rank_dims[3 * p + 1];
        int z = physical_rank_dims[3 * p + 2];
        if (x != x0 || y != y0 || z != z0)
            num_ranks_not_same++;
        else
            num_ranks_same++;
    }
    if (num_ranks_not_same > 0)
        all_same = false;
    if (!all_same)
    {
        // try twice: user-based, automatic
        for (int i = 0; i < 2; i++)
        {
            bool z_condition = (i == 0) ? user_diff_dim == Z && (npz & 1) == 0 : (npz & 1) == 0;
            bool y_condition = (i == 0) ? user_diff_dim == Y && (npy & 1) == 0 : (npy & 1) == 0;
            bool x_condition = (i == 0) ? user_diff_dim == X && (npx & 1) == 0 : (npx & 1) == 0;
            // Let us start with Z
            if (z_condition)
            { // Z is even
                different_dim = Z;
                bool x_same = true;
                bool y_same = true;
                for (int p = 1; p < global_total_ranks; p++)
                {
                    int x = physical_rank_dims[3 * p];
                    int y = physical_rank_dims[3 * p + 1];
                    assert(x == x0 && y == y0);
                }
            }
            else if (y_condition)
            { // Y is even
                different_dim = Y;
                bool x_same = true;
                bool z_same = true;
                for (int p = 1; p < global_total_ranks; p++)
                {
                    int x = physical_rank_dims[3 * p];
                    int z = physical_rank_dims[3 * p + 2];
                    assert(x == x0 && z == z0);
                }
            }
            else if (x_condition)
            {
                different_dim = X;
                bool y_same = true;
                bool z_same = true;
                for (int p = 1; p < global_total_ranks; p++)
                {
                    int y = physical_rank_dims[3 * p + 1];
                    int z = physical_rank_dims[3 * p + 2];
                    assert(z == z0 && y == y0);
                }
            }
            if (z_condition || y_condition || x_condition)
                break;
        }
    }
    // When exec_mode is GPUCPU, GPU and CPU ranks can have different dims. Therefore,
    // we must rearrange the ranks such that the 3D shape is correct.
    int same_rank_counter = 0;
    if (different_dim != NONE)
    {
        for (int iz = 0; iz < npz; iz++)
            for (int iy = 0; iy < npy; iy++)
                for (int ix = 0; ix < npx; ix++)
                {
                    int logical_position = iz * npy * npx + iy * npx + ix;
                    // Different dim is Z
                    // The first NPXxNPY are GPUs, then the next NPXxNPY is CPUs, and so on
                    if (different_dim == Z)
                    {
                        if ((iz & 1) == 0 && same_rank_counter < num_ranks_same)
                        { // same as 0
                            same_as_0_position = loop_over_ranks(same_as_0_position, logical_position, true);
                            same_rank_counter++;
                        }
                        else
                        { // Not same as 0
                            not_same_as_0_position = loop_over_ranks(not_same_as_0_position, logical_position, false);
                        }
                    }
                    // Different dim is Y
                    // The first NPXxNPZ are GPUs, then the next NPXxNPZ is CPUs, and so on
                    else if (different_dim == Y)
                    {
                        if ((iy & 1) == 0 && same_rank_counter < num_ranks_same)
                        { // same as 0
                            same_as_0_position = loop_over_ranks(same_as_0_position, logical_position, true);
                            same_rank_counter++;
                        }
                        else
                        { // Not same as 0
                            not_same_as_0_position = loop_over_ranks(not_same_as_0_position, logical_position, false);
                        }
                    }
                    // Different dim is X
                    // The first NPYxNPZ are GPUs, then the next NPYxNPZ is CPUs, and so on
                    else if (different_dim == X)
                    {
                        if ((ix & 1) == 0 && same_rank_counter < num_ranks_same)
                        { // same as 0
                            same_as_0_position = loop_over_ranks(same_as_0_position, logical_position, true);
                            same_rank_counter++;
                        }
                        else
                        { // Not same as 0
                            not_same_as_0_position = loop_over_ranks(not_same_as_0_position, logical_position, false);
                        }
                    }
                }
    }
    else
    {
        // Keep rank Ids the same if all ranks have the same problem size
        for (int p = 0; p < global_total_ranks; p++)
            logical_rank_to_phys[p] = p;
    }
    for (int p = 0; p < global_total_ranks; p++)
    {
        if (rank == logical_rank_to_phys[p])
        {
            logical_rank = p;
        }
    }
    // Now compute this process's indices in the 3D cube
    int ipz = logical_rank / (npx * npy);
    int ipy = (logical_rank - ipz * npx * npy) / npx;
    int ipx = logical_rank % npx;
 #ifdef HPCG_DEBUG
    if (rank == 0)
        HPCG_fout << "size = " << size << endl
                  << "nx  = " << nx << endl
                  << "ny  = " << ny << endl
                  << "nz  = " << nz << endl
                  << "npx = " << npx << endl
                  << "npy = " << npy << endl
                  << "npz = " << npz << endl;
    HPCG_fout << "For rank = " << rank << endl
              << "ipx = " << ipx << endl
              << "ipy = " << ipy << endl
              << "ipz = " << ipz << endl;
    assert(size >= npx * npy * npz);
 #endif
    geom->size = size;
    geom->rank = rank;
    geom->logical_rank = logical_rank;
    geom->different_dim = different_dim;
    geom->numThreads = params.numThreads;
    geom->nx = nx;
    geom->ny = ny;
    geom->nz = nz;
    geom->npx = npx;
    geom->npy = npy;
    geom->npz = npz;
    geom->ipx = ipx;
    geom->ipy = ipy;
    geom->ipz = ipz;
    // These values should be defined to take into account changes in nx, ny, nz values
    // due to variable local grid sizes
    global_int_t gnx = 0;
    global_int_t gny = 0;
    global_int_t gnz = 0;
    // Find the global NX. NY, and NZ
    //  For diff dims, accumulate sequentially
    //  For similar dims, just multiply rank 3D location by the local dim
    if (different_dim == X)
        for (int i = 0; i < npx; i++)
        {
            int r = ipz * npx * npy + ipy * npx + i;
            int p = logical_rank_to_phys[r];
            gnx += physical_rank_dims[p * 3];
        }
    else
        gnx = npx * nx;
    if (different_dim == Y)
        for (int i = 0; i < npy; i++)
        {
            int r = ipz * npx * npy + i * npx + ipx;
            int p = logical_rank_to_phys[r];
            gny += physical_rank_dims[p * 3 + 1];
        }
    else
        gny = npy * ny;
    if (different_dim == Z)
        for (int i = 0; i < npz; i++)
        {
            int r = i * npx * npy + ipy * npx + ipx;
            int p = logical_rank_to_phys[r];
            gnz += physical_rank_dims[p * 3 + 2];
        }
    else
        gnz = npz * nz;
    // Here, we find the initial global indices (gix0, giy0, and giz0)
    // for each rank based on its 3d location in the grid
    // Also, for the diff dim find the previous and next neighbor IDs
    // Notice, on the diff dims the previous and next neighbors have
    // the different dimension!
    int prev_n = 0;
    int next_n = 0;
    global_int_t giz0 = 0;
    global_int_t gix0 = 0;
    global_int_t giy0 = 0;
    if (different_dim == X)
    {
        for (int i = 0; i < ipx; i++)
        {
            int r = ipz * npx * npy + ipy * npx + i;
            int p = logical_rank_to_phys[r];
            gix0 += physical_rank_dims[p * 3];
            if (i == ipx - 1)
            {
                prev_n = physical_rank_dims[p * 3];
            }
        }
        if (ipx + 1 < npx)
        {
            int r = ipz * npx * npy + ipy * npx + (ipx + 1);
            int p = logical_rank_to_phys[r];
            next_n = physical_rank_dims[p * 3];
        }
    }
    else
        gix0 = ipx * nx;
    if (different_dim == Y)
    {
        for (int i = 0; i < ipy; i++)
        {
            int r = ipz * npx * npy + i * npx + ipx;
            int p = logical_rank_to_phys[r];
            giy0 += physical_rank_dims[p * 3 + 1];
            if (i == ipy - 1)
            {
                prev_n = physical_rank_dims[p * 3 + 1];
            }
        }
        if (ipy + 1 < npy)
        {
            int r = ipz * npx * npy + (ipy + 1) * npx + ipx;
            int p = logical_rank_to_phys[r];
            next_n = physical_rank_dims[p * 3 + 1];
        }
    }
    else
        giy0 = ipy * ny;
    if (different_dim == Z)
    {
        for (int i = 0; i < ipz; i++)
        {
            int r = i * npx * npy + ipy * npx + ipx;
            int p = logical_rank_to_phys[r];
            giz0 += physical_rank_dims[p * 3 + 2];
            if (i == ipz - 1)
            {
                prev_n = physical_rank_dims[p * 3 + 2];
            }
        }
        if (ipz + 1 < npz)
        {
            int r = (ipz + 1) * npx * npy + ipy * npx + ipx;
            int p = logical_rank_to_phys[r];
            next_n = physical_rank_dims[p * 3 + 2];
        }
    }
    else
        giz0 = ipz * nz;
    // Keep these values for later
    geom->gnx = gnx;
    geom->gny = gny;
    geom->gnz = gnz;
    geom->gix0 = gix0;
    geom->giy0 = giy0;
    geom->giz0 = giz0;
    geom->previous_neighbor_dim = prev_n;
    geom->next_neighbor_dim = next_n;
    return;
 }
 // Simpler generateion for next/coarse levels
 // Do not need to find nx, ny, nz for CPU and GPU based on parameters
 // Do not need to find logical rank IDs
 void GenerateGeometry(int size, int rank, int numThreads, local_int_t nx, local_int_t ny, local_int_t nz, int npx,
    int npy, int npz, dim_3d_t different_dim, Geometry* geom)
 {
    // My logical rank Id
    int logical_rank;
    for (int p = 0; p < global_total_ranks; p++)
    {
        if (rank == logical_rank_to_phys[p])
        {
            logical_rank = p;
        }
    }
    // Now compute this process's indices in the 3D cube
    int ipz = logical_rank / (npx * npy);
    int ipy = (logical_rank - ipz * npx * npy) / npx;
    int ipx = logical_rank % npx;
 #ifdef HPCG_DEBUG
    if (rank == 0)
        HPCG_fout << "size = " << size << endl
                  << "nx  = " << nx << endl
                  << "ny  = " << ny << endl
                  << "nz  = " << nz << endl
                  << "npx = " << npx << endl
                  << "npy = " << npy << endl
                  << "npz = " << npz << endl;
    HPCG_fout << "For rank = " << rank << endl
              << "ipx = " << ipx << endl
              << "ipy = " << ipy << endl
              << "ipz = " << ipz << endl;
    assert(size >= npx * npy * npz);
 #endif
    geom->size = size;
    geom->rank = rank;
    geom->logical_rank = logical_rank;
    geom->different_dim = different_dim;
    geom->numThreads = numThreads;
    geom->nx = nx;
    geom->ny = ny;
    geom->nz = nz;
    geom->npx = npx;
    geom->npy = npy;
    geom->npz = npz;
    geom->ipx = ipx;
    geom->ipy = ipy;
    geom->ipz = ipz;
    // Find the global NX. NY, and NZ
    //  For diff dims, accumulate sequentially
    //  For similar dims, just multiply rank 3D location by the local dim
    global_int_t gnx = 0;
    global_int_t gny = 0;
    global_int_t gnz = 0;
    if (different_dim == X)
        for (int i = 0; i < npx; i++)
        {
            int r = ipz * npx * npy + ipy * npx + i;
            int p = logical_rank_to_phys[r];
            gnx += physical_rank_dims[p * 3];
        }
    else
        gnx = npx * nx;
    if (different_dim == Y)
        for (int i = 0; i < npy; i++)
        {
            int r = ipz * npx * npy + i * npx + ipx;
            int p = logical_rank_to_phys[r];
            gny += physical_rank_dims[p * 3 + 1];
        }
    else
        gny = npy * ny;
    if (different_dim == Z)
        for (int i = 0; i < npz; i++)
        {
            int r = i * npx * npy + ipy * npx + ipx;
            int p = logical_rank_to_phys[r];
            gnz += physical_rank_dims[p * 3 + 2];
        }
    else
        gnz = npz * nz;
    // Here, we find the initial global indices (gix0, giy0, and giz0)
    // for each rank based on its 3d location in the grid
    // Also, for the diff dim find the previous and next neighbor IDs
    // Notice, on the diff dims the previous and next neighbors have
    // the different dimension!
    int prev_n = 0;
    int next_n = 0;
    global_int_t giz0 = 0;
    global_int_t gix0 = 0;
    global_int_t giy0 = 0;
    if (different_dim == X)
    {
        for (int i = 0; i < ipx; i++)
        {
            int r = ipz * npx * npy + ipy * npx + i;
            int p = logical_rank_to_phys[r];
            gix0 += physical_rank_dims[p * 3];
            if (i == ipx - 1)
            {
                prev_n = physical_rank_dims[p * 3];
            }
        }
        if (ipx + 1 < npx)
        {
            int r = ipz * npx * npy + ipy * npx + (ipx + 1);
            int p = logical_rank_to_phys[r];
            next_n = physical_rank_dims[p * 3];
        }
    }
    else
        gix0 = ipx * nx;
    if (different_dim == Y)
    {
        for (int i = 0; i < ipy; i++)
        {
            int r = ipz * npx * npy + i * npx + ipx;
            int p = logical_rank_to_phys[r];
            giy0 += physical_rank_dims[p * 3 + 1];
            if (i == ipy - 1)
            {
                prev_n = physical_rank_dims[p * 3 + 1];
            }
        }
        if (ipy + 1 < npy)
        {
            int r = ipz * npx * npy + (ipy + 1) * npx + ipx;
            int p = logical_rank_to_phys[r];
            next_n = physical_rank_dims[p * 3 + 1];
        }
    }
    else
        giy0 = ipy * ny;
    if (different_dim == Z)
    {
        for (int i = 0; i < ipz; i++)
        {
            int r = i * npx * npy + ipy * npx + ipx;
            int p = logical_rank_to_phys[r];
            giz0 += physical_rank_dims[p * 3 + 2];
            if (i == ipz - 1)
            {
                prev_n = physical_rank_dims[p * 3 + 2];
            }
        }
        if (ipz + 1 < npz)
        {
            int r = (ipz + 1) * npx * npy + ipy * npx + ipx;
            int p = logical_rank_to_phys[r];
            next_n = physical_rank_dims[p * 3 + 2];
        }
    }
    else
        giz0 = ipz * nz;
    // Keep these values for later
    geom->gnx = gnx;
    geom->gny = gny;
    geom->gnz = gnz;
    geom->gix0 = gix0;
    geom->giy0 = giy0;
    geom->giz0 = giz0;
    geom->previous_neighbor_dim = prev_n;
    geom->next_neighbor_dim = next_n;
    return;
 }
--- a/src/GenerateGeometry.hpp
+++ b/src/GenerateGeometry.hpp
@@ -0,0 +1,39 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef GENERATEGEOMETRY_HPP
 #define GENERATEGEOMETRY_HPP
 #include "Geometry.hpp"
 #include "hpcg.hpp"
 void GenerateGeometry(HPCG_Params& params, Geometry* geom);
 void GenerateGeometry(int size, int rank, int numThreads, local_int_t nx, local_int_t ny, local_int_t nz, int npx,
    int npy, int npz, dim_3d_t partition_by, Geometry* geom);
 #endif // GENERATEGEOMETRY_HPP
--- a/src/GenerateProblem.cpp
+++ b/src/GenerateProblem.cpp
@@ -0,0 +1,404 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /*!
 @file GenerateProblem.cpp
 HPCG routine
 */
 #ifndef HPCG_NO_MPI
 #include <mpi.h>
 #endif
 #ifndef HPCG_NO_OPENMP
 #include <omp.h>
 #endif
 #include "mytimer.hpp"
 #include "GenerateProblem.hpp"
 #include "GenerateProblem_ref.hpp"
 #ifdef USE_CUDA
 #include "Cuda.hpp"
 #include "CudaKernels.hpp"
 #endif
 #ifdef USE_GRACE
 #include "CpuKernels.hpp"
 #endif
 /*!
  Routine to generate a sparse matrix, right hand side, initial guess, and exact solution.
  @param[in]  A        The generated system matrix
  @param[inout] b      The newly allocated and generated right hand side vector (if b!=0 on entry)
  @param[inout] x      The newly allocated solution vector with entries set to 0.0 (if x!=0 on entry)
  @param[inout] xexact The newly allocated solution vector with entries set to the exact solution (if the xexact!=0
  non-zero on entry)
  @see GenerateGeometry
 */
 #ifdef USE_CUDA
 void GenerateProblem_Gpu(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact)
 {
    global_int_t nx = A.geom->nx;
    global_int_t ny = A.geom->ny;
    global_int_t nz = A.geom->nz;
    global_int_t gnx = A.geom->gnx;
    global_int_t gny = A.geom->gny;
    global_int_t gnz = A.geom->gnz;
    global_int_t gix0 = A.geom->gix0;
    global_int_t giy0 = A.geom->giy0;
    global_int_t giz0 = A.geom->giz0;
    local_int_t localNumberOfRows = nx * ny * nz;
    local_int_t numberOfNonzerosPerRow = 27;
    global_int_t totalNumberOfRows = gnx * gny * gnz;
    if (b != 0)
        InitializeVector(*b, localNumberOfRows, GPU);
    if (x != 0)
        InitializeVector(*x, localNumberOfRows, GPU);
    if (xexact != 0)
        InitializeVector(*xexact, localNumberOfRows, GPU);
    GenerateProblemCuda(A, b, x, xexact);
    local_int_t localNumberOfNonzeros = A.localNumberOfNonzeros;
    global_int_t totalNumberOfNonzeros = 27LL * ((gnx - 2LL) * (gny - 2LL) * (gnz - 2LL))
        + 18LL
            * (2LL * ((gnx - 2LL) * (gny - 2LL)) + 2LL * ((gnx - 2LL) * (gnz - 2LL))
                + 2LL * ((gny - 2LL) * (gnz - 2LL)))
        + 12LL * (4LL * (gnx - 2LL) + 4LL * (gny - 2LL) + 4LL * (gnz - 2LL)) + 8LL * 8LL;
    A.title = 0;
    A.totalNumberOfRows = totalNumberOfRows;
    A.totalNumberOfNonzeros = totalNumberOfNonzeros;
    A.localNumberOfRows = localNumberOfRows;
    A.localNumberOfColumns = localNumberOfRows;
    A.localNumberOfNonzeros = localNumberOfNonzeros;
    return;
 }
 #endif
 #ifdef USE_GRACE
 // Neighbor rank to sequential ID and vice versa
 extern int *rankToId_h, *idToRank_h;
 // GenerateProblem_Cpu is called 4 times for each level
 // Sometimes we need to perform actions based on the level (global across the applications)
 int global_steps = 0;
 void GenerateProblem_Cpu(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact)
 {
    // Make local copies of geometry information.  Use global_int_t since the RHS products in the calculations
    // below may result in global range values.
    global_int_t nx = A.geom->nx;
    global_int_t ny = A.geom->ny;
    global_int_t nz = A.geom->nz;
    global_int_t gnx = A.geom->gnx;
    global_int_t gny = A.geom->gny;
    global_int_t gnz = A.geom->gnz;
    global_int_t gix0 = A.geom->gix0;
    global_int_t giy0 = A.geom->giy0;
    global_int_t giz0 = A.geom->giz0;
    int npx = A.geom->npx;
    int npy = A.geom->npy;
    local_int_t localNumberOfRows = nx * ny * nz; // This is the size of our subblock
    // If this assert fails, it most likely means that the local_int_t is set to int and should be set to long long
    assert(localNumberOfRows
        > 0); // Throw an exception of the number of rows is less than zero (can happen if int overflow)
    local_int_t numberOfNonzerosPerRow
        = 27; // We are approximating a 27-point finite element/volume/difference 3D stencil
    global_int_t totalNumberOfRows = gnx * gny * gnz; // Total number of grid points in mesh
    // If this assert fails, it most likely means that the global_int_t is set to int and should be set to long long
    assert(totalNumberOfRows
        > 0); // Throw an exception of the number of rows is less than zero (can happen if int overflow)
    // Allocate arrays that are of length localNumberOfRows
    if (global_steps == 0)
    {
        rankToId_h = new int[A.geom->size + 1];
        idToRank_h = new int[27];
        global_steps++;
    }
    local_int_t* nonzerosInRow = new local_int_t[localNumberOfRows];
    global_int_t** mtxIndG = new global_int_t*[localNumberOfRows];
    local_int_t** mtxIndL = new local_int_t*[localNumberOfRows];
    double** matrixValues = new double*[localNumberOfRows];
    double** matrixDiagonal = new double*[localNumberOfRows];
    if (b != 0)
        InitializeVector(*b, localNumberOfRows, CPU);
    if (x != 0)
        InitializeVector(*x, localNumberOfRows, CPU);
    if (xexact != 0)
        InitializeVector(*xexact, localNumberOfRows, CPU);
    double* bv = 0;
    double* xv = 0;
    double* xexactv = 0;
    if (b != 0)
        bv = b->values; // Only compute exact solution if requested
    if (x != 0)
        xv = x->values; // Only compute exact solution if requested
    if (xexact != 0)
        xexactv = xexact->values; // Only compute exact solution if requested
    A.localToGlobalMap.resize(localNumberOfRows);
    // Use a parallel loop to do initial assignment:
    // distributes the physical placement of arrays of pointers across the memory system
 #ifndef HPCG_NO_OPENMP
 #pragma omp parallel for
 #endif
    for (local_int_t i = 0; i < localNumberOfRows; ++i)
    {
        matrixValues[i] = 0;
        matrixDiagonal[i] = 0;
        mtxIndG[i] = 0;
        mtxIndL[i] = 0;
    }
    if (global_steps == 1)
    {
 #ifndef HPCG_NO_OPENMP
 #pragma omp parallel for
 #endif
        for (local_int_t i = 0; i < A.geom->size + 1; i++)
        {
            rankToId_h[i] = 0;
        }
        global_steps++;
    }
    // Now allocate the arrays pointed to
    mtxIndL[0] = new local_int_t[localNumberOfRows * numberOfNonzerosPerRow];
    matrixValues[0] = new double[localNumberOfRows * numberOfNonzerosPerRow];
    mtxIndG[0] = new global_int_t[localNumberOfRows * numberOfNonzerosPerRow];
    local_int_t localNumberOfNonzeros = 0;
    local_int_t ext_nnz = 0;
 #ifndef HPCG_NO_OPENMP
 #pragma omp parallel for reduction(+ : localNumberOfNonzeros) reduction(+ : ext_nnz)
 #endif
    for (local_int_t i = 0; i < localNumberOfRows; i++)
    {
        mtxIndL[i] = mtxIndL[0] + i * numberOfNonzerosPerRow;
        matrixValues[i] = matrixValues[0] + i * numberOfNonzerosPerRow;
        mtxIndG[i] = mtxIndG[0] + i * numberOfNonzerosPerRow;
        const local_int_t iz = (i / (nx * ny));
        const local_int_t iy = (i - iz * nx * ny) / nx;
        const local_int_t ix = i - (iz * ny + iy) * nx;
        const global_int_t gix = ix + gix0;
        const global_int_t giy = iy + giy0;
        const global_int_t giz = iz + giz0;
        local_int_t currentLocalRow = i;
        global_int_t currentGlobalRow = gix + giy * gnx + giz * gnx * gny;
        A.localToGlobalMap[currentLocalRow] = currentGlobalRow;
        char numberOfNonzerosInRow = 0;
        double* currentValuePointer = matrixValues[currentLocalRow];
        global_int_t* currentIndexPointerG = mtxIndG[currentLocalRow];
        global_int_t curcol;
        double* diagonalPointer = nullptr;
        // Go through all the neighbors around a 3D point to decide
        //  which one is a halo and which one is local to the rank
        for (int k = 0; k < 27; k++)
        {
            // Neibor global Ids
            long long int cgix = gix + tid2indCpu[k][0];
            long long int cgiy = giy + tid2indCpu[k][1];
            long long int cgiz = giz + tid2indCpu[k][2];
            // These used when the point is local to the rank
            local_int_t zi = (cgiz) % nz;
            local_int_t yi = (cgiy) % ny;
            local_int_t xi = (cgix) % nx;
            // local column Id
            local_int_t lcol = zi * ny * nx + yi * nx + xi;
            // Is the global 3D point inside the global problem?
            int ok = cgiz > -1 && cgiz < gnz && cgiy > -1 && cgiy < gny && cgix > -1 && cgix < gnx;
            if (ok /*Yes this a valid point globally*/)
            {
                *currentIndexPointerG++ = cgix + cgiy * gnx + cgiz * gnx * gny;
                ;
                if (k == 13)
                {
                    *currentValuePointer = 26.0;
                    diagonalPointer = currentValuePointer;
                }
                else
                {
                    *currentValuePointer = -1.0;
                }
                // Rank Id in the global domain
                int ipz = cgiz / nz;
                int ipy = cgiy / ny;
                int ipx = cgix / nx;
                // For GPUCPU exec mode, when the CPU and GPU have diff dims in a direction,
                //  we need to find the point rank manually, not based on its local dimension
                //  but based on its physical location to the local problem
                //  Note the halo size is always 1
                if (A.geom->different_dim == Z)
                {
                    long long int local = cgiz - giz0;
                    if (local >= 0 && local < nz)
                        ipz = A.geom->ipz;
                    else if (local < 0)
                        ipz = A.geom->ipz - 1;
                    else if (local >= nz)
                        ipz = A.geom->ipz + 1;
                }
                else if (A.geom->different_dim == Y)
                {
                    long long int local = cgiy - giy0;
                    if (local >= 0 && local < ny)
                        ipy = A.geom->ipy;
                    else if (local < 0)
                        ipy = A.geom->ipy - 1;
                    else if (local >= ny)
                        ipy = A.geom->ipy + 1;
                }
                else if (A.geom->different_dim == X)
                {
                    long long int local = cgix - gix0;
                    if (local >= 0 && local < nx)
                        ipx = A.geom->ipx;
                    else if (local < 0)
                        ipx = A.geom->ipx - 1;
                    else if (local >= nx)
                        ipx = A.geom->ipx + 1;
                }
                // Now, after find the point rank from the location
                //  in the 3D grid (ranks domain NPXxNPYxNPZ)
                int col_rank = ipx + ipy * npx + ipz * npy * npx;
                // The neighbor point rank is diff than the current point rank
                if (A.geom->logical_rank != col_rank)
                {
                    if (global_steps == 2)
                        rankToId_h[col_rank + 1] = 1; // To find its sequential Id (will be prefix summed later)
                    ext_nnz++;
                }
                currentValuePointer++;
                numberOfNonzerosInRow++;
            }
        }
        matrixDiagonal[currentLocalRow] = diagonalPointer;
        nonzerosInRow[currentLocalRow] = numberOfNonzerosInRow;
        localNumberOfNonzeros += numberOfNonzerosInRow;
        if (b != 0)
            bv[currentLocalRow] = 26.0 - ((double) (numberOfNonzerosInRow - 1));
        if (x != 0)
            xv[currentLocalRow] = 0.0;
        if (xexact != 0)
            xexactv[currentLocalRow] = 1.0;
    }
    // Prefixsum to RakToId
    // Map physical neighbor ranks to sequential IDs
    //  less memory consumption
    if (global_steps == 2)
    {
        PrefixsumCpu(rankToId_h + 1, A.geom->size);
        int counter = 1;
        for (int i = 1; i < A.geom->size + 1; i++)
        {
            if (rankToId_h[i] == counter)
            {
                idToRank_h[counter - 1] = i - 1;
                counter++;
            }
        }
        global_steps++;
    }
 #ifdef HPCG_DETAILED_DEBUG
    HPCG_fout << "Process " << A.geom->rank << " of " << A.geom->size << " has " << localNumberOfRows << " rows."
              << endl
              << "Process " << A.geom->rank << " of " << A.geom->size << " has " << localNumberOfNonzeros
              << " nonzeros." << endl;
 #endif
    global_int_t totalNumberOfNonzeros = 27LL * ((gnx - 2LL) * (gny - 2LL) * (gnz - 2LL))
        + 18LL
            * (2LL * ((gnx - 2LL) * (gny - 2LL)) + 2LL * ((gnx - 2LL) * (gnz - 2LL))
                + 2LL * ((gny - 2LL) * (gnz - 2LL)))
        + 12LL * (4LL * (gnx - 2LL) + 4LL * (gny - 2LL) + 4LL * (gnz - 2LL)) + 8LL * 8LL;
    // If this assert fails, it most likely means that the global_int_t is set to int and should be set to long long
    // This assert is usually the first to fail as problem size increases beyond the 32-bit integer range.
    assert(totalNumberOfNonzeros
        > 0); // Throw an exception of the number of nonzeros is less than zero (can happen if int overflow)
    A.title = 0;
    A.totalNumberOfRows = totalNumberOfRows;
    A.totalNumberOfNonzeros = totalNumberOfNonzeros;
    A.localNumberOfRows = localNumberOfRows;
    A.localNumberOfColumns = localNumberOfRows;
    A.localNumberOfNonzeros = localNumberOfNonzeros;
    A.nonzerosInRow = nonzerosInRow;
    A.mtxIndG = mtxIndG;
    A.mtxIndL = mtxIndL;
    A.matrixValues = matrixValues;
    A.matrixDiagonal = matrixDiagonal;
    A.extNnz = ext_nnz;
    return;
 }
 #endif // USE_GRACE
 void GenerateProblem(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact)
 {
    if (A.rankType == GPU)
    {
 #ifdef USE_CUDA
        GenerateProblem_Gpu(A, b, x, xexact);
 #endif
    }
    else
    {
 #ifdef USE_GRACE
        GenerateProblem_Cpu(A, b, x, xexact);
 #endif
    }
 }
--- a/src/GenerateProblem.hpp
+++ b/src/GenerateProblem.hpp
@@ -0,0 +1,20 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 #ifndef GENERATEPROBLEM_HPP
 #define GENERATEPROBLEM_HPP
 #include "SparseMatrix.hpp"
 #include "Vector.hpp"
 void GenerateProblem(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact);
 #endif // GENERATEPROBLEM_HPP
--- a/src/GenerateProblem_ref.cpp
+++ b/src/GenerateProblem_ref.cpp
@@ -0,0 +1,251 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*!
 @file GenerateProblem_ref.cpp
 HPCG routine
 */
 #ifndef HPCG_NO_MPI
 #include <mpi.h>
 #endif
 #ifndef HPCG_NO_OPENMP
 #include <omp.h>
 #endif
 #if defined(HPCG_DEBUG) || defined(HPCG_DETAILED_DEBUG)
 #include <fstream>
 using std::endl;
 #include "hpcg.hpp"
 #endif
 #include <cassert>
 #include "GenerateProblem_ref.hpp"
 /*!
  Reference version of GenerateProblem to generate the sparse matrix, right hand side, initial guess, and exact
  solution.
  @param[in]  A      The known system matrix
  @param[inout] b      The newly allocated and generated right hand side vector (if b!=0 on entry)
  @param[inout] x      The newly allocated solution vector with entries set to 0.0 (if x!=0 on entry)
  @param[inout] xexact The newly allocated solution vector with entries set to the exact solution (if the xexact!=0
  non-zero on entry)
  @see GenerateGeometry
 */
 void GenerateProblem_ref(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact)
 {
    // Make local copies of geometry information.  Use global_int_t since the RHS products in the calculations
    // below may result in global range values.
    global_int_t nx = A.geom->nx;
    global_int_t ny = A.geom->ny;
    global_int_t nz = A.geom->nz;
    global_int_t gnx = A.geom->gnx;
    global_int_t gny = A.geom->gny;
    global_int_t gnz = A.geom->gnz;
    global_int_t gix0 = A.geom->gix0;
    global_int_t giy0 = A.geom->giy0;
    global_int_t giz0 = A.geom->giz0;
    local_int_t localNumberOfRows = nx * ny * nz; // This is the size of our subblock
    // If this assert fails, it most likely means that the local_int_t is set to int and should be set to long long
    assert(localNumberOfRows
        > 0); // Throw an exception of the number of rows is less than zero (can happen if int overflow)
    local_int_t numberOfNonzerosPerRow
        = 27; // We are approximating a 27-point finite element/volume/difference 3D stencil
    global_int_t totalNumberOfRows = gnx * gny * gnz; // Total number of grid points in mesh
    // If this assert fails, it most likely means that the global_int_t is set to int and should be set to long long
    assert(totalNumberOfRows
        > 0); // Throw an exception of the number of rows is less than zero (can happen if int overflow)
    // Allocate arrays that are of length localNumberOfRows
    local_int_t* nonzerosInRow = new local_int_t[localNumberOfRows];
    global_int_t** mtxIndG = new global_int_t*[localNumberOfRows];
    local_int_t** mtxIndL = new local_int_t*[localNumberOfRows];
    double** matrixValues = new double*[localNumberOfRows];
    double** matrixDiagonal = new double*[localNumberOfRows];
    if (b != 0)
        InitializeVector(*b, localNumberOfRows, CPU);
    if (x != 0)
        InitializeVector(*x, localNumberOfRows, CPU);
    if (xexact != 0)
        InitializeVector(*xexact, localNumberOfRows, CPU);
    double* bv = 0;
    double* xv = 0;
    double* xexactv = 0;
    if (b != 0)
        bv = b->values; // Only compute exact solution if requested
    if (x != 0)
        xv = x->values; // Only compute exact solution if requested
    if (xexact != 0)
        xexactv = xexact->values; // Only compute exact solution if requested
    A.localToGlobalMap.resize(localNumberOfRows);
    // Use a parallel loop to do initial assignment:
    // distributes the physical placement of arrays of pointers across the memory system
 #ifndef HPCG_NO_OPENMP
 #pragma omp parallel for
 #endif
    for (local_int_t i = 0; i < localNumberOfRows; ++i)
    {
        matrixValues[i] = 0;
        matrixDiagonal[i] = 0;
        mtxIndG[i] = 0;
        mtxIndL[i] = 0;
    }
 #ifndef HPCG_CONTIGUOUS_ARRAYS
    // Now allocate the arrays pointed to
    for (local_int_t i = 0; i < localNumberOfRows; ++i)
        mtxIndL[i] = new local_int_t[numberOfNonzerosPerRow];
    for (local_int_t i = 0; i < localNumberOfRows; ++i)
        matrixValues[i] = new double[numberOfNonzerosPerRow];
    for (local_int_t i = 0; i < localNumberOfRows; ++i)
        mtxIndG[i] = new global_int_t[numberOfNonzerosPerRow];
 #else
    // Now allocate the arrays pointed to
    mtxIndL[0] = new local_int_t[localNumberOfRows * numberOfNonzerosPerRow];
    matrixValues[0] = new double[localNumberOfRows * numberOfNonzerosPerRow];
    mtxIndG[0] = new global_int_t[localNumberOfRows * numberOfNonzerosPerRow];
    for (local_int_t i = 1; i < localNumberOfRows; ++i)
    {
        mtxIndL[i] = mtxIndL[0] + i * numberOfNonzerosPerRow;
        matrixValues[i] = matrixValues[0] + i * numberOfNonzerosPerRow;
        mtxIndG[i] = mtxIndG[0] + i * numberOfNonzerosPerRow;
    }
 #endif
    local_int_t localNumberOfNonzeros = 0;
    // TODO:  This triply nested loop could be flattened or use nested parallelism
 #ifndef HPCG_NO_OPENMP
 #pragma omp parallel for
 #endif
    for (local_int_t iz = 0; iz < nz; iz++)
    {
        global_int_t giz = giz0 + iz;
        for (local_int_t iy = 0; iy < ny; iy++)
        {
            global_int_t giy = giy0 + iy;
            for (local_int_t ix = 0; ix < nx; ix++)
            {
                global_int_t gix = gix0 + ix;
                local_int_t currentLocalRow = iz * nx * ny + iy * nx + ix;
                global_int_t currentGlobalRow = giz * gnx * gny + giy * gnx + gix;
 #ifndef HPCG_NO_OPENMP
                // C++ std::map is not threadsafe for writing
 #pragma omp critical
 #endif
                A.globalToLocalMap[currentGlobalRow] = currentLocalRow;
                A.localToGlobalMap[currentLocalRow] = currentGlobalRow;
 #ifdef HPCG_DETAILED_DEBUG
                HPCG_fout << " rank, globalRow, localRow = " << A.geom->rank << " " << currentGlobalRow << " "
                          << A.globalToLocalMap[currentGlobalRow] << endl;
 #endif
                char numberOfNonzerosInRow = 0;
                double* currentValuePointer = matrixValues[currentLocalRow]; // Pointer to current value in current row
                global_int_t* currentIndexPointerG
                    = mtxIndG[currentLocalRow]; // Pointer to current index in current row
                for (int sz = -1; sz <= 1; sz++)
                {
                    if (giz + sz > -1 && giz + sz < gnz)
                    {
                        for (int sy = -1; sy <= 1; sy++)
                        {
                            if (giy + sy > -1 && giy + sy < gny)
                            {
                                for (int sx = -1; sx <= 1; sx++)
                                {
                                    if (gix + sx > -1 && gix + sx < gnx)
                                    {
                                        global_int_t curcol = currentGlobalRow + sz * gnx * gny + sy * gnx + sx;
                                        if (curcol == currentGlobalRow)
                                        {
                                            matrixDiagonal[currentLocalRow] = currentValuePointer;
                                            *currentValuePointer++ = 26.0;
                                        }
                                        else
                                        {
                                            *currentValuePointer++ = -1.0;
                                        }
                                        *currentIndexPointerG++ = curcol;
                                        numberOfNonzerosInRow++;
                                    } // end x bounds test
                                } // end sx loop
                            } // end y bounds test
                        } // end sy loop
                    } // end z bounds test
                } // end sz loop
                nonzerosInRow[currentLocalRow] = numberOfNonzerosInRow;
 #ifndef HPCG_NO_OPENMP
 #pragma omp critical
 #endif
                localNumberOfNonzeros += numberOfNonzerosInRow; // Protect this with an atomic
                if (b != 0)
                    bv[currentLocalRow] = 26.0 - ((double) (numberOfNonzerosInRow - 1));
                if (x != 0)
                    xv[currentLocalRow] = 0.0;
                if (xexact != 0)
                    xexactv[currentLocalRow] = 1.0;
            } // end ix loop
        } // end iy loop
    } // end iz loop
 #ifdef HPCG_DETAILED_DEBUG
    HPCG_fout << "Process " << A.geom->rank << " of " << A.geom->size << " has " << localNumberOfRows << " rows."
              << endl
              << "Process " << A.geom->rank << " of " << A.geom->size << " has " << localNumberOfNonzeros
              << " nonzeros." << endl;
 #endif
    global_int_t totalNumberOfNonzeros = 0;
 #ifndef HPCG_NO_MPI
    // Use MPI's reduce function to sum all nonzeros
 #ifdef HPCG_NO_LONG_LONG
    MPI_Allreduce(&localNumberOfNonzeros, &totalNumberOfNonzeros, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
 #else
    long long lnnz = localNumberOfNonzeros, gnnz = 0; // convert to 64 bit for MPI call
    MPI_Allreduce(&lnnz, &gnnz, 1, MPI_LONG_LONG_INT, MPI_SUM, MPI_COMM_WORLD);
    totalNumberOfNonzeros = gnnz; // Copy back
 #endif
 #else
    totalNumberOfNonzeros = localNumberOfNonzeros;
 #endif
    // If this assert fails, it most likely means that the global_int_t is set to int and should be set to long long
    // This assert is usually the first to fail as problem size increases beyond the 32-bit integer range.
    assert(totalNumberOfNonzeros
        > 0); // Throw an exception of the number of nonzeros is less than zero (can happen if int overflow)
    A.title = 0;
    A.totalNumberOfRows = totalNumberOfRows;
    A.totalNumberOfNonzeros = totalNumberOfNonzeros;
    A.localNumberOfRows = localNumberOfRows;
    A.localNumberOfColumns = localNumberOfRows;
    A.localNumberOfNonzeros = localNumberOfNonzeros;
    A.nonzerosInRow = nonzerosInRow;
    A.mtxIndG = mtxIndG;
    A.mtxIndL = mtxIndL;
    A.matrixValues = matrixValues;
    A.matrixDiagonal = matrixDiagonal;
    return;
 }
--- a/src/GenerateProblem_ref.hpp
+++ b/src/GenerateProblem_ref.hpp
@@ -0,0 +1,21 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 #ifndef GENERATEPROBLEM_REF_HPP
 #define GENERATEPROBLEM_REF_HPP
 #include "SparseMatrix.hpp"
 #include "Vector.hpp"
 void GenerateProblem_ref(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact);
 #endif // GENERATEPROBLEM_REF_HPP
--- a/src/Geometry.hpp
+++ b/src/Geometry.hpp
@@ -0,0 +1,207 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /*!
 @file Geometry.hpp
 HPCG data structure for problem geometry
 */
 #ifndef GEOMETRY_HPP
 #define GEOMETRY_HPP
 /*!
  This defines the type for integers that have local subdomain dimension.
  Define as "long long" when local problem dimension is > 2^31
 */
 // #define INDEX_64
 #ifndef INDEX_64
 typedef int local_int_t;
 #else
 typedef long long local_int_t;
 #endif
 /*!
  This defines the type for integers that have global dimension
  Define as "long long" when global problem dimension is > 2^31
 */
 #ifdef HPCG_NO_LONG_LONG
 typedef int global_int_t;
 #else
 typedef long long global_int_t;
 #endif
 #define HPCG_MAX_ROW_LEN 27
 // Enums
 typedef enum
 {
    X = 0,
    Y = 1,
    Z = 2,
    NONE = 3
 } dim_3d_t;
 typedef enum
 {
    MPI_CPU,
    MPI_CUDA_AWARE,
    MPI_GPU_All2allv,
    MPI_CPU_All2allv,
    NCCL /*GPUONLY*/
 } p2p_comm_mode_t;
 typedef enum
 {
    CPU,
    GPU
 } rank_type_t;
 typedef enum
 {
    GPUONLY = 0,
    CPUONLY = 1,
    GPUCPU = 2
 } exec_mode_t;
 typedef enum
 {
    GPU_RATIO = 0 /*NX, NY, NZ are local to GPU and g2c is a ratio*/,
    GPU_ABS = 1 /*NX, NY, NZ are local to GPU and g2c is absolute dimension size*/,
    GPU_CPU_RATIO = 2 /*NX, NY, NZ are local to GPU+CPU and g2c is ratio*/,
    GPU_CPU_ABS = 3 /*NX, NY, NZ are local to GPU+CPU and g2c is absolute dimension size*/
 } local_problem_def_t;
 // This macro should be defined if the global_int_t is not long long
 // in order to stop complaints from non-C++11 compliant compilers.
 // #define HPCG_NO_LONG_LONG
 /*!
  This is a data structure to contain all processor geometry information
 */
 struct Geometry_STRUCT
 {
    int size;         //!< Number of MPI processes
    int rank;         //!< This process' rank in the range [0 to size - 1]
    int logical_rank; //!< For hetrogeneous setup,
    int numThreads;   //!< This process' number of threads
    local_int_t nx;   //!< Number of x-direction grid points for each local subdomain
    local_int_t ny;   //!< Number of y-direction grid points for each local subdomain
    local_int_t nz;   //!< Number of z-direction grid points for each local subdomain
    int npx;          //!< Number of processors in x-direction
    int npy;          //!< Number of processors in y-direction
    int npz;          //!< Number of processors in z-direction
    int pz;           //!< partition ID of z-dimension process that starts the second region of nz values
    int npartz;       //!< Number of partitions with varying nz values
    int* partz_ids;   //!< Array of partition ids of processor in z-direction where new value of nz starts (valid values
                      //!< are 1 to npz)
    local_int_t* partz_nz; //!< Array of length npartz containing the nz values for each partition
    int ipx;               //!< Current rank's x location in the npx by npy by npz processor grid
    int ipy;               //!< Current rank's y location in the npx by npy by npz processor grid
    int ipz;               //!< Current rank's z location in the npx by npy by npz processor grid
    global_int_t gnx;      //!< Global number of x-direction grid points
    global_int_t gny;      //!< Global number of y-direction grid points
    global_int_t gnz;      //!< Global number of z-direction grid points
    global_int_t gix0;     //!< Base global x index for this rank in the npx by npy by npz processor grid
    global_int_t giy0;     //!< Base global y index for this rank in the npx by npy by npz processor grid
    global_int_t giz0;     //!< Base global z index for this rank in the npx by npy by npz processor grid
    dim_3d_t different_dim; //!< The dimension that the GPU and CPU rank are partitioned along
    int previous_neighbor_dim;
    int next_neighbor_dim;
 };
 typedef struct Geometry_STRUCT Geometry;
 /*!
  Returns the rank of the MPI process that is assigned the global row index
  given as the input argument.
  @param[in] geom  The description of the problem's geometry.
  @param[in] index The global row index
  @return Returns the MPI rank of the process assigned the row
 */
 inline int ComputeRankOfMatrixRow(const Geometry& geom, global_int_t index)
 {
    global_int_t gnx = geom.gnx;
    global_int_t gny = geom.gny;
    global_int_t iz = index / (gny * gnx);
    global_int_t iy = (index - iz * gny * gnx) / gnx;
    global_int_t ix = index % gnx;
    // We now permit varying values for nz for any nx-by-ny plane of MPI processes.
    // npartz is the number of different groups of nx-by-ny groups of processes.
    // partz_ids is an array of length npartz where each value indicates the z process of the last process in the ith
    // nx-by-ny group. partz_nz is an array of length npartz containing the value of nz for the ith group.
    //        With no variation, npartz = 1, partz_ids[0] = npz, partz_nz[0] = nz
    int ipz = 0;
    int ipartz_ids = 0;
    for (int i = 0; i < geom.npartz; ++i)
    {
        int ipart_nz = geom.partz_nz[i];
        ipartz_ids = geom.partz_ids[i] - ipartz_ids;
        if (iz <= ipart_nz * ipartz_ids)
        {
            ipz += iz / ipart_nz;
            break;
        }
        else
        {
            ipz += ipartz_ids;
            iz -= ipart_nz * ipartz_ids;
        }
    }
    //  global_int_t ipz = iz/geom.nz;
    int ipy = iy / geom.ny;
    int ipx = ix / geom.nx;
    int rank = ipx + ipy * geom.npx + ipz * geom.npy * geom.npx;
    return rank;
 }
 /*!
 Destructor for geometry data.
 @param[inout] data the geometry data structure whose storage is deallocated
 */
 inline void DeleteGeometry(Geometry& geom)
 {
    // Not used anymore
    // if(geom.partz_nz != 0)
    //   delete [] geom.partz_nz;
    // if(geom.partz_ids != 0)
    //   delete [] geom.partz_ids;
    return;
 }
 #endif // GEOMETRY_HPP
--- a/src/MGData.hpp
+++ b/src/MGData.hpp
@@ -0,0 +1,81 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*!
 @file MGData.hpp
 HPCG data structure
 */
 #ifndef MGDATA_HPP
 #define MGDATA_HPP
 #include "SparseMatrix.hpp"
 #include "Vector.hpp"
 #include <cassert>
 struct MGData_STRUCT
 {
    int numberOfPresmootherSteps;  // Call ComputeSYMGS this many times prior to coarsening
    int numberOfPostsmootherSteps; // Call ComputeSYMGS this many times after coarsening
    local_int_t*
        f2cOperator; //!< 1D array containing the fine operator local IDs that will be injected into coarse space.
    Vector* rc;      // coarse grid residual vector
    Vector* xc;      // coarse grid solution vector
    Vector* Axf;     // fine grid residual vector
    /*!
     This is for storing optimized data structres created in OptimizeProblem and
     used inside optimized ComputeSPMV().
     */
    void* optimizationData;
 };
 typedef struct MGData_STRUCT MGData;
 /*!
 Constructor for the data structure of CG vectors.
 @param[in] Ac - Fully-formed coarse matrix
 @param[in] f2cOperator -
 @param[out] data the data structure for CG vectors that will be allocated to get it ready for use in CG iterations
 */
 inline void InitializeMGData(local_int_t* f2cOperator, Vector* rc, Vector* xc, Vector* Axf, MGData& data)
 {
    data.numberOfPresmootherSteps = 1;
    data.numberOfPostsmootherSteps = 1;
    data.f2cOperator = f2cOperator; // Space for injection operator
    data.rc = rc;
    data.xc = xc;
    data.Axf = Axf;
    return;
 }
 /*!
 Destructor for the CG vectors data.
 @param[inout] data the MG data structure whose storage is deallocated
 */
 inline void DeleteMGData(MGData& data)
 {
    delete[] data.f2cOperator;
    DeleteVector(*data.Axf);
    DeleteVector(*data.rc);
    DeleteVector(*data.xc);
    delete data.Axf;
    delete data.rc;
    delete data.xc;
    return;
 }
 #endif // MGDATA_HPP
--- a/src/MixedBaseCounter.cpp
+++ b/src/MixedBaseCounter.cpp
@@ -0,0 +1,66 @@
 #include <map>
 #include "MixedBaseCounter.hpp"
 MixedBaseCounter::MixedBaseCounter(int* counts, int length)
 {
    this->length = length;
    int i;
    for (i = 0; i < 32; ++i)
    {
        this->max_counts[i] = counts[i];
        this->cur_counts[i] = 0;
    }
    // terminate with 0's
    this->max_counts[i] = this->cur_counts[i] = 0;
    this->max_counts[length] = this->cur_counts[length] = 0;
 }
 MixedBaseCounter::MixedBaseCounter(MixedBaseCounter& left, MixedBaseCounter& right)
 {
    this->length = left.length;
    for (int i = 0; i < left.length; ++i)
    {
        this->max_counts[i] = left.max_counts[i] - right.cur_counts[i];
        this->cur_counts[i] = 0;
    }
 }
 void MixedBaseCounter::next()
 {
    for (int i = 0; i < this->length; ++i)
    {
        this->cur_counts[i]++;
        if (this->cur_counts[i] > this->max_counts[i])
        {
            this->cur_counts[i] = 0;
            continue;
        }
        break;
    }
 }
 int MixedBaseCounter::is_zero()
 {
    for (int i = 0; i < this->length; ++i)
        if (this->cur_counts[i])
            return 0;
    return 1;
 }
 int MixedBaseCounter::product(int* multipliers)
 {
    int k = 0, x = 1;
    for (int i = 0; i < this->length; ++i)
        for (int j = 0; j < this->cur_counts[i]; ++j)
        {
            k = 1;
            x *= multipliers[i];
        }
    return x * k;
 }
--- a/src/MixedBaseCounter.hpp
+++ b/src/MixedBaseCounter.hpp
@@ -0,0 +1,16 @@
 class MixedBaseCounter
 {
 private:
    int length;             //!< number of prime factor counts (cannot exceed 32 for a 32-bit integer)
    int max_counts[32 + 1]; //!< maximum value for prime factor counts
    int cur_counts[32 + 1]; //!< current prime factor counts
 public:
    MixedBaseCounter(int* counts, int length);
    MixedBaseCounter(MixedBaseCounter& left, MixedBaseCounter& right);
    void next();
    int is_zero();
    int product(int* multipliers);
 };
--- a/src/OptimizeProblem.cpp
+++ b/src/OptimizeProblem.cpp
@@ -0,0 +1,427 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /*!
 @file OptimizeProblem.cpp
 HPCG routine
 */
 #include "OptimizeProblem.hpp"
 #include "CpuKernels.hpp"
 #include "CudaKernels.hpp"
 #include "Cuda.hpp"
 #include "WriteProblem.hpp"
 #include "mytimer.hpp"
 extern bool Use_Hpcg_Mem_Reduction; /*USE HPCG aggresive memory reduction*/
 /*!
  Optimizes the data structures used for CG iteration to increase the
  performance of the benchmark version of the preconditioned CG algorithm.
  @param[inout] A      The known system matrix, also contains the MG hierarchy in attributes Ac and mgData.
  @param[inout] data   The data structure with all necessary CG vectors preallocated
  @param[inout] b      The known right hand side vector
  @param[inout] x      The solution vector to be computed in future CG iteration
  @param[inout] xexact The exact solution vector
  @return returns 0 upon success and non-zero otherwise
  @see GenerateGeometry
  @see GenerateProblem
 */
 #ifdef USE_CUDA
 size_t OptimizeProblemGpu(SparseMatrix& A_in, CGData& data, Vector& b, Vector& x, Vector& xexact)
 {
    // This function can be used to completely transform any part of the data structures.
    // Right now it does nothing, so compiling with a check for unused variables results in complaints
    SparseMatrix* A = &A_in;
    local_int_t numberOfMgLevels = 4;
    local_int_t slice_size = A->slice_size;
    for (int level = 0; level < numberOfMgLevels; ++level)
    {
        const local_int_t nrow = A->localNumberOfRows;
        int totalColors = 8;
        // Let's deal with perm and iperm
        SetVectorAscCuda(A->ref2opt, nrow);
        SetVectorAscCuda(A->opt2ref, nrow);
        // Let us color the matrix
        int num_colors = 0;
        ColorMatrixCuda(NULL, A->gpuAux.columns, A->gpuAux.nnzPerRow, A->localNumberOfRows, A->gpuAux.color,
            &(num_colors), A->gpuAux.colorCountCpu, 8, A->ref2opt, A->opt2ref, A->geom->rank, A->geom->nx, NULL);
        A->totalColors = totalColors;
        PermElemToSendCuda(A->totalToBeSent, A->gpuAux.elementsToSend, A->ref2opt);
        // Create (S)ELL
        local_int_t TranslateIndex = slice_size * HPCG_MAX_ROW_LEN;
        local_int_t* translated_ell_col_index = A->sellAPermColumns + TranslateIndex;
        double* translated_ell_values = A->sellAPermValues + TranslateIndex;
        EllPermColumnsValuesCuda(nrow, A->gpuAux.nnzPerRow, A->gpuAux.columns, A->gpuAux.values,
            A->gpuAux.csrAPermOffsets, translated_ell_col_index, translated_ell_values, A->opt2ref, A->ref2opt,
            A->gpuAux.sellADiagonalIdx, A->gpuAux.csrLPermOffsets, A->gpuAux.csrUPermOffsets, false);
        // Coloumn mojor blocked/sliced ellpack
        TransposeCuda(nrow, slice_size, A->sellAPermColumns, A->sellAPermValues);
        // Per block max row len
        local_int_t num_slices = (nrow + slice_size - 1) / slice_size;
        EllMaxRowLenPerBlockCuda(nrow, slice_size, A->gpuAux.csrLPermOffsets, A->gpuAux.csrUPermOffsets,
            A->sellLSliceMrl, A->sellUSliceMrl);
        // Find prefix sum for sliced ell
        PrefixsumCuda(num_slices, A->sellLSliceMrl);
        MultiplyBySliceSizeCUDA(num_slices, slice_size, A->sellLSliceMrl + 1);
        PrefixsumCuda(num_slices, A->sellUSliceMrl);
        MultiplyBySliceSizeCUDA(num_slices, slice_size, A->sellUSliceMrl + 1);
        // Set the general matrix slice_offsets
        CreateAMatrixSliceOffsetsCuda(num_slices + 1, A->slice_size, A->sellASliceMrl);
        // Lower Upper ELL variant parts
        CreateSellLUColumnsValuesCuda(nrow, slice_size, A->sellAPermColumns, A->sellAPermValues, A->sellLSliceMrl,
            A->sellLPermColumns, A->sellLPermValues, A->sellUSliceMrl, A->sellUPermColumns, A->sellUPermValues, level);
        local_int_t sell_slices = (nrow + slice_size - 1) / slice_size;
        const local_int_t half_nnz = (A->localNumberOfNonzeros - nrow - A->extNnz) / 2;
        local_int_t sell_l_nnz = 0;
        cudaMemcpyAsync(
            &sell_l_nnz, &(A->sellLSliceMrl[sell_slices]), sizeof(local_int_t), cudaMemcpyDeviceToHost, stream);
        local_int_t sell_u_nnz = 0;
        cudaMemcpyAsync(
            &sell_u_nnz, &(A->sellUSliceMrl[sell_slices]), sizeof(local_int_t), cudaMemcpyDeviceToHost, stream);
        auto INDEX_TYPE = CUSPARSE_INDEX_32I;
 #ifdef INDEX_64 // In src/Geometry
        INDEX_TYPE = CUSPARSE_INDEX_64I;
 #endif
        cusparseCreateSlicedEll(&(A->cusparseOpt.matL), nrow, nrow, half_nnz, sell_l_nnz, slice_size,
            A->sellLSliceMrl, A->sellLPermColumns, A->sellLPermValues, INDEX_TYPE, INDEX_TYPE, CUSPARSE_INDEX_BASE_ZERO,
            CUDA_R_64F);
        cusparseCreateSlicedEll(&(A->cusparseOpt.matU), nrow, nrow, half_nnz, sell_u_nnz, slice_size,
            A->sellUSliceMrl, A->sellUPermColumns, A->sellUPermValues, INDEX_TYPE, INDEX_TYPE, CUSPARSE_INDEX_BASE_ZERO,
            CUDA_R_64F);
        local_int_t sell_nnz = sell_slices * slice_size * HPCG_MAX_ROW_LEN;
        cusparseCreateSlicedEll(&(A->cusparseOpt.matA), nrow, nrow, A->localNumberOfNonzeros, sell_nnz, slice_size,
            A->sellASliceMrl, A->sellAPermColumns, A->sellAPermValues, INDEX_TYPE, INDEX_TYPE, CUSPARSE_INDEX_BASE_ZERO,
            CUDA_R_64F);
        double alpha = 1.0, beta = 0.0;
        size_t e_buf_size = 0;
        size_t l_buf_size = 0, u_buf_size = 0, i_buf_size = 0, max_buf_size = 0;
        cusparseDnVecDescr_t dummy1, dummy2;
        cusparseCreateDnVec(&dummy1, nrow, x.values_d, CUDA_R_64F);
        cusparseCreateDnVec(&dummy2, nrow, b.values_d, CUDA_R_64F);
        cusparseCreateDnVec(&(A->cusparseOpt.vecX), nrow, x.values_d, CUDA_R_64F);
        cusparseCreateDnVec(&(A->cusparseOpt.vecY), nrow, b.values_d, CUDA_R_64F);
        max_buf_size = e_buf_size;
        // MV
        // Lower
        cusparseSpMV_bufferSize(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matL, dummy1,
            &beta, dummy2, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &l_buf_size);
        cusparseSpMV_bufferSize(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matU, dummy1,
            &beta, dummy2, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &u_buf_size);
        cusparseSpMV_bufferSize(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matA, dummy1,
            &beta, dummy2, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &i_buf_size);
        max_buf_size = std::max(std::max(i_buf_size, e_buf_size), std::max(u_buf_size, l_buf_size));
        // SV
        // Lower
        size_t buffer_size_sv_l, buffer_size_sv_u;
        cusparseFillMode_t fillmode_l = CUSPARSE_FILL_MODE_LOWER;
        cusparseFillMode_t fillmode_u = CUSPARSE_FILL_MODE_UPPER;
        cusparseDiagType_t diagtype = CUSPARSE_DIAG_TYPE_NON_UNIT;
        cusparseSpSV_createDescr(&A->cusparseOpt.spsvDescrL);
        cusparseSpSV_createDescr(&A->cusparseOpt.spsvDescrU);
        cusparseSpMatSetAttribute(A->cusparseOpt.matL, CUSPARSE_SPMAT_DIAG_TYPE, &(diagtype), sizeof(diagtype));
        cusparseSpMatSetAttribute(A->cusparseOpt.matL, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
        if (!Use_Hpcg_Mem_Reduction || (nrow % 8 != 0))
        {
            cusparseSpSV_bufferSize(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matL,
                A->cusparseOpt.vecX, A->cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT,
                A->cusparseOpt.spsvDescrL, &buffer_size_sv_l);
            cudaMalloc(&A->bufferSvL, buffer_size_sv_l);
        }
        cusparseSpSV_analysis(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matL,
            A->cusparseOpt.vecX, A->cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A->cusparseOpt.spsvDescrL,
            A->bufferSvL);
        cusparseSpSV_updateMatrix(
            cusparsehandle, A->cusparseOpt.spsvDescrL, A->diagonal, CUSPARSE_SPSV_UPDATE_DIAGONAL);
        cusparseSpMatSetAttribute(A->cusparseOpt.matU, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
        if (!Use_Hpcg_Mem_Reduction || (nrow % 8 != 0))
        {
            cusparseSpSV_bufferSize(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matU,
                A->cusparseOpt.vecX, A->cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT,
                A->cusparseOpt.spsvDescrU, &buffer_size_sv_u);
            cudaMalloc(&A->bufferSvU, buffer_size_sv_u);
        }
        cusparseSpSV_analysis(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matU,
            A->cusparseOpt.vecX, A->cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A->cusparseOpt.spsvDescrU,
            A->bufferSvU);
        cusparseSpSV_updateMatrix(
            cusparsehandle, A->cusparseOpt.spsvDescrU, A->diagonal, CUSPARSE_SPSV_UPDATE_DIAGONAL);
        if (max_buf_size > 0)
            cudaMalloc(&(A->bufferMvA), max_buf_size);
        cusparseDestroyDnVec(dummy1);
        cusparseDestroyDnVec(dummy2);
        // //////////////////////////////////////////////////////////////////////////
        A = A->Ac;
    }
    A = &A_in;
    for (int level = 1; level < numberOfMgLevels; ++level)
    {
        const local_int_t nrow_c = A->Ac->localNumberOfRows;
        const local_int_t nrow_f = A->localNumberOfRows;
        F2cPermCuda(nrow_c, A->gpuAux.f2c, A->f2cPerm, A->ref2opt, A->Ac->opt2ref);
        A = A->Ac;
    }
    return 0;
 }
 #endif
 #ifdef USE_GRACE
 size_t OptimizeProblemCpu(SparseMatrix& A_in, CGData& data, Vector& b, Vector& x, Vector& xexact)
 {
    // Initialize data structures
    size_t mem = AllocateMemCpu(A_in);
    SparseMatrix* A = &A_in;
    local_int_t numberOfMgLevels = 4;
    local_int_t slice_size = A->slice_size;
    for (int level = 0; level < numberOfMgLevels; ++level)
    {
        // Color the matrix
        int num_colors;
        ColorMatrixCpu(*A, &num_colors);
        A->totalColors = num_colors;
        // Compute when each color starts
        A->cpuAux.firstRowOfColor[0] = 0;
        for (int c = 1; c < A->totalColors; c++)
        {
            A->cpuAux.firstRowOfColor[c] = A->cpuAux.firstRowOfColor[c - 1] + A->cpuAux.nRowsWithColor[c - 1];
        }
        // Reorder the matrix
        CreateSellPermCpu(*A);
 #ifndef HPCG_NO_MPI
        // Translate row IDs that will be send to neighbours
 #pragma omp parallel for
        for (local_int_t i = 0; i < A->totalToBeSent; i++)
        {
            local_int_t orig = A->elementsToSend[i];
            A->elementsToSend[i] = A->ref2opt[orig];
        }
 #endif
        local_int_t numberOfNonzerosPerRow = HPCG_MAX_ROW_LEN;
        local_int_t nrow = A->localNumberOfRows;
        local_int_t half_nnz = (A->localNumberOfNonzeros - nrow - A->extNnz) / 2;
        local_int_t num_slices = (nrow + slice_size - 1) / slice_size;
        local_int_t sell_l_nnz = A->sellLSliceMrl[num_slices];
        local_int_t sell_u_nnz = A->sellUSliceMrl[num_slices];
        local_int_t sell_nnz = num_slices * slice_size * numberOfNonzerosPerRow;
        auto INDEX_TYPE = NVPL_SPARSE_INDEX_32I;
 #ifdef INDEX_64 // In src/Geometry
        INDEX_TYPE = NVPL_SPARSE_INDEX_64I;
 #endif
        nvpl_sparse_create_sliced_ell(&(A->nvplSparseOpt.matL), nrow, nrow, half_nnz, sell_l_nnz, slice_size,
            A->sellLSliceMrl, A->sellLPermColumns, A->sellLPermValues, INDEX_TYPE, INDEX_TYPE,
            NVPL_SPARSE_INDEX_BASE_ZERO, NVPL_SPARSE_R_64F);
        nvpl_sparse_create_sliced_ell(&(A->nvplSparseOpt.matU), nrow, nrow, half_nnz, sell_u_nnz, slice_size,
            A->sellUSliceMrl, A->sellUPermColumns, A->sellUPermValues, INDEX_TYPE, INDEX_TYPE,
            NVPL_SPARSE_INDEX_BASE_ZERO, NVPL_SPARSE_R_64F);
        nvpl_sparse_create_sliced_ell(&(A->nvplSparseOpt.matA), nrow, nrow, A->localNumberOfNonzeros, sell_nnz,
            slice_size, A->sellASliceMrl, A->sellAPermColumns, A->sellAPermValues, INDEX_TYPE, INDEX_TYPE,
            NVPL_SPARSE_INDEX_BASE_ZERO, NVPL_SPARSE_R_64F);
        double alpha = 1.0, beta = 0.0;
        size_t e_buf_size = 0;
        size_t l_buf_size = 0, u_buf_size = 0, i_buf_size = 0, max_buf_size = 0;
        nvpl_sparse_create_dn_vec(&(A->nvplSparseOpt.vecX), nrow, x.values, NVPL_SPARSE_R_64F);
        nvpl_sparse_create_dn_vec(&(A->nvplSparseOpt.vecY), nrow, b.values, NVPL_SPARSE_R_64F);
        max_buf_size = e_buf_size;
        // //MV
        // //Lower
        nvpl_sparse_spmv_create_descr(&A->nvplSparseOpt.spmvLDescr);
        nvpl_sparse_spmv_buffer_size(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
            A->nvplSparseOpt.matL, A->nvplSparseOpt.vecX, &beta, A->nvplSparseOpt.vecY, A->nvplSparseOpt.vecY,
            NVPL_SPARSE_R_64F, NVPL_SPARSE_SPMV_ALG_DEFAULT, A->nvplSparseOpt.spmvLDescr, &l_buf_size);
        // //Upper
        nvpl_sparse_spmv_create_descr(&A->nvplSparseOpt.spmvUDescr);
        nvpl_sparse_spmv_buffer_size(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
            A->nvplSparseOpt.matU, A->nvplSparseOpt.vecX, &beta, A->nvplSparseOpt.vecY, A->nvplSparseOpt.vecY,
            NVPL_SPARSE_R_64F, NVPL_SPARSE_SPMV_ALG_DEFAULT, A->nvplSparseOpt.spmvUDescr, &u_buf_size);
        // //L+D+U
        nvpl_sparse_spmv_create_descr(&A->nvplSparseOpt.spmvADescr);
        nvpl_sparse_spmv_buffer_size(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
            A->nvplSparseOpt.matA, A->nvplSparseOpt.vecX, &beta, A->nvplSparseOpt.vecY, A->nvplSparseOpt.vecY,
            NVPL_SPARSE_R_64F, NVPL_SPARSE_SPMV_ALG_DEFAULT, A->nvplSparseOpt.spmvADescr, &i_buf_size);
        max_buf_size = std::max(std::max(i_buf_size, e_buf_size), std::max(u_buf_size, l_buf_size));
        // //SV
        // //Lower
        size_t buffer_size_sv_l, buffer_size_sv_u;
        nvpl_sparse_fill_mode_t fillmode_l = NVPL_SPARSE_FILL_MODE_LOWER;
        nvpl_sparse_fill_mode_t fillmode_u = NVPL_SPARSE_FILL_MODE_UPPER;
        nvpl_sparse_diag_type_t diagtype = NVPL_SPARSE_DIAG_TYPE_NON_UNIT;
        nvpl_sparse_spsv_create_descr(&A->nvplSparseOpt.spsvDescrL);
        nvpl_sparse_spsv_create_descr(&A->nvplSparseOpt.spsvDescrU);
        nvpl_sparse_sp_mat_set_attribute(
            A->nvplSparseOpt.matL, NVPL_SPARSE_SPMAT_DIAG_TYPE, &(diagtype), sizeof(diagtype));
        nvpl_sparse_sp_mat_set_attribute(
            A->nvplSparseOpt.matL, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
        Vector origDiagA;
        InitializeVector(origDiagA, A->localNumberOfRows, CPU);
        CopyMatrixDiagonal(*A, origDiagA);
        // Pass strictly L, and then update the diagonal
        if (!Use_Hpcg_Mem_Reduction || A->localNumberOfRows % 8 != 0)
        {
            nvpl_sparse_sp_mat_set_attribute(
                A->nvplSparseOpt.matA, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
            nvpl_sparse_spsv_buffer_size(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
                A->nvplSparseOpt.matA, A->nvplSparseOpt.vecX, A->nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
                NVPL_SPARSE_SPSV_ALG_DEFAULT, A->nvplSparseOpt.spsvDescrL, &buffer_size_sv_l);
            A->bufferSvL = new char[buffer_size_sv_l];
            mem += buffer_size_sv_l;
            nvpl_sparse_spsv_analysis(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
                A->nvplSparseOpt.matA, A->nvplSparseOpt.vecX, A->nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
                NVPL_SPARSE_SPSV_ALG_DEFAULT, A->nvplSparseOpt.spsvDescrL, A->bufferSvL);
        }
        else
        {
            nvpl_sparse_spsv_analysis(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
                A->nvplSparseOpt.matL, A->nvplSparseOpt.vecX, A->nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
                NVPL_SPARSE_SPSV_ALG_DEFAULT, A->nvplSparseOpt.spsvDescrL, A->bufferSvL);
            nvpl_sparse_spsv_update_matrix(
                nvpl_sparse_handle, A->nvplSparseOpt.spsvDescrL, origDiagA.values, NVPL_SPARSE_SPSV_UPDATE_DIAGONAL);
        }
        // Pass strctly U, and then update diagonal
        nvpl_sparse_sp_mat_set_attribute(
            A->nvplSparseOpt.matU, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
        if (!Use_Hpcg_Mem_Reduction || A->localNumberOfRows % 8 != 0)
        {
            nvpl_sparse_sp_mat_set_attribute(
                A->nvplSparseOpt.matA, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
            nvpl_sparse_spsv_buffer_size(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
                A->nvplSparseOpt.matA, A->nvplSparseOpt.vecX, A->nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
                NVPL_SPARSE_SPSV_ALG_DEFAULT, A->nvplSparseOpt.spsvDescrU, &buffer_size_sv_u);
            A->bufferSvU = new char[buffer_size_sv_u];
            mem += buffer_size_sv_u;
            nvpl_sparse_spsv_analysis(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
                A->nvplSparseOpt.matA, A->nvplSparseOpt.vecX, A->nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
                NVPL_SPARSE_SPSV_ALG_DEFAULT, A->nvplSparseOpt.spsvDescrU, A->bufferSvU);
        }
        else
        {
            nvpl_sparse_spsv_analysis(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
                A->nvplSparseOpt.matU, A->nvplSparseOpt.vecX, A->nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
                NVPL_SPARSE_SPSV_ALG_DEFAULT, A->nvplSparseOpt.spsvDescrU, A->bufferSvU);
            nvpl_sparse_spsv_update_matrix(
                nvpl_sparse_handle, A->nvplSparseOpt.spsvDescrU, origDiagA.values, NVPL_SPARSE_SPSV_UPDATE_DIAGONAL);
        }
        DeleteVector(origDiagA);
        //////////////////////////////////////////////////////////////////////////////////////////////////////////
        A = A->Ac;
    }
    A = &A_in;
    for (int level = 1; level < numberOfMgLevels; level++)
    {
        local_int_t nrow_c = A->Ac->localNumberOfRows;
        local_int_t nrow_f = A->localNumberOfRows;
        // Permute space injector operator
        F2cPermCpu(nrow_c, A->mgData->f2cOperator, A->f2cPerm, A->ref2opt, A->Ac->opt2ref);
        A = A->Ac;
    }
    return mem;
 }
 #endif // USE_GRACE
 size_t OptimizeProblem(SparseMatrix& A_in, CGData& data, Vector& b, Vector& x, Vector& xexact)
 {
    size_t result = 0;
    if (A_in.rankType == GPU)
    {
 #ifdef USE_CUDA
        result = OptimizeProblemGpu(A_in, data, b, x, xexact);
 #endif
    }
    else
    {
 #ifdef USE_GRACE
        result = OptimizeProblemCpu(A_in, data, b, x, xexact);
 #endif
    }
    return result;
 }
 // Helper function (see OptimizeProblem.hpp for details)
 double OptimizeProblemMemoryUse(const SparseMatrix& A)
 {
    return 0.0;
 }
--- a/src/OptimizeProblem.hpp
+++ b/src/OptimizeProblem.hpp
@@ -0,0 +1,30 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 #ifndef OPTIMIZEPROBLEM_HPP
 #define OPTIMIZEPROBLEM_HPP
 #include "CGData.hpp"
 #include "SparseMatrix.hpp"
 #include "Vector.hpp"
 size_t OptimizeProblem(SparseMatrix& A, CGData& data, Vector& b, Vector& x, Vector& xexact);
 // This helper function should be implemented in a non-trivial way if OptimizeProblem is non-trivial
 // It should return as type double, the total number of bytes allocated and retained after calling OptimizeProblem.
 // This value will be used to report Gbytes used in ReportResults (the value returned will be divided by 1000000000.0).
 double OptimizeProblemMemoryUse(const SparseMatrix& A);
 #endif // OPTIMIZEPROBLEM_HPP
--- a/src/OutputFile.cpp
+++ b/src/OutputFile.cpp
@@ -0,0 +1,176 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <fstream>
 #include <iostream>
 #include <list>
 #include <sstream>
 #include <string>
 #include "OutputFile.hpp"
 using std::string;
 using std::stringstream;
 using std::list;
 using std::ofstream;
 extern int use_output_file;
 OutputFile::OutputFile(const string& name_arg, const string& version_arg)
    : name(name_arg)
    , version(version_arg)
    , eol("\n")
    , keySeparator("::")
 {
 }
 OutputFile::OutputFile(void)
    : eol("\n")
    , keySeparator("::")
 {
 }
 OutputFile::~OutputFile()
 {
    for (list<OutputFile*>::iterator it = descendants.begin(); it != descendants.end(); ++it)
    {
        delete *it;
    }
 }
 void OutputFile::add(const string& key_arg, const string& value_arg)
 {
    descendants.push_back(allocKeyVal(key_arg, value_arg));
 }
 void OutputFile::add(const string& key_arg, double value_arg)
 {
    stringstream ss;
    ss << value_arg;
    descendants.push_back(allocKeyVal(key_arg, ss.str()));
 }
 void OutputFile::add(const string& key_arg, int value_arg)
 {
    stringstream ss;
    ss << value_arg;
    descendants.push_back(allocKeyVal(key_arg, ss.str()));
 }
 #ifndef HPCG_NO_LONG_LONG
 void OutputFile::add(const string& key_arg, long long value_arg)
 {
    stringstream ss;
    ss << value_arg;
    descendants.push_back(allocKeyVal(key_arg, ss.str()));
 }
 #endif
 void OutputFile::add(const string& key_arg, size_t value_arg)
 {
    stringstream ss;
    ss << value_arg;
    descendants.push_back(allocKeyVal(key_arg, ss.str()));
 }
 void OutputFile::setKeyValue(const string& key_arg, const string& value_arg)
 {
    key = key_arg;
    value = value_arg;
 }
 OutputFile* OutputFile::get(const string& key_arg)
 {
    for (list<OutputFile*>::iterator it = descendants.begin(); it != descendants.end(); ++it)
    {
        if ((*it)->key == key_arg)
            return *it;
    }
    return 0;
 }
 string OutputFile::generateRecursive(string prefix)
 {
    string result = "";
    result += prefix + key + "=" + value + eol;
    for (list<OutputFile*>::iterator it = descendants.begin(); it != descendants.end(); ++it)
    {
        result += (*it)->generateRecursive(prefix + key + keySeparator);
    }
    return result;
 }
 string OutputFile::generate(void)
 {
    string result = name + "\nversion=" + version + eol;
    for (list<OutputFile*>::iterator it = descendants.begin(); it != descendants.end(); ++it)
    {
        result += (*it)->generateRecursive("");
    }
    time_t rawtime;
    time(&rawtime);
    tm* ptm = localtime(&rawtime);
    char sdate[64];
    // use tm_mon+1 because tm_mon is 0 .. 11 instead of 1 .. 12
    sprintf(sdate, "%04d-%02d-%02d_%02d-%02d-%02d", ptm->tm_year + 1900, ptm->tm_mon + 1, ptm->tm_mday, ptm->tm_hour,
        ptm->tm_min, ptm->tm_sec);
    string filename = name + "_" + version + "_";
    filename += string(sdate) + ".txt";
    if (use_output_file)
    {
        ofstream myfile(filename.c_str());
        myfile << result;
        myfile.close();
    }
    else
    {
        std::cout << result << std::flush;
    }
    return result;
 }
 OutputFile* OutputFile::allocKeyVal(const std::string& key_arg, const std::string& value_arg)
 {
    OutputFile* of = new OutputFile();
    of->setKeyValue(key_arg, value_arg);
    return of;
 }
--- a/src/OutputFile.hpp
+++ b/src/OutputFile.hpp
@@ -0,0 +1,161 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*!
 @file Output_File.hpp
 HPCG output file classes
 */
 #ifndef OUTPUTFILE_HPP
 #define OUTPUTFILE_HPP
 #include <list>
 #include <string>
 //! The OutputFile class for the uniform collecting and reporting of performance data for HPCG
 /*!
  The OutputFile class facilitates easy collecting and reporting of
  key-value-formatted data that can be then registered with the HPCG results
  collection website. The keys may have hierarchy key1::key2::key3=val with
  double colon :: as a separator. A sample output may look like this (note how
  "major" and "micro" keys repeat with different ancestor keys):
 \code
 version=3.2.1alpha
 version::major=3
 version::minor=2
 version::micro=1
 version::release=alpha
 axis=xyz
 axis::major=x
 axis::minor=y
 \endcode
 */
 class OutputFile
 {
 protected:
    std::list<OutputFile*> descendants; //!< descendant elements
    std::string name;                   //!< name of the benchmark
    std::string version;                //!< version of the benchmark
    std::string key;                    //!< the key under which the element is stored
    std::string value;                  //!< the value of the stored element
    std::string eol;                    //!< end-of-line character sequence in the output file
    std::string keySeparator;           //!< character sequence to separate keys in the output file
    //! Recursively generate output string from descendant list, and their descendants and so on
    std::string generateRecursive(std::string prefix);
 public:
    static OutputFile* allocKeyVal(const std::string& key, const std::string& value);
    //! Constructor: accepts name and version as strings that are used to create a file name for printing results.
    /*!
      This constructor accepts and name and version number for the benchmark that
      are used to form a file name information for results that are generated by
      the generate() method.
      \param name (in) string containing name of the benchmark
      \param version (in) string containing the version of the benchmark
    */
    OutputFile(const std::string& name, const std::string& version);
    //! Default constructor: no-arguments accepted, should be used for descendant nodes
    /*!
      This no-argument constructor can be used for descendant nodes to provide
      key1::key2::key3=val output. Unlike the root node, descendant nodes do not
      have name and version but only store key-value pairs.
    */
    OutputFile(void);
    ~OutputFile();
    //! Create and add a descendant element with value of type "string"
    /*!
    Create and add a descendant element identified by "key" and associated with
    "value".  The element is added at the end of a list of previously added
    elements.
    @param[in] key   The key that identifies the added element and under which the element is stored
    @param[in] value The value stored by the element
    */
    void add(const std::string& key, const std::string& value);
    //! Create and add a descendant element with value of type "double"
    /*!
    Create and add a descendant element identified by "key" and associated with
    "value".  The element is added at the end of a list of previously added
    elements.
    @param[in] key   The key that identifies the added element and under which the element is stored
    @param[in] value The value stored by the element
    */
    void add(const std::string& key, double value);
    //! Create and add a descendant element with value of type "int"
    /*!
    Create and add a descendant element identified by "key" and associated with
    "value".  The element is added at the end of a list of previously added
    elements.
    @param[in] key   The key that identifies the added element and under which the element is stored
    @param[in] value The value stored by the element
    */
    void add(const std::string& key, int value);
 #ifndef HPCG_NO_LONG_LONG
    //! Create and add a descendant element with value of type "long long"
    /*!
    Create and add a descendant element identified by "key" and associated with
    "value".  The element is added at the end of a list of previously added
    elements.
    @param[in] key   The key that identifies the added element and under which the element is stored
    @param[in] value The value stored by the element
    */
    void add(const std::string& key, long long value);
 #endif
    //! Create and add a descendant element with value of type "size_t"
    /*!
    Create and add a descendant element identified by "key" and associated with
    "value".  The element is added at the end of a list of previously added
    elements.
    @param[in] key   The key that identifies the added element and under which the element is stored
    @param[in] value The value stored by the element
    */
    void add(const std::string& key, size_t value);
    //! Key-Value setter method
    /*!
    Set the key and the value of this element.
    @param[in] key   The key that identifies this element and under which the element is stored
    @param[in] value The value stored by the element
    */
    void setKeyValue(const std::string& key, const std::string& value);
    //! Get the element in the list with the given key or return NULL if not found
    OutputFile* get(const std::string& key);
    //! Generate output string with results based on the stored key-value hierarchy
    std::string generate(void);
 };
 #endif // OUTPUTFILE_HPP
--- a/src/ReadHpcgDat.cpp
+++ b/src/ReadHpcgDat.cpp
@@ -0,0 +1,79 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 #include <cstdio>
 #include "ReadHpcgDat.hpp"
 static int SkipUntilEol(FILE* stream)
 {
    int chOrEof;
    bool finished;
    do
    {
        chOrEof = fgetc(stream);
        finished = (chOrEof == EOF) || (chOrEof == '\n') || (chOrEof == '\r');
    } while (!finished);
    if ('\r' == chOrEof)
    { // on Windows, \r might be followed by \n
        int chOrEofExtra = fgetc(stream);
        if ('\n' == chOrEofExtra || EOF == chOrEofExtra)
            chOrEof = chOrEofExtra;
        else
            ungetc(chOrEofExtra, stream);
    }
    return chOrEof;
 }
 int ReadHpcgDat(int* localDimensions, int* secondsPerRun, int* localProcDimensions, char* filename)
 {
    FILE* hpcgStream = fopen(filename, "r");
    if (!hpcgStream)
    {
        printf("Cannot open input file: %s\n", filename);
        return -1;
    }
    SkipUntilEol(hpcgStream); // skip the first line
    SkipUntilEol(hpcgStream); // skip the second line
    for (int i = 0; i < 3; ++i)
        if (fscanf(hpcgStream, "%d", localDimensions + i) != 1 || localDimensions[i] < 16)
            localDimensions[i] = 16;
    SkipUntilEol(hpcgStream); // skip the rest of the second line
    if (secondsPerRun != 0)
    { // Only read number of seconds if the pointer is non-zero
        if (fscanf(hpcgStream, "%d", secondsPerRun) != 1 || secondsPerRun[0] < 0)
            secondsPerRun[0] = 30 * 60; // 30 minutes
    }
    SkipUntilEol(hpcgStream); // skip the rest of the third line
    for (int i = 0; i < 3; ++i)
        // the user didn't specify (or values are invalid) process dimensions
        if (fscanf(hpcgStream, "%d", localProcDimensions + i) != 1 || localProcDimensions[i] < 1)
            localProcDimensions[i] = 0; // value 0 means: "not specified" and it will be fixed later
    fclose(hpcgStream);
    return 0;
 }
--- a/src/ReadHpcgDat.hpp
+++ b/src/ReadHpcgDat.hpp
@@ -0,0 +1,20 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 #ifndef READHPCGDAT_HPP
 #define READHPCGDAT_HPP
 int ReadHpcgDat(int* localDimensions, int* secondsPerRun, int* localProcDimensions, char* filename);
 #endif // READHPCGDAT_HPP
--- a/src/ReportResults.cpp
+++ b/src/ReportResults.cpp
@@ -0,0 +1,512 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /*!
 @file ReportResults.cpp
 HPCG routine
 */
 #ifndef HPCG_NO_MPI
 #include <mpi.h>
 #endif
 #include "OptimizeProblem.hpp"
 #include "OutputFile.hpp"
 #include "ReportResults.hpp"
 #include <vector>
 #ifdef HPCG_DEBUG
 #include <fstream>
 using std::endl;
 #include "hpcg.hpp"
 #endif
 extern int use_output_file;
 /*!
 Creates a YAML file and writes the information about the HPCG run, its results, and validity.
  @param[in] geom The description of the problem's geometry.
  @param[in] A    The known system matrix
  @param[in] numberOfMgLevels Number of levels in multigrid V cycle
  @param[in] numberOfCgSets Number of CG runs performed
  @param[in] niters Number of preconditioned CG iterations performed to lower the residual below a threshold
  @param[in] times  Vector of cumulative timings for each of the phases of a preconditioned CG iteration
  @param[in] testcg_data    the data structure with the results of the CG-correctness test including pass/fail
 information
  @param[in] testsymmetry_data the data structure with the results of the CG symmetry test including pass/fail
 information
  @param[in] testnorms_data the data structure with the results of the CG norm test including pass/fail information
  @param[in] global_failure indicates whether a failure occurred during the correctness tests of CG
  @see YAML_Doc
 */
 void ReportResults(const SparseMatrix& A, int numberOfMgLevels, int numberOfCgSets, int refMaxIters, int optMaxIters,
    double times[], const TestCGData& testcg_data, const TestSymmetryData& testsymmetry_data,
    const TestNormsData& testnorms_data, int global_failure, bool quickPath)
 {
    double minOfficialTime = 1800; // Any official benchmark result must run at least this many seconds
 #ifndef HPCG_NO_MPI
    double t4 = times[4];
    double t4min = 0.0;
    double t4max = 0.0;
    double t4avg = 0.0;
    MPI_Allreduce(&t4, &t4min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
    MPI_Allreduce(&t4, &t4max, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
    MPI_Allreduce(&t4, &t4avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    t4avg = t4avg / ((double) A.geom->size);
 #endif
    if (A.geom->rank == 0)
    { // Only PE 0 needs to compute and report timing results
        // TODO: Put the FLOP count, Memory BW and Memory Usage models into separate functions
        // ======================== FLOP count model =======================================
        double fNumberOfCgSets = numberOfCgSets;
        double fniters = fNumberOfCgSets * (double) optMaxIters;
        double fnrow = A.totalNumberOfRows;
        double fnnz = A.totalNumberOfNonzeros;
        // Op counts come from implementation of CG in CG.cpp (include 1 extra for the CG preamble ops)
        double fnops_ddot = (3.0 * fniters + fNumberOfCgSets) * 2.0 * fnrow; // 3 ddots with nrow adds and nrow mults
        double fnops_waxpby
            = (3.0 * fniters + fNumberOfCgSets) * 2.0 * fnrow;            // 3 WAXPBYs with nrow adds and nrow mults
        double fnops_sparsemv = (fniters + fNumberOfCgSets) * 2.0 * fnnz; // 1 SpMV with nnz adds and nnz mults
        // Op counts from the multigrid preconditioners
        double fnops_precond = 0.0;
        const SparseMatrix* Af = &A;
        for (int i = 1; i < numberOfMgLevels; ++i)
        {
            double fnnz_Af = Af->totalNumberOfNonzeros;
            double fnumberOfPresmootherSteps = Af->mgData->numberOfPresmootherSteps;
            double fnumberOfPostsmootherSteps = Af->mgData->numberOfPostsmootherSteps;
            fnops_precond += fnumberOfPresmootherSteps * fniters * 4.0 * fnnz_Af; // number of presmoother flops
            fnops_precond += fniters * 2.0 * fnnz_Af; // cost of fine grid residual calculation
            fnops_precond += fnumberOfPostsmootherSteps * fniters * 4.0 * fnnz_Af; // number of postsmoother flops
            Af = Af->Ac;                                                           // Go to next coarse level
        }
        fnops_precond
            += fniters * 4.0 * ((double) Af->totalNumberOfNonzeros); // One symmetric GS sweep at the coarsest level
        double fnops = fnops_ddot + fnops_waxpby + fnops_sparsemv + fnops_precond;
        double frefnops = fnops * ((double) refMaxIters) / ((double) optMaxIters);
        // ======================== Memory bandwidth model =======================================
        // Read/Write counts come from implementation of CG in CG.cpp (include 1 extra for the CG preamble ops)
        double fnreads_ddot
            = (3.0 * fniters + fNumberOfCgSets) * 2.0 * fnrow * sizeof(double);    // 3 ddots with 2 nrow reads
        double fnwrites_ddot = (3.0 * fniters + fNumberOfCgSets) * sizeof(double); // 3 ddots with 1 write
        double fnreads_waxpby = (3.0 * fniters + fNumberOfCgSets) * 2.0 * fnrow
            * sizeof(double); // 3 WAXPBYs with nrow adds and nrow mults
        double fnwrites_waxpby
            = (3.0 * fniters + fNumberOfCgSets) * fnrow * sizeof(double); // 3 WAXPBYs with nrow adds and nrow mults
        double fnreads_sparsemv = (fniters + fNumberOfCgSets)
            * (fnnz * (sizeof(double) + sizeof(local_int_t))
                + fnrow * sizeof(double)); // 1 SpMV with nnz reads of values, nnz reads indices,
        // plus nrow reads of x
        double fnwrites_sparsemv = (fniters + fNumberOfCgSets) * fnrow * sizeof(double); // 1 SpMV nrow writes
        // Op counts from the multigrid preconditioners
        double fnreads_precond = 0.0;
        double fnwrites_precond = 0.0;
        Af = &A;
        for (int i = 1; i < numberOfMgLevels; ++i)
        {
            double fnnz_Af = Af->totalNumberOfNonzeros;
            double fnrow_Af = Af->totalNumberOfRows;
            double fnumberOfPresmootherSteps = Af->mgData->numberOfPresmootherSteps;
            double fnumberOfPostsmootherSteps = Af->mgData->numberOfPostsmootherSteps;
            fnreads_precond += fnumberOfPresmootherSteps * fniters
                * (2.0 * fnnz_Af * (sizeof(double) + sizeof(local_int_t))
                    + fnrow_Af * sizeof(double)); // number of presmoother reads
            fnwrites_precond
                += fnumberOfPresmootherSteps * fniters * fnrow_Af * sizeof(double); // number of presmoother writes
            fnreads_precond += fniters
                * (fnnz_Af * (sizeof(double) + sizeof(local_int_t))
                    + fnrow_Af * sizeof(double)); // Number of reads for fine grid residual calculation
            fnwrites_precond
                += fniters * fnnz_Af * sizeof(double); // Number of writes for fine grid residual calculation
            fnreads_precond += fnumberOfPostsmootherSteps * fniters
                * (2.0 * fnnz_Af * (sizeof(double) + sizeof(local_int_t))
                    + fnrow_Af * sizeof(double)); // number of postsmoother reads
            fnwrites_precond
                += fnumberOfPostsmootherSteps * fniters * fnnz_Af * sizeof(double); // number of postsmoother writes
            Af = Af->Ac;                                                            // Go to next coarse level
        }
        double fnnz_Af = Af->totalNumberOfNonzeros;
        double fnrow_Af = Af->totalNumberOfRows;
        fnreads_precond
            += fniters * (2.0 * fnnz_Af * (sizeof(double) + sizeof(local_int_t)) + fnrow_Af * sizeof(double));
        ;                                                        // One symmetric GS sweep at the coarsest level
        fnwrites_precond += fniters * fnrow_Af * sizeof(double); // One symmetric GS sweep at the coarsest level
        double fnreads = fnreads_ddot + fnreads_waxpby + fnreads_sparsemv + fnreads_precond;
        double fnwrites = fnwrites_ddot + fnwrites_waxpby + fnwrites_sparsemv + fnwrites_precond;
        double frefnreads = fnreads * ((double) refMaxIters) / ((double) optMaxIters);
        double frefnwrites = fnwrites * ((double) refMaxIters) / ((double) optMaxIters);
        // ======================== Memory usage model =======================================
        // Data in GenerateProblem_ref
        double numberOfNonzerosPerRow
            = 27.0; // We are approximating a 27-point finite element/volume/difference 3D stencil
        double size = ((double) A.geom->size); // Needed for estimating size of halo
        double fnbytes = ((double) sizeof(Geometry));           // Geometry struct in main.cpp
        fnbytes += ((double) sizeof(double) * fNumberOfCgSets); // testnorms_data in main.cpp
        // Model for GenerateProblem_ref.cpp
        fnbytes += fnrow * sizeof(char);                                             // array nonzerosInRow
        fnbytes += fnrow * ((double) sizeof(global_int_t*));                         // mtxIndG
        fnbytes += fnrow * ((double) sizeof(local_int_t*));                          // mtxIndL
        fnbytes += fnrow * ((double) sizeof(double*));                               // matrixValues
        fnbytes += fnrow * ((double) sizeof(double*));                               // matrixDiagonal
        fnbytes += fnrow * numberOfNonzerosPerRow * ((double) sizeof(local_int_t));  // mtxIndL[1..nrows]
        fnbytes += fnrow * numberOfNonzerosPerRow * ((double) sizeof(double));       // matrixValues[1..nrows]
        fnbytes += fnrow * numberOfNonzerosPerRow * ((double) sizeof(global_int_t)); // mtxIndG[1..nrows]
        fnbytes += fnrow * ((double) 3 * sizeof(double));                            // x, b, xexact
        // Model for CGData.hpp
        double fncol = ((global_int_t) A.localNumberOfColumns)
            * size; // Estimate of the global number of columns using the value from rank 0
        fnbytes += fnrow * ((double) 2 * sizeof(double)); // r, Ap
        fnbytes += fncol * ((double) 2 * sizeof(double)); // z, p
        std::vector<double> fnbytesPerLevel(numberOfMgLevels); // Count byte usage per level (level 0 is main CG level)
        fnbytesPerLevel[0] = fnbytes;
        // Benchmarker-provided model for OptimizeProblem.cpp
        double fnbytes_OptimizedProblem = OptimizeProblemMemoryUse(A);
        fnbytes += fnbytes_OptimizedProblem;
        Af = A.Ac;
        for (int i = 1; i < numberOfMgLevels; ++i)
        {
            double fnrow_Af = Af->totalNumberOfRows;
            double fncol_Af = ((global_int_t) Af->localNumberOfColumns)
                * size; // Estimate of the global number of columns using the value from rank 0
            double fnbytes_Af = 0.0;
            // Model for GenerateCoarseProblem.cpp
            fnbytes_Af += fnrow_Af * ((double) sizeof(local_int_t)); // f2cOperator
            fnbytes_Af += fnrow_Af * ((double) sizeof(double));      // rc
            fnbytes_Af += 2.0 * fncol_Af
                * ((double) sizeof(double)); // xc, Axf are estimated based on the size of these arrays on rank 0
            fnbytes_Af += ((double) (sizeof(Geometry) + sizeof(SparseMatrix) + 3 * sizeof(Vector)
                + sizeof(MGData))); // Account for structs geomc, Ac, rc, xc, Axf - (minor)
            // Model for GenerateProblem.cpp (called within GenerateCoarseProblem.cpp)
            fnbytes_Af += fnrow_Af * sizeof(char);                                             // array nonzerosInRow
            fnbytes_Af += fnrow_Af * ((double) sizeof(global_int_t*));                         // mtxIndG
            fnbytes_Af += fnrow_Af * ((double) sizeof(local_int_t*));                          // mtxIndL
            fnbytes_Af += fnrow_Af * ((double) sizeof(double*));                               // matrixValues
            fnbytes_Af += fnrow_Af * ((double) sizeof(double*));                               // matrixDiagonal
            fnbytes_Af += fnrow_Af * numberOfNonzerosPerRow * ((double) sizeof(local_int_t));  // mtxIndL[1..nrows]
            fnbytes_Af += fnrow_Af * numberOfNonzerosPerRow * ((double) sizeof(double));       // matrixValues[1..nrows]
            fnbytes_Af += fnrow_Af * numberOfNonzerosPerRow * ((double) sizeof(global_int_t)); // mtxIndG[1..nrows]
 // Model for SetupHalo_ref.cpp
 #ifndef HPCG_NO_MPI
            fnbytes_Af += ((double) sizeof(double) * Af->totalToBeSent);              // sendBuffer
            fnbytes_Af += ((double) sizeof(local_int_t) * Af->totalToBeSent);         // elementsToSend
            fnbytes_Af += ((double) sizeof(int) * Af->numberOfSendNeighbors);         // neighbors
            fnbytes_Af += ((double) sizeof(local_int_t) * Af->numberOfSendNeighbors); // receiveLength, sendLength
 #endif
            fnbytesPerLevel[i] = fnbytes_Af;
            fnbytes += fnbytes_Af; // Running sum
            Af = Af->Ac;           // Go to next coarse level
        }
        assert(Af == 0); // Make sure we got to the lowest grid level
        // Count number of bytes used per equation
        double fnbytesPerEquation = fnbytes / fnrow;
        // Instantiate YAML document
        OutputFile doc("HPCG-Benchmark", "3.1");
        doc.add("Release date", "March 28, 2019");
        doc.add("Machine Summary", "");
        doc.get("Machine Summary")->add("Distributed Processes", A.geom->size);
        doc.get("Machine Summary")->add("Threads per processes", A.geom->numThreads);
        doc.add("Global Problem Dimensions", "");
        doc.get("Global Problem Dimensions")->add("Global nx", A.geom->gnx);
        doc.get("Global Problem Dimensions")->add("Global ny", A.geom->gny);
        doc.get("Global Problem Dimensions")->add("Global nz", A.geom->gnz);
        doc.add("Processor Dimensions", "");
        doc.get("Processor Dimensions")->add("npx", A.geom->npx);
        doc.get("Processor Dimensions")->add("npy", A.geom->npy);
        doc.get("Processor Dimensions")->add("npz", A.geom->npz);
        doc.add("Local Domain Dimensions", "");
        doc.get("Local Domain Dimensions")->add("nx", A.geom->nx);
        doc.get("Local Domain Dimensions")->add("ny", A.geom->ny);
        doc.add("########## Problem Summary  ##########", "");
        doc.add("Setup Information", "");
        doc.get("Setup Information")->add("Setup Time", times[9]);
        doc.add("Linear System Information", "");
        doc.get("Linear System Information")->add("Number of Equations", A.totalNumberOfRows);
        doc.get("Linear System Information")->add("Number of Nonzero Terms", A.totalNumberOfNonzeros);
        doc.add("Multigrid Information", "");
        doc.get("Multigrid Information")->add("Number of coarse grid levels", numberOfMgLevels - 1);
        Af = &A;
        doc.get("Multigrid Information")->add("Coarse Grids", "");
        for (int i = 1; i < numberOfMgLevels; ++i)
        {
            doc.get("Multigrid Information")->get("Coarse Grids")->add("Grid Level", i);
            doc.get("Multigrid Information")
                ->get("Coarse Grids")
                ->add("Number of Equations", Af->Ac->totalNumberOfRows);
            doc.get("Multigrid Information")
                ->get("Coarse Grids")
                ->add("Number of Nonzero Terms", Af->Ac->totalNumberOfNonzeros);
            doc.get("Multigrid Information")
                ->get("Coarse Grids")
                ->add("Number of Presmoother Steps", Af->mgData->numberOfPresmootherSteps);
            doc.get("Multigrid Information")
                ->get("Coarse Grids")
                ->add("Number of Postsmoother Steps", Af->mgData->numberOfPostsmootherSteps);
            Af = Af->Ac;
        }
        doc.add("########## Memory Use Summary  ##########", "");
        doc.add("Memory Use Information", "");
        doc.get("Memory Use Information")->add("Total memory used for data (Gbytes)", fnbytes / 1000000000.0);
        doc.get("Memory Use Information")
            ->add("Memory used for OptimizeProblem data (Gbytes)", fnbytes_OptimizedProblem / 1000000000.0);
        doc.get("Memory Use Information")
            ->add("Bytes per equation (Total memory / Number of Equations)", fnbytesPerEquation);
        doc.get("Memory Use Information")
            ->add("Memory used for linear system and CG (Gbytes)", fnbytesPerLevel[0] / 1000000000.0);
        doc.get("Memory Use Information")->add("Coarse Grids", "");
        for (int i = 1; i < numberOfMgLevels; ++i)
        {
            doc.get("Memory Use Information")->get("Coarse Grids")->add("Grid Level", i);
            doc.get("Memory Use Information")
                ->get("Coarse Grids")
                ->add("Memory used", fnbytesPerLevel[i] / 1000000000.0);
        }
        doc.add("########## V&V Testing Summary  ##########", "");
        doc.add("Spectral Convergence Tests", "");
        if (testcg_data.count_fail == 0)
            doc.get("Spectral Convergence Tests")->add("Result", "PASSED");
        else
            doc.get("Spectral Convergence Tests")->add("Result", "FAILED");
        doc.get("Spectral Convergence Tests")->add("Unpreconditioned", "");
        doc.get("Spectral Convergence Tests")
            ->get("Unpreconditioned")
            ->add("Maximum iteration count", testcg_data.niters_max_no_prec);
        doc.get("Spectral Convergence Tests")
            ->get("Unpreconditioned")
            ->add("Expected iteration count", testcg_data.expected_niters_no_prec);
        doc.get("Spectral Convergence Tests")->add("Preconditioned", "");
        doc.get("Spectral Convergence Tests")
            ->get("Preconditioned")
            ->add("Maximum iteration count", testcg_data.niters_max_prec);
        doc.get("Spectral Convergence Tests")
            ->get("Preconditioned")
            ->add("Expected iteration count", testcg_data.expected_niters_prec);
        const char DepartureFromSymmetry[] = "Departure from Symmetry |x'Ay-y'Ax|/(2*||x||*||A||*||y||)/epsilon";
        doc.add(DepartureFromSymmetry, "");
        if (testsymmetry_data.count_fail == 0)
            doc.get(DepartureFromSymmetry)->add("Result", "PASSED");
        else
            doc.get(DepartureFromSymmetry)->add("Result", "FAILED");
        doc.get(DepartureFromSymmetry)->add("Departure for SpMV", testsymmetry_data.depsym_spmv);
        doc.get(DepartureFromSymmetry)->add("Departure for MG", testsymmetry_data.depsym_mg);
        doc.add("########## Iterations Summary  ##########", "");
        doc.add("Iteration Count Information", "");
        if (!global_failure)
            doc.get("Iteration Count Information")->add("Result", "PASSED");
        else
            doc.get("Iteration Count Information")->add("Result", "FAILED");
        doc.get("Iteration Count Information")->add("Reference CG iterations per set", refMaxIters);
        doc.get("Iteration Count Information")->add("Optimized CG iterations per set", optMaxIters);
        doc.get("Iteration Count Information")
            ->add("Total number of reference iterations", refMaxIters * numberOfCgSets);
        doc.get("Iteration Count Information")
            ->add("Total number of optimized iterations", optMaxIters * numberOfCgSets);
        doc.add("########## Reproducibility Summary  ##########", "");
        doc.add("Reproducibility Information", "");
        if (testnorms_data.pass)
            doc.get("Reproducibility Information")->add("Result", "PASSED");
        else
            doc.get("Reproducibility Information")->add("Result", "FAILED");
        doc.get("Reproducibility Information")->add("Scaled residual mean", testnorms_data.mean);
        doc.get("Reproducibility Information")->add("Scaled residual variance", testnorms_data.variance);
        doc.add("########## Performance Summary (times in sec) ##########", "");
        doc.add("Benchmark Time Summary", "");
        doc.get("Benchmark Time Summary")->add("Optimization phase", times[7]);
        doc.get("Benchmark Time Summary")->add("DDOT", times[1]);
        doc.get("Benchmark Time Summary")->add("WAXPBY", times[2]);
        doc.get("Benchmark Time Summary")->add("SpMV", times[3]);
        doc.get("Benchmark Time Summary")->add("MG", times[5]);
        doc.get("Benchmark Time Summary")->add("Total", times[0]);
        doc.add("Floating Point Operations Summary", "");
        doc.get("Floating Point Operations Summary")->add("Raw DDOT", fnops_ddot);
        doc.get("Floating Point Operations Summary")->add("Raw WAXPBY", fnops_waxpby);
        doc.get("Floating Point Operations Summary")->add("Raw SpMV", fnops_sparsemv);
        doc.get("Floating Point Operations Summary")->add("Raw MG", fnops_precond);
        doc.get("Floating Point Operations Summary")->add("Total", fnops);
        doc.get("Floating Point Operations Summary")->add("Total with convergence overhead", frefnops);
        doc.add("GB/s Summary", "");
        doc.get("GB/s Summary")->add("Raw Read B/W", fnreads / times[0] / 1.0E9);
        doc.get("GB/s Summary")->add("Raw Write B/W", fnwrites / times[0] / 1.0E9);
        doc.get("GB/s Summary")->add("Raw Total B/W", (fnreads + fnwrites) / (times[0]) / 1.0E9);
        doc.get("GB/s Summary")
            ->add("Total with convergence and optimization phase overhead",
                (frefnreads + frefnwrites) / (times[0] + fNumberOfCgSets * (times[7] / 10.0 + times[9] / 10.0))
                    / 1.0E9);
        doc.add("GFLOP/s Summary", "");
        doc.get("GFLOP/s Summary")->add("Raw DDOT", fnops_ddot / times[1] / 1.0E9);
        doc.get("GFLOP/s Summary")->add("Raw WAXPBY", fnops_waxpby / times[2] / 1.0E9);
        doc.get("GFLOP/s Summary")->add("Raw SpMV", fnops_sparsemv / (times[3]) / 1.0E9);
        doc.get("GFLOP/s Summary")->add("Raw MG", fnops_precond / (times[5]) / 1.0E9);
        doc.get("GFLOP/s Summary")->add("Raw Total", fnops / times[0] / 1.0E9);
        doc.get("GFLOP/s Summary")->add("Total with convergence overhead", frefnops / times[0] / 1.0E9);
        // This final GFLOP/s rating includes the overhead of problem setup and optimizing the data structures vs ten
        // sets of 50 iterations of CG
        double totalGflops = frefnops / (times[0] + fNumberOfCgSets * (times[7] / 10.0 + times[9] / 10.0)) / 1.0E9;
        double totalGflops24 = frefnops / (times[0] + fNumberOfCgSets * times[7] / 10.0) / 1.0E9;
        doc.get("GFLOP/s Summary")->add("Total with convergence and optimization phase overhead", totalGflops);
        doc.add("User Optimization Overheads", "");
        doc.get("User Optimization Overheads")->add("Optimization phase time (sec)", (times[7]));
        doc.get("User Optimization Overheads")
            ->add("Optimization phase time vs reference SpMV+MG time", times[7] / times[8]);
 #ifndef HPCG_NO_MPI
        doc.add("DDOT Timing Variations", "");
        doc.get("DDOT Timing Variations")->add("Min DDOT MPI_Allreduce time", t4min);
        doc.get("DDOT Timing Variations")->add("Max DDOT MPI_Allreduce time", t4max);
        doc.get("DDOT Timing Variations")->add("Avg DDOT MPI_Allreduce time", t4avg);
 // doc.get("Sparse Operations Overheads")->add("Halo exchange time (sec)", (times[6]));
 // doc.get("Sparse Operations Overheads")->add("Halo exchange as percentage of SpMV time",
 // (times[6])/totalSparseMVTime*100.0);
 #endif
        doc.add("Final Summary", "");
        bool isValidRun = (testcg_data.count_fail == 0) && (testsymmetry_data.count_fail == 0) && (testnorms_data.pass)
            && (!global_failure);
        if (isValidRun)
        {
            doc.get("Final Summary")->add("HPCG result is VALID with a GFLOP/s rating of", totalGflops);
            doc.get("Final Summary")->add("HPCG 2.4 rating for historical reasons is", totalGflops24);
            if (!A.isDotProductOptimized)
            {
                doc.get("Final Summary")
                    ->add("Reference version of ComputeDotProduct used",
                        "Performance results are most likely suboptimal");
            }
            if (!A.isSpmvOptimized)
            {
                doc.get("Final Summary")
                    ->add("Reference version of ComputeSPMV used", "Performance results are most likely suboptimal");
            }
            if (!A.isMgOptimized)
            {
                if (A.geom->numThreads > 1)
                    doc.get("Final Summary")
                        ->add("Reference version of ComputeMG used and number of threads greater than 1",
                            "Performance results are severely suboptimal");
                else // numThreads ==1
                    doc.get("Final Summary")
                        ->add("Reference version of ComputeMG used", "Performance results are most likely suboptimal");
            }
            if (!A.isWaxpbyOptimized)
            {
                doc.get("Final Summary")
                    ->add("Reference version of ComputeWAXPBY used", "Performance results are most likely suboptimal");
            }
            if (times[0] >= minOfficialTime)
            {
                doc.get("Final Summary")
                    ->add("Please upload results from the YAML file contents to", "http://hpcg-benchmark.org");
            }
            else
            {
                doc.get("Final Summary")->add("Results are valid but execution time (sec) is", times[0]);
                if (quickPath)
                {
                    doc.get("Final Summary")
                        ->add("You have selected the QuickPath option",
                            "Results are official for legacy installed systems with confirmation from the HPCG "
                            "Benchmark leaders.");
                    doc.get("Final Summary")
                        ->add("After confirmation please upload results from the YAML file contents to",
                            "http://hpcg-benchmark.org");
                }
                else
                {
                    doc.get("Final Summary")
                        ->add("Official results execution time (sec) must be at least", minOfficialTime);
                }
            }
        }
        else
        {
            doc.get("Final Summary")->add("HPCG result is", "INVALID.");
            doc.get("Final Summary")
                ->add("Please review the YAML file contents", "You may NOT submit these results for consideration.");
        }
        std::string yaml = doc.generate();
 #ifdef HPCG_DEBUG
        HPCG_fout << yaml;
 #endif
    }
    return;
 }
--- a/src/ReportResults.hpp
+++ b/src/ReportResults.hpp
@@ -0,0 +1,26 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 #ifndef REPORTRESULTS_HPP
 #define REPORTRESULTS_HPP
 #include "SparseMatrix.hpp"
 #include "TestCG.hpp"
 #include "TestNorms.hpp"
 #include "TestSymmetry.hpp"
 void ReportResults(const SparseMatrix& A, int numberOfMgLevels, int numberOfCgSets, int refMaxIters, int optMaxIters,
    double times[], const TestCGData& testcg_data, const TestSymmetryData& testsymmetry_data,
    const TestNormsData& testnorms_data, int global_failure, bool quickPath);
 #endif // REPORTRESULTS_HPP
--- a/src/SetupHalo.cpp
+++ b/src/SetupHalo.cpp
@@ -0,0 +1,729 @@
 //@HEADER
 // ***************************************************
 //
 // HPCG: High Performance Conjugate Gradient Benchmark
 //
 // Contact:
 // Michael A. Heroux ( maherou@sandia.gov)
 // Jack Dongarra     (dongarra@eecs.utk.edu)
 // Piotr Luszczek    (luszczek@eecs.utk.edu)
 //
 // ***************************************************
 //@HEADER
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /*!
 @file SetupHalo.cpp
 HPCG routine
 */
 #ifndef HPCG_NO_MPI
 #include <map>
 #include <mpi.h>
 #include <set>
 #endif
 #include <algorithm>
 #ifndef HPCG_NO_OPENMP
 #include <omp.h>
 #endif
 #include "SetupHalo.hpp"
 #include "SetupHalo_ref.hpp"
 #ifdef USE_CUDA
 #include "Cuda.hpp"
 #include "CudaKernels.hpp"
 #endif
 #ifdef USE_GRACE
 #include "CpuKernels.hpp"
 #endif
 #ifndef HPCG_NO_MPI
 // Used to find ranks for CPU and GPU programs
 extern int global_total_ranks;
 extern int* physical_rank_dims;
 extern int* logical_rank_to_phys;
 extern int* rankToId_h;
 extern int* idToRank_h;
 extern p2p_comm_mode_t P2P_Mode;
 #endif
 /*!
  Prepares system matrix data structure and creates data necessary necessary
  for communication of boundary values of this process.
  @param[inout] A    The known system matrix
  @see ExchangeHalo
 */
 #ifdef USE_CUDA
 void SetupHalo_Gpu(SparseMatrix& A)
 {
    global_int_t nx = A.geom->nx;
    global_int_t ny = A.geom->ny;
    global_int_t nz = A.geom->nz;
    global_int_t gnx = A.geom->gnx;
    global_int_t gny = A.geom->gny;
    global_int_t gnz = A.geom->gnz;
    global_int_t gix0 = A.geom->gix0;
    global_int_t giy0 = A.geom->giy0;
    global_int_t giz0 = A.geom->giz0;
 #ifndef HPCG_NO_MPI
    local_int_t localNumberOfRows = A.localNumberOfRows;
    local_int_t* send_buffer_d;
    local_int_t sendbufld
        = std::max(std::max(A.geom->nx * A.geom->ny, A.geom->nx * A.geom->nz), A.geom->ny * A.geom->nz);
    int* neighbors = new int[27];
    int* neighborsPhysical = new int[27];
    CHECK_CUDART(cudaMalloc((void**) &(send_buffer_d), 27 * sendbufld * sizeof(local_int_t)));
    local_int_t* sendLength = new local_int_t[27];
    local_int_t totalToBeSent = 0;
    int neiCount = 0;
    int numberOfExternalValues = 0;
    local_int_t* sendcounts2 = new local_int_t[27];
    local_int_t* receiveLength = new local_int_t[27];
    memset(sendcounts2, 0, sizeof(local_int_t) * (27));
    local_int_t* sendcounts_d = NULL;
    local_int_t* elementsToSendGpu;
    cudaMalloc(&sendcounts_d, sizeof(local_int_t) * (27));
    cudaMemsetAsync(sendcounts_d, 0, sizeof(local_int_t) * (27), stream);
    // Finds elements to send and neighbors
    SetupHaloCuda(A, sendbufld, sendcounts_d, send_buffer_d, &totalToBeSent, &neiCount, neighbors, sendLength,
        &elementsToSendGpu);
    local_int_t* elementsToSend = new local_int_t[totalToBeSent];
    double* sendBuffer = nullptr;
    if (totalToBeSent > 0)
    {
        cudaMemcpyAsync(
            elementsToSend, elementsToSendGpu, sizeof(local_int_t) * totalToBeSent, cudaMemcpyDeviceToHost, stream);
        local_int_t* sendcounts = (local_int_t*) malloc(sizeof(local_int_t) * (A.geom->size + 1));
        memset(sendcounts, 0, sizeof(local_int_t) * (A.geom->size + 1));
        local_int_t *eltsToRecv_d = NULL, *extToLocMap = NULL;
        sendcounts[0] = 0;
        for (int i = 0; i < neiCount; i++)
        {
            receiveLength[i] = sendLength[i];
            sendcounts[i + 1] = sendcounts[i] + sendLength[i];
            int neighborId = neighbors[i];
            neighborsPhysical[i] = logical_rank_to_phys[neighborId];
        }
        CHECK_CUDART(cudaMalloc(&extToLocMap, sizeof(local_int_t) * localNumberOfRows));
        CHECK_CUDART(cudaMalloc(&eltsToRecv_d, sizeof(local_int_t) * totalToBeSent));
        CHECK_CUDART(cudaMallocHost(&(sendBuffer), sizeof(double) * totalToBeSent));
        CHECK_CUDART(cudaMalloc(&(A.gpuAux.sendBuffer), sizeof(double) * totalToBeSent));
        local_int_t* eltsToRecv = new local_int_t[totalToBeSent];
        // Exchange elements to send with neighbors
        auto INDEX_TYPE = MPI_INT;
 #ifdef INDEX_64 // In src/Geometry
        INDEX_TYPE = MPI_LONG;
 #endif
        MPI_Status status;
        int MPI_MY_TAG = 93;
        MPI_Request* request = new MPI_Request[neiCount];
        cudaStreamSynchronize(stream);
        local_int_t* recv_ptr = eltsToRecv;
        for (int i = 0; i < neiCount; i++)
        {
            auto n_recv = sendLength[i];
            MPI_Irecv(recv_ptr, n_recv, INDEX_TYPE, neighborsPhysical[i], MPI_MY_TAG, MPI_COMM_WORLD, request + i);
            recv_ptr += n_recv;
        }
        local_int_t* elts_ptr = elementsToSend;
        for (int i = 0; i < neiCount; i++)
        {
            auto n_send = sendLength[i];
            MPI_Send(elts_ptr, n_send, INDEX_TYPE, neighborsPhysical[i], MPI_MY_TAG, MPI_COMM_WORLD);
            elts_ptr += n_send;
        }
        for (int i = 0; i < neiCount; i++)
        {
            MPI_Wait(request + i, &status);
        }
        delete[] request;
        cudaMemcpyAsync(
            eltsToRecv_d, eltsToRecv, sizeof(local_int_t) * (totalToBeSent), cudaMemcpyHostToDevice, stream);
        // Add the sorted indices from neighbors. For each neighbor, add its indices sequentially
        //  before the next neighbor's indices. Tje indices will be adjusted to be
        //  localNumberOfRows + its sequential location
        for (int neighborCount = 0; neighborCount < neiCount; ++neighborCount)
        {
            int neighborId = neighbors[neighborCount];
            cudaMemsetAsync(extToLocMap, 0, sizeof(local_int_t) * localNumberOfRows, stream);
            local_int_t str = sendcounts[neighborCount];
            local_int_t end = sendcounts[neighborCount + 1];
            ExtToLocMapCuda(localNumberOfRows, str, end, extToLocMap, eltsToRecv_d);
            ExtTolocCuda(localNumberOfRows, neighborId, A.extNnz, A.csrExtColumns, A.csrExtValues,
                A.gpuAux.ext2csrOffsets, extToLocMap, A.gpuAux.columns);
        }
        CHECK_CUDART(cudaFree(sendcounts_d));
        CHECK_CUDART(cudaFree(extToLocMap));
        CHECK_CUDART(cudaFree(eltsToRecv_d));
        // For P2P Alltoallv communication
        if (P2P_Mode == MPI_GPU_All2allv || P2P_Mode == MPI_CPU_All2allv)
        {
            int* sdispls = new int[A.geom->size];
            int* rdispls = new int[A.geom->size];
            int* scounts = new int[A.geom->size];
            int* rcounts = new int[A.geom->size];
            int tmp_s = 0, tmp_r = 0;
            if (sdispls == NULL || rdispls == NULL || scounts == NULL || rcounts == NULL)
                return;
            for (local_int_t i = 0; i < A.geom->size; i++)
            {
                scounts[i] = 0;
                rcounts[i] = 0;
                sdispls[i] = 0;
                rdispls[i] = 0;
            }
            for (local_int_t i = 0; i < neiCount; i++)
            {
                local_int_t root = neighborsPhysical[i];
                scounts[root] = sendLength[i];
                rcounts[root] = receiveLength[i];
                sdispls[root] = tmp_s;
                tmp_s += sendLength[i];
                rdispls[root] = tmp_r;
                tmp_r += receiveLength[i];
            }
            A.scounts = scounts;
            A.rcounts = rcounts;
            A.sdispls = sdispls;
            A.rdispls = rdispls;
        }
    }
    // Store contents in our matrix struct
    A.numberOfExternalValues = totalToBeSent;
    A.localNumberOfColumns = A.localNumberOfRows + A.numberOfExternalValues;
    A.numberOfSendNeighbors = neiCount;
    A.totalToBeSent = totalToBeSent;
    A.elementsToSend = elementsToSend;
    A.gpuAux.elementsToSend = elementsToSendGpu;
    A.neighbors = neighbors;
    A.neighborsPhysical = neighborsPhysical;
    A.receiveLength = receiveLength;
    A.sendLength = sendLength;
    A.sendBuffer = sendBuffer;
 #endif
    return;
 }
 #endif
 #ifdef USE_GRACE
 void SetupHalo_Cpu(SparseMatrix& A)
 {
    // Extract Matrix pieces
    global_int_t nx = A.geom->nx;
    global_int_t ny = A.geom->ny;
    global_int_t nz = A.geom->nz;
    global_int_t gnx = A.geom->gnx;
    global_int_t gny = A.geom->gny;
    global_int_t gnz = A.geom->gnz;
    global_int_t gix0 = A.geom->gix0;
    global_int_t giy0 = A.geom->giy0;
    global_int_t giz0 = A.geom->giz0;
    int npx = A.geom->npx;
    int npy = A.geom->npy;
    local_int_t localNumberOfRows = A.localNumberOfRows;
    local_int_t* nonzerosInRow = A.nonzerosInRow;
    global_int_t** mtxIndG = A.mtxIndG;
    local_int_t** mtxIndL = A.mtxIndL;
 #ifdef HPCG_NO_MPI // In the non-MPI case we simply copy global indices to local index storage
 #ifndef HPCG_NO_OPENMP
 #pragma omp parallel for
 #endif
    for (local_int_t i = 0; i < localNumberOfRows; i++)
    {
        int cur_nnz = nonzerosInRow[i];
        for (int j = 0; j < cur_nnz; j++)
            mtxIndL[i][j] = mtxIndG[i][j];
    }
 #else // Run this section if compiling for MPI
    // Scan global IDs of the nonzeros in the matrix.  Determine if the column ID matches a row ID.  If not:
    // 1) We call the ComputeRankOfMatrixRow function, which tells us the rank of the processor owning the row ID.
    //  We need to receive this value of the x vector during the halo exchange.
    // 2) We record our row ID since we know that the other processor will need this value from us, due to symmetry.
    std::map<local_int_t, std::map<global_int_t, local_int_t>> externalToLocalMap;
    local_int_t* extTemp = new local_int_t[localNumberOfRows];
    // Okay Let us git rid of the map
    local_int_t sendbufld
        = std::max(std::max(A.geom->nx * A.geom->ny, A.geom->nx * A.geom->nz), A.geom->ny * A.geom->nz);
    local_int_t* send_buffer = new local_int_t[27 * sendbufld];
    char* has_external = new char[localNumberOfRows];
    local_int_t* sendcounter = new local_int_t[27];
    for (local_int_t i = 0; i < 27; i++)
        sendcounter[i] = 0;
 // Goes through all local rows, for each local point
 //  find its 27 3D neighbors (including the point itself).
 //  For each neibor decide if it is on a different rank (halo) or local
 //  If external, add to the send buffer
 //  If local, create the local matrix
 #pragma omp parallel for
    for (local_int_t i = 0; i < localNumberOfRows; i++)
    {
        const local_int_t iz = (i / (nx * ny));
        const local_int_t iy = (i - iz * nx * ny) / nx;
        const local_int_t ix = i - (iz * ny + iy) * nx;
        const global_int_t gix = ix + gix0;
        const global_int_t giy = iy + giy0;
        const global_int_t giz = iz + giz0;
        global_int_t curcol;
        int nnz_c = 0;
        bool rank_set[27];
        for (int j = 0; j < 27; j++)
        {
            rank_set[j] = false;
        }
        has_external[i] = 0;
        for (int k = 0; k < 27; k++)
        {
            long long int cgix = gix + tid2indCpu[k][0];
            long long int cgiy = giy + tid2indCpu[k][1];
            long long int cgiz = giz + tid2indCpu[k][2];
            int ok = cgiz > -1 && cgiz < gnz && cgiy > -1 && cgiy < gny && cgix > -1 && cgix < gnx;
            if (ok)
            {
                int ipz = cgiz / nz;
                int ipy = cgiy / ny;
                int ipx = cgix / nx;
                // For GPUCPU exec mode, find the 3D rank coordinates.
                //  For diff dim between CPU and GPU, we cannot
                //  just divide on the local dim to find ipx/ipy/ipz
                //  We must find it manually based on neighbor 3d coordinates
                //  Note the halo size is always 1
                if (A.geom->different_dim == Z)
                {
                    long long int local = cgiz - giz0;
                    if (local >= 0 && local < nz)
                        ipz = A.geom->ipz;
                    else if (local < 0)
                        ipz = A.geom->ipz - 1;
                    else if (local >= nz)
                        ipz = A.geom->ipz + 1;
                }
                else if (A.geom->different_dim == Y)
                {
                    long long int local = cgiy - giy0;
                    if (local >= 0 && local < ny)
                        ipy = A.geom->ipy;
                    else if (local < 0)
                        ipy = A.geom->ipy - 1;
                    else if (local >= ny)
                        ipy = A.geom->ipy + 1;
                }
                else if (A.geom->different_dim == X)
                {
                    long long int local = cgix - gix0;
                    if (local >= 0 && local < nx)
                        ipx = A.geom->ipx;
                    else if (local < 0)
                        ipx = A.geom->ipx - 1;
                    else if (local >= nx)
                        ipx = A.geom->ipx + 1;
                }
                // Global rank Id
                int col_rank = ipx + ipy * npx + ipz * npy * npx;
                // The neighbor point rank is diff than the current point rank
                if (A.geom->logical_rank != col_rank)
                {
                    has_external[i] = 1;
                    int rankId = rankToId_h[col_rank];
                    local_int_t* p = &(sendcounter[rankId]);
                    // Add the halo point atomically to send_buffer
                    // For all the cols in a row that has the same rank,
                    //  we add the row once to the rank buffer
                    if (!rank_set[rankId])
                    {
                        rank_set[rankId] = true;
                        local_int_t t;
 #pragma omp atomic capture
                        {
                            t = *p;
                            *p += 1;
                        }
                        send_buffer[rankId * sendbufld + t] = i;
                    }
                }
                else
                {
                    // local neighbor, add it to the local matrix
                    local_int_t zi = cgiz - giz0;
                    local_int_t yi = cgiy - giy0;
                    local_int_t xi = cgix - gix0;
                    local_int_t lcol = zi * ny * nx + yi * nx + xi;
                    mtxIndL[i][nnz_c] = lcol;
                }
                nnz_c++;
            }
        }
    }
    // Now external data structures
    // 1 Create elements to send buffer (Sort the indicies for each neighbor)
    local_int_t totalToBeSent = 0;
    local_int_t* sendcounts = new local_int_t[A.geom->size + 1];
    sendcounts[0] = 0;
    int neighborCount = 0;
 #pragma omp parallel for
    for (local_int_t i = 0; i < 27; i++)
    {
        if (sendcounter[i] > 0)
        {
            std::sort(send_buffer + i * sendbufld, send_buffer + i * sendbufld + sendcounter[i]);
        }
    }
    for (local_int_t i = 0; i < 27; i++)
    {
        if (sendcounter[i] > 0)
        {
            totalToBeSent += sendcounter[i];
            sendcounts[neighborCount + 1] = sendcounts[neighborCount] + sendcounter[i];
            neighborCount++;
        }
    }
    // 2 Now find neighbor Ids, neighbor physical Ids (see GenerateGeometry), and elemets to send
    local_int_t sendEntryCount = 0;
    local_int_t* receiveLength = new local_int_t[neighborCount];
    local_int_t* sendLength = new local_int_t[neighborCount];
    // Build the arrays and lists needed by the ExchangeHalo function.
    double* sendBuffer = new double[totalToBeSent];
    int* neighbors = new int[neighborCount];
    int* neighborsPhysical = new int[neighborCount];
    local_int_t* elementsToSend = new local_int_t[totalToBeSent];
    neighborCount = 0;
    for (local_int_t i = 0; i < 27; i++)
    {
        if (sendcounter[i] > 0)
        {
            int neighborId = idToRank_h[i]; // logical Id
            int phys_neiId = logical_rank_to_phys[neighborId];
            neighbors[neighborCount] = neighborId; // store rank ID of current neighbor
            neighborsPhysical[neighborCount] = phys_neiId;
            receiveLength[neighborCount] = sendcounter[i];
            sendLength[neighborCount] = sendcounter[i];
            for (int j = 0; j < sendcounter[i]; j++)
            {
                elementsToSend[sendEntryCount] = send_buffer[i * sendbufld + j];
                sendEntryCount++;
            }
            neighborCount++;
        }
    }
    delete[] send_buffer;
    delete[] sendcounter;
    // Exchange elements to send  wit other neighbors
    auto INDEX_TYPE = MPI_INT;
 #ifdef INDEX_64 // In src/Geometry
    INDEX_TYPE = MPI_LONG;
 #endif
    MPI_Status status;
    int MPI_MY_TAG = 93;
    MPI_Request* request = new MPI_Request[neighborCount];
    local_int_t* eltsToRecv = new local_int_t[totalToBeSent];
    local_int_t* recv_ptr = eltsToRecv;
    for (int i = 0; i < neighborCount; i++)
    {
        int n_recv = sendLength[i];
        MPI_Irecv(recv_ptr, n_recv, INDEX_TYPE, neighborsPhysical[i], MPI_MY_TAG, MPI_COMM_WORLD, request + i);
        recv_ptr += n_recv;
    }
    local_int_t* elts_ptr = elementsToSend;
    for (int i = 0; i < neighborCount; i++)
    {
        local_int_t n_send = sendLength[i];
        MPI_Send(elts_ptr, n_send, INDEX_TYPE, neighborsPhysical[i], MPI_MY_TAG, MPI_COMM_WORLD);
        elts_ptr += n_send;
    }
    for (int i = 0; i < neighborCount; i++)
    {
        MPI_Wait(request + i, &status);
    }
    delete[] request;
    // Create a map to be used in the optimization step
    //  Any external column index will be given a sequntail Id
    //  after the number of rows (Will be used to access x vector)
    int prev_dim = 0;
    for (int nc = 0; nc < neighborCount; ++nc)
    {
        int neighborId = neighbors[nc];
        int phys_neiId = neighborsPhysical[nc];
        local_int_t str = sendcounts[nc];
        local_int_t end = sendcounts[nc + 1];
        for (int j = str; j < end; j++)
        {
            const local_int_t col = eltsToRecv[j];
            externalToLocalMap[neighborId][col] = localNumberOfRows + j;
        }
    }
    delete[] eltsToRecv;
    delete[] sendcounts;
    if (totalToBeSent > 0)
    {
 // Last step sort all external IDs per rank Id, elements of neighbor 0 first, then 1, and so on
 #pragma omp parallel for
        for (local_int_t i = 0; i < localNumberOfRows; i++)
        {
            int nnz_ext = 0;
            if (has_external[i] == 1)
            {
                const local_int_t iz = (i / (nx * ny));
                const local_int_t iy = (i - iz * nx * ny) / nx;
                const local_int_t ix = i - (iz * ny + iy) * nx;
                const global_int_t gix = ix + gix0;
                const global_int_t giy = iy + giy0;
                const global_int_t giz = iz + giz0;
                int nnz_c = 0;
                for (int k = 0; k < 27; k++)
                {
                    long long int cgix = gix + tid2indCpu[k][0];
                    long long int cgiy = giy + tid2indCpu[k][1];
                    long long int cgiz = giz + tid2indCpu[k][2];
                    local_int_t zi = (cgiz) % nz;
                    local_int_t yi = (cgiy) % ny;
                    local_int_t xi = (cgix) % nx;
                    int ok = cgiz > -1 && cgiz < gnz && cgiy > -1 && cgiy < gny && cgix > -1 && cgix < gnx;
                    int ipz = cgiz / nz;
                    int ipy = cgiy / ny;
                    int ipx = cgix / nx;
                    // The indices sent by the neighbor uses the neighbor's nx, ny, and nz which can
                    // be deffirent than the current neighbor's dims. Thus, based on neighor location
                    // and the diffrent_dim we adjust the indices if needed.
                    // Also, the ipx, ipy, and ipz must be updated accordingly
                    global_int_t new_nx = A.geom->nx;
                    global_int_t new_ny = A.geom->ny;
                    if (A.geom->different_dim == Z)
                    {
                        long long int local = cgiz - giz0;
                        if (local >= 0 && local < nz)
                        {
                            ipz = A.geom->ipz;
                            zi = local;
                        }
                        else if (local < 0)
                        {
                            ipz = A.geom->ipz - 1;
                            zi = A.geom->previous_neighbor_dim - 1;
                        }
                        else if (local >= nz)
                        {
                            ipz = A.geom->ipz + 1;
                            zi = 0;
                        }
                    }
                    else if (A.geom->different_dim == Y)
                    {
                        long long int local = cgiy - giy0;
                        if (local >= 0 && local < ny)
                        {
                            ipy = A.geom->ipy;
                            yi = local;
                        }
                        else if (local < 0)
                        {
                            ipy = A.geom->ipy - 1;
                            yi = A.geom->previous_neighbor_dim - 1;
                            new_ny = A.geom->previous_neighbor_dim;
                        }
                        else if (local >= ny)
                        {
                            ipy = A.geom->ipy + 1;
                            yi = 0;
                            new_ny = A.geom->next_neighbor_dim;
                        }
                    }
                    else if (A.geom->different_dim == X)
                    {
                        long long int local = cgix - gix0;
                        if (local >= 0 && local < nx)
                        {
                            ipx = A.geom->ipx;
                            xi = local;
                        }
                        else if (local < 0)
                        {
                            ipx = A.geom->ipx - 1;
                            xi = A.geom->previous_neighbor_dim - 1;
                            new_nx = A.geom->previous_neighbor_dim;
                        }
                        else if (local >= nx)
                        {
                            ipx = A.geom->ipx + 1;
                            xi = 0;
                            new_nx = A.geom->next_neighbor_dim;
                        }
                    }
                    local_int_t lcol = zi * new_ny * new_nx + yi * new_nx + xi;
                    int row_rank = ipx + ipy * npx + ipz * npy * npx;
                    if (ok)
                    {
                        if (externalToLocalMap.find(row_rank) != externalToLocalMap.end())
                        {
                            mtxIndL[i][nnz_c] = externalToLocalMap[row_rank][lcol];
                            nnz_ext++;
                        }
                        nnz_c++;
                    }
                }
            }
             extTemp[i] = nnz_ext;
        }
    }
    if (P2P_Mode == MPI_CPU_All2allv)
    {
        int* sdispls = new int[A.geom->size];
        int* rdispls = new int[A.geom->size];
        int* scounts = new int[A.geom->size];
        int* rcounts = new int[A.geom->size];
        int tmp_s = 0, tmp_r = 0;
        if (sdispls == NULL || rdispls == NULL || scounts == NULL || rcounts == NULL)
            return;
        for (local_int_t i = 0; i < A.geom->size; i++)
        {
            scounts[i] = 0;
            rcounts[i] = 0;
            sdispls[i] = 0;
            rdispls[i] = 0;
        }
        for (local_int_t i = 0; i < neighborCount; i++)
        {
            local_int_t root = neighborsPhysical[i];
            scounts[root] = sendLength[i];
            rcounts[root] = receiveLength[i];
            sdispls[root] = tmp_s;
            tmp_s += sendLength[i];
            rdispls[root] = tmp_r;
            tmp_r += receiveLength[i];
        }
        A.scounts = scounts;
        A.rcounts = rcounts;
        A.sdispls = sdispls;
        A.rdispls = rdispls;
    }
    delete[] has_external;
    // Store contents in our matrix struct
    A.numberOfExternalValues = totalToBeSent;
    A.localNumberOfColumns = A.localNumberOfRows + A.numberOfExternalValues;
    A.numberOfSendNeighbors = neighborCount;
    A.totalToBeSent = totalToBeSent;
    A.elementsToSend = elementsToSend;
    A.neighbors = neighbors;
    A.neighborsPhysical = neighborsPhysical;
    A.receiveLength = receiveLength;
    A.sendLength = sendLength;
    A.sendBuffer = sendBuffer;
    A.cpuAux.tempIndex = extTemp;
 #ifdef HPCG_DETAILED_DEBUG
    HPCG_fout << " For rank " << A.geom->rank << " of " << A.geom->size
              << ", number of neighbors = " << A.numberOfSendNeighbors << endl;
    for (int i = 0; i < A.numberOfSendNeighbors; i++)
    {
        HPCG_fout << "     rank " << A.geom->rank << " neighbor " << neighbors[i]
                  << " send/recv length = " << sendLength[i] << "/" << receiveLength[i] << endl;
        for (local_int_t j = 0; j < sendLength[i]; ++j)
            HPCG_fout << "       rank " << A.geom->rank << " elementsToSend[" << j << "] = " << elementsToSend[j]
                      << endl;
    }
 #endif
 #endif
    // ifdef HPCG_NO_MPI
    return;
 }
 #endif // USE_GRACE
 void SetupHalo(SparseMatrix& A)
 {
    if (A.rankType == GPU)
    {
 #ifdef USE_CUDA
        SetupHalo_Gpu(A);
 #endif
    }
    else
    {
 #ifdef USE_GRACE
        SetupHalo_Cpu(A);
 #endif
    }
 }
--- a/Show More
+++ b/Show More
		`@@ -0,0 +1,2 @@`

							`void ComputeOptimalShapeXYZ(int xyz, int& x, int& y, int& z);`