first commit

2026-01-18 20:37:50 +08:00
commit fff9f18287
123 changed files with 1385491 additions and 0 deletions
--- a/run_sample.sh
+++ b/run_sample.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export CXX_PATH=/usr
+export PATH=${CXX_PATH}/bin:${PATH}
+
+if [[ -z "${MPI_PATH}" ]]; then
+    export MPI_PATH=/path/to/mpi #Change this to correct MPI path
+fi
+
+if [[ -z "${CUDA_PATH}" ]]; then
+    export MATHLIBS_PATH=/path/to/mathlibs #Change this to correct CUDA mathlibs
+fi
+
+if [[ -z "${NCCL_PATH}" ]]; then
+    export NCCL_PATH=/path/to/nccl #Change to correct NCCL path
+fi
+
+if [[ -z "${CUDA_PATH}" ]]; then
+    export CUDA_PATH=/path/to/cuda #Change this to correct CUDA path
+fi
+
+if [[ -z "${NVPL_SPARSE}" ]]; then
+    export NVPL_SPARSE=/path/to/nvpllibs #Change this to correct NVPL mathlibs
+fi
+
+#Please fix, if needed
+export CUDA_BLAS_VERSION=${CUDA_BUILD_VERSION:-12.2}
+export LD_LIBRARY_PATH=${MATHLIBS_PATH}/${CUDA_BLAS_VERSION}/lib64/:${LD_LIBRARY_PATH}
+export PATH=${CUDA_PATH}/bin:${PATH}
+export LD_LIBRARY_PATH=${CUDA_PATH}/lib64:${LD_LIBRARY_PATH}
+export LD_LIBRARY_PATH=${NCCL_PATH}/lib:${LD_LIBRARY_PATH}
+export LD_LIBRARY_PATH=${NVPL_SPARSE}/lib:${LD_LIBRARY_PATH}
+
+ext="--mca pml ^ucx --mca btl ^openib,smcuda -mca coll_hcoll_enable 0 -x coll_hcoll_np=0 --bind-to none"
+
+#Directory to xhpcg binary
+dir="bin/"
+
+#Sample on a Hopper GPU x86
+###########################
+#Local problem size
+nx=512 #Large problem size x
+ny=512 #Large problem size y
+nz=288 #Large problem size z
+mpirun --oversubscribe ${ext} -np 1 ${dir}/hpcg.sh  --exec-name ${dir}/xhpcg \
+ --nx $nx --ny $ny --nz $nz --rt 10 --b 0
+########################################################################################
+
+#Sample on Grace Hopper x4
+###########################
+#Local problem size
+nx=256 #Large problem size x, assumed for the GPU
+ny=1024 #Large problem size y, assumed for the GPU
+nz=288 #Large problem size z, assumed for the GPU
+
+#1 GPUOnly
+#---------#
+np=4  #Total number of ranks
+mpirun --oversubscribe ${ext} -np $np ${dir}/hpcg-aarch64.sh  --exec-name ${dir}/xhpcg \
+ --nx $nx --ny $ny --nz $nz --rt 10 --b 0 --exm 0 --p2p 0 \
+ --mem-affinity 0:1:2:3 --cpu-affinity 0-71:72-143:144-215:216-287
+
+#2 GraceOnly
+#-----------#
+np=4  #Total number of ranks
+mpirun --oversubscribe ${ext} -np $np ${dir}/hpcg-aarch64.sh  --exec-name ${dir}/xhpcg-cpu \
+ --nx $nx --ny $ny --nz $nz --rt 10 --b 0 --exm 0 --p2p 0 \
+ --mem-affinity 0:1:2:3 --cpu-affinity 0-71:72-143:144-215:216-287
+
+#3 Hetrogeneous (GPU + Grace)
+#----------------------------#
+np=8  #Total number of ranks (4GPU + 4Grace)
+exm=2 #Execution mode GPU+Grace
+diff_dim=2 #different dim between GPU and Grace is Y
+lpm=1 #Local problem mode (nx/ny/nz are local to GPU, g2c is the Grace different dimension)
+g2c=64 #Based on dif_dim=2 and lpm=1 --> Grace rank local problem size is $nx x $g2c x $nz
+
+#3D grid size 4x2x1 (must be equal to np)
+npx=4 #number of ranks in the x direction
+npy=2 #number of ranks in the y direction
+npz=1 #number of ranks in the z direction
+mpirun --oversubscribe ${ext} -np $np ${dir}/hpcg-aarch64.sh  --exec-name ${dir}/xhpcg \
+ --nx $nx --ny $ny --nz $nz --rt 10 --b 0 --p2p 0 --exm $exm --lpm $lpm --g2c $g2c --ddm $diff_dim --npx $npx --npy $npy --npz $npz \
+ --mem-affinity 0:0:1:1:2:2:3:3 --cpu-affinity 0-7:8-71:72-79:80-143:144-151:152-215:216-223:224-287