Compare commits
15 Commits
gcc-legacy
...
aocc-legac
| Author | SHA1 | Date | |
|---|---|---|---|
| 29c16e7333 | |||
| 8bc89c900f | |||
| 1769d48428 | |||
| 2f518a03bb | |||
| 74849effb7 | |||
| b1e80f9778 | |||
| 320a3694c2 | |||
| ac92b75dc1 | |||
| 29b2406d57 | |||
| b84561426e | |||
| e1e4b1d0fa | |||
| 3d3a3ba759 | |||
| b13a187219 | |||
| bf74f2f688 | |||
| 96c5b79a23 |
@@ -58,14 +58,10 @@ POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
|
|||||||
TRANSFER_CACHE_FLAG = -DBSSN_USE_TRANSFER_CACHE=$(EFFECTIVE_USE_TRANSFER_CACHE)
|
TRANSFER_CACHE_FLAG = -DBSSN_USE_TRANSFER_CACHE=$(EFFECTIVE_USE_TRANSFER_CACHE)
|
||||||
ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(EFFECTIVE_USE_CXX_ESCALAR_KERNEL)
|
ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(EFFECTIVE_USE_CXX_ESCALAR_KERNEL)
|
||||||
|
|
||||||
|
## AMD AOCC build flags optimized for EPYC Zen 4 (-march=znver4)
|
||||||
|
## make -> opt (PGO-guided, maximum performance)
|
||||||
|
## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data)
|
||||||
|
|
||||||
## GCC build flags (optimized for x86-64-v4)
|
|
||||||
## PGO disabled (used negative optimization on Intel; not tested on GCC)
|
|
||||||
CXXAPPFLAGS = -O3 -march=x86-64-v4 -ffast-math -mfma -flto \
|
|
||||||
-Dfortran3 -Dnewc $(INTERP_LB_FLAGS) \
|
|
||||||
$(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) $(EM_KERNEL_FLAG)
|
|
||||||
f90appflags = -O3 -march=x86-64-v4 -ffast-math -mfma -flto \
|
|
||||||
-cpp $(POLINT6_FLAG)
|
|
||||||
|
|
||||||
.SUFFIXES: .o .f90 .C .for .cu
|
.SUFFIXES: .o .f90 .C .for .cu
|
||||||
|
|
||||||
@@ -73,11 +69,11 @@ f90appflags = -O3 -march=x86-64-v4 -ffast-math -mfma -flto \
|
|||||||
$(f90) $(f90appflags) -c $< -o $@
|
$(f90) $(f90appflags) -c $< -o $@
|
||||||
|
|
||||||
.C.o:
|
.C.o:
|
||||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
# ShellPatch.C uses OpenMP for setupintintstuff search loops
|
# ShellPatch.C uses OpenMP for setupintintstuff search loops
|
||||||
ShellPatch.o: ShellPatch.C
|
ShellPatch.o: ShellPatch.C
|
||||||
${CXX} $(CXXAPPFLAGS) $(OMP_FLAG) -c $< $(filein) -o $@
|
$(CXX) $(CXXAPPFLAGS) $(OMP_FLAG) -c $< $(filein) -o $@
|
||||||
|
|
||||||
.for.o:
|
.for.o:
|
||||||
$(f77) -c $< -o $@
|
$(f77) -c $< -o $@
|
||||||
@@ -87,59 +83,59 @@ ShellPatch.o: ShellPatch.C
|
|||||||
|
|
||||||
# C rewrite of BSSN RHS kernel and helpers
|
# C rewrite of BSSN RHS kernel and helpers
|
||||||
bssn_rhs_c.o: bssn_rhs_c.C
|
bssn_rhs_c.o: bssn_rhs_c.C
|
||||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
fderivs_c.o: fderivs_c.C
|
fderivs_c.o: fderivs_c.C
|
||||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
fdderivs_c.o: fdderivs_c.C
|
fdderivs_c.o: fdderivs_c.C
|
||||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
kodiss_c.o: kodiss_c.C
|
kodiss_c.o: kodiss_c.C
|
||||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
lopsided_c.o: lopsided_c.C
|
lopsided_c.o: lopsided_c.C
|
||||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
lopsided_kodis_c.o: lopsided_kodis_c.C
|
lopsided_kodis_c.o: lopsided_kodis_c.C
|
||||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
# C rewrite of shell-patch derivative kernels
|
# C rewrite of shell-patch derivative kernels
|
||||||
fderivs_sh_c.o: fderivs_sh_c.C
|
fderivs_sh_c.o: fderivs_sh_c.C
|
||||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
fdderivs_sh_c.o: fdderivs_sh_c.C
|
fdderivs_sh_c.o: fdderivs_sh_c.C
|
||||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
fderivs_shc_c.o: fderivs_shc_c.C
|
fderivs_shc_c.o: fderivs_shc_c.C
|
||||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
fdderivs_shc_c.o: fdderivs_shc_c.C
|
fdderivs_shc_c.o: fdderivs_shc_c.C
|
||||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
kodiss_sh_c.o: kodiss_sh_c.C
|
kodiss_sh_c.o: kodiss_sh_c.C
|
||||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
|
|
||||||
bssn_em_rhs_c.o: bssn_em_rhs_c.C
|
bssn_em_rhs_c.o: bssn_em_rhs_c.C
|
||||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
z4c_rhs_c.o: z4c_rhs_c.C
|
z4c_rhs_c.o: z4c_rhs_c.C
|
||||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
#interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
|
#interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
|
||||||
# ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
# $(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
|
## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
|
||||||
TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
|
TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
|
||||||
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||||
-fprofile-instr-use=$(TP_PROFDATA) \
|
-fprofile-instr-use=$(TP_PROFDATA) \
|
||||||
-Dfortran3 -Dnewc $(filein_real)
|
-Dfortran3 -Dnewc -I$(AOCL_ROOT)/include
|
||||||
|
|
||||||
TwoPunctures.o: TwoPunctures.C
|
TwoPunctures.o: TwoPunctures.C
|
||||||
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
|
$(CXX) $(TP_OPTFLAGS) -qopenmp -c $< -o $@
|
||||||
|
|
||||||
TwoPunctureABE.o: TwoPunctureABE.C
|
TwoPunctureABE.o: TwoPunctureABE.C
|
||||||
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
|
$(CXX) $(TP_OPTFLAGS) -qopenmp -c $< -o $@
|
||||||
|
|
||||||
# Input files
|
# Input files
|
||||||
|
|
||||||
|
|||||||
@@ -1,26 +1,21 @@
|
|||||||
## GCC version with OpenMPI and OpenBLAS
|
## AMD AOCC version with AOCL (Optimized for AMD EPYC Zen 4)
|
||||||
OMPI_ROOT = /usr/mpi/gcc/openmpi-4.1.9a1
|
|
||||||
|
|
||||||
## Ensure mpicxx and final executables find OpenMPI libs at build- and runtime
|
## AOCL root path for includes and libraries
|
||||||
export LD_LIBRARY_PATH := $(OMPI_ROOT)/lib64:$(LD_LIBRARY_PATH)
|
AOCL_ROOT ?= /home/aocc/aocl/5.2.0/aocc
|
||||||
|
|
||||||
filein = -I/usr/include/ -I$(OMPI_ROOT)/include
|
## AOCC-built OpenMPI prefix
|
||||||
|
OMPI_PREFIX ?= /home/aocc/openmpi-5.0.10
|
||||||
|
|
||||||
## OpenBLAS (OpenMP variant) + gfortran runtime
|
filein = -I/usr/include/ -I$(AOCL_ROOT)/include
|
||||||
## -Wl,-rpath ensures ABE / TwoPunctureABE find libmpi at runtime without LD_LIBRARY_PATH
|
|
||||||
LDLIBS = -Wl,-rpath,$(OMPI_ROOT)/lib64 -lopenblaso -lgfortran -lpthread -lm -ldl -lgomp
|
## Using AOCL BLIS + libFLAME for BLAS/LAPACK
|
||||||
|
## AOCC Fortran runtime: -lflang (includes FortranRuntime)
|
||||||
|
## AOCC OpenMP runtime: -lomp (LLVM OpenMP)
|
||||||
|
LDLIBS = -L$(AOCL_ROOT)/lib -lblis -lflame -lamdlibm -lflang -lpgmath -lpthread -lm -ldl -lomp
|
||||||
|
|
||||||
# OpenMP flag for selective compilation
|
# OpenMP flag for selective compilation
|
||||||
OMP_FLAG = -fopenmp
|
OMP_FLAG = -fopenmp
|
||||||
|
|
||||||
## Memory allocator switch
|
|
||||||
## 0 (default) : use system default allocator (ptmalloc)
|
|
||||||
## 1 : use jemalloc (install jemalloc-devel first)
|
|
||||||
USE_JEMALLOC ?= 0
|
|
||||||
ifeq ($(USE_JEMALLOC),1)
|
|
||||||
LDLIBS := -ljemalloc $(LDLIBS)
|
|
||||||
endif
|
|
||||||
|
|
||||||
## Interp_Points load balance profiling mode
|
## Interp_Points load balance profiling mode
|
||||||
## off : (default) no load balance instrumentation
|
## off : (default) no load balance instrumentation
|
||||||
## profile : Pass 1 — instrument Interp_Points to collect timing profile
|
## profile : Pass 1 — instrument Interp_Points to collect timing profile
|
||||||
@@ -68,11 +63,11 @@ USE_TRANSFER_CACHE ?= auto
|
|||||||
## 0 : use original Fortran rungekutta4_rout.o
|
## 0 : use original Fortran rungekutta4_rout.o
|
||||||
USE_CXX_RK4 ?= 1
|
USE_CXX_RK4 ?= 1
|
||||||
|
|
||||||
f90 = gfortran
|
f90 = flang
|
||||||
f77 = gfortran
|
f77 = flang
|
||||||
CXX = g++
|
CXX = clang++
|
||||||
CC = gcc
|
CC = clang
|
||||||
CLINKER = mpicxx
|
CLINKER = $(OMPI_PREFIX)/bin/mpicxx
|
||||||
|
|
||||||
Cu = nvcc
|
Cu = nvcc
|
||||||
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
|
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
|
||||||
|
|||||||
Reference in New Issue
Block a user