Compare commits
15 Commits
oneapi-leg
...
aocc-legac
| Author | SHA1 | Date | |
|---|---|---|---|
| 29c16e7333 | |||
| 8bc89c900f | |||
| 1769d48428 | |||
| 2f518a03bb | |||
| 74849effb7 | |||
| b1e80f9778 | |||
| 320a3694c2 | |||
| ac92b75dc1 | |||
| 29b2406d57 | |||
| b84561426e | |||
| e1e4b1d0fa | |||
| 3d3a3ba759 | |||
| b13a187219 | |||
| bf74f2f688 | |||
| 96c5b79a23 |
@@ -59,7 +59,7 @@ bool shell_fast_interp_enabled()
|
||||
if (enabled < 0)
|
||||
{
|
||||
const char *env = getenv("AMSS_SHELL_FAST_INTERP");
|
||||
enabled = (!env || atoi(env) != 0) ? 1 : 0;
|
||||
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
||||
}
|
||||
return enabled != 0;
|
||||
}
|
||||
@@ -70,7 +70,7 @@ bool shell_parallel_interp_enabled()
|
||||
if (enabled < 0)
|
||||
{
|
||||
const char *env = getenv("AMSS_SHELL_PARALLEL_INTERP");
|
||||
enabled = (!env || atoi(env) != 0) ? 1 : 0;
|
||||
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
||||
}
|
||||
return enabled != 0;
|
||||
}
|
||||
|
||||
@@ -27,7 +27,7 @@ using namespace std;
|
||||
#endif
|
||||
|
||||
#include "TwoPunctures.h"
|
||||
#include <mkl_cblas.h>
|
||||
#include <cblas.h>
|
||||
|
||||
TwoPunctures::TwoPunctures(double mp, double mm, double b,
|
||||
double P_plusx, double P_plusy, double P_plusz,
|
||||
|
||||
@@ -18,7 +18,7 @@ using namespace std;
|
||||
#endif
|
||||
|
||||
// Intel oneMKL LAPACK interface
|
||||
#include <mkl_lapacke.h>
|
||||
#include <lapacke.h>
|
||||
/* Linear equation solution using Intel oneMKL LAPACK.
|
||||
a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input
|
||||
containing the right-hand side vectors. On output a is
|
||||
|
||||
@@ -58,30 +58,10 @@ POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
|
||||
TRANSFER_CACHE_FLAG = -DBSSN_USE_TRANSFER_CACHE=$(EFFECTIVE_USE_TRANSFER_CACHE)
|
||||
ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(EFFECTIVE_USE_CXX_ESCALAR_KERNEL)
|
||||
|
||||
## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
|
||||
## AMD AOCC build flags optimized for EPYC Zen 4 (-march=znver4)
|
||||
## make -> opt (PGO-guided, maximum performance)
|
||||
## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data)
|
||||
PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
|
||||
|
||||
ifeq ($(PGO_MODE),instrument)
|
||||
## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
|
||||
CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
|
||||
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \
|
||||
$(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) $(EM_KERNEL_FLAG)
|
||||
f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
|
||||
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
|
||||
else
|
||||
## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \
|
||||
## PGO has been turned off, now tested and found to be negative optimization
|
||||
## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization
|
||||
|
||||
|
||||
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \
|
||||
$(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) $(EM_KERNEL_FLAG)
|
||||
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
|
||||
endif
|
||||
|
||||
.SUFFIXES: .o .f90 .C .for .cu
|
||||
|
||||
@@ -89,11 +69,11 @@ endif
|
||||
$(f90) $(f90appflags) -c $< -o $@
|
||||
|
||||
.C.o:
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
# ShellPatch.C uses OpenMP for setupintintstuff search loops
|
||||
ShellPatch.o: ShellPatch.C
|
||||
${CXX} $(CXXAPPFLAGS) $(OMP_FLAG) -c $< $(filein) -o $@
|
||||
$(CXX) $(CXXAPPFLAGS) $(OMP_FLAG) -c $< $(filein) -o $@
|
||||
|
||||
.for.o:
|
||||
$(f77) -c $< -o $@
|
||||
@@ -103,59 +83,59 @@ ShellPatch.o: ShellPatch.C
|
||||
|
||||
# C rewrite of BSSN RHS kernel and helpers
|
||||
bssn_rhs_c.o: bssn_rhs_c.C
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
fderivs_c.o: fderivs_c.C
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
fdderivs_c.o: fdderivs_c.C
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
kodiss_c.o: kodiss_c.C
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
lopsided_c.o: lopsided_c.C
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
lopsided_kodis_c.o: lopsided_kodis_c.C
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
# C rewrite of shell-patch derivative kernels
|
||||
fderivs_sh_c.o: fderivs_sh_c.C
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
fdderivs_sh_c.o: fdderivs_sh_c.C
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
fderivs_shc_c.o: fderivs_shc_c.C
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
fdderivs_shc_c.o: fdderivs_shc_c.C
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
kodiss_sh_c.o: kodiss_sh_c.C
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
|
||||
bssn_em_rhs_c.o: bssn_em_rhs_c.C
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
z4c_rhs_c.o: z4c_rhs_c.C
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
#interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
|
||||
# ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
# $(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
|
||||
TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
|
||||
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||
-fprofile-instr-use=$(TP_PROFDATA) \
|
||||
-Dfortran3 -Dnewc -I${MKLROOT}/include
|
||||
-Dfortran3 -Dnewc -I$(AOCL_ROOT)/include
|
||||
|
||||
TwoPunctures.o: TwoPunctures.C
|
||||
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
|
||||
$(CXX) $(TP_OPTFLAGS) -qopenmp -c $< -o $@
|
||||
|
||||
TwoPunctureABE.o: TwoPunctureABE.C
|
||||
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
|
||||
$(CXX) $(TP_OPTFLAGS) -qopenmp -c $< -o $@
|
||||
|
||||
# Input files
|
||||
|
||||
|
||||
@@ -1,33 +1,20 @@
|
||||
## GCC version (commented out)
|
||||
## filein = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
|
||||
## filein = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
|
||||
## LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
|
||||
## AMD AOCC version with AOCL (Optimized for AMD EPYC Zen 4)
|
||||
|
||||
## Intel oneAPI version with oneMKL (Optimized for performance)
|
||||
filein = -I/usr/include/ -I${MKLROOT}/include
|
||||
## AOCL root path for includes and libraries
|
||||
AOCL_ROOT ?= /home/aocc/aocl/5.2.0/aocc
|
||||
|
||||
## Using sequential MKL (OpenMP disabled for better single-threaded performance)
|
||||
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
|
||||
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
|
||||
## AOCC-built OpenMPI prefix
|
||||
OMPI_PREFIX ?= /home/aocc/openmpi-5.0.10
|
||||
|
||||
## Memory allocator switch
|
||||
## 1 (default) : link Intel oneTBB allocator (libtbbmalloc)
|
||||
## 0 : use system default allocator (ptmalloc)
|
||||
USE_TBBMALLOC ?= 1
|
||||
TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
|
||||
ifneq ($(wildcard $(TBBMALLOC_SO)),)
|
||||
TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
|
||||
else
|
||||
TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
|
||||
endif
|
||||
ifeq ($(USE_TBBMALLOC),1)
|
||||
LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
|
||||
endif
|
||||
filein = -I/usr/include/ -I$(AOCL_ROOT)/include
|
||||
|
||||
## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
|
||||
## opt : (default) maximum performance with PGO profile-guided optimization
|
||||
## instrument : PGO Phase 1 instrumentation to collect fresh profile data
|
||||
PGO_MODE ?= opt
|
||||
## Using AOCL BLIS + libFLAME for BLAS/LAPACK
|
||||
## AOCC Fortran runtime: -lflang (includes FortranRuntime)
|
||||
## AOCC OpenMP runtime: -lomp (LLVM OpenMP)
|
||||
LDLIBS = -L$(AOCL_ROOT)/lib -lblis -lflame -lamdlibm -lflang -lpgmath -lpthread -lm -ldl -lomp
|
||||
|
||||
# OpenMP flag for selective compilation
|
||||
OMP_FLAG = -fopenmp
|
||||
|
||||
## Interp_Points load balance profiling mode
|
||||
## off : (default) no load balance instrumentation
|
||||
@@ -76,13 +63,12 @@ USE_TRANSFER_CACHE ?= auto
|
||||
## 0 : use original Fortran rungekutta4_rout.o
|
||||
USE_CXX_RK4 ?= 1
|
||||
|
||||
f90 = ifx
|
||||
f77 = ifx
|
||||
CXX = icpx
|
||||
CC = icx
|
||||
CLINKER = mpiicpx
|
||||
f90 = flang
|
||||
f77 = flang
|
||||
CXX = clang++
|
||||
CC = clang
|
||||
CLINKER = $(OMPI_PREFIX)/bin/mpicxx
|
||||
|
||||
Cu = nvcc
|
||||
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
|
||||
#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
|
||||
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc
|
||||
|
||||
Reference in New Issue
Block a user