Compare commits
16 Commits
oneapi-leg
...
gcc-legacy
| Author | SHA1 | Date | |
|---|---|---|---|
| 12bf08a2a1 | |||
| 9b4f98e237 | |||
| 2bbde059db | |||
| 3b8774c1b1 | |||
| 23b52e30d6 | |||
| e8f590a742 | |||
| 632173ea10 | |||
| eed2ff2be8 | |||
| b904f6cf56 | |||
| c4b9bd3788 | |||
| 276b36ea25 | |||
| baf248c3bc | |||
| 70b6496ed3 | |||
| 6ca9fece2e | |||
| 516cdea502 | |||
| 9687d9a3dd |
@@ -59,7 +59,7 @@ bool shell_fast_interp_enabled()
|
||||
if (enabled < 0)
|
||||
{
|
||||
const char *env = getenv("AMSS_SHELL_FAST_INTERP");
|
||||
enabled = (!env || atoi(env) != 0) ? 1 : 0;
|
||||
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
||||
}
|
||||
return enabled != 0;
|
||||
}
|
||||
@@ -70,7 +70,7 @@ bool shell_parallel_interp_enabled()
|
||||
if (enabled < 0)
|
||||
{
|
||||
const char *env = getenv("AMSS_SHELL_PARALLEL_INTERP");
|
||||
enabled = (!env || atoi(env) != 0) ? 1 : 0;
|
||||
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
||||
}
|
||||
return enabled != 0;
|
||||
}
|
||||
|
||||
@@ -27,7 +27,7 @@ using namespace std;
|
||||
#endif
|
||||
|
||||
#include "TwoPunctures.h"
|
||||
#include <mkl_cblas.h>
|
||||
#include <cblas.h>
|
||||
|
||||
TwoPunctures::TwoPunctures(double mp, double mm, double b,
|
||||
double P_plusx, double P_plusy, double P_plusz,
|
||||
|
||||
@@ -18,7 +18,7 @@ using namespace std;
|
||||
#endif
|
||||
|
||||
// Intel oneMKL LAPACK interface
|
||||
#include <mkl_lapacke.h>
|
||||
#include <lapacke.h>
|
||||
/* Linear equation solution using Intel oneMKL LAPACK.
|
||||
a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input
|
||||
containing the right-hand side vectors. On output a is
|
||||
|
||||
@@ -58,30 +58,14 @@ POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
|
||||
TRANSFER_CACHE_FLAG = -DBSSN_USE_TRANSFER_CACHE=$(EFFECTIVE_USE_TRANSFER_CACHE)
|
||||
ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(EFFECTIVE_USE_CXX_ESCALAR_KERNEL)
|
||||
|
||||
## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
|
||||
## make -> opt (PGO-guided, maximum performance)
|
||||
## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data)
|
||||
PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
|
||||
|
||||
ifeq ($(PGO_MODE),instrument)
|
||||
## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
|
||||
CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
|
||||
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \
|
||||
## GCC build flags (optimized for x86-64-v4)
|
||||
## PGO disabled (used negative optimization on Intel; not tested on GCC)
|
||||
CXXAPPFLAGS = -O3 -march=x86-64-v4 -ffast-math -mfma -flto \
|
||||
-Dfortran3 -Dnewc $(INTERP_LB_FLAGS) \
|
||||
$(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) $(EM_KERNEL_FLAG)
|
||||
f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
|
||||
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
|
||||
else
|
||||
## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \
|
||||
## PGO has been turned off, now tested and found to be negative optimization
|
||||
## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization
|
||||
|
||||
|
||||
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \
|
||||
$(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) $(EM_KERNEL_FLAG)
|
||||
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
|
||||
endif
|
||||
f90appflags = -O3 -march=x86-64-v4 -ffast-math -mfma -flto \
|
||||
-cpp $(POLINT6_FLAG)
|
||||
|
||||
.SUFFIXES: .o .f90 .C .for .cu
|
||||
|
||||
@@ -149,7 +133,7 @@ z4c_rhs_c.o: z4c_rhs_c.C
|
||||
TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
|
||||
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||
-fprofile-instr-use=$(TP_PROFDATA) \
|
||||
-Dfortran3 -Dnewc -I${MKLROOT}/include
|
||||
-Dfortran3 -Dnewc $(filein_real)
|
||||
|
||||
TwoPunctures.o: TwoPunctures.C
|
||||
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
|
||||
|
||||
@@ -1,33 +1,25 @@
|
||||
## GCC version (commented out)
|
||||
## filein = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
|
||||
## filein = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
|
||||
## LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
|
||||
## GCC version with OpenMPI and OpenBLAS
|
||||
OMPI_ROOT = /usr/mpi/gcc/openmpi-4.1.9a1
|
||||
|
||||
## Intel oneAPI version with oneMKL (Optimized for performance)
|
||||
filein = -I/usr/include/ -I${MKLROOT}/include
|
||||
## Ensure mpicxx and final executables find OpenMPI libs at build- and runtime
|
||||
export LD_LIBRARY_PATH := $(OMPI_ROOT)/lib64:$(LD_LIBRARY_PATH)
|
||||
|
||||
## Using sequential MKL (OpenMP disabled for better single-threaded performance)
|
||||
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
|
||||
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
|
||||
filein = -I/usr/include/ -I$(OMPI_ROOT)/include
|
||||
|
||||
## OpenBLAS (OpenMP variant) + gfortran runtime
|
||||
## -Wl,-rpath ensures ABE / TwoPunctureABE find libmpi at runtime without LD_LIBRARY_PATH
|
||||
LDLIBS = -Wl,-rpath,$(OMPI_ROOT)/lib64 -lopenblaso -lgfortran -lpthread -lm -ldl -lgomp
|
||||
|
||||
# OpenMP flag for selective compilation
|
||||
OMP_FLAG = -fopenmp
|
||||
|
||||
## Memory allocator switch
|
||||
## 1 (default) : link Intel oneTBB allocator (libtbbmalloc)
|
||||
## 0 : use system default allocator (ptmalloc)
|
||||
USE_TBBMALLOC ?= 1
|
||||
TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
|
||||
ifneq ($(wildcard $(TBBMALLOC_SO)),)
|
||||
TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
|
||||
else
|
||||
TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
|
||||
## 0 (default) : use system default allocator (ptmalloc)
|
||||
## 1 : use jemalloc (install jemalloc-devel first)
|
||||
USE_JEMALLOC ?= 0
|
||||
ifeq ($(USE_JEMALLOC),1)
|
||||
LDLIBS := -ljemalloc $(LDLIBS)
|
||||
endif
|
||||
ifeq ($(USE_TBBMALLOC),1)
|
||||
LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
|
||||
endif
|
||||
|
||||
## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
|
||||
## opt : (default) maximum performance with PGO profile-guided optimization
|
||||
## instrument : PGO Phase 1 instrumentation to collect fresh profile data
|
||||
PGO_MODE ?= opt
|
||||
|
||||
## Interp_Points load balance profiling mode
|
||||
## off : (default) no load balance instrumentation
|
||||
@@ -76,13 +68,12 @@ USE_TRANSFER_CACHE ?= auto
|
||||
## 0 : use original Fortran rungekutta4_rout.o
|
||||
USE_CXX_RK4 ?= 1
|
||||
|
||||
f90 = ifx
|
||||
f77 = ifx
|
||||
CXX = icpx
|
||||
CC = icx
|
||||
CLINKER = mpiicpx
|
||||
f90 = gfortran
|
||||
f77 = gfortran
|
||||
CXX = g++
|
||||
CC = gcc
|
||||
CLINKER = mpicxx
|
||||
|
||||
Cu = nvcc
|
||||
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
|
||||
#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
|
||||
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc
|
||||
|
||||
Reference in New Issue
Block a user