Compare commits
16 Commits
oneapi-leg
...
gcc-legacy
| Author | SHA1 | Date | |
|---|---|---|---|
| 12bf08a2a1 | |||
| 9b4f98e237 | |||
| 2bbde059db | |||
| 3b8774c1b1 | |||
| 23b52e30d6 | |||
| e8f590a742 | |||
| 632173ea10 | |||
| eed2ff2be8 | |||
| b904f6cf56 | |||
| c4b9bd3788 | |||
| 276b36ea25 | |||
| baf248c3bc | |||
| 70b6496ed3 | |||
| 6ca9fece2e | |||
| 516cdea502 | |||
| 9687d9a3dd |
@@ -59,7 +59,7 @@ bool shell_fast_interp_enabled()
|
|||||||
if (enabled < 0)
|
if (enabled < 0)
|
||||||
{
|
{
|
||||||
const char *env = getenv("AMSS_SHELL_FAST_INTERP");
|
const char *env = getenv("AMSS_SHELL_FAST_INTERP");
|
||||||
enabled = (!env || atoi(env) != 0) ? 1 : 0;
|
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
||||||
}
|
}
|
||||||
return enabled != 0;
|
return enabled != 0;
|
||||||
}
|
}
|
||||||
@@ -70,7 +70,7 @@ bool shell_parallel_interp_enabled()
|
|||||||
if (enabled < 0)
|
if (enabled < 0)
|
||||||
{
|
{
|
||||||
const char *env = getenv("AMSS_SHELL_PARALLEL_INTERP");
|
const char *env = getenv("AMSS_SHELL_PARALLEL_INTERP");
|
||||||
enabled = (!env || atoi(env) != 0) ? 1 : 0;
|
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
||||||
}
|
}
|
||||||
return enabled != 0;
|
return enabled != 0;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ using namespace std;
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "TwoPunctures.h"
|
#include "TwoPunctures.h"
|
||||||
#include <mkl_cblas.h>
|
#include <cblas.h>
|
||||||
|
|
||||||
TwoPunctures::TwoPunctures(double mp, double mm, double b,
|
TwoPunctures::TwoPunctures(double mp, double mm, double b,
|
||||||
double P_plusx, double P_plusy, double P_plusz,
|
double P_plusx, double P_plusy, double P_plusz,
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ using namespace std;
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Intel oneMKL LAPACK interface
|
// Intel oneMKL LAPACK interface
|
||||||
#include <mkl_lapacke.h>
|
#include <lapacke.h>
|
||||||
/* Linear equation solution using Intel oneMKL LAPACK.
|
/* Linear equation solution using Intel oneMKL LAPACK.
|
||||||
a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input
|
a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input
|
||||||
containing the right-hand side vectors. On output a is
|
containing the right-hand side vectors. On output a is
|
||||||
|
|||||||
@@ -58,30 +58,14 @@ POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
|
|||||||
TRANSFER_CACHE_FLAG = -DBSSN_USE_TRANSFER_CACHE=$(EFFECTIVE_USE_TRANSFER_CACHE)
|
TRANSFER_CACHE_FLAG = -DBSSN_USE_TRANSFER_CACHE=$(EFFECTIVE_USE_TRANSFER_CACHE)
|
||||||
ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(EFFECTIVE_USE_CXX_ESCALAR_KERNEL)
|
ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(EFFECTIVE_USE_CXX_ESCALAR_KERNEL)
|
||||||
|
|
||||||
## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
|
|
||||||
## make -> opt (PGO-guided, maximum performance)
|
|
||||||
## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data)
|
|
||||||
PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
|
|
||||||
|
|
||||||
ifeq ($(PGO_MODE),instrument)
|
## GCC build flags (optimized for x86-64-v4)
|
||||||
## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
|
## PGO disabled (used negative optimization on Intel; not tested on GCC)
|
||||||
CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
|
CXXAPPFLAGS = -O3 -march=x86-64-v4 -ffast-math -mfma -flto \
|
||||||
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \
|
-Dfortran3 -Dnewc $(INTERP_LB_FLAGS) \
|
||||||
$(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) $(EM_KERNEL_FLAG)
|
$(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) $(EM_KERNEL_FLAG)
|
||||||
f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
|
f90appflags = -O3 -march=x86-64-v4 -ffast-math -mfma -flto \
|
||||||
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
|
-cpp $(POLINT6_FLAG)
|
||||||
else
|
|
||||||
## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \
|
|
||||||
## PGO has been turned off, now tested and found to be negative optimization
|
|
||||||
## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization
|
|
||||||
|
|
||||||
|
|
||||||
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
|
||||||
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \
|
|
||||||
$(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) $(EM_KERNEL_FLAG)
|
|
||||||
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
|
||||||
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
|
|
||||||
endif
|
|
||||||
|
|
||||||
.SUFFIXES: .o .f90 .C .for .cu
|
.SUFFIXES: .o .f90 .C .for .cu
|
||||||
|
|
||||||
@@ -149,7 +133,7 @@ z4c_rhs_c.o: z4c_rhs_c.C
|
|||||||
TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
|
TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
|
||||||
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||||
-fprofile-instr-use=$(TP_PROFDATA) \
|
-fprofile-instr-use=$(TP_PROFDATA) \
|
||||||
-Dfortran3 -Dnewc -I${MKLROOT}/include
|
-Dfortran3 -Dnewc $(filein_real)
|
||||||
|
|
||||||
TwoPunctures.o: TwoPunctures.C
|
TwoPunctures.o: TwoPunctures.C
|
||||||
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
|
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
|
||||||
|
|||||||
@@ -1,33 +1,25 @@
|
|||||||
## GCC version (commented out)
|
## GCC version with OpenMPI and OpenBLAS
|
||||||
## filein = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
|
OMPI_ROOT = /usr/mpi/gcc/openmpi-4.1.9a1
|
||||||
## filein = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
|
|
||||||
## LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
|
|
||||||
|
|
||||||
## Intel oneAPI version with oneMKL (Optimized for performance)
|
## Ensure mpicxx and final executables find OpenMPI libs at build- and runtime
|
||||||
filein = -I/usr/include/ -I${MKLROOT}/include
|
export LD_LIBRARY_PATH := $(OMPI_ROOT)/lib64:$(LD_LIBRARY_PATH)
|
||||||
|
|
||||||
## Using sequential MKL (OpenMP disabled for better single-threaded performance)
|
filein = -I/usr/include/ -I$(OMPI_ROOT)/include
|
||||||
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
|
|
||||||
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
|
## OpenBLAS (OpenMP variant) + gfortran runtime
|
||||||
|
## -Wl,-rpath ensures ABE / TwoPunctureABE find libmpi at runtime without LD_LIBRARY_PATH
|
||||||
|
LDLIBS = -Wl,-rpath,$(OMPI_ROOT)/lib64 -lopenblaso -lgfortran -lpthread -lm -ldl -lgomp
|
||||||
|
|
||||||
|
# OpenMP flag for selective compilation
|
||||||
|
OMP_FLAG = -fopenmp
|
||||||
|
|
||||||
## Memory allocator switch
|
## Memory allocator switch
|
||||||
## 1 (default) : link Intel oneTBB allocator (libtbbmalloc)
|
## 0 (default) : use system default allocator (ptmalloc)
|
||||||
## 0 : use system default allocator (ptmalloc)
|
## 1 : use jemalloc (install jemalloc-devel first)
|
||||||
USE_TBBMALLOC ?= 1
|
USE_JEMALLOC ?= 0
|
||||||
TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
|
ifeq ($(USE_JEMALLOC),1)
|
||||||
ifneq ($(wildcard $(TBBMALLOC_SO)),)
|
LDLIBS := -ljemalloc $(LDLIBS)
|
||||||
TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
|
|
||||||
else
|
|
||||||
TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
|
|
||||||
endif
|
endif
|
||||||
ifeq ($(USE_TBBMALLOC),1)
|
|
||||||
LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
|
|
||||||
endif
|
|
||||||
|
|
||||||
## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
|
|
||||||
## opt : (default) maximum performance with PGO profile-guided optimization
|
|
||||||
## instrument : PGO Phase 1 instrumentation to collect fresh profile data
|
|
||||||
PGO_MODE ?= opt
|
|
||||||
|
|
||||||
## Interp_Points load balance profiling mode
|
## Interp_Points load balance profiling mode
|
||||||
## off : (default) no load balance instrumentation
|
## off : (default) no load balance instrumentation
|
||||||
@@ -76,13 +68,12 @@ USE_TRANSFER_CACHE ?= auto
|
|||||||
## 0 : use original Fortran rungekutta4_rout.o
|
## 0 : use original Fortran rungekutta4_rout.o
|
||||||
USE_CXX_RK4 ?= 1
|
USE_CXX_RK4 ?= 1
|
||||||
|
|
||||||
f90 = ifx
|
f90 = gfortran
|
||||||
f77 = ifx
|
f77 = gfortran
|
||||||
CXX = icpx
|
CXX = g++
|
||||||
CC = icx
|
CC = gcc
|
||||||
CLINKER = mpiicpx
|
CLINKER = mpicxx
|
||||||
|
|
||||||
Cu = nvcc
|
Cu = nvcc
|
||||||
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
|
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
|
||||||
#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
|
|
||||||
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc
|
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc
|
||||||
|
|||||||
Reference in New Issue
Block a user