Enable multi-threaded MKL for better resource utilization
- Changed from libmkl_sequential to libmkl_intel_thread - Added automatic MKL thread count configuration (104 cores / MPI_processes) - Updated runtime scripts to set MKL_NUM_THREADS environment variable - Added comprehensive optimization documentation Expected improvement: 5-15% from better MKL utilization Note: Main performance bottleneck is in computation loops, not MKL functions
This commit is contained in:
@@ -6,10 +6,12 @@
|
||||
## Intel oneAPI version with oneMKL (Optimized for performance)
|
||||
filein = -I/usr/include/ -I${MKLROOT}/include
|
||||
|
||||
## Using sequential MKL (OpenMP disabled for better single-threaded performance)
|
||||
## Using multi-threaded MKL for better scalability with MPI
|
||||
## This allows MKL functions (FFT, BLAS, LAPACK) to use multiple threads internally
|
||||
## while keeping the application code as pure MPI (no OpenMP pragmas in user code)
|
||||
LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -lifcore -limf -lmpi \
|
||||
-L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core \
|
||||
-lpthread -lm -ldl
|
||||
-L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core \
|
||||
-liomp5 -lpthread -lm -ldl
|
||||
|
||||
## Aggressive optimization flags:
|
||||
## -O3: Maximum optimization
|
||||
|
||||
@@ -22,6 +22,13 @@ NUMACTL_CPU_BIND = "taskset -c 4-55,60-111"
|
||||
## Set make -j to utilize available cores for faster builds
|
||||
BUILD_JOBS = 104
|
||||
|
||||
## MKL threading configuration for hybrid MPI+threaded-MKL execution
|
||||
## Total cores available: 104 (cores 4-55, 60-111)
|
||||
## MPI processes: configured in AMSS_NCKU_Input.py (typically 48)
|
||||
## MKL threads per process: 104 / MPI_processes ≈ 2
|
||||
## This ensures full utilization of available cores
|
||||
MKL_NUM_THREADS = max(1, 104 // input_data.MPI_processes)
|
||||
|
||||
|
||||
##################################################################
|
||||
|
||||
@@ -110,18 +117,24 @@ def makefile_TwoPunctureABE():
|
||||
def run_ABE():
|
||||
|
||||
print( )
|
||||
print( " Running the AMSS-NCKU executable file ABE/ABEGPU " )
|
||||
print( " Running the AMSS-NCKU executable file ABE/ABEGPU " )
|
||||
print( )
|
||||
print( f" MPI processes: {input_data.MPI_processes}" )
|
||||
print( f" MKL threads per process: {MKL_NUM_THREADS}" )
|
||||
print( f" Total threads: {input_data.MPI_processes * MKL_NUM_THREADS}" )
|
||||
print( )
|
||||
|
||||
## Define the command to run; cast other values to strings as needed
|
||||
|
||||
## Set MKL threading environment variables for optimal performance
|
||||
env_vars = f"export MKL_NUM_THREADS={MKL_NUM_THREADS} && export MKL_DYNAMIC=FALSE && "
|
||||
|
||||
if (input_data.GPU_Calculation == "no"):
|
||||
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
|
||||
mpi_command = env_vars + NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
|
||||
mpi_command_outfile = "ABE_out.log"
|
||||
elif (input_data.GPU_Calculation == "yes"):
|
||||
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
|
||||
mpi_command = env_vars + NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
|
||||
mpi_command_outfile = "ABEGPU_out.log"
|
||||
|
||||
|
||||
## Execute the MPI command and stream output
|
||||
mpi_process = subprocess.Popen(mpi_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
|
||||
|
||||
@@ -154,11 +167,12 @@ def run_ABE():
|
||||
def run_TwoPunctureABE():
|
||||
|
||||
print( )
|
||||
print( " Running the AMSS-NCKU executable file TwoPunctureABE " )
|
||||
print( " Running the AMSS-NCKU executable file TwoPunctureABE " )
|
||||
print( )
|
||||
|
||||
## Define the command to run
|
||||
TwoPuncture_command = NUMACTL_CPU_BIND + " ./TwoPunctureABE"
|
||||
|
||||
## Define the command to run with MKL threading configuration
|
||||
env_vars = f"export MKL_NUM_THREADS={MKL_NUM_THREADS} && export MKL_DYNAMIC=FALSE && "
|
||||
TwoPuncture_command = env_vars + NUMACTL_CPU_BIND + " ./TwoPunctureABE"
|
||||
TwoPuncture_command_outfile = "TwoPunctureABE_out.log"
|
||||
|
||||
## Execute the command with subprocess.Popen and stream output
|
||||
|
||||
Reference in New Issue
Block a user