diff --git a/AMSS_NCKU_source/makefile.inc b/AMSS_NCKU_source/makefile.inc index d9fa726..823aab5 100755 --- a/AMSS_NCKU_source/makefile.inc +++ b/AMSS_NCKU_source/makefile.inc @@ -6,10 +6,12 @@ ## Intel oneAPI version with oneMKL (Optimized for performance) filein = -I/usr/include/ -I${MKLROOT}/include -## Using sequential MKL (OpenMP disabled for better single-threaded performance) +## Using multi-threaded MKL for better scalability with MPI +## This allows MKL functions (FFT, BLAS, LAPACK) to use multiple threads internally +## while keeping the application code as pure MPI (no OpenMP pragmas in user code) LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -lifcore -limf -lmpi \ - -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core \ - -lpthread -lm -ldl + -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core \ + -liomp5 -lpthread -lm -ldl ## Aggressive optimization flags: ## -O3: Maximum optimization diff --git a/makefile_and_run.py b/makefile_and_run.py index 6140f99..3a763cc 100755 --- a/makefile_and_run.py +++ b/makefile_and_run.py @@ -22,6 +22,13 @@ NUMACTL_CPU_BIND = "taskset -c 4-55,60-111" ## Set make -j to utilize available cores for faster builds BUILD_JOBS = 104 +## MKL threading configuration for hybrid MPI+threaded-MKL execution +## Total cores available: 104 (cores 4-55, 60-111) +## MPI processes: configured in AMSS_NCKU_Input.py (typically 48) +## MKL threads per process: 104 / MPI_processes ≈ 2 +## This ensures full utilization of available cores +MKL_NUM_THREADS = max(1, 104 // input_data.MPI_processes) + ################################################################## @@ -110,18 +117,24 @@ def makefile_TwoPunctureABE(): def run_ABE(): print( ) - print( " Running the AMSS-NCKU executable file ABE/ABEGPU " ) + print( " Running the AMSS-NCKU executable file ABE/ABEGPU " ) + print( ) + print( f" MPI processes: {input_data.MPI_processes}" ) + print( f" MKL threads per process: {MKL_NUM_THREADS}" ) + print( f" Total threads: {input_data.MPI_processes * MKL_NUM_THREADS}" ) print( ) ## Define the command to run; cast other values to strings as needed - + ## Set MKL threading environment variables for optimal performance + env_vars = f"export MKL_NUM_THREADS={MKL_NUM_THREADS} && export MKL_DYNAMIC=FALSE && " + if (input_data.GPU_Calculation == "no"): - mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" + mpi_command = env_vars + NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" mpi_command_outfile = "ABE_out.log" elif (input_data.GPU_Calculation == "yes"): - mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU" + mpi_command = env_vars + NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU" mpi_command_outfile = "ABEGPU_out.log" - + ## Execute the MPI command and stream output mpi_process = subprocess.Popen(mpi_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) @@ -154,11 +167,12 @@ def run_ABE(): def run_TwoPunctureABE(): print( ) - print( " Running the AMSS-NCKU executable file TwoPunctureABE " ) + print( " Running the AMSS-NCKU executable file TwoPunctureABE " ) print( ) - - ## Define the command to run - TwoPuncture_command = NUMACTL_CPU_BIND + " ./TwoPunctureABE" + + ## Define the command to run with MKL threading configuration + env_vars = f"export MKL_NUM_THREADS={MKL_NUM_THREADS} && export MKL_DYNAMIC=FALSE && " + TwoPuncture_command = env_vars + NUMACTL_CPU_BIND + " ./TwoPunctureABE" TwoPuncture_command_outfile = "TwoPunctureABE_out.log" ## Execute the command with subprocess.Popen and stream output