diff --git a/AMSS_NCKU_source/makefile.inc b/AMSS_NCKU_source/makefile.inc
index d9fa726..823aab5 100755
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -6,10 +6,12 @@
 ## Intel oneAPI version with oneMKL (Optimized for performance)
 filein  = -I/usr/include/ -I${MKLROOT}/include
 
-## Using sequential MKL (OpenMP disabled for better single-threaded performance)
+## Using multi-threaded MKL for better scalability with MPI
+## This allows MKL functions (FFT, BLAS, LAPACK) to use multiple threads internally
+## while keeping the application code as pure MPI (no OpenMP pragmas in user code)
 LDLIBS  = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -lifcore -limf -lmpi \
-          -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core \
-          -lpthread -lm -ldl
+          -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core \
+          -liomp5 -lpthread -lm -ldl
 
 ## Aggressive optimization flags:
 ## -O3: Maximum optimization
diff --git a/makefile_and_run.py b/makefile_and_run.py
index 6140f99..3a763cc 100755
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -22,6 +22,13 @@ NUMACTL_CPU_BIND = "taskset -c 4-55,60-111"
 ## Set make -j to utilize available cores for faster builds
 BUILD_JOBS = 104
 
+## MKL threading configuration for hybrid MPI+threaded-MKL execution
+## Total cores available: 104 (cores 4-55, 60-111)
+## MPI processes: configured in AMSS_NCKU_Input.py (typically 48)
+## MKL threads per process: 104 / MPI_processes ≈ 2
+## This ensures full utilization of available cores
+MKL_NUM_THREADS = max(1, 104 // input_data.MPI_processes)
+
 
 ##################################################################
 
@@ -110,18 +117,24 @@ def makefile_TwoPunctureABE():
 def run_ABE():
 
     print(                                                      )
-    print( " Running the AMSS-NCKU executable file ABE/ABEGPU " ) 
+    print( " Running the AMSS-NCKU executable file ABE/ABEGPU " )
+    print(                                                      )
+    print( f" MPI processes: {input_data.MPI_processes}" )
+    print( f" MKL threads per process: {MKL_NUM_THREADS}" )
+    print( f" Total threads: {input_data.MPI_processes * MKL_NUM_THREADS}" )
     print(                                                      )
 
     ## Define the command to run; cast other values to strings as needed
-    
+    ## Set MKL threading environment variables for optimal performance
+    env_vars = f"export MKL_NUM_THREADS={MKL_NUM_THREADS} && export MKL_DYNAMIC=FALSE && "
+
     if (input_data.GPU_Calculation == "no"):
-        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
+        mpi_command         = env_vars + NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
         mpi_command_outfile = "ABE_out.log"
     elif (input_data.GPU_Calculation == "yes"):
-        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
+        mpi_command         = env_vars + NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
         mpi_command_outfile = "ABEGPU_out.log"
- 
+
     ## Execute the MPI command and stream output
     mpi_process = subprocess.Popen(mpi_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
 
@@ -154,11 +167,12 @@ def run_ABE():
 def run_TwoPunctureABE():
 
     print(                                                          )
-    print( " Running the AMSS-NCKU executable file TwoPunctureABE " ) 
+    print( " Running the AMSS-NCKU executable file TwoPunctureABE " )
     print(                                                          )
-    
-    ## Define the command to run
-    TwoPuncture_command         = NUMACTL_CPU_BIND + " ./TwoPunctureABE"
+
+    ## Define the command to run with MKL threading configuration
+    env_vars = f"export MKL_NUM_THREADS={MKL_NUM_THREADS} && export MKL_DYNAMIC=FALSE && "
+    TwoPuncture_command         = env_vars + NUMACTL_CPU_BIND + " ./TwoPunctureABE"
     TwoPuncture_command_outfile = "TwoPunctureABE_out.log"
 
     ## Execute the command with subprocess.Popen and stream output