From 3a7bce3af24f52e0982c2b6dd1f0bd409527c98f Mon Sep 17 00:00:00 2001
From: CGH0S7 <chengjinyu@hifuu.ink>
Date: Sat, 17 Jan 2026 20:41:02 +0800
Subject: [PATCH] Update Intel oneAPI configuration and CPU binding settings

   - Update makefile.inc with Intel oneAPI compiler flags and oneMKL linking
   - Configure taskset CPU binding to use nohz_full cores (4-55, 60-111)
   - Set build parallelism to 104 jobs for faster compilation
   - Update MPI process count to 48 in input configuration
---
 AMSS_NCKU_Input.py            |  2 +-
 AMSS_NCKU_source/makefile.inc | 18 ++++++++++++++----
 makefile_and_run.py           | 23 +++++++++++++++--------
 3 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/AMSS_NCKU_Input.py b/AMSS_NCKU_Input.py
index 6bf3589..f288e2a 100755
--- a/AMSS_NCKU_Input.py
+++ b/AMSS_NCKU_Input.py
@@ -16,7 +16,7 @@ import numpy
 File_directory   = "GW150914"                    ## output file directory
 Output_directory = "binary_output"               ## binary data file directory
                                                  ## The file directory name should not be too long
-MPI_processes    = 8                             ## number of mpi processes used in the simulation
+MPI_processes    = 48                             ## number of mpi processes used in the simulation
 
 GPU_Calculation  = "no"                          ## Use GPU or not 
                                                  ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
diff --git a/AMSS_NCKU_source/makefile.inc b/AMSS_NCKU_source/makefile.inc
index a0bd81f..f881737 100755
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -3,14 +3,24 @@
 ## filein  = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
 ## LDLIBS  = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
 
-## Intel oneAPI version with oneMKL
+## Intel oneAPI version with oneMKL (Optimized for performance)
 filein  = -I/usr/include/ -I${MKLROOT}/include
 
+## Use Intel OpenMP threading layer for better performance
 LDLIBS  = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -lifcore -limf -lmpi \
-          -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl
+          -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core \
+          -liomp5 -lpthread -lm -ldl
 
-CXXAPPFLAGS  = -O3 -Dfortran3 -Dnewc -I${MKLROOT}/include
-f90appflags  = -O3 -fpp -I${MKLROOT}/include
+## Aggressive optimization flags:
+## -O3: Maximum optimization
+## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible)
+## -qopenmp: Enable OpenMP parallelization
+## -fp-model fast=2: Aggressive floating-point optimizations
+## -fma: Enable fused multiply-add instructions
+CXXAPPFLAGS  = -O3 -xHost -qopenmp -fp-model fast=2 -fma \
+               -Dfortran3 -Dnewc -I${MKLROOT}/include
+f90appflags  = -O3 -xHost -qopenmp -fp-model fast=2 -fma \
+               -fpp -I${MKLROOT}/include
 f90          = ifx
 f77          = ifx
 CXX          = icpx
diff --git a/makefile_and_run.py b/makefile_and_run.py
index a814dee..6140f99 100755
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -11,9 +11,16 @@
 import AMSS_NCKU_Input as input_data
 import subprocess
 
-## CPU core binding configuration using numactl
-## Avoid cores 0-3 and 56-59, use cores 4-55 and 60-111
-NUMACTL_CPU_BIND = "numactl --physcpubind=4-55,60-111"
+## CPU core binding configuration using taskset
+## taskset ensures all child processes inherit the CPU affinity mask
+## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111)
+## Format: taskset -c 4-55,60-111 ensures processes only run on these cores
+NUMACTL_CPU_BIND = "taskset -c 4-55,60-111"
+
+## Build parallelism configuration
+## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
+## Set make -j to utilize available cores for faster builds
+BUILD_JOBS = 104
 
 
 ##################################################################
@@ -30,11 +37,11 @@ def makefile_ABE():
     print( " Compiling the AMSS-NCKU executable file ABE/ABEGPU " ) 
     print(                                                        )
 
-    ## Build command
+    ## Build command with CPU binding to nohz_full cores
     if (input_data.GPU_Calculation == "no"):
-        makefile_command  = "make -j4" + " ABE"
+        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABE"
     elif (input_data.GPU_Calculation == "yes"):
-        makefile_command  = "make -j4" + " ABEGPU"
+        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU"
     else:
         print( " CPU/GPU numerical calculation setting is wrong " )
         print(                                                    )
@@ -71,8 +78,8 @@ def makefile_TwoPunctureABE():
     print( " Compiling the AMSS-NCKU executable file TwoPunctureABE " )
     print(                                                            )
     
-    ## Build command
-    makefile_command = "make" + " TwoPunctureABE"
+    ## Build command with CPU binding to nohz_full cores
+    makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} TwoPunctureABE"
 
     ## Execute the command with subprocess.Popen and stream output
     makefile_process = subprocess.Popen(makefile_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)