diff --git a/makefile_and_run.py b/makefile_and_run.py index 096ed58..157ef76 100755 --- a/makefile_and_run.py +++ b/makefile_and_run.py @@ -11,17 +11,46 @@ import AMSS_NCKU_Input as input_data import subprocess import time -## CPU core binding configuration using taskset -## taskset ensures all child processes inherit the CPU affinity mask -## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111) -## Format: taskset -c 4-55,60-111 ensures processes only run on these cores -#NUMACTL_CPU_BIND = "taskset -c 0-111" -NUMACTL_CPU_BIND = "taskset -c 16-47,64-95" -## Build parallelism configuration -## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores -## Set make -j to utilize available cores for faster builds -BUILD_JOBS = 96 + +def get_last_n_cores_per_socket(n=32): + """ + Read CPU topology via lscpu and return a taskset -c string + selecting the last `n` cores of each NUMA node (socket). + + Example: 2 sockets x 56 cores each, n=32 -> node0: 24-55, node1: 80-111 + -> "taskset -c 24-55,80-111" + """ + result = subprocess.run(["lscpu", "--parse=NODE,CPU"], capture_output=True, text=True) + + # Build a dict: node_id -> sorted list of CPU ids + node_cpus = {} + for line in result.stdout.splitlines(): + if line.startswith("#") or not line.strip(): + continue + parts = line.split(",") + if len(parts) < 2: + continue + node_id, cpu_id = int(parts[0]), int(parts[1]) + node_cpus.setdefault(node_id, []).append(cpu_id) + + segments = [] + for node_id in sorted(node_cpus): + cpus = sorted(node_cpus[node_id]) + selected = cpus[-n:] # last n cores of this socket + segments.append(f"{selected[0]}-{selected[-1]}") + + cpu_str = ",".join(segments) + total = len(segments) * n + print(f" CPU binding: taskset -c {cpu_str} ({total} cores, last {n} per socket)") + return f"taskset -c {cpu_str}" + + +## CPU core binding: dynamically select the last 32 cores of each socket (64 cores total) +NUMACTL_CPU_BIND = get_last_n_cores_per_socket(n=32) + +## Build parallelism: match the number of bound cores +BUILD_JOBS = 64 ##################################################################