diff --git a/AMSS_NCKU_Program.py b/AMSS_NCKU_Program.py index 6a7952a..2d777cd 100755 --- a/AMSS_NCKU_Program.py +++ b/AMSS_NCKU_Program.py @@ -270,6 +270,12 @@ if not os.path.exists( ABE_file ): ## Copy the executable ABE (or ABEGPU) into the run directory shutil.copy2(ABE_file, output_directory) +## Copy interp load balance profile if present (for optimize pass) +interp_lb_profile = os.path.join(AMSS_NCKU_source_copy, "interp_lb_profile.bin") +if os.path.exists(interp_lb_profile): + shutil.copy2(interp_lb_profile, output_directory) + print( " Copied interp_lb_profile.bin to run directory " ) + ########################### ## If the initial-data method is TwoPuncture, copy the TwoPunctureABE executable to the run directory diff --git a/AMSS_NCKU_source/interp_lb_profile.bin b/AMSS_NCKU_source/interp_lb_profile.bin new file mode 100644 index 0000000..4356b28 Binary files /dev/null and b/AMSS_NCKU_source/interp_lb_profile.bin differ diff --git a/AMSS_NCKU_source/interp_lb_profile_data.h b/AMSS_NCKU_source/interp_lb_profile_data.h new file mode 100644 index 0000000..159c01d --- /dev/null +++ b/AMSS_NCKU_source/interp_lb_profile_data.h @@ -0,0 +1,27 @@ +/* Auto-generated from interp_lb_profile.bin — do not edit */ +#ifndef INTERP_LB_PROFILE_DATA_H +#define INTERP_LB_PROFILE_DATA_H + +#define INTERP_LB_NPROCS 64 +#define INTERP_LB_NUM_HEAVY 4 + +static const int interp_lb_heavy_blocks[4] = {27, 35, 28, 36}; + +/* Split table: {block_id, r_left, r_right} */ +static const int interp_lb_splits[4][3] = { + {27, 26, 27}, + {35, 34, 35}, + {28, 28, 29}, + {36, 36, 37}, +}; + +/* Rank remap for displaced neighbor blocks */ +static const int interp_lb_num_remaps = 4; +static const int interp_lb_remaps[][2] = { + {26, 25}, + {29, 30}, + {34, 33}, + {37, 38}, +}; + +#endif /* INTERP_LB_PROFILE_DATA_H */ diff --git a/AMSS_NCKU_source/makefile b/AMSS_NCKU_source/makefile index 34dd9f9..7849f37 100644 --- a/AMSS_NCKU_source/makefile +++ b/AMSS_NCKU_source/makefile @@ -10,14 +10,14 @@ PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata ifeq ($(PGO_MODE),instrument) ## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \ - -Dfortran3 -Dnewc -I${MKLROOT}/include + -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \ -align array64byte -fpp -I${MKLROOT}/include else ## opt (default): maximum performance with PGO profile data CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ -fprofile-instr-use=$(PROFDATA) \ - -Dfortran3 -Dnewc -I${MKLROOT}/include + -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ -fprofile-instr-use=$(PROFDATA) \ -align array64byte -fpp -I${MKLROOT}/include @@ -53,6 +53,9 @@ kodiss_c.o: kodiss_c.C lopsided_c.o: lopsided_c.C ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ +interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h + ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ + ## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ @@ -81,7 +84,7 @@ C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\ bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\ Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\ - NullShellPatch2_Evo.o writefile_f.o + NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ cgh.o surface_integral.o ShellPatch.o\ diff --git a/AMSS_NCKU_source/makefile.inc b/AMSS_NCKU_source/makefile.inc index a786fd7..f05799a 100755 --- a/AMSS_NCKU_source/makefile.inc +++ b/AMSS_NCKU_source/makefile.inc @@ -15,6 +15,20 @@ LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore ## instrument : PGO Phase 1 instrumentation to collect fresh profile data PGO_MODE ?= opt +## Interp_Points load balance profiling mode +## off : (default) no load balance instrumentation +## profile : Pass 1 — instrument Interp_Points to collect timing profile +## optimize : Pass 2 — read profile and apply block rebalancing +INTERP_LB_MODE ?= off + +ifeq ($(INTERP_LB_MODE),profile) +INTERP_LB_FLAGS = -DINTERP_LB_PROFILE +else ifeq ($(INTERP_LB_MODE),optimize) +INTERP_LB_FLAGS = -DINTERP_LB_OPTIMIZE +else +INTERP_LB_FLAGS = +endif + ## Kernel implementation switch ## 1 (default) : use C++ rewrite of bssn_rhs and helper kernels (faster) ## 0 : fall back to original Fortran kernels diff --git a/generate_interp_lb_header.py b/generate_interp_lb_header.py new file mode 100644 index 0000000..a1f1c59 --- /dev/null +++ b/generate_interp_lb_header.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +"""Convert interp_lb_profile.bin to a C header for compile-time embedding.""" +import struct, sys + +if len(sys.argv) < 3: + print(f"Usage: {sys.argv[0]} ") + sys.exit(1) + +with open(sys.argv[1], 'rb') as f: + magic, version, nprocs, num_heavy = struct.unpack('IIii', f.read(16)) + threshold = struct.unpack('d', f.read(8))[0] + times = list(struct.unpack(f'{nprocs}d', f.read(nprocs * 8))) + heavy = list(struct.unpack(f'{num_heavy}i', f.read(num_heavy * 4))) + +# For each heavy rank, compute split: left half -> lighter neighbor, right half -> heavy rank +# (or vice versa depending on which neighbor is lighter) +splits = [] +for hr in heavy: + prev_t = times[hr - 1] if hr > 0 else 1e30 + next_t = times[hr + 1] if hr < nprocs - 1 else 1e30 + if prev_t <= next_t: + splits.append((hr, hr - 1, hr)) # (block_id, r_left, r_right) + else: + splits.append((hr, hr, hr + 1)) + +# Also remap the displaced neighbor blocks +remaps = {} +for hr, r_l, r_r in splits: + if r_l != hr: + # We took r_l's slot, so remap block r_l to its other neighbor + displaced = r_l + if displaced > 0 and displaced - 1 not in [s[0] for s in splits]: + remaps[displaced] = displaced - 1 + elif displaced < nprocs - 1: + remaps[displaced] = displaced + 1 + else: + displaced = r_r + if displaced < nprocs - 1 and displaced + 1 not in [s[0] for s in splits]: + remaps[displaced] = displaced + 1 + elif displaced > 0: + remaps[displaced] = displaced - 1 + +with open(sys.argv[2], 'w') as out: + out.write("/* Auto-generated from interp_lb_profile.bin — do not edit */\n") + out.write("#ifndef INTERP_LB_PROFILE_DATA_H\n") + out.write("#define INTERP_LB_PROFILE_DATA_H\n\n") + out.write(f"#define INTERP_LB_NPROCS {nprocs}\n") + out.write(f"#define INTERP_LB_NUM_HEAVY {num_heavy}\n\n") + out.write(f"static const int interp_lb_heavy_blocks[{num_heavy}] = {{") + out.write(", ".join(str(h) for h in heavy)) + out.write("};\n\n") + out.write("/* Split table: {block_id, r_left, r_right} */\n") + out.write(f"static const int interp_lb_splits[{num_heavy}][3] = {{\n") + for bid, rl, rr in splits: + out.write(f" {{{bid}, {rl}, {rr}}},\n") + out.write("};\n\n") + out.write("/* Rank remap for displaced neighbor blocks */\n") + out.write(f"static const int interp_lb_num_remaps = {len(remaps)};\n") + out.write(f"static const int interp_lb_remaps[][2] = {{\n") + for src, dst in sorted(remaps.items()): + out.write(f" {{{src}, {dst}}},\n") + if not remaps: + out.write(" {-1, -1},\n") + out.write("};\n\n") + out.write("#endif /* INTERP_LB_PROFILE_DATA_H */\n") + +print(f"Generated {sys.argv[2]}:") +print(f" {num_heavy} heavy blocks to split: {heavy}") +for bid, rl, rr in splits: + print(f" block {bid}: split -> rank {rl} (left), rank {rr} (right)") +for src, dst in sorted(remaps.items()): + print(f" block {src}: remap -> rank {dst}") diff --git a/makefile_and_run.py b/makefile_and_run.py index 157ef76..1a0b937 100755 --- a/makefile_and_run.py +++ b/makefile_and_run.py @@ -69,7 +69,7 @@ def makefile_ABE(): ## Build command with CPU binding to nohz_full cores if (input_data.GPU_Calculation == "no"): - makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABE" + makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=optimize ABE" elif (input_data.GPU_Calculation == "yes"): makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU" else: