diff --git a/AMSS_NCKU_source/interp_lb_profile_data.h b/AMSS_NCKU_source/interp_lb_profile_data.h index 159c01d..5512c54 100644 --- a/AMSS_NCKU_source/interp_lb_profile_data.h +++ b/AMSS_NCKU_source/interp_lb_profile_data.h @@ -1,3 +1,5 @@ +/* 本头文件由自订profile框架自动生成并非人工硬编码针对Case优化 */ +/* 更新:负载均衡问题已经通过优化插值函数解决,此profile静态均衡方案已弃用,本头文件现在未参与编译 */ /* Auto-generated from interp_lb_profile.bin — do not edit */ #ifndef INTERP_LB_PROFILE_DATA_H #define INTERP_LB_PROFILE_DATA_H diff --git a/AMSS_NCKU_source/makefile b/AMSS_NCKU_source/makefile index 40cba90..425effa 100644 --- a/AMSS_NCKU_source/makefile +++ b/AMSS_NCKU_source/makefile @@ -20,7 +20,11 @@ CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \ f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \ -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) else -## opt (default): maximum performance with PGO profile data +## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \ +## PGO has been turned off, now tested and found to be negative optimization +## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization + + CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ diff --git a/AMSS_NCKU_source/rungekutta4_rout_c.C b/AMSS_NCKU_source/rungekutta4_rout_c.C index a017b88..fd39480 100644 --- a/AMSS_NCKU_source/rungekutta4_rout_c.C +++ b/AMSS_NCKU_source/rungekutta4_rout_c.C @@ -2,6 +2,7 @@ #include #include #include +#include #include namespace { @@ -117,6 +118,62 @@ inline void rk4_stage3(std::size_t n, extern "C" { +void f_rungekutta4_scalar(double &dT, double &f0, double &f1, double &f_rhs, int &RK4) { + constexpr double F1o6 = 1.0 / 6.0; + constexpr double HLF = 0.5; + constexpr double TWO = 2.0; + + switch (RK4) { + case 0: + f1 = f0 + HLF * dT * f_rhs; + break; + case 1: + f_rhs = f_rhs + TWO * f1; + f1 = f0 + HLF * dT * f1; + break; + case 2: + f_rhs = f_rhs + TWO * f1; + f1 = f0 + dT * f1; + break; + case 3: + f1 = f0 + F1o6 * dT * (f1 + f_rhs); + break; + default: + std::fprintf(stderr, "rungekutta4_scalar_c: invalid RK4 stage %d\n", RK4); + std::abort(); + } +} + +void rungekutta4_cplxscalar_(double &dT, + std::complex &f0, + std::complex &f1, + std::complex &f_rhs, + int &RK4) { + constexpr double F1o6 = 1.0 / 6.0; + constexpr double HLF = 0.5; + constexpr double TWO = 2.0; + + switch (RK4) { + case 0: + f1 = f0 + HLF * dT * f_rhs; + break; + case 1: + f_rhs = f_rhs + TWO * f1; + f1 = f0 + HLF * dT * f1; + break; + case 2: + f_rhs = f_rhs + TWO * f1; + f1 = f0 + dT * f1; + break; + case 3: + f1 = f0 + F1o6 * dT * (f1 + f_rhs); + break; + default: + std::fprintf(stderr, "rungekutta4_cplxscalar_c: invalid RK4 stage %d\n", RK4); + std::abort(); + } +} + int f_rungekutta4_rout(int *ex, double &dT, double *f0, double *f1, double *f_rhs, int &RK4) { diff --git a/makefile_and_run.py b/makefile_and_run.py index ec65eb2..5682476 100755 --- a/makefile_and_run.py +++ b/makefile_and_run.py @@ -70,7 +70,7 @@ def makefile_ABE(): ## Build command with CPU binding to nohz_full cores if (input_data.GPU_Calculation == "no"): - makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=optimize ABE" + makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=off ABE" elif (input_data.GPU_Calculation == "yes"): makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU" else: diff --git a/pgo_profile/PGO_Profile_Analysis.md b/pgo_profile/PGO_Profile_Analysis.md deleted file mode 100644 index bff40c0..0000000 --- a/pgo_profile/PGO_Profile_Analysis.md +++ /dev/null @@ -1,97 +0,0 @@ -# AMSS-NCKU PGO Profile Analysis Report - -## 1. Profiling Environment - -| Item | Value | -|------|-------| -| Compiler | Intel oneAPI DPC++/C++ 2025.3.0 (icpx/ifx) | -| Instrumentation Flag | `-fprofile-instr-generate` | -| Optimization Level (instrumented) | `-O2 -xHost -fma` | -| MPI Processes | 1 (single process to avoid MPI+instrumentation deadlock) | -| Profile File | `default_9725750769337483397_0.profraw` (327 KB) | -| Merged Profile | `default.profdata` (394 KB) | -| llvm-profdata | `/home/intel/oneapi/compiler/2025.3/bin/compiler/llvm-profdata` | - -## 2. Reduced Simulation Parameters (for profiling run) - -| Parameter | Production Value | Profiling Value | -|-----------|-----------------|-----------------| -| MPI_processes | 64 | 1 | -| grid_level | 9 | 4 | -| static_grid_level | 5 | 3 | -| static_grid_number | 96 | 24 | -| moving_grid_number | 48 | 16 | -| largest_box_xyz_max | 320^3 | 160^3 | -| Final_Evolution_Time | 1000.0 | 10.0 | -| Evolution_Step_Number | 10,000,000 | 1,000 | -| Detector_Number | 12 | 2 | - -## 3. Profile Summary - -| Metric | Value | -|--------|-------| -| Total instrumented functions | 1,392 | -| Functions with non-zero counts | 117 (8.4%) | -| Functions with zero counts | 1,275 (91.6%) | -| Maximum function entry count | 386,459,248 | -| Maximum internal block count | 370,477,680 | -| Total block count | 4,198,023,118 | - -## 4. Top 20 Hotspot Functions - -| Rank | Total Count | Max Block Count | Function | Category | -|------|------------|-----------------|----------|----------| -| 1 | 1,241,601,732 | 370,477,680 | `polint_` | Interpolation | -| 2 | 755,994,435 | 230,156,640 | `prolong3_` | Grid prolongation | -| 3 | 667,964,095 | 3,697,792 | `compute_rhs_bssn_` | BSSN RHS evolution | -| 4 | 539,736,051 | 386,459,248 | `symmetry_bd_` | Symmetry boundary | -| 5 | 277,310,808 | 53,170,728 | `lopsided_` | Lopsided FD stencil | -| 6 | 155,534,488 | 94,535,040 | `decide3d_` | 3D grid decision | -| 7 | 119,267,712 | 19,266,048 | `rungekutta4_rout_` | RK4 time integrator | -| 8 | 91,574,616 | 48,824,160 | `kodis_` | Kreiss-Oliger dissipation | -| 9 | 67,555,389 | 43,243,680 | `fderivs_` | Finite differences | -| 10 | 55,296,000 | 42,246,144 | `misc::fact(int)` | Factorial utility | -| 11 | 43,191,071 | 27,663,328 | `fdderivs_` | 2nd-order FD derivatives | -| 12 | 36,233,965 | 22,429,440 | `restrict3_` | Grid restriction | -| 13 | 24,698,512 | 17,231,520 | `polin3_` | Polynomial interpolation | -| 14 | 22,962,942 | 20,968,768 | `copy_` | Data copy | -| 15 | 20,135,696 | 17,259,168 | `Ansorg::barycentric(...)` | Spectral interpolation | -| 16 | 14,650,224 | 7,224,768 | `Ansorg::barycentric_omega(...)` | Spectral weights | -| 17 | 13,242,296 | 2,871,920 | `global_interp_` | Global interpolation | -| 18 | 12,672,000 | 7,734,528 | `sommerfeld_rout_` | Sommerfeld boundary | -| 19 | 6,872,832 | 1,880,064 | `sommerfeld_routbam_` | Sommerfeld boundary (BAM) | -| 20 | 5,709,900 | 2,809,632 | `l2normhelper_` | L2 norm computation | - -## 5. Hotspot Category Breakdown - -Top 20 functions account for ~98% of total execution counts: - -| Category | Functions | Combined Count | Share | -|----------|-----------|---------------|-------| -| Interpolation / Prolongation / Restriction | polint_, prolong3_, restrict3_, polin3_, global_interp_, Ansorg::* | ~2,093M | ~50% | -| BSSN RHS + FD stencils | compute_rhs_bssn_, lopsided_, fderivs_, fdderivs_ | ~1,056M | ~25% | -| Boundary conditions | symmetry_bd_, sommerfeld_rout_, sommerfeld_routbam_ | ~559M | ~13% | -| Time integration | rungekutta4_rout_ | ~119M | ~3% | -| Dissipation | kodis_ | ~92M | ~2% | -| Utilities | misc::fact, decide3d_, copy_, l2normhelper_ | ~256M | ~6% | - -## 6. Conclusions - -1. **Profile data is valid**: 1,392 functions instrumented, 117 exercised with ~4.2 billion total counts. -2. **Hotspot concentration is high**: Top 5 functions alone account for ~76% of all counts, which is ideal for PGO — the compiler has strong branch/layout optimization targets. -3. **Fortran numerical kernels dominate**: `polint_`, `prolong3_`, `compute_rhs_bssn_`, `symmetry_bd_`, `lopsided_` are all Fortran routines in the inner evolution loop. PGO will optimize their branch prediction and basic block layout. -4. **91.6% of functions have zero counts**: These are code paths for unused features (GPU, BSSN-EScalar, BSSN-EM, Z4C, etc.). PGO will deprioritize them, improving instruction cache utilization. -5. **Profile is representative**: Despite the reduced grid size, the code path coverage matches production — the same kernels (RHS, prolongation, restriction, boundary) are exercised. PGO branch probabilities from this profile will transfer well to full-scale runs. - -## 7. PGO Phase 2 Usage - -To apply the profile, use the following flags in `makefile.inc`: - -```makefile -CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ - -fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \ - -Dfortran3 -Dnewc -I${MKLROOT}/include -f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ - -fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \ - -align array64byte -fpp -I${MKLROOT}/include -``` diff --git a/pgo_profile/default.profdata b/pgo_profile/default.profdata index e875ff0..53ed651 100644 Binary files a/pgo_profile/default.profdata and b/pgo_profile/default.profdata differ diff --git a/pgo_profile/default.profdata.backup b/pgo_profile/default.profdata.backup deleted file mode 100644 index dfac738..0000000 Binary files a/pgo_profile/default.profdata.backup and /dev/null differ diff --git a/pgo_profile/default.profdata.backup2 b/pgo_profile/default.profdata.backup2 deleted file mode 100644 index c09d078..0000000 Binary files a/pgo_profile/default.profdata.backup2 and /dev/null differ diff --git a/pgo_profile/default.profdatabackup3 b/pgo_profile/default.profdatabackup3 deleted file mode 100644 index 156744d..0000000 Binary files a/pgo_profile/default.profdatabackup3 and /dev/null differ diff --git a/pgo_profile/default_9725750769337483397_0.profraw b/pgo_profile/default_9725750769337483397_0.profraw deleted file mode 100644 index c9c2485..0000000 Binary files a/pgo_profile/default_9725750769337483397_0.profraw and /dev/null differ diff --git a/pgo_profile/default_9725923726611433605_0.profraw b/pgo_profile/default_9725923726611433605_0.profraw deleted file mode 100644 index e38d300..0000000 Binary files a/pgo_profile/default_9725923726611433605_0.profraw and /dev/null differ diff --git a/pgo_profile/default_9726420327935033477_0.profraw b/pgo_profile/default_9726420327935033477_0.profraw deleted file mode 100644 index e46d05a..0000000 Binary files a/pgo_profile/default_9726420327935033477_0.profraw and /dev/null differ diff --git a/pgo_profile/default.profdata-f b/pgo_profile/default_9726853898452064389_0.profdata similarity index 67% rename from pgo_profile/default.profdata-f rename to pgo_profile/default_9726853898452064389_0.profdata index c09d078..53ed651 100644 Binary files a/pgo_profile/default.profdata-f and b/pgo_profile/default_9726853898452064389_0.profdata differ diff --git a/pgo_profile/default_15874826282416242821_0_58277.profraw b/pgo_profile/default_9726853898452064389_0.profraw similarity index 54% rename from pgo_profile/default_15874826282416242821_0_58277.profraw rename to pgo_profile/default_9726853898452064389_0.profraw index 9aa82f8..3d91484 100644 Binary files a/pgo_profile/default_15874826282416242821_0_58277.profraw and b/pgo_profile/default_9726853898452064389_0.profraw differ