From 5f664716ab0e4be77bff75e6fc16286ee4712359 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Mon, 30 Mar 2026 20:34:34 +0800 Subject: [PATCH] Enable OpenMP task parallelism for C kernels --- AMSS_NCKU_Input.py | 19 +++-- AMSS_NCKU_source/bssn_rhs_c.C | 125 ++++++++++++++++------------ AMSS_NCKU_source/fdderivs_c.C | 4 +- AMSS_NCKU_source/fderivs_c.C | 4 +- AMSS_NCKU_source/lopsided_kodis_c.C | 10 ++- AMSS_NCKU_source/makefile | 4 +- AMSS_NCKU_source/makefile.inc | 3 + makefile_and_run.py | 29 ++++++- setup.py | 18 ++-- 9 files changed, 137 insertions(+), 79 deletions(-) diff --git a/AMSS_NCKU_Input.py b/AMSS_NCKU_Input.py index fe25a50..4919de6 100755 --- a/AMSS_NCKU_Input.py +++ b/AMSS_NCKU_Input.py @@ -13,14 +13,15 @@ import numpy ## Setting MPI processes and the output file directory -File_directory = "GW150914" ## output file directory -Output_directory = "binary_output" ## binary data file directory - ## The file directory name should not be too long -MPI_processes = 64 ## number of mpi processes used in the simulation - -GPU_Calculation = "no" ## Use GPU or not - ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface) -CPU_Part = 1.0 +File_directory = "GW150914" ## output file directory +Output_directory = "binary_output" ## binary data file directory + ## The file directory name should not be too long +MPI_processes = 8 ## number of mpi processes used in the simulation +OMP_Threads = 16 ## number of OpenMP threads used by each MPI process + +GPU_Calculation = "no" ## Use GPU or not + ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface) +CPU_Part = 1.0 GPU_Part = 0.0 ################################################# @@ -50,7 +51,7 @@ Check_Time = 100.0 Dump_Time = 100.0 ## time inteval dT for dumping binary data D2_Dump_Time = 100.0 ## dump the ascii data for 2d surface after dT' Analysis_Time = 0.1 ## dump the puncture position and GW psi4 after dT" -Evolution_Step_Number = 10000000 ## stop the calculation after the maximal step number +Evolution_Step_Number = 10000000 ## stop the calculation after the maximal step number Courant_Factor = 0.5 ## Courant Factor Dissipation = 0.15 ## Kreiss-Oliger Dissipation Strength diff --git a/AMSS_NCKU_source/bssn_rhs_c.C b/AMSS_NCKU_source/bssn_rhs_c.C index 4354866..c72a8b5 100644 --- a/AMSS_NCKU_source/bssn_rhs_c.C +++ b/AMSS_NCKU_source/bssn_rhs_c.C @@ -2,6 +2,24 @@ #include "bssn_rhs.h" #include "share_func.h" #include "tool.h" + +#ifdef _OPENMP +#define BSSN_OMP_TASK_GROUP_BEGIN \ + _Pragma("omp parallel") \ + { \ + _Pragma("omp single nowait") \ + { +#define BSSN_OMP_TASK_CALL(...) \ + _Pragma("omp task") { __VA_ARGS__; } +#define BSSN_OMP_TASK_GROUP_END \ + _Pragma("omp taskwait") \ + } \ + } +#else +#define BSSN_OMP_TASK_GROUP_BEGIN { +#define BSSN_OMP_TASK_CALL(...) { __VA_ARGS__; } +#define BSSN_OMP_TASK_GROUP_END } +#endif // 0-based i,j,k // #define IDX_F(i,j,k,nx,ny) ((i) + (j)*(nx) + (k)*(nx)*(ny)) // ex(1)=nx, ex(2)=ny, ex(3)=nz @@ -108,18 +126,20 @@ int f_compute_rhs_bssn(int *ex, double &T, chin1[i] = chi[i] + 1.0; } // 9ms // - fderivs(ex,betax,betaxx,betaxy,betaxz,X,Y,Z,ANTI, SYM, SYM,Symmetry,Lev); - fderivs(ex,betay,betayx,betayy,betayz,X,Y,Z, SYM,ANTI, SYM,Symmetry,Lev); - fderivs(ex,betaz,betazx,betazy,betazz,X,Y,Z, SYM, SYM,ANTI,Symmetry,Lev); - fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev); - fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev); - fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev); - fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev); - fderivs(ex,dyy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev); - fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev); - fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev); - fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev); - fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev); + BSSN_OMP_TASK_GROUP_BEGIN + BSSN_OMP_TASK_CALL(fderivs(ex,betax,betaxx,betaxy,betaxz,X,Y,Z,ANTI, SYM, SYM,Symmetry,Lev)) + BSSN_OMP_TASK_CALL(fderivs(ex,betay,betayx,betayy,betayz,X,Y,Z, SYM,ANTI, SYM,Symmetry,Lev)) + BSSN_OMP_TASK_CALL(fderivs(ex,betaz,betazx,betazy,betazz,X,Y,Z, SYM, SYM,ANTI,Symmetry,Lev)) + BSSN_OMP_TASK_CALL(fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)) + BSSN_OMP_TASK_CALL(fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)) + BSSN_OMP_TASK_CALL(fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev)) + BSSN_OMP_TASK_CALL(fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev)) + BSSN_OMP_TASK_CALL(fderivs(ex,dyy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)) + BSSN_OMP_TASK_CALL(fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev)) + BSSN_OMP_TASK_CALL(fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)) + BSSN_OMP_TASK_CALL(fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)) + BSSN_OMP_TASK_CALL(fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)) + BSSN_OMP_TASK_GROUP_END // 3ms // for(int i=0;i 1) for(int i=0;i cap) { free(fh); diff --git a/AMSS_NCKU_source/fderivs_c.C b/AMSS_NCKU_source/fderivs_c.C index 90f10a1..9c32635 100644 --- a/AMSS_NCKU_source/fderivs_c.C +++ b/AMSS_NCKU_source/fderivs_c.C @@ -50,8 +50,8 @@ void fderivs(const int ex[3], const size_t ny = (size_t)ex2 + 2; const size_t nz = (size_t)ex3 + 2; const size_t fh_size = nx * ny * nz; - static double *fh = NULL; - static size_t cap = 0; + static thread_local double *fh = NULL; + static thread_local size_t cap = 0; if (fh_size > cap) { free(fh); diff --git a/AMSS_NCKU_source/lopsided_kodis_c.C b/AMSS_NCKU_source/lopsided_kodis_c.C index 970d0e7..4e20de7 100644 --- a/AMSS_NCKU_source/lopsided_kodis_c.C +++ b/AMSS_NCKU_source/lopsided_kodis_c.C @@ -43,7 +43,13 @@ void lopsided_kodis(const int ex[3], const size_t nz = (size_t)ex3 + 3; const size_t fh_size = nx * ny * nz; - double *fh = (double*)malloc(fh_size * sizeof(double)); + static thread_local double *fh = NULL; + static thread_local size_t cap = 0; + if (fh_size > cap) { + free(fh); + fh = (double*)aligned_alloc(64, fh_size * sizeof(double)); + cap = fh_size; + } if (!fh) return; symmetry_bd(3, ex, f, fh, SoA); @@ -243,6 +249,4 @@ void lopsided_kodis(const int ex[3], } } } - - free(fh); } diff --git a/AMSS_NCKU_source/makefile b/AMSS_NCKU_source/makefile index 72b9cbd..f39a4fd 100644 --- a/AMSS_NCKU_source/makefile +++ b/AMSS_NCKU_source/makefile @@ -16,7 +16,7 @@ PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata ifeq ($(PGO_MODE),instrument) ## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \ - -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) + -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) $(OPENMP_FLAGS) f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \ -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) else @@ -26,7 +26,7 @@ else CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ - -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) + -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) $(OPENMP_FLAGS) f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) endif diff --git a/AMSS_NCKU_source/makefile.inc b/AMSS_NCKU_source/makefile.inc index 331cff1..cdbefb6 100755 --- a/AMSS_NCKU_source/makefile.inc +++ b/AMSS_NCKU_source/makefile.inc @@ -29,6 +29,9 @@ endif ## instrument : PGO Phase 1 instrumentation to collect fresh profile data PGO_MODE ?= opt +## OpenMP switch for C/C++ kernels +OPENMP_FLAGS ?= -qopenmp + ## Interp_Points load balance profiling mode ## off : (default) no load balance instrumentation ## profile : Pass 1 — instrument Interp_Points to collect timing profile diff --git a/makefile_and_run.py b/makefile_and_run.py index 5682476..8fc622e 100755 --- a/makefile_and_run.py +++ b/makefile_and_run.py @@ -9,6 +9,7 @@ import AMSS_NCKU_Input as input_data +import os import subprocess import time @@ -54,6 +55,14 @@ NUMACTL_CPU_BIND = get_last_n_cores_per_socket(n=32) BUILD_JOBS = 64 +def build_runtime_env(): + """Inject OpenMP runtime settings from the Python input file.""" + runtime_env = os.environ.copy() + omp_threads = max(1, int(getattr(input_data, "OMP_Threads", 1))) + runtime_env["OMP_NUM_THREADS"] = str(omp_threads) + return runtime_env + + ################################################################## @@ -143,6 +152,8 @@ def run_ABE(): print( ) print( " Running the AMSS-NCKU executable file ABE/ABEGPU " ) print( ) + print( f" MPI processes = {input_data.MPI_processes}, OMP threads per process = {max(1, int(getattr(input_data, 'OMP_Threads', 1)))}" ) + print( ) ## Define the command to run; cast other values to strings as needed @@ -155,7 +166,14 @@ def run_ABE(): mpi_command_outfile = "ABEGPU_out.log" ## Execute the MPI command and stream output - mpi_process = subprocess.Popen(mpi_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) + mpi_process = subprocess.Popen( + mpi_command, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + env=build_runtime_env(), + ) ## Write ABE run output to file while printing to stdout with open(mpi_command_outfile, 'w') as file0: @@ -195,7 +213,14 @@ def run_TwoPunctureABE(): TwoPuncture_command_outfile = "TwoPunctureABE_out.log" ## Execute the command with subprocess.Popen and stream output - TwoPuncture_process = subprocess.Popen(TwoPuncture_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) + TwoPuncture_process = subprocess.Popen( + TwoPuncture_command, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + env=build_runtime_env(), + ) ## Write TwoPunctureABE run output to file while printing to stdout with open(TwoPuncture_command_outfile, 'w') as file0: diff --git a/setup.py b/setup.py index d418031..a7ee65f 100755 --- a/setup.py +++ b/setup.py @@ -65,10 +65,11 @@ def print_input_data( File_directory ): print( "------------------------------------------------------------------------------------------" ) print( ) - print( " Printing the basic parameter and setting in the AMSS-NCKU simulation " ) - print( ) - print( " The number of MPI processes in the AMSS-NCKU simulation = ", input_data.MPI_processes ) - print( ) + print( " Printing the basic parameter and setting in the AMSS-NCKU simulation " ) + print( ) + print( " The number of MPI processes in the AMSS-NCKU simulation = ", input_data.MPI_processes ) + print( " The number of OMP threads per MPI process = ", input_data.OMP_Threads ) + print( ) print( " The form of computational equation = ", input_data.Equation_Class ) print( " The initial data in this simulation = ", input_data.Initial_Data_Method ) print( ) @@ -140,10 +141,11 @@ def print_input_data( File_directory ): file0 = open(filepath, 'w') print( file=file0 ) - print( " Printing the basic parameter and setting in the AMSS-NCKU simulation ", file=file0 ) - print( file=file0 ) - print( " The number of MPI processes in the AMSS-NCKU simulation = ", input_data.MPI_processes, file=file0 ) - print( file=file0 ) + print( " Printing the basic parameter and setting in the AMSS-NCKU simulation ", file=file0 ) + print( file=file0 ) + print( " The number of MPI processes in the AMSS-NCKU simulation = ", input_data.MPI_processes, file=file0 ) + print( " The number of OMP threads per MPI process = ", input_data.OMP_Threads, file=file0 ) + print( file=file0 ) print( " The form of computational equation = ", input_data.Equation_Class, file=file0 ) print( " The initial data in this simulation = ", input_data.Initial_Data_Method, file=file0 ) print( file=file0 )