Switch legacy build to GCC and OpenMPI

2026-04-13 19:39:30 +08:00
parent 9c31384b2f
commit 3f3f16e881
7 changed files with 224 additions and 186 deletions
--- a/AMSS_NCKU_source/FFT.f90
+++ b/AMSS_NCKU_source/FFT.f90
@@ -37,51 +37,56 @@ close(77)
 end program checkFFT
 #endif
 !-------------
 ! Optimized FFT using Intel oneMKL DFTI
 ! Mathematical equivalence: Standard DFT definition
 !   Forward (isign=1):  X[k] = sum_{n=0}^{N-1} x[n] * exp(-2*pi*i*k*n/N)
 !   Backward (isign=-1): X[k] = sum_{n=0}^{N-1} x[n] * exp(+2*pi*i*k*n/N)
 ! Input/Output: dataa is interleaved complex array [Re(0),Im(0),Re(1),Im(1),...]
 !-------------
 SUBROUTINE four1(dataa,nn,isign)
 use MKL_DFTI
 implicit none
-INTEGER, intent(in) :: isign, nn
+INTEGER::isign,nn
-DOUBLE PRECISION, dimension(2*nn), intent(inout) :: dataa
+double precision,dimension(2*nn)::dataa
-
+INTEGER::i,istep,j,m,mmax,n
-type(DFTI_DESCRIPTOR), pointer :: desc
+double precision::tempi,tempr
-integer :: status
+DOUBLE PRECISION::theta,wi,wpi,wpr,wr,wtemp
-
+n=2*nn
-! Create DFTI descriptor for 1D complex-to-complex transform
+j=1
-status = DftiCreateDescriptor(desc, DFTI_DOUBLE, DFTI_COMPLEX, 1, nn)
+do i=1,n,2
-if (status /= 0) return
+  if(j.gt.i)then
-
+     tempr=dataa(j)
-! Set input/output storage as interleaved complex (default)
+     tempi=dataa(j+1)
-status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_INPLACE)
+     dataa(j)=dataa(i)
-if (status /= 0) then
+     dataa(j+1)=dataa(i+1)
-   status = DftiFreeDescriptor(desc)
+     dataa(i)=tempr
-   return
+     dataa(i+1)=tempi
  endif
-
+  m=nn
-! Commit the descriptor
+1 if ((m.ge.2).and.(j.gt.m)) then
-status = DftiCommitDescriptor(desc)
+  j=j-m
-if (status /= 0) then
+  m=m/2
-   status = DftiFreeDescriptor(desc)
+goto 1
   return
  endif
-
+j=j+m
-! Execute FFT based on direction
+enddo
-if (isign == 1) then
+mmax=2
-   ! Forward FFT: exp(-2*pi*i*k*n/N)
+2  if (n.gt.mmax) then
-   status = DftiComputeForward(desc, dataa)
+     istep=2*mmax
-else
+     theta=6.28318530717959d0/(isign*mmax)
-   ! Backward FFT: exp(+2*pi*i*k*n/N)
+     wpr=-2.d0*sin(0.5d0*theta)**2
-   status = DftiComputeBackward(desc, dataa)
+     wpi=sin(theta)
     wr=1.d0
     wi=0.d0
     do m=1,mmax,2
       do i=m,n,istep
         j=i+mmax
         tempr=sngl(wr)*dataa(j)-sngl(wi)*dataa(j+1)
         tempi=sngl(wr)*dataa(j+1)+sngl(wi)*dataa(j)
         dataa(j)=dataa(i)-tempr
         dataa(j+1)=dataa(i+1)-tempi
         dataa(i)=dataa(i)+tempr
         dataa(i+1)=dataa(i+1)+tempi
       enddo
          wtemp=wr
          wr=wr*wpr-wi*wpi+wr
          wi=wi*wpr+wtemp*wpi+wi
     enddo
 mmax=istep
 goto 2
 endif
 ! Free descriptor
 status = DftiFreeDescriptor(desc)
 return
 END SUBROUTINE four1
--- a/AMSS_NCKU_source/TwoPunctures.C
+++ b/AMSS_NCKU_source/TwoPunctures.C
@@ -27,7 +27,21 @@ using namespace std;
 #endif
 #include "TwoPunctures.h"
-#include <mkl_cblas.h>
+
 extern "C" {
 double cblas_ddot(const int, const double *, const int, const double *, const int);
 double cblas_dnrm2(const int, const double *, const int);
 void cblas_dgemm(const int, const int, const int,
                 const int, const int, const int,
                 const double, const double *, const int,
                 const double *, const int, const double,
                 double *, const int);
 }
 enum {
  CblasRowMajor = 101,
  CblasNoTrans = 111
 };
 TwoPunctures::TwoPunctures(double mp, double mm, double b,
                           double P_plusx, double P_plusy, double P_plusz,
--- a/AMSS_NCKU_source/gaussj.C
+++ b/AMSS_NCKU_source/gaussj.C
@@ -17,65 +17,103 @@ using namespace std;
 #include <math.h>
 #endif
-// Intel oneMKL LAPACK interface
+/* Linear equation solution by Gauss-Jordan elimination.
 #include <mkl_lapacke.h>
 /* Linear equation solution using Intel oneMKL LAPACK.
 a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input
 containing the right-hand side vectors. On output a is
 replaced by its matrix inverse, and b is replaced by the
-corresponding set of solution vectors.
+corresponding set of solution vectors. */
 Mathematical equivalence:
  Solves: A * x = b  =>  x = A^(-1) * b
  Original Gauss-Jordan and LAPACK dgesv/dgetri produce identical results
  within numerical precision. */
 int gaussj(double *a, double *b, int n)
 {
-  // Allocate pivot array and workspace
+  double swap;
  lapack_int *ipiv = new lapack_int[n];
  lapack_int info;
-  // Make a copy of matrix a for solving (dgesv modifies it to LU form)
+  int *indxc, *indxr, *ipiv;
-  double *a_copy = new double[n * n];
+  indxc = new int[n];
-  for (int i = 0; i < n * n; i++) {
+  indxr = new int[n];
-    a_copy[i] = a[i];
+  ipiv = new int[n];
  int i, icol, irow, j, k, l, ll;
  double big, dum, pivinv;
  for (j = 0; j < n; j++)
    ipiv[j] = 0;
  for (i = 0; i < n; i++)
  {
    big = 0.0;
    for (j = 0; j < n; j++)
      if (ipiv[j] != 1)
        for (k = 0; k < n; k++)
        {
          if (ipiv[k] == 0)
          {
            if (fabs(a[j * n + k]) >= big)
            {
              big = fabs(a[j * n + k]);
              irow = j;
              icol = k;
            }
          }
          else if (ipiv[k] > 1)
          {
            cout << "gaussj: Singular Matrix-1" << endl;
            return 1;
          }
        }
-  // Step 1: Solve linear system A*x = b using LU decomposition
+    ipiv[icol] = ipiv[icol] + 1;
-  // LAPACKE_dgesv uses column-major by default, but we use row-major
+    if (irow != icol)
-  info = LAPACKE_dgesv(LAPACK_ROW_MAJOR, n, 1, a_copy, n, ipiv, b, 1);
+    {
      for (l = 0; l < n; l++)
      {
        swap = a[irow * n + l];
        a[irow * n + l] = a[icol * n + l];
        a[icol * n + l] = swap;
      }
-  if (info != 0) {
+      swap = b[irow];
-    cout << "gaussj: Singular Matrix (dgesv info=" << info << ")" << endl;
+      b[irow] = b[icol];
-    delete[] ipiv;
+      b[icol] = swap;
-    delete[] a_copy;
+    }
    indxr[i] = irow;
    indxc[i] = icol;
    if (a[icol * n + icol] == 0.0)
    {
      cout << "gaussj: Singular Matrix-2" << endl;
      return 1;
    }
-  // Step 2: Compute matrix inverse A^(-1) using LU factorization
+    pivinv = 1.0 / a[icol * n + icol];
-  // First do LU factorization of original matrix a
+    a[icol * n + icol] = 1.0;
-  info = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, n, n, a, n, ipiv);
+    for (l = 0; l < n; l++)
-
+      a[icol * n + l] *= pivinv;
-  if (info != 0) {
+    b[icol] *= pivinv;
-    cout << "gaussj: Singular Matrix (dgetrf info=" << info << ")" << endl;
+    for (ll = 0; ll < n; ll++)
-    delete[] ipiv;
+      if (ll != icol)
-    delete[] a_copy;
+      {
-    return 1;
+        dum = a[ll * n + icol];
        a[ll * n + icol] = 0.0;
        for (l = 0; l < n; l++)
          a[ll * n + l] -= a[icol * n + l] * dum;
        b[ll] -= b[icol] * dum;
      }
  }
-  // Then compute inverse from LU factorization
+  for (l = n - 1; l >= 0; l--)
-  info = LAPACKE_dgetri(LAPACK_ROW_MAJOR, n, a, n, ipiv);
+  {
-
+    if (indxr[l] != indxc[l])
-  if (info != 0) {
+      for (k = 0; k < n; k++)
-    cout << "gaussj: Singular Matrix (dgetri info=" << info << ")" << endl;
+      {
-    delete[] ipiv;
+        swap = a[k * n + indxr[l]];
-    delete[] a_copy;
+        a[k * n + indxr[l]] = a[k * n + indxc[l]];
-    return 1;
+        a[k * n + indxc[l]] = swap;
      }
  }
  delete[] indxc;
  delete[] indxr;
  delete[] ipiv;
  delete[] a_copy;
  return 0;
 }
--- a/AMSS_NCKU_source/makefile
+++ b/AMSS_NCKU_source/makefile
@@ -8,27 +8,16 @@ include makefile.inc
 POLINT6_USE_BARY ?= 1
 POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
-## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
+## Legacy GNU/OpenMPI flags
-##   make                        -> opt  (PGO-guided, maximum performance)
+CXXBASEFLAGS = -O3 -march=native -Wno-deprecated -Dfortran3 -Dnewc $(INTERP_LB_FLAGS)
-##   make PGO_MODE=instrument    -> instrument (Phase 1: collect fresh profile data)
+F90BASEFLAGS = -O3 -march=native -cpp -fallow-argument-mismatch $(POLINT6_FLAG)
 PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
 ifeq ($(PGO_MODE),instrument)
-## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
+CXXAPPFLAGS = $(CXXBASEFLAGS)
-CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
+f90appflags = $(F90BASEFLAGS)
              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
 f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
 else
-## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \
+CXXAPPFLAGS = $(CXXBASEFLAGS)
-## PGO has been turned off, now tested and found to be negative optimization
+f90appflags = $(F90BASEFLAGS)
 ## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization
 CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
 f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
 endif
 .SUFFIXES: .o .f90 .C .for .cu
@@ -68,16 +57,13 @@ lopsided_kodis_c.o: lopsided_kodis_c.C
 #	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 ## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
-TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
+TP_OPTFLAGS = $(CXXBASEFLAGS) $(TP_OPENMP_FLAGS)
 TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
              -fprofile-instr-use=$(TP_PROFDATA) \
              -Dfortran3 -Dnewc -I${MKLROOT}/include
 TwoPunctures.o: TwoPunctures.C
-	${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
+	${CXX} $(TP_OPTFLAGS) -c $< -o $@
 TwoPunctureABE.o: TwoPunctureABE.C
-	${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
+	${CXX} $(TP_OPTFLAGS) -c $< -o $@
 # Input files
@@ -185,7 +171,7 @@ ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILE
 	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
 TwoPunctureABE: $(TwoPunctureFILES)
-	$(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
+	$(CLINKER) $(TP_OPTFLAGS) -o $@ $(TwoPunctureFILES) $(LDLIBS)
 clean:
 	rm *.o ABE ABEGPU TwoPunctureABE make.log -f
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -1,33 +1,27 @@
-## GCC version (commented out)
+## Legacy GNU/OpenMPI toolchain configuration
 ## filein  = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
 ## filein  = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
 ## LDLIBS  = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
-## Intel oneAPI version with oneMKL (Optimized for performance)
+## OpenMPI wrappers are installed but may not be on PATH.
-filein  = -I/usr/include/ -I${MKLROOT}/include
+OMPI_BIN ?= /usr/lib64/openmpi/bin
-## Using sequential MKL (OpenMP disabled for better single-threaded performance)
+## Wrapper compilers
-## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
+f90          = $(OMPI_BIN)/mpifort
-LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
+f77          = $(OMPI_BIN)/mpifort
 CXX          = $(OMPI_BIN)/mpicxx
 CC           = $(OMPI_BIN)/mpicc
 CLINKER      = $(OMPI_BIN)/mpicxx
-## Memory allocator switch
+## Extra include flags are not needed when using the OpenMPI wrappers.
-##   1 (default) : link Intel oneTBB allocator (libtbbmalloc)
+filein       =
 ##   0           : use system default allocator (ptmalloc)
 USE_TBBMALLOC ?= 1
 TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
 ifneq ($(wildcard $(TBBMALLOC_SO)),)
 TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
 else
 TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
 endif
 ifeq ($(USE_TBBMALLOC),1)
 LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
 endif
-## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
+## BLAS/LAPACK backend:
-##   opt        : (default) maximum performance with PGO profile-guided optimization
+## OpenBLAS on this system provides BLAS, CBLAS and LAPACK symbols.
-##   instrument : PGO Phase 1 instrumentation to collect fresh profile data
+BLAS_LAPACK_LIB ?= /lib64/libopenblaso.so.0
-PGO_MODE ?= opt
+LDLIBS  = $(BLAS_LAPACK_LIB) -lgfortran -lpthread -lm -ldl
 ## PGO build mode switch
 ##   off        : default legacy GNU build without PGO
 ##   instrument : accepted for compatibility, currently same as off
 PGO_MODE ?= off
 ## Interp_Points load balance profiling mode
 ##   off        : (default) no load balance instrumentation
@@ -49,17 +43,13 @@ endif
 USE_CXX_KERNELS ?= 1
 ## RK4 kernel implementation switch
-##   1 (default) : use C/C++ rewrite of rungekutta4_rout (for optimization experiments)
+##   1 (default) : use C/C++ rewrite of rungekutta4_rout
 ##   0           : use original Fortran rungekutta4_rout.o
 USE_CXX_RK4 ?= 1
-f90          = ifx
+## OpenMP is only used for TwoPunctures on the legacy toolchain.
-f77          = ifx
+TP_OPENMP_FLAGS ?= -fopenmp
 CXX          = icpx
 CC           = icx
 CLINKER      = mpiicpx
 Cu = nvcc
 CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
 #CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
 CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc
--- a/README.md
+++ b/README.md
@@ -97,7 +97,9 @@ Here, we take the Ubuntu 22.04 system as an example
    Modify the makefile.inc file in the AMSS_NCKU_source directory and change the settings according to your computer.
-    The settings for the Ubuntu 22.04 system do not need to be modified.
+    The default configuration in this branch uses GNU compilers through the OpenMPI wrappers under `/usr/lib64/openmpi/bin`.
    If your OpenMPI installation is in another location, update `OMPI_BIN` in `AMSS_NCKU_source/makefile.inc` or export `AMSS_OPENMPI_BIN` before running the Python launcher.
 1.  Enter the AMSS-NCKU Python code folder and modify the input.
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -9,6 +9,7 @@
 import AMSS_NCKU_Input as input_data
 import os
 import subprocess
 import time
@@ -52,6 +53,8 @@ NUMACTL_CPU_BIND = get_last_n_cores_per_socket(n=32)
 ## Build parallelism: match the number of bound cores
 BUILD_JOBS = 64
 OPENMPI_BIN = os.environ.get("AMSS_OPENMPI_BIN", "/usr/lib64/openmpi/bin")
 MPI_RUNNER = os.path.join(OPENMPI_BIN, "mpirun")
 ##################################################################
@@ -147,11 +150,11 @@ def run_ABE():
    ## Define the command to run; cast other values to strings as needed
    if (input_data.GPU_Calculation == "no"):
-        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
+        mpi_command         = NUMACTL_CPU_BIND + " " + MPI_RUNNER + " -np " + str(input_data.MPI_processes) + " ./ABE"
        #mpi_command         = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
        mpi_command_outfile = "ABE_out.log"
    elif (input_data.GPU_Calculation == "yes"):
-        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
+        mpi_command         = NUMACTL_CPU_BIND + " " + MPI_RUNNER + " -np " + str(input_data.MPI_processes) + " ./ABEGPU"
        mpi_command_outfile = "ABEGPU_out.log"
    ## Execute the MPI command and stream output