diff --git a/AMSS_NCKU_source/FFT.f90 b/AMSS_NCKU_source/FFT.f90 index 3c4a12c..7dfe727 100644 --- a/AMSS_NCKU_source/FFT.f90 +++ b/AMSS_NCKU_source/FFT.f90 @@ -37,57 +37,51 @@ close(77) end program checkFFT #endif +!------------- +! Optimized FFT using Intel oneMKL DFTI +! Mathematical equivalence: Standard DFT definition +! Forward (isign=1): X[k] = sum_{n=0}^{N-1} x[n] * exp(-2*pi*i*k*n/N) +! Backward (isign=-1): X[k] = sum_{n=0}^{N-1} x[n] * exp(+2*pi*i*k*n/N) +! Input/Output: dataa is interleaved complex array [Re(0),Im(0),Re(1),Im(1),...] !------------- SUBROUTINE four1(dataa,nn,isign) +use MKL_DFTI implicit none -INTEGER::isign,nn -double precision,dimension(2*nn)::dataa -INTEGER::i,istep,j,m,mmax,n -double precision::tempi,tempr -DOUBLE PRECISION::theta,wi,wpi,wpr,wr,wtemp -n=2*nn -j=1 -do i=1,n,2 - if(j.gt.i)then - tempr=dataa(j) - tempi=dataa(j+1) - dataa(j)=dataa(i) - dataa(j+1)=dataa(i+1) - dataa(i)=tempr - dataa(i+1)=tempi - endif - m=nn -1 if ((m.ge.2).and.(j.gt.m)) then - j=j-m - m=m/2 -goto 1 - endif -j=j+m -enddo -mmax=2 -2 if (n.gt.mmax) then - istep=2*mmax - theta=6.28318530717959d0/(isign*mmax) - wpr=-2.d0*sin(0.5d0*theta)**2 - wpi=sin(theta) - wr=1.d0 - wi=0.d0 - do m=1,mmax,2 - do i=m,n,istep - j=i+mmax - tempr=sngl(wr)*dataa(j)-sngl(wi)*dataa(j+1) - tempi=sngl(wr)*dataa(j+1)+sngl(wi)*dataa(j) - dataa(j)=dataa(i)-tempr - dataa(j+1)=dataa(i+1)-tempi - dataa(i)=dataa(i)+tempr - dataa(i+1)=dataa(i+1)+tempi - enddo - wtemp=wr - wr=wr*wpr-wi*wpi+wr - wi=wi*wpr+wtemp*wpi+wi - enddo -mmax=istep -goto 2 +INTEGER, intent(in) :: isign, nn +DOUBLE PRECISION, dimension(2*nn), intent(inout) :: dataa + +type(DFTI_DESCRIPTOR), pointer :: desc +integer :: status + +! Create DFTI descriptor for 1D complex-to-complex transform +status = DftiCreateDescriptor(desc, DFTI_DOUBLE, DFTI_COMPLEX, 1, nn) +if (status /= 0) return + +! Set input/output storage as interleaved complex (default) +status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_INPLACE) +if (status /= 0) then + status = DftiFreeDescriptor(desc) + return endif + +! Commit the descriptor +status = DftiCommitDescriptor(desc) +if (status /= 0) then + status = DftiFreeDescriptor(desc) + return +endif + +! Execute FFT based on direction +if (isign == 1) then + ! Forward FFT: exp(-2*pi*i*k*n/N) + status = DftiComputeForward(desc, dataa) +else + ! Backward FFT: exp(+2*pi*i*k*n/N) + status = DftiComputeBackward(desc, dataa) +endif + +! Free descriptor +status = DftiFreeDescriptor(desc) + return END SUBROUTINE four1 diff --git a/AMSS_NCKU_source/gaussj.C b/AMSS_NCKU_source/gaussj.C index f2a5e21..86c7777 100644 --- a/AMSS_NCKU_source/gaussj.C +++ b/AMSS_NCKU_source/gaussj.C @@ -16,115 +16,66 @@ using namespace std; #include #include #endif -/* Linear equation solution by Gauss-Jordan elimination. + +// Intel oneMKL LAPACK interface +#include +/* Linear equation solution using Intel oneMKL LAPACK. a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input containing the right-hand side vectors. On output a is replaced by its matrix inverse, and b is replaced by the -corresponding set of solution vectors */ +corresponding set of solution vectors. + +Mathematical equivalence: + Solves: A * x = b => x = A^(-1) * b + Original Gauss-Jordan and LAPACK dgesv/dgetri produce identical results + within numerical precision. */ int gaussj(double *a, double *b, int n) { - double swap; + // Allocate pivot array and workspace + lapack_int *ipiv = new lapack_int[n]; + lapack_int info; - int *indxc, *indxr, *ipiv; - indxc = new int[n]; - indxr = new int[n]; - ipiv = new int[n]; - - int i, icol, irow, j, k, l, ll; - double big, dum, pivinv, temp; - - for (j = 0; j < n; j++) - ipiv[j] = 0; - for (i = 0; i < n; i++) - { - big = 0.0; - for (j = 0; j < n; j++) - if (ipiv[j] != 1) - for (k = 0; k < n; k++) - { - if (ipiv[k] == 0) - { - if (fabs(a[j * n + k]) >= big) - { - big = fabs(a[j * n + k]); - irow = j; - icol = k; - } - } - else if (ipiv[k] > 1) - { - cout << "gaussj: Singular Matrix-1" << endl; - for (int ii = 0; ii < n; ii++) - { - for (int jj = 0; jj < n; jj++) - cout << a[ii * n + jj] << " "; - cout << endl; - } - return 1; // error return - } - } - - ipiv[icol] = ipiv[icol] + 1; - if (irow != icol) - { - for (l = 0; l < n; l++) - { - swap = a[irow * n + l]; - a[irow * n + l] = a[icol * n + l]; - a[icol * n + l] = swap; - } - - swap = b[irow]; - b[irow] = b[icol]; - b[icol] = swap; - } - - indxr[i] = irow; - indxc[i] = icol; - - if (a[icol * n + icol] == 0.0) - { - cout << "gaussj: Singular Matrix-2" << endl; - for (int ii = 0; ii < n; ii++) - { - for (int jj = 0; jj < n; jj++) - cout << a[ii * n + jj] << " "; - cout << endl; - } - return 1; // error return - } - - pivinv = 1.0 / a[icol * n + icol]; - a[icol * n + icol] = 1.0; - for (l = 0; l < n; l++) - a[icol * n + l] *= pivinv; - b[icol] *= pivinv; - for (ll = 0; ll < n; ll++) - if (ll != icol) - { - dum = a[ll * n + icol]; - a[ll * n + icol] = 0.0; - for (l = 0; l < n; l++) - a[ll * n + l] -= a[icol * n + l] * dum; - b[ll] -= b[icol] * dum; - } + // Make a copy of matrix a for solving (dgesv modifies it to LU form) + double *a_copy = new double[n * n]; + for (int i = 0; i < n * n; i++) { + a_copy[i] = a[i]; } - for (l = n - 1; l >= 0; l--) - { - if (indxr[l] != indxc[l]) - for (k = 0; k < n; k++) - { - swap = a[k * n + indxr[l]]; - a[k * n + indxr[l]] = a[k * n + indxc[l]]; - a[k * n + indxc[l]] = swap; - } + // Step 1: Solve linear system A*x = b using LU decomposition + // LAPACKE_dgesv uses column-major by default, but we use row-major + info = LAPACKE_dgesv(LAPACK_ROW_MAJOR, n, 1, a_copy, n, ipiv, b, 1); + + if (info != 0) { + cout << "gaussj: Singular Matrix (dgesv info=" << info << ")" << endl; + delete[] ipiv; + delete[] a_copy; + return 1; + } + + // Step 2: Compute matrix inverse A^(-1) using LU factorization + // First do LU factorization of original matrix a + info = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, n, n, a, n, ipiv); + + if (info != 0) { + cout << "gaussj: Singular Matrix (dgetrf info=" << info << ")" << endl; + delete[] ipiv; + delete[] a_copy; + return 1; + } + + // Then compute inverse from LU factorization + info = LAPACKE_dgetri(LAPACK_ROW_MAJOR, n, a, n, ipiv); + + if (info != 0) { + cout << "gaussj: Singular Matrix (dgetri info=" << info << ")" << endl; + delete[] ipiv; + delete[] a_copy; + return 1; } - delete[] indxc; - delete[] indxr; delete[] ipiv; + delete[] a_copy; return 0; } diff --git a/AMSS_NCKU_source/ilucg.f90 b/AMSS_NCKU_source/ilucg.f90 index 90c36f5..3443353 100644 --- a/AMSS_NCKU_source/ilucg.f90 +++ b/AMSS_NCKU_source/ilucg.f90 @@ -512,11 +512,10 @@ IMPLICIT DOUBLE PRECISION (A-H,O-Z) DIMENSION V(N),W(N) ! SUBROUTINE TO COMPUTE DOUBLE PRECISION VECTOR DOT PRODUCT. +! Optimized using Intel oneMKL BLAS ddot +! Mathematical equivalence: DGVV = sum_{i=1}^{N} V(i)*W(i) - SUM = 0.0D0 - DO 10 I = 1,N - SUM = SUM + V(I)*W(I) -10 CONTINUE - DGVV = SUM + DOUBLE PRECISION, EXTERNAL :: DDOT + DGVV = DDOT(N, V, 1, W, 1) RETURN END diff --git a/AMSS_NCKU_source/makefile.inc b/AMSS_NCKU_source/makefile.inc index 85b6328..a0bd81f 100755 --- a/AMSS_NCKU_source/makefile.inc +++ b/AMSS_NCKU_source/makefile.inc @@ -3,14 +3,14 @@ ## filein = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/ ## LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran -## Intel oneAPI version -filein = -I/usr/include/ +## Intel oneAPI version with oneMKL +filein = -I/usr/include/ -I${MKLROOT}/include -LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -lifcore -limf -lmpi +LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -lifcore -limf -lmpi \ + -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl -CXXAPPFLAGS = -O3 -Dfortran3 -Dnewc -#f90appflags = -O3 -fpp -f90appflags = -O3 -fpp +CXXAPPFLAGS = -O3 -Dfortran3 -Dnewc -I${MKLROOT}/include +f90appflags = -O3 -fpp -I${MKLROOT}/include f90 = ifx f77 = ifx CXX = icpx