Performance optimization for the TwoPunctures module

* Re-enabled OpenMP. 1. Batch spectral derivatives (Chebyshev & Fourier) via precomputed matrices: Chebyshev/Fourier transforms and derivatives are precomputed as explicit physical-space operator matrices. Batch DGEMM now applies to entire tensor fields, mathematically identical to original per-line transforms but vastly faster. 2. Gauss-Seidel relaxation & tridiagonal solver workspace reuse: Per-thread reusable workspaces replace per-call heap new/delete in all tridiagonal and relaxation routines. 3. Efficient OpenMP multithreading throughout relaxation/deriv: relax_omp and friends parallelize over grouped lines/planes, maximizing threading efficiency and memory independence. Co-authored-by: copilot-swe-agent[bot] <198982749+copilot@users.noreply.github.com>
2026-02-07 14:46:46 +08:00
parent f5ed23d687
commit f345b0e520
3 changed files with 917 additions and 215 deletions
--- a/AMSS_NCKU_source/TwoPunctures.C
+++ b/AMSS_NCKU_source/TwoPunctures.C
--- a/AMSS_NCKU_source/TwoPunctures.h
+++ b/AMSS_NCKU_source/TwoPunctures.h
@@ -1,7 +1,8 @@
 #ifndef TWO_PUNCTURES_H
 #define TWO_PUNCTURES_H
 #include <omp.h>
 #define StencilSize 19
 #define N_PlaneRelax 1
 #define NRELAX 200
@@ -42,6 +43,18 @@ private:
       int ntotal;
       // ===== Precomputed spectral derivative matrices =====
       double *D1_A, *D2_A;
       double *D1_B, *D2_B;
       double *DF1_phi, *DF2_phi;
       // ===== Pre-allocated workspace for LineRelax (per-thread) =====
       int max_threads;
       double **ws_diag_be, **ws_e_be, **ws_f_be, **ws_b_be, **ws_x_be;
       double **ws_l_be, **ws_u_be, **ws_d_be, **ws_y_be;
       double **ws_diag_al, **ws_e_al, **ws_f_al, **ws_b_al, **ws_x_al;
       double **ws_l_al, **ws_u_al, **ws_d_al, **ws_y_al;
       struct parameters
       {
              int nvar, n1, n2, n3;
@@ -58,6 +71,28 @@ public:
                    int Newtonmaxit);
       ~TwoPunctures();
       // 02/07: New/modified methods
       void allocate_workspace();
       void free_workspace();
       void precompute_derivative_matrices();
       void build_cheb_deriv_matrices(int n, double *D1, double *D2);
       void build_fourier_deriv_matrices(int N, double *DF1, double *DF2);
       void Derivatives_AB3_MatMul(int nvar, int n1, int n2, int n3, derivs v);
       void ThomasAlgorithm_ws(int N, double *b, double *a, double *c, double *x, double *q,
                                double *l, double *u_ws, double *d, double *y);
       void LineRelax_be_omp(double *dv,
                         int const i, int const k, int const nvar,
                         int const n1, int const n2, int const n3,
                         double const *rhs, int const *ncols, int **cols,
                         double **JFD, int tid);
       void LineRelax_al_omp(double *dv,
                         int const j, int const k, int const nvar,
                         int const n1, int const n2, int const n3,
                         double const *rhs, int const *ncols,
                         int **cols, double **JFD, int tid);
       void relax_omp(double *dv, int const nvar, int const n1, int const n2, int const n3,
                  double const *rhs, int const *ncols, int **cols, double **JFD);
       void Solve();
       void set_initial_guess(derivs v);
       int index(int i, int j, int k, int l, int a, int b, int c, int d);
@@ -116,23 +151,11 @@ public:
       double BY_KKofxyz(double x, double y, double z);
       void SetMatrix_JFD(int nvar, int n1, int n2, int n3, derivs u, int *ncols, int **cols, double **Matrix);
       void J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, double *Jdv, derivs u);
       void relax(double *dv, int const nvar, int const n1, int const n2, int const n3,
                  double const *rhs, int const *ncols, int **cols, double **JFD);
       void LineRelax_be(double *dv,
                         int const i, int const k, int const nvar,
                         int const n1, int const n2, int const n3,
                         double const *rhs, int const *ncols, int **cols,
                         double **JFD);
       void JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2,
                         int n3, derivs dv, derivs u, double *values);
       void LinEquations(double A, double B, double X, double R,
                         double x, double r, double phi,
                         double y, double z, derivs dU, derivs U, double *values);
       void LineRelax_al(double *dv,
                         int const j, int const k, int const nvar,
                         int const n1, int const n2, int const n3,
                         double const *rhs, int const *ncols,
                         int **cols, double **JFD);
       void ThomasAlgorithm(int N, double *b, double *a, double *c, double *x, double *q);
       void Save(char *fname);
       // provided by Vasileios Paschalidis (vpaschal@illinois.edu)
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -15,10 +15,9 @@ LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore
 ## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible)
 ## -fp-model fast=2: Aggressive floating-point optimizations
 ## -fma: Enable fused multiply-add instructions
-## Note: OpenMP has been disabled (-qopenmp removed) due to performance issues
+CXXAPPFLAGS  = -O3 -xHost -fp-model fast=2 -fma -ipo -qopenmp \
 CXXAPPFLAGS  = -O3 -xHost -fp-model fast=2 -fma -ipo \
               -Dfortran3 -Dnewc -I${MKLROOT}/include
-f90appflags  = -O3 -xHost -fp-model fast=2 -fma -ipo \
+f90appflags  = -O3 -xHost -fp-model fast=2 -fma -ipo -qopenmp \
               -align array64byte -fpp -I${MKLROOT}/include
 f90          = ifx
 f77          = ifx