fix: try to fix segfault at 240 steps by adding WithShell guard for writecheck_sh call

fix(build): update makefile.inc for debian 13
feat: port GPU code to CUDA 13 and enable GPU computation
2026-01-22 14:26:41 +08:00 · 2026-01-21 09:29:35 +08:00 · 2026-01-13 18:15:49 +00:00 · 2026-01-14 01:41:31 +08:00 · 2026-01-13 23:52:44 +08:00 · 2026-01-13 23:45:49 +08:00
17 changed files with 293 additions and 602 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,3 @@
 __pycache__
 GW150914
 GW150914-origin
 docs
 *.tmp
--- a/AMSS_NCKU_Input.py
+++ b/AMSS_NCKU_Input.py
@@ -16,12 +16,12 @@ import numpy
 File_directory   = "GW150914"                    ## output file directory
 Output_directory = "binary_output"               ## binary data file directory
                                                 ## The file directory name should not be too long
-MPI_processes    = 48                             ## number of mpi processes used in the simulation
+MPI_processes    = 96                             ## number of mpi processes used in the simulation
-GPU_Calculation  = "no"                          ## Use GPU or not 
+GPU_Calculation  = "yes"                         ## Use GPU or not
-                                                 ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
+                                                 ## GPU support has been updated for CUDA 13
-CPU_Part         = 1.0
+CPU_Part         = 0.0
-GPU_Part         = 0.0
+GPU_Part         = 1.0
 #################################################
--- a/AMSS_NCKU_Verify_ASC26.py
+++ b/AMSS_NCKU_Verify_ASC26.py
@@ -1,279 +0,0 @@
 #!/usr/bin/env python3
 """
 AMSS-NCKU GW150914 Simulation Regression Test Script
 Verification Requirements:
 1. XY-plane trajectory RMS error < 1% (Optimized vs. baseline, max of BH1 and BH2)
 2. ADM constraint violation < 2 (Grid Level 0)
 RMS Calculation Method:
 - Computes trajectory deviation on the XY plane independently for BH1 and BH2
 - For each black hole: RMS = sqrt((1/M) * sum((Δr_i / r_i^max)^2)) × 100%
 - Final RMS = max(RMS_BH1, RMS_BH2)
 Usage: python3 AMSS_NCKU_Verify_ASC26.py [output_dir]
 Default: output_dir = GW150914/AMSS_NCKU_output
 Reference: GW150914-origin (baseline simulation)
 """
 import numpy as np
 import sys
 import os
 # ANSI Color Codes
 class Color:
    GREEN = '\033[92m'
    RED = '\033[91m'
    YELLOW = '\033[93m'
    BLUE = '\033[94m'
    BOLD = '\033[1m'
    RESET = '\033[0m'
 def get_status_text(passed):
    if passed:
        return f"{Color.GREEN}{Color.BOLD}PASS{Color.RESET}"
    else:
        return f"{Color.RED}{Color.BOLD}FAIL{Color.RESET}"
 def load_bh_trajectory(filepath):
    """Load black hole trajectory data"""
    data = np.loadtxt(filepath)
    return {
        'time': data[:, 0],
        'x1': data[:, 1], 'y1': data[:, 2], 'z1': data[:, 3],
        'x2': data[:, 4], 'y2': data[:, 5], 'z2': data[:, 6]
    }
 def load_constraint_data(filepath):
    """Load constraint violation data"""
    data = []
    with open(filepath, 'r') as f:
        for line in f:
            if line.startswith('#'):
                continue
            parts = line.split()
            if len(parts) >= 8:
                data.append([float(x) for x in parts[:8]])
    return np.array(data)
 def calculate_rms_error(bh_data_ref, bh_data_target):
    """
    Calculate trajectory-based RMS error on the XY plane between baseline and optimized simulations.
    This function computes the RMS error independently for BH1 and BH2 trajectories,
    then returns the maximum of the two as the final RMS error metric.
    For each black hole, the RMS is calculated as:
        RMS = sqrt( (1/M) * sum( (Δr_i / r_i^max)^2 ) ) × 100%
    where:
        Δr_i = sqrt((x_ref,i - x_new,i)^2 + (y_ref,i - y_new,i)^2)
        r_i^max = max(sqrt(x_ref,i^2 + y_ref,i^2), sqrt(x_new,i^2 + y_new,i^2))
    Args:
        bh_data_ref: Reference (baseline) trajectory data
        bh_data_target: Target (optimized) trajectory data
    Returns:
        rms_value: Final RMS error as a percentage (max of BH1 and BH2)
        error: Error message if any
    """
    # Align data: truncate to the length of the shorter dataset
    M = min(len(bh_data_ref['time']), len(bh_data_target['time']))
    if M < 10:
        return None, "Insufficient data points for comparison"
    # Extract XY coordinates for both black holes
    x1_ref = bh_data_ref['x1'][:M]
    y1_ref = bh_data_ref['y1'][:M]
    x2_ref = bh_data_ref['x2'][:M]
    y2_ref = bh_data_ref['y2'][:M]
    x1_new = bh_data_target['x1'][:M]
    y1_new = bh_data_target['y1'][:M]
    x2_new = bh_data_target['x2'][:M]
    y2_new = bh_data_target['y2'][:M]
    # Calculate RMS for BH1
    delta_r1 = np.sqrt((x1_ref - x1_new)**2 + (y1_ref - y1_new)**2)
    r1_ref = np.sqrt(x1_ref**2 + y1_ref**2)
    r1_new = np.sqrt(x1_new**2 + y1_new**2)
    r1_max = np.maximum(r1_ref, r1_new)
    # Calculate RMS for BH2
    delta_r2 = np.sqrt((x2_ref - x2_new)**2 + (y2_ref - y2_new)**2)
    r2_ref = np.sqrt(x2_ref**2 + y2_ref**2)
    r2_new = np.sqrt(x2_new**2 + y2_new**2)
    r2_max = np.maximum(r2_ref, r2_new)
    # Avoid division by zero for BH1
    valid_mask1 = r1_max > 1e-15
    if np.sum(valid_mask1) < 10:
        return None, "Insufficient valid data points for BH1"
    terms1 = (delta_r1[valid_mask1] / r1_max[valid_mask1])**2
    rms_bh1 = np.sqrt(np.mean(terms1)) * 100
    # Avoid division by zero for BH2
    valid_mask2 = r2_max > 1e-15
    if np.sum(valid_mask2) < 10:
        return None, "Insufficient valid data points for BH2"
    terms2 = (delta_r2[valid_mask2] / r2_max[valid_mask2])**2
    rms_bh2 = np.sqrt(np.mean(terms2)) * 100
    # Final RMS is the maximum of BH1 and BH2
    rms_final = max(rms_bh1, rms_bh2)
    return rms_final, None
 def analyze_constraint_violation(constraint_data, n_levels=9):
    """
    Analyze ADM constraint violation
    Return maximum constraint violation for Grid Level 0
    """
    # Extract Grid Level 0 data (first entry for each time step)
    level0_data = constraint_data[::n_levels]
    # Calculate maximum absolute value for each constraint
    results = {
        'Ham': np.max(np.abs(level0_data[:, 1])),
        'Px': np.max(np.abs(level0_data[:, 2])),
        'Py': np.max(np.abs(level0_data[:, 3])),
        'Pz': np.max(np.abs(level0_data[:, 4])),
        'Gx': np.max(np.abs(level0_data[:, 5])),
        'Gy': np.max(np.abs(level0_data[:, 6])),
        'Gz': np.max(np.abs(level0_data[:, 7]))
    }
    results['max_violation'] = max(results.values())
    return results
 def print_header():
    """Print report header"""
    print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
    print(Color.BOLD + "   AMSS-NCKU GW150914 Simulation Regression Test Report" + Color.RESET)
    print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
 def print_rms_results(rms_rel, error, threshold=1.0):
    """Print RMS error results"""
    print(f"\n{Color.BOLD}1. RMS Error Analysis (Baseline vs Optimized){Color.RESET}")
    print("-" * 45)
    if error:
        print(f"   {Color.RED}Error: {error}{Color.RESET}")
        return False
    passed = rms_rel < threshold
    print(f"   RMS relative error: {rms_rel:.4f}%")
    print(f"   Requirement:        < {threshold}%")
    print(f"   Status:             {get_status_text(passed)}")
    return passed
 def print_constraint_results(results, threshold=2.0):
    """Print constraint violation results"""
    print(f"\n{Color.BOLD}2. ADM Constraint Violation Analysis (Grid Level 0){Color.RESET}")
    print("-" * 45)
    names = ['Ham', 'Px', 'Py', 'Pz', 'Gx', 'Gy', 'Gz']
    for i, name in enumerate(names):
        print(f"   Max |{name:3}|: {results[name]:.6f}", end="   ")
        if (i + 1) % 2 == 0: print()
    if len(names) % 2 != 0: print()
    passed = results['max_violation'] < threshold
    print(f"\n   Maximum violation:  {results['max_violation']:.6f}")
    print(f"   Requirement:        < {threshold}")
    print(f"   Status:             {get_status_text(passed)}")
    return passed
 def print_summary(rms_passed, constraint_passed):
    """Print summary"""
    print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
    print(Color.BOLD + "Verification Summary" + Color.RESET)
    print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
    all_passed = rms_passed and constraint_passed
    res_rms = get_status_text(rms_passed)
    res_con = get_status_text(constraint_passed)
    print(f"   [1] RMS trajectory check:         {res_rms}")
    print(f"   [2] ADM constraint check:         {res_con}")
    final_status = f"{Color.GREEN}{Color.BOLD}ALL CHECKS PASSED{Color.RESET}" if all_passed else f"{Color.RED}{Color.BOLD}SOME CHECKS FAILED{Color.RESET}"
    print(f"\n   Overall result: {final_status}")
    print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET + "\n")
    return all_passed
 def main():
    # Determine target (optimized) output directory
    if len(sys.argv) > 1:
        target_dir = sys.argv[1]
    else:
        script_dir = os.path.dirname(os.path.abspath(__file__))
        target_dir = os.path.join(script_dir, "GW150914/AMSS_NCKU_output")
    # Determine reference (baseline) directory
    script_dir = os.path.dirname(os.path.abspath(__file__))
    reference_dir = os.path.join(script_dir, "GW150914-origin/AMSS_NCKU_output")
    # Data file paths
    bh_file_ref = os.path.join(reference_dir, "bssn_BH.dat")
    bh_file_target = os.path.join(target_dir, "bssn_BH.dat")
    constraint_file = os.path.join(target_dir, "bssn_constraint.dat")
    # Check if files exist
    if not os.path.exists(bh_file_ref):
        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Baseline trajectory file not found: {bh_file_ref}")
        sys.exit(1)
    if not os.path.exists(bh_file_target):
        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Target trajectory file not found: {bh_file_target}")
        sys.exit(1)
    if not os.path.exists(constraint_file):
        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Constraint data file not found: {constraint_file}")
        sys.exit(1)
    # Print header
    print_header()
    print(f"\n{Color.BOLD}Reference (Baseline):{Color.RESET} {Color.BLUE}{reference_dir}{Color.RESET}")
    print(f"{Color.BOLD}Target (Optimized):  {Color.RESET} {Color.BLUE}{target_dir}{Color.RESET}")
    # Load data
    bh_data_ref = load_bh_trajectory(bh_file_ref)
    bh_data_target = load_bh_trajectory(bh_file_target)
    constraint_data = load_constraint_data(constraint_file)
    # Calculate RMS error
    rms_rel, error = calculate_rms_error(bh_data_ref, bh_data_target)
    rms_passed = print_rms_results(rms_rel, error)
    # Analyze constraint violation
    constraint_results = analyze_constraint_violation(constraint_data)
    constraint_passed = print_constraint_results(constraint_results)
    # Print summary
    all_passed = print_summary(rms_passed, constraint_passed)
    # Return exit code
    sys.exit(0 if all_passed else 1)
 if __name__ == "__main__":
    main()
--- a/AMSS_NCKU_source/FFT.f90
+++ b/AMSS_NCKU_source/FFT.f90
@@ -37,51 +37,57 @@ close(77)
 end program checkFFT
 #endif
 !-------------
 ! Optimized FFT using Intel oneMKL DFTI
 ! Mathematical equivalence: Standard DFT definition
 !   Forward (isign=1):  X[k] = sum_{n=0}^{N-1} x[n] * exp(-2*pi*i*k*n/N)
 !   Backward (isign=-1): X[k] = sum_{n=0}^{N-1} x[n] * exp(+2*pi*i*k*n/N)
 ! Input/Output: dataa is interleaved complex array [Re(0),Im(0),Re(1),Im(1),...]
 !-------------
 SUBROUTINE four1(dataa,nn,isign)
 use MKL_DFTI
 implicit none
-INTEGER, intent(in) :: isign, nn
+INTEGER::isign,nn
-DOUBLE PRECISION, dimension(2*nn), intent(inout) :: dataa
+double precision,dimension(2*nn)::dataa
-
+INTEGER::i,istep,j,m,mmax,n
-type(DFTI_DESCRIPTOR), pointer :: desc
+double precision::tempi,tempr
-integer :: status
+DOUBLE PRECISION::theta,wi,wpi,wpr,wr,wtemp
-
+n=2*nn
-! Create DFTI descriptor for 1D complex-to-complex transform
+j=1
-status = DftiCreateDescriptor(desc, DFTI_DOUBLE, DFTI_COMPLEX, 1, nn)
+do i=1,n,2
-if (status /= 0) return
+  if(j.gt.i)then
-
+     tempr=dataa(j)
-! Set input/output storage as interleaved complex (default)
+     tempi=dataa(j+1)
-status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_INPLACE)
+     dataa(j)=dataa(i)
-if (status /= 0) then
+     dataa(j+1)=dataa(i+1)
-   status = DftiFreeDescriptor(desc)
+     dataa(i)=tempr
-   return
+     dataa(i+1)=tempi
  endif
  m=nn
 1 if ((m.ge.2).and.(j.gt.m)) then
  j=j-m
  m=m/2
 goto 1
  endif
 j=j+m
 enddo
 mmax=2
 2  if (n.gt.mmax) then
     istep=2*mmax
     theta=6.28318530717959d0/(isign*mmax)
     wpr=-2.d0*sin(0.5d0*theta)**2
     wpi=sin(theta)
     wr=1.d0
     wi=0.d0
     do m=1,mmax,2
       do i=m,n,istep
         j=i+mmax
         tempr=sngl(wr)*dataa(j)-sngl(wi)*dataa(j+1)
         tempi=sngl(wr)*dataa(j+1)+sngl(wi)*dataa(j)
         dataa(j)=dataa(i)-tempr
         dataa(j+1)=dataa(i+1)-tempi
         dataa(i)=dataa(i)+tempr
         dataa(i+1)=dataa(i+1)+tempi
       enddo
          wtemp=wr
          wr=wr*wpr-wi*wpi+wr
          wi=wi*wpr+wtemp*wpi+wi
     enddo
 mmax=istep
 goto 2
 endif
 ! Commit the descriptor
 status = DftiCommitDescriptor(desc)
 if (status /= 0) then
   status = DftiFreeDescriptor(desc)
   return
 endif
 ! Execute FFT based on direction
 if (isign == 1) then
   ! Forward FFT: exp(-2*pi*i*k*n/N)
   status = DftiComputeForward(desc, dataa)
 else
   ! Backward FFT: exp(+2*pi*i*k*n/N)
   status = DftiComputeBackward(desc, dataa)
 endif
 ! Free descriptor
 status = DftiFreeDescriptor(desc)
 return
 END SUBROUTINE four1
--- a/AMSS_NCKU_source/TwoPunctures.C
+++ b/AMSS_NCKU_source/TwoPunctures.C
@@ -27,7 +27,6 @@ using namespace std;
 #endif
 #include "TwoPunctures.h"
 #include <mkl_cblas.h>
 TwoPunctures::TwoPunctures(double mp, double mm, double b,
                           double P_plusx, double P_plusy, double P_plusz,
@@ -892,17 +891,25 @@ double TwoPunctures::norm1(double *v, int n)
 /* -------------------------------------------------------------------------*/
 double TwoPunctures::norm2(double *v, int n)
 {
-  // Optimized with oneMKL BLAS DNRM2
+  int i;
-  // Computes: sqrt(sum(v[i]^2))
+  double result = 0;
-  return cblas_dnrm2(n, v, 1);
+
  for (i = 0; i < n; i++)
    result += v[i] * v[i];
  return sqrt(result);
 }
 /* -------------------------------------------------------------------------*/
 double TwoPunctures::scalarproduct(double *v, double *w, int n)
 {
-  // Optimized with oneMKL BLAS DDOT
+  int i;
-  // Computes: sum(v[i] * w[i])
+  double result = 0;
-  return cblas_ddot(n, v, 1, w, 1);
+
  for (i = 0; i < n; i++)
    result += v[i] * w[i];
  return result;
 }
 /* -------------------------------------------------------------------------*/
--- a/AMSS_NCKU_source/bssn_gpu.cu
+++ b/AMSS_NCKU_source/bssn_gpu.cu
@@ -18,7 +18,7 @@ using namespace std;
 #include <fstream>
 #endif
-void compare_result_gpu(int ftag1,double * datac,int data_num){
+static void compare_result_gpu(int ftag1,double * datac,int data_num){
 	double * data = (double*)malloc(sizeof(double)*data_num);
 	cudaMemcpy(data, datac, data_num * sizeof(double), cudaMemcpyDeviceToHost);
 	compare_result(ftag1,data,data_num);
@@ -83,7 +83,7 @@ inline void sub_enforce_ga(int matrix_size){
 	double * trA = M_ chin1;
 	enforce_ga<<<GRID_DIM,BLOCK_DIM>>>(trA);
 	cudaMemset(trA,0,matrix_size * sizeof(double));
-	cudaThreadSynchronize(); 
+	cudaDeviceSynchronize(); 
 	//cudaMemset(Mh_ gupxx,0,matrix_size * sizeof(double));
 	//trA gxx,gyy,gzz gupxx,gupxy,gupxz,gupyy,gupyz,gupzz
@@ -273,13 +273,13 @@ __global__ void sub_symmetry_bd_partK(int ord,double * func, double * funcc,doub
 #endif //ifdef Vertex
 inline void sub_symmetry_bd(int ord,double * func, double * funcc,double * SoA){
 	sub_symmetry_bd_partF<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_symmetry_bd_partI<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[0]);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_symmetry_bd_partJ<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[1]);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_symmetry_bd_partK<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[2]);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 }
@@ -378,9 +378,9 @@ inline void sub_fdderivs(double * f,double *fh,double *fxx,double *fxy,double *f
 	cudaMemset(fyy,0,_3D_SIZE[0] * sizeof(double));
 	cudaMemset(fyz,0,_3D_SIZE[0] * sizeof(double));
 	cudaMemset(fzz,0,_3D_SIZE[0] * sizeof(double));
-	cudaThreadSynchronize(); 
+	cudaDeviceSynchronize(); 
 	sub_fdderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fxx,fxy,fxz,fyy,fyz,fzz);
-	cudaThreadSynchronize(); 
+	cudaDeviceSynchronize(); 
 }
 __global__ void sub_fderivs_part1(double * f,double * fh,double *fx,double *fy,double *fz  )
@@ -445,9 +445,9 @@ inline void sub_fderivs(double * f,double * fh,double *fx,double *fy,double *fz,
 	cudaMemset(fy,0,_3D_SIZE[0] * sizeof(double));
 	cudaMemset(fz,0,_3D_SIZE[0] * sizeof(double));
-	cudaThreadSynchronize(); 
+	cudaDeviceSynchronize(); 
 	sub_fderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fx,fy,fz);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 }
 __global__ void computeRicci_part1(double * dst)
@@ -465,9 +465,9 @@ __global__ void computeRicci_part1(double * dst)
 inline void computeRicci(double * src,double* dst,double * SoA, Meta* meta)
 {
 	sub_fdderivs(src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,SoA);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	computeRicci_part1<<<GRID_DIM,BLOCK_DIM>>>(dst);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 }/*Exception*/
@@ -524,9 +524,9 @@ __global__ void sub_kodis_part1(double *f,double *fh,double *f_rhs)
 inline void sub_kodis(double *f,double *fh,double *f_rhs,double *SoA)
 {
 	sub_symmetry_bd(3,f,fh,SoA);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_kodis_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 }
 __global__ void  sub_lopsided_part1(double *f,double* fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz)
@@ -617,9 +617,9 @@ __global__ void  sub_lopsided_part1(double *f,double* fh,double *f_rhs,double *S
 inline void  sub_lopsided(double *f,double*fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz,double *SoA){
 	sub_symmetry_bd(3,f,fh,SoA);
-	cudaThreadSynchronize(); 
+	cudaDeviceSynchronize(); 
 	sub_lopsided_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs,Sfx,Sfy,Sfz);
-	cudaThreadSynchronize(); 
+	cudaDeviceSynchronize(); 
 }
 __global__ void compute_rhs_bssn_part1() 
@@ -2656,13 +2656,13 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 #ifdef TIMING1
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	gettimeofday(&tv2, NULL);
   	cout<<"TIME USED"<<TimeBetween(tv1, tv2)<<endl; 
 #endif	
 	//cout<<"GPU meta data ready.\n";
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 //--------------test constant memory address & value--------------
 /*	double rank = mpi_rank;
@@ -2685,7 +2685,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 	//sub_enforce_ga(matrix_size);
 	//4.1-----compute rhs---------
 	compute_rhs_bssn_part1<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fderivs(Mh_ betax,Mh_ fh,Mh_ betaxx,Mh_ betaxy,Mh_ betaxz,ass);
 	sub_fderivs(Mh_ betay,Mh_ fh,Mh_ betayx,Mh_ betayy,Mh_ betayz,sas);
@@ -2701,7 +2701,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 	sub_fderivs(Mh_ gyz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz, saa);
  	compute_rhs_bssn_part2<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fdderivs(Mh_ betax,Mh_ fh,Mh_ gxxx,Mh_ gxyx,Mh_ gxzx,Mh_ gyyx,Mh_ gyzx,Mh_ gzzx,ass);
 	sub_fdderivs(Mh_ betay,Mh_ fh,Mh_ gxxy,Mh_ gxyy,Mh_ gxzy,Mh_ gyyy,Mh_ gyzy,Mh_ gzzy,sas);
@@ -2711,7 +2711,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 	sub_fderivs( Mh_ Gamz, Mh_ fh,Mh_ Gamzx, Mh_ Gamzy, Mh_ Gamzz,ssa);
 	compute_rhs_bssn_part3<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	computeRicci(Mh_ dxx,Mh_ Rxx,sss, meta);
 	computeRicci(Mh_ dyy,Mh_ Ryy,sss, meta);
@@ -2720,20 +2720,20 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 	computeRicci(Mh_ gxz,Mh_ Rxz,asa, meta);
 	computeRicci(Mh_ gyz,Mh_ Ryz,saa, meta);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	compute_rhs_bssn_part4<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fdderivs(Mh_ chi,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
 	compute_rhs_bssn_part5<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fdderivs(Mh_ Lap,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
 	compute_rhs_bssn_part6<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 #if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5)
 	sub_fderivs(Mh_ chi,Mh_ fh, Mh_ dtSfx_rhs, Mh_ dtSfy_rhs, Mh_ dtSfz_rhs,sss);
@@ -2805,7 +2805,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 	if(co == 0){
 		compute_rhs_bssn_part7<<<GRID_DIM,BLOCK_DIM>>>();
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 		sub_fderivs(Mh_ Axx,Mh_ fh,Mh_ gxxx,Mh_ gxxy,Mh_ gxxz,sss);
 		sub_fderivs(Mh_ Axy,Mh_ fh,Mh_ gxyx,Mh_ gxyy,Mh_ gxyz,aas);
@@ -2814,7 +2814,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 		sub_fderivs(Mh_ Ayz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz,saa);
 		sub_fderivs(Mh_ Azz,Mh_ fh,Mh_ gzzx,Mh_ gzzy,Mh_ gzzz,sss);
 		compute_rhs_bssn_part8<<<GRID_DIM,BLOCK_DIM>>>();
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 	}
 #if (ABV == 1)
@@ -2895,7 +2895,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 //-------------------FOR GPU TEST----------------------
 //-----------------------------------------------------
 #ifdef TIMING
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	gettimeofday(&tv2, NULL);
   	cout<<"MPI rank is: "<<mpi_rank<<" GPU TIME is"<<TimeBetween(tv1, tv2)<<" (s)."<<endl; 
 #endif
--- a/AMSS_NCKU_source/bssn_gpu.h
+++ b/AMSS_NCKU_source/bssn_gpu.h
@@ -4,6 +4,17 @@
 #include "bssn_macro.h"
 #include "macrodef.fh"
 // CUDA error checking macro for CUDA 13 compatibility
 #define CUDA_SAFE_CALL(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA error in %s:%d: %s\n", __FILE__, __LINE__, \
                    cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)
 #define DEVICE_ID 0
 // #define DEVICE_ID_BY_MPI_RANK
 #define GRID_DIM 256
--- a/AMSS_NCKU_source/bssn_gpu_class.C
+++ b/AMSS_NCKU_source/bssn_gpu_class.C
@@ -2134,7 +2134,9 @@ void bssn_class::Evolve(int Steps)
      CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass);
      CheckPoint->writecheck_cgh(PhysTime, GH);
 #ifdef WithShell
      CheckPoint->writecheck_sh(PhysTime, SH);
 #endif
      CheckPoint->write_bssn(LastDump, Last2dDump, LastAnas);
    }
  }
--- a/AMSS_NCKU_source/bssn_gpu_rhs_ss.cu
+++ b/AMSS_NCKU_source/bssn_gpu_rhs_ss.cu
@@ -20,7 +20,7 @@ using namespace std;
 __device__ volatile unsigned int global_count = 0;
-void compare_result_gpu(int ftag1,double * datac,int data_num){
+static void compare_result_gpu(int ftag1,double * datac,int data_num){
 	double * data = (double*)malloc(sizeof(double)*data_num);
 	cudaMemcpy(data, datac, data_num * sizeof(double), cudaMemcpyDeviceToHost);
 	compare_result(ftag1,data,data_num);
@@ -153,11 +153,11 @@ __global__ void sub_symmetry_bd_ss_partJ(int ord,double * func, double * funcc,d
 inline void sub_symmetry_bd_ss(int ord,double * func, double * funcc,double * SoA){
 	sub_symmetry_bd_ss_partF<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_symmetry_bd_ss_partI<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[0]);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_symmetry_bd_ss_partJ<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[1]);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 }
 __global__ void sub_fderivs_shc_part1(double *fx,double *fy,double *fz){
@@ -247,13 +247,13 @@ inline void sub_fderivs_shc(int& sst,double * f,double * fh,double *fx,double *f
 	//cudaMemset(Msh_ gy,0,h_3D_SIZE[0] * sizeof(double));
 	//cudaMemset(Msh_ gz,0,h_3D_SIZE[0] * sizeof(double));
 	sub_symmetry_bd_ss(2,f,fh,SoA1);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			//compare_result_gpu(0,fh,h_3D_SIZE[2]);	
 	sub_fderivs_sh<<<GRID_DIM,BLOCK_DIM>>>(fh,Msh_ gx,Msh_ gy,Msh_ gz);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fderivs_shc_part1<<<GRID_DIM,BLOCK_DIM>>>(fx,fy,fz);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			//compare_result_gpu(1,fx,h_3D_SIZE[0]);
 			//compare_result_gpu(2,fy,h_3D_SIZE[0]);
 			//compare_result_gpu(3,fz,h_3D_SIZE[0]);
@@ -451,17 +451,17 @@ inline void sub_fdderivs_shc(int& sst,double * f,double * fh,
 	//fderivs_sh
 	sub_symmetry_bd_ss(2,f,fh,SoA1);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			//compare_result_gpu(1,fh,h_3D_SIZE[2]);	
 	sub_fderivs_sh<<<GRID_DIM,BLOCK_DIM>>>(fh,Msh_ gx,Msh_ gy,Msh_ gz);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	//fdderivs_sh
 	sub_symmetry_bd_ss(2,f,fh,SoA1);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			//compare_result_gpu(21,fh,h_3D_SIZE[2]);
 	sub_fdderivs_sh<<<GRID_DIM,BLOCK_DIM>>>(fh,Msh_ gxx,Msh_ gxy,Msh_ gxz,Msh_ gyy,Msh_ gyz,Msh_ gzz);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			/*compare_result_gpu(11,Msh_ gx,h_3D_SIZE[0]);
 			compare_result_gpu(12,Msh_ gy,h_3D_SIZE[0]);
 			compare_result_gpu(13,Msh_ gz,h_3D_SIZE[0]);
@@ -472,7 +472,7 @@ inline void sub_fdderivs_shc(int& sst,double * f,double * fh,
 			compare_result_gpu(5,Msh_ gyz,h_3D_SIZE[0]);
 			compare_result_gpu(6,Msh_ gzz,h_3D_SIZE[0]);*/
 	sub_fdderivs_shc_part1<<<GRID_DIM,BLOCK_DIM>>>(fxx,fxy,fxz,fyy,fyz,fzz);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			/*compare_result_gpu(1,fxx,h_3D_SIZE[0]);
 			compare_result_gpu(2,fxy,h_3D_SIZE[0]);
 			compare_result_gpu(3,fxz,h_3D_SIZE[0]);
@@ -496,9 +496,9 @@ __global__ void computeRicci_ss_part1(double * dst)
 inline void computeRicci_ss(int &sst,double * src,double* dst,double * SoA, Meta* meta)
 {
 	sub_fdderivs_shc(sst,src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,SoA);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	computeRicci_ss_part1<<<GRID_DIM,BLOCK_DIM>>>(dst);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 }
 __global__ void sub_lopsided_ss_part1(double * dst)
@@ -516,9 +516,9 @@ __global__ void sub_lopsided_ss_part1(double * dst)
 inline void sub_lopsided_ss(int& sst,double *src,double* dst,double *SoA)
 {
 		sub_fderivs_shc(sst,src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,SoA);
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 		sub_lopsided_ss_part1<<<GRID_DIM,BLOCK_DIM>>>(dst);
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 }
 __global__ void sub_kodis_sh_part1(double *f,double *fh,double *f_rhs)
@@ -590,11 +590,11 @@ inline void sub_kodis_ss(int &sst,double *f,double *fh,double *f_rhs,double *SoA
 	}
 			//compare_result_gpu(10,f,h_3D_SIZE[0]);
 	sub_symmetry_bd_ss(3,f,fh,SoA1);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			//compare_result_gpu(0,fh,h_3D_SIZE[3]);
 	sub_kodis_sh_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			//compare_result_gpu(1,f_rhs,h_3D_SIZE[0]);
 }
@@ -2287,13 +2287,13 @@ int gpu_rhs_ss(RHS_SS_PARA)
 #ifdef TIMING1
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	gettimeofday(&tv2, NULL);
   	cout<<"TIME USED"<<TimeBetween(tv1, tv2)<<endl; 
 #endif	
 	//cout<<"GPU meta data ready.\n";
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 //-------------get device info-------------------------------------
@@ -2306,7 +2306,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	//sub_enforce_ga(matrix_size);
 	//4.1-----compute rhs---------
 	compute_rhs_ss_part1<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fderivs_shc(sst,Mh_ betax,Mh_ fh,Mh_ betaxx,Mh_ betaxy,Mh_ betaxz,ass);
 	sub_fderivs_shc(sst,Mh_ betay,Mh_ fh,Mh_ betayx,Mh_ betayy,Mh_ betayz,sas);
@@ -2322,7 +2322,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	sub_fderivs_shc(sst,Mh_ gyz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz, saa);
 	compute_rhs_ss_part2<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fdderivs_shc(sst,Mh_ betax,Mh_ fh,Mh_ gxxx,Mh_ gxyx,Mh_ gxzx,Mh_ gyyx,Mh_ gyzx,Mh_ gzzx,ass);
 	sub_fdderivs_shc(sst,Mh_ betay,Mh_ fh,Mh_ gxxy,Mh_ gxyy,Mh_ gxzy,Mh_ gyyy,Mh_ gyzy,Mh_ gzzy,sas);
@@ -2332,7 +2332,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	sub_fderivs_shc( sst,Mh_ Gamz, Mh_ fh,Mh_ Gamzx, Mh_ Gamzy, Mh_ Gamzz,ssa);
 	compute_rhs_ss_part3<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	computeRicci_ss(sst,Mh_ dxx,Mh_ Rxx,sss, meta);
 	computeRicci_ss(sst,Mh_ dyy,Mh_ Ryy,sss, meta);
@@ -2340,25 +2340,25 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	computeRicci_ss(sst,Mh_ gxy,Mh_ Rxy,aas, meta);
 	computeRicci_ss(sst,Mh_ gxz,Mh_ Rxz,asa, meta);
 	computeRicci_ss(sst,Mh_ gyz,Mh_ Ryz,saa, meta);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	compute_rhs_ss_part4<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fdderivs_shc(sst,Mh_ chi,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
-	//cudaThreadSynchronize();
+	//cudaDeviceSynchronize();
 	//compare_result_gpu(0,Mh_ chi,h_3D_SIZE[0]);
 	//compare_result_gpu(1,Mh_ chi,h_3D_SIZE[0]);
 	//compare_result_gpu(2,Mh_ fyz,h_3D_SIZE[0]);
 	compute_rhs_ss_part5<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fdderivs_shc(sst,Mh_ Lap,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
 	compute_rhs_ss_part6<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 #if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5)
 	sub_fderivs_shc(sst,Mh_ chi,Mh_ fh, Mh_ dtSfx_rhs, Mh_ dtSfy_rhs, Mh_ dtSfz_rhs,sss);
@@ -2423,7 +2423,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	}
 	if(co == 0){
 		compute_rhs_ss_part7<<<GRID_DIM,BLOCK_DIM>>>();
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 		sub_fderivs_shc(sst,Mh_ Axx,Mh_ fh,Mh_ gxxx,Mh_ gxxy,Mh_ gxxz,sss);
 		sub_fderivs_shc(sst,Mh_ Axy,Mh_ fh,Mh_ gxyx,Mh_ gxyy,Mh_ gxyz,aas);
@@ -2432,7 +2432,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 		sub_fderivs_shc(sst,Mh_ Ayz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz,saa);
 		sub_fderivs_shc(sst,Mh_ Azz,Mh_ fh,Mh_ gzzx,Mh_ gzzy,Mh_ gzzz,sss);
 		compute_rhs_ss_part8<<<GRID_DIM,BLOCK_DIM>>>();
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 	}
 #if (ABV == 1)
@@ -2512,7 +2512,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	//test kodis
 	//sub_kodis_sh(sst,Msh_ drhodx,Mh_ fh2,Msh_ drhody,sss);
 #ifdef TIMING
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	gettimeofday(&tv2, NULL);
   	cout<<"MPI rank is: "<<mpi_rank<<" GPU TIME is"<<TimeBetween(tv1, tv2)<<" (s)."<<endl; 
 #endif
--- a/AMSS_NCKU_source/bssn_step_gpu.C
+++ b/AMSS_NCKU_source/bssn_step_gpu.C
@@ -1676,8 +1676,11 @@ void bssn_class::Step_GPU(int lev, int YN)
 #endif // PSTR == ?
 //--------------------------With Shell--------------------------
 // Note: SHStep() implementation is in bssn_gpu_class.C
 #ifdef WithShell
 #if 0
 // This SHStep() implementation has been moved to bssn_gpu_class.C to avoid duplicate definition
 void bssn_class::SHStep()
 {
  int lev = 0;
@@ -1938,5 +1941,5 @@ void bssn_class::SHStep()
    sPp = sPp->next;
  }
 }
-d
+#endif // #if 0
 #endif // withshell
--- a/AMSS_NCKU_source/diff_new.f90
+++ b/AMSS_NCKU_source/diff_new.f90
@@ -69,7 +69,6 @@
  fy = ZEO
  fz = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -152,7 +151,6 @@
  fx = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -220,7 +218,6 @@
  fy = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -285,7 +282,6 @@
  fz = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -375,7 +371,6 @@
  fxz = ZEO
  fyz = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -474,7 +469,6 @@
  fxx = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -537,7 +531,6 @@
  fyy = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -601,7 +594,6 @@
  fzz = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -665,7 +657,6 @@
  fxy = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -728,7 +719,6 @@
  fxz = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -790,7 +780,6 @@
  fyz = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -877,7 +866,6 @@
  fxz = ZEO
  fyz = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1009,7 +997,6 @@
  fy = ZEO
  fz = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1164,7 +1151,6 @@
  fx = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1241,7 +1227,6 @@
  fy = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1312,7 +1297,6 @@
  fz = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1417,7 +1401,6 @@
  fxz = ZEO
  fyz = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1593,7 +1576,6 @@
  fxx = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1661,7 +1643,6 @@
  fyy = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1731,7 +1712,6 @@
  fzz = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1801,7 +1781,6 @@
  fxy = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1872,7 +1851,6 @@
  fxz = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1941,7 +1919,6 @@
  fyz = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2034,7 +2011,6 @@
  fy = ZEO
  fz = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2151,7 +2127,6 @@
  fx = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2237,7 +2212,6 @@
  fy = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2314,7 +2288,6 @@
  fz = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2433,7 +2406,6 @@
  fxz = ZEO
  fyz = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2621,7 +2593,6 @@
  fxx = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2694,7 +2665,6 @@
  fyy = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2770,7 +2740,6 @@
  fzz = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2846,7 +2815,6 @@
  fxy = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2927,7 +2895,6 @@
  fxz = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -3006,7 +2973,6 @@
  fyz = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -3114,7 +3080,6 @@
  fy = ZEO
  fz = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -3251,7 +3216,6 @@
  fx = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -3347,7 +3311,6 @@
  fy = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -3432,7 +3395,6 @@
  fz = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -3568,7 +3530,6 @@
  fxz = ZEO
  fyz = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -3841,7 +3802,6 @@
  fxx = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -3923,7 +3883,6 @@
  fyy = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -4008,7 +3967,6 @@
  fzz = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -4093,7 +4051,6 @@
  fxy = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -4196,7 +4153,6 @@
  fxz = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -4297,7 +4253,6 @@
  fyz = ZEO
  !$omp parallel do collapse(3) schedule(static)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
--- a/AMSS_NCKU_source/fmisc.f90
+++ b/AMSS_NCKU_source/fmisc.f90
@@ -1276,9 +1276,7 @@ end subroutine d2dump
  real*8            :: dX, dY, dZ
  integer::imin,jmin,kmin
  integer::imax,jmax,kmax
-  integer::i,j,k,n_elements
+  integer::i,j,k
  real*8, dimension(:), allocatable :: f_flat
  real*8, external :: DDOT
  dX = X(2) - X(1)
  dY = Y(2) - Y(1)
@@ -1302,12 +1300,7 @@ if(dabs(X(1)-xmin) < dX) imin = 1
 if(dabs(Y(1)-ymin) < dY) jmin = 1
 if(dabs(Z(1)-zmin) < dZ) kmin = 1
-! Optimized with oneMKL BLAS DDOT for dot product
+f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax))
 n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
 allocate(f_flat(n_elements))
 f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
 f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
 deallocate(f_flat)
 f_out = f_out*dX*dY*dZ
@@ -1332,9 +1325,7 @@ f_out = f_out*dX*dY*dZ
  real*8            :: dX, dY, dZ
  integer::imin,jmin,kmin
  integer::imax,jmax,kmax
-  integer::i,j,k,n_elements
+  integer::i,j,k
  real*8, dimension(:), allocatable :: f_flat
  real*8, external :: DDOT
  real*8 :: PIo4
@@ -1397,12 +1388,7 @@ if(Symmetry==2)then
  if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
 endif
-! Optimized with oneMKL BLAS DDOT for dot product
+f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax))
 n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
 allocate(f_flat(n_elements))
 f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
 f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
 deallocate(f_flat)
 f_out = f_out*dX*dY*dZ
@@ -1430,8 +1416,6 @@ f_out = f_out*dX*dY*dZ
  integer::imin,jmin,kmin
  integer::imax,jmax,kmax
  integer::i,j,k
  real*8, dimension(:), allocatable :: f_flat
  real*8, external :: DDOT
  real*8 :: PIo4
@@ -1494,12 +1478,11 @@ if(Symmetry==2)then
  if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
 endif
-! Optimized with oneMKL BLAS DDOT for dot product
+f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax))
 f_out = f_out
 Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
 allocate(f_flat(Nout))
 f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [Nout])
 f_out = DDOT(Nout, f_flat, 1, f_flat, 1)
 deallocate(f_flat)
  return
@@ -1697,7 +1680,6 @@ deallocate(f_flat)
  real*8, dimension(ORDN,ORDN) :: tmp2
  real*8, dimension(ORDN) :: tmp1
  real*8, dimension(3) :: SoAh
  real*8, external :: DDOT
 ! +1 because c++ gives 0 for first point
  cxB = inds+1  
@@ -1733,21 +1715,20 @@ deallocate(f_flat)
     ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),cxB(3):cxT(3))
  endif 
  ! Optimized with BLAS operations for better performance
  ! First dimension: z-direction weighted sum
  tmp2=0
  do m=1,ORDN
    tmp2 = tmp2 + coef(2*ORDN+m)*ya(:,:,m)
  enddo
  ! Second dimension: y-direction weighted sum
  tmp1=0
  do m=1,ORDN
    tmp1 = tmp1 + coef(ORDN+m)*tmp2(:,m)
  enddo
-  ! Third dimension: x-direction weighted sum using BLAS DDOT
+  f_int=0
-  f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
+  do m=1,ORDN
    f_int = f_int + coef(m)*tmp1(m)
  enddo
  return
@@ -1777,7 +1758,6 @@ deallocate(f_flat)
  real*8, dimension(ORDN,ORDN) :: ya
  real*8, dimension(ORDN) :: tmp1
  real*8, dimension(2) :: SoAh
  real*8, external :: DDOT
 ! +1 because c++ gives 0 for first point
  cxB = inds(1:2)+1  
@@ -1807,14 +1787,15 @@ deallocate(f_flat)
     ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),inds(3))
  endif 
  ! Optimized with BLAS operations
  tmp1=0
  do m=1,ORDN
    tmp1 = tmp1 + coef(ORDN+m)*ya(:,m)
  enddo
-  ! Use BLAS DDOT for final weighted sum
+  f_int=0
-  f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
+  do m=1,ORDN
    f_int = f_int + coef(m)*tmp1(m)
  enddo
  return
@@ -1845,7 +1826,6 @@ deallocate(f_flat)
  real*8, dimension(ORDN) :: ya
  real*8 :: SoAh
  integer,dimension(3) :: inds
  real*8, external :: DDOT
 ! +1 because c++ gives 0 for first point
  inds = indsi + 1
@@ -1906,8 +1886,10 @@ deallocate(f_flat)
          write(*,*)"error in global_interpind1d, not recognized dumyd = ",dumyd
  endif
-  ! Optimized with BLAS DDOT for weighted sum
+  f_int=0
-  f_int = DDOT(ORDN, coef, 1, ya, 1)
+  do m=1,ORDN
    f_int = f_int + coef(m)*ya(m)
  enddo
  return
@@ -2139,38 +2121,24 @@ deallocate(f_flat)
  end function fWigner_d_function
 !----------------------------------
 ! Optimized factorial function using lookup table for small N
 ! and log-gamma for large N to avoid overflow
  function ffact(N) result(gont)
  implicit none
  integer,intent(in) :: N
  real*8 :: gont
  integer :: i
-  ! Lookup table for factorials 0! to 20! (precomputed)
+  integer :: i
  real*8, parameter, dimension(0:20) :: fact_table = [ &
    1.d0, 1.d0, 2.d0, 6.d0, 24.d0, 120.d0, 720.d0, 5040.d0, 40320.d0, &
    362880.d0, 3628800.d0, 39916800.d0, 479001600.d0, 6227020800.d0, &
    87178291200.d0, 1307674368000.d0, 20922789888000.d0, &
    355687428096000.d0, 6402373705728000.d0, 121645100408832000.d0, &
    2432902008176640000.d0 ]
 ! sanity check
  if(N < 0)then
     write(*,*) "ffact: error input for factorial"
     gont = 1.d0
     return
  endif
-  ! Use lookup table for small N (fast path)
+  gont = 1.d0
-  if(N <= 20)then
+  do i=1,N
-     gont = fact_table(N)
+     gont = gont*i
-  else
+  enddo
     ! Use log-gamma function for large N: N! = exp(log_gamma(N+1))
     ! This avoids overflow and is computed efficiently
     gont = exp(log_gamma(dble(N+1)))
  endif
  return
--- a/AMSS_NCKU_source/gaussj.C
+++ b/AMSS_NCKU_source/gaussj.C
@@ -16,66 +16,115 @@ using namespace std;
 #include <string.h>
 #include <math.h>
 #endif
-
+/* Linear equation solution by Gauss-Jordan elimination.
 // Intel oneMKL LAPACK interface
 #include <mkl_lapacke.h>
 /* Linear equation solution using Intel oneMKL LAPACK.
 a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input
 containing the right-hand side vectors. On output a is
 replaced by its matrix inverse, and b is replaced by the
-corresponding set of solution vectors.
+corresponding set of solution vectors */
 Mathematical equivalence:
  Solves: A * x = b  =>  x = A^(-1) * b
  Original Gauss-Jordan and LAPACK dgesv/dgetri produce identical results
  within numerical precision. */
 int gaussj(double *a, double *b, int n)
 {
-  // Allocate pivot array and workspace
+  double swap;
  lapack_int *ipiv = new lapack_int[n];
  lapack_int info;
-  // Make a copy of matrix a for solving (dgesv modifies it to LU form)
+  int *indxc, *indxr, *ipiv;
-  double *a_copy = new double[n * n];
+  indxc = new int[n];
-  for (int i = 0; i < n * n; i++) {
+  indxr = new int[n];
-    a_copy[i] = a[i];
+  ipiv = new int[n];
  int i, icol, irow, j, k, l, ll;
  double big, dum, pivinv, temp;
  for (j = 0; j < n; j++)
    ipiv[j] = 0;
  for (i = 0; i < n; i++)
  {
    big = 0.0;
    for (j = 0; j < n; j++)
      if (ipiv[j] != 1)
        for (k = 0; k < n; k++)
        {
          if (ipiv[k] == 0)
          {
            if (fabs(a[j * n + k]) >= big)
            {
              big = fabs(a[j * n + k]);
              irow = j;
              icol = k;
            }
          }
          else if (ipiv[k] > 1)
          {
            cout << "gaussj: Singular Matrix-1" << endl;
            for (int ii = 0; ii < n; ii++)
            {
              for (int jj = 0; jj < n; jj++)
                cout << a[ii * n + jj] << " ";
              cout << endl;
            }
            return 1; // error return
          }
        }
    ipiv[icol] = ipiv[icol] + 1;
    if (irow != icol)
    {
      for (l = 0; l < n; l++)
      {
        swap = a[irow * n + l];
        a[irow * n + l] = a[icol * n + l];
        a[icol * n + l] = swap;
      }
      swap = b[irow];
      b[irow] = b[icol];
      b[icol] = swap;
    }
    indxr[i] = irow;
    indxc[i] = icol;
    if (a[icol * n + icol] == 0.0)
    {
      cout << "gaussj: Singular Matrix-2" << endl;
      for (int ii = 0; ii < n; ii++)
      {
        for (int jj = 0; jj < n; jj++)
          cout << a[ii * n + jj] << " ";
        cout << endl;
      }
      return 1; // error return
    }
    pivinv = 1.0 / a[icol * n + icol];
    a[icol * n + icol] = 1.0;
    for (l = 0; l < n; l++)
      a[icol * n + l] *= pivinv;
    b[icol] *= pivinv;
    for (ll = 0; ll < n; ll++)
      if (ll != icol)
      {
        dum = a[ll * n + icol];
        a[ll * n + icol] = 0.0;
        for (l = 0; l < n; l++)
          a[ll * n + l] -= a[icol * n + l] * dum;
        b[ll] -= b[icol] * dum;
      }
  }
-  // Step 1: Solve linear system A*x = b using LU decomposition
+  for (l = n - 1; l >= 0; l--)
-  // LAPACKE_dgesv uses column-major by default, but we use row-major
+  {
-  info = LAPACKE_dgesv(LAPACK_ROW_MAJOR, n, 1, a_copy, n, ipiv, b, 1);
+    if (indxr[l] != indxc[l])
-
+      for (k = 0; k < n; k++)
-  if (info != 0) {
+      {
-    cout << "gaussj: Singular Matrix (dgesv info=" << info << ")" << endl;
+        swap = a[k * n + indxr[l]];
-    delete[] ipiv;
+        a[k * n + indxr[l]] = a[k * n + indxc[l]];
-    delete[] a_copy;
+        a[k * n + indxc[l]] = swap;
-    return 1;
+      }
  }
  // Step 2: Compute matrix inverse A^(-1) using LU factorization
  // First do LU factorization of original matrix a
  info = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, n, n, a, n, ipiv);
  if (info != 0) {
    cout << "gaussj: Singular Matrix (dgetrf info=" << info << ")" << endl;
    delete[] ipiv;
    delete[] a_copy;
    return 1;
  }
  // Then compute inverse from LU factorization
  info = LAPACKE_dgetri(LAPACK_ROW_MAJOR, n, a, n, ipiv);
  if (info != 0) {
    cout << "gaussj: Singular Matrix (dgetri info=" << info << ")" << endl;
    delete[] ipiv;
    delete[] a_copy;
    return 1;
  }
  delete[] indxc;
  delete[] indxr;
  delete[] ipiv;
  delete[] a_copy;
  return 0;
 }
--- a/AMSS_NCKU_source/ilucg.f90
+++ b/AMSS_NCKU_source/ilucg.f90
@@ -512,10 +512,11 @@
      IMPLICIT DOUBLE PRECISION (A-H,O-Z)
      DIMENSION V(N),W(N)
 !     SUBROUTINE TO COMPUTE DOUBLE PRECISION VECTOR DOT PRODUCT.
 !     Optimized using Intel oneMKL BLAS ddot
 !     Mathematical equivalence: DGVV = sum_{i=1}^{N} V(i)*W(i)
-      DOUBLE PRECISION, EXTERNAL :: DDOT
+      SUM = 0.0D0
-      DGVV = DDOT(N, V, 1, W, 1)
+            DO 10 I = 1,N
            SUM = SUM + V(I)*W(I)
 10          CONTINUE
      DGVV = SUM
      RETURN
      END
--- a/AMSS_NCKU_source/macrodef.h
+++ b/AMSS_NCKU_source/macrodef.h
@@ -2,7 +2,7 @@
 #ifndef MICRODEF_H
 #define MICRODEF_H
-#include "macrodef.fh"
+#include "microdef.fh"
 // application parameters
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -1,32 +1,22 @@
 ## GCC version (commented out)
 ## filein  = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
 ## filein  = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
 ## LDLIBS  = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
-## Intel oneAPI version with oneMKL (Optimized for performance)
+filein  = -I/usr/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/14/ -I/usr/include/c++/14/
 filein  = -I/usr/include/ -I${MKLROOT}/include
-## Using sequential MKL (OpenMP disabled for better single-threaded performance)
+##filein  = -I/usr/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/ -I/usr/lib/cuda/include
 ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
 LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl
-## Aggressive optimization flags:
+LDLIBS  = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/14 -lgfortran -lmpi -lgfortran -lcudart -lcuda
-## -O3: Maximum optimization
+##LDLIBS  = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -L/usr/lib/cuda/lib64 -lcudart -lmpi -lgfortran
-## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible)
+
-## -fp-model fast=2: Aggressive floating-point optimizations
+CXXAPPFLAGS  = -O3 -Wno-deprecated -Dfortran3 -Dnewc
-## -fma: Enable fused multiply-add instructions
+#f90appflags = -O3 -fpp
-## Note: OpenMP enabled for hybrid MPI+OpenMP
+f90appflags  = -O3 -x f95-cpp-input
-CXXAPPFLAGS  = -O3 -xHost -fp-model fast=2 -fma -qopenmp \
+f90          = gfortran 
-               -Dfortran3 -Dnewc -I${MKLROOT}/include
+f77          = gfortran 
-f90appflags  = -O3 -xHost -fp-model fast=2 -fma -qopenmp \
+CXX          = g++
-               -fpp -I${MKLROOT}/include
+CC           = gcc
-f90          = ifx
+CLINKER      = mpic++ 
 f77          = ifx
 CXX          = icpx
 CC           = icx
 CLINKER      = mpiicpx -qopenmp 
 Cu = nvcc
 CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
 #CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
-CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc
+# RTX 4050 uses Ada Lovelace architecture (compute capability 8.9)
 CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch=sm_89 -Dfortran3 -Dnewc
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -11,15 +11,6 @@
 import AMSS_NCKU_Input as input_data
 import subprocess
 ## CPU core binding configuration
 ## Removed hardcoded taskset to allow full utilization of 96 cores via MPI+OpenMP
 NUMACTL_CPU_BIND = ""
 ## Build parallelism configuration
 ## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
 ## Set make -j to utilize available cores for faster builds
 BUILD_JOBS = 96
 ##################################################################
@@ -37,9 +28,9 @@ def makefile_ABE():
    ## Build command
    if (input_data.GPU_Calculation == "no"):
-        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABE"
+        makefile_command  = "make -j4" + " ABE"
    elif (input_data.GPU_Calculation == "yes"):
-        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU"
+        makefile_command  = "make -j4" + " ABEGPU"
    else:
        print( " CPU/GPU numerical calculation setting is wrong " )
        print(                                                    )
@@ -77,7 +68,7 @@ def makefile_TwoPunctureABE():
    print(                                                            )
    ## Build command
-    makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} TwoPunctureABE"
+    makefile_command = "make" + " TwoPunctureABE"
    ## Execute the command with subprocess.Popen and stream output
    makefile_process = subprocess.Popen(makefile_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) 
@@ -111,23 +102,13 @@ def run_ABE():
    print( " Running the AMSS-NCKU executable file ABE/ABEGPU " ) 
    print(                                                      )
    ## Calculate OMP_NUM_THREADS
    ## User has 96 cores. Calculate threads per MPI process.
    total_physical_cores = 96
    omp_num_threads = total_physical_cores // input_data.MPI_processes
    if omp_num_threads < 1:
        omp_num_threads = 1
    print( f" Configuration: {input_data.MPI_processes} MPI processes, {omp_num_threads} OpenMP threads per process." )
    print( f" Total cores utilized: {input_data.MPI_processes * omp_num_threads}" )
    ## Define the command to run; cast other values to strings as needed
    if (input_data.GPU_Calculation == "no"):
-        mpi_command         = f"{NUMACTL_CPU_BIND} mpirun -genv OMP_NUM_THREADS {omp_num_threads} -np {input_data.MPI_processes} ./ABE"
+        mpi_command         = "mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
        mpi_command_outfile = "ABE_out.log"
    elif (input_data.GPU_Calculation == "yes"):
-        mpi_command         = f"{NUMACTL_CPU_BIND} mpirun -genv OMP_NUM_THREADS {omp_num_threads} -np {input_data.MPI_processes} ./ABEGPU"
+        mpi_command         = "mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
        mpi_command_outfile = "ABEGPU_out.log"
    ## Execute the MPI command and stream output
@@ -166,7 +147,7 @@ def run_TwoPunctureABE():
    print(                                                          )
    ## Define the command to run
-    TwoPuncture_command         = NUMACTL_CPU_BIND + " ./TwoPunctureABE"
+    TwoPuncture_command         = "./TwoPunctureABE"
    TwoPuncture_command_outfile = "TwoPunctureABE_out.log"
    ## Execute the command with subprocess.Popen and stream output
Author	SHA1	Message	Date
ianchb	95575d9450	fix: try to fix segfault at 240 steps by adding WithShell guard for writecheck_sh call	2026-01-22 14:26:41 +08:00
ianchb	54600327da	fix(build): update makefile.inc for debian 13	2026-01-21 09:29:35 +08:00
CGH0S7	75be0968fc	feat: port GPU code to CUDA 13 and enable GPU computation Major changes: - Update makefile.inc for CUDA 13.1 with sm_89 architecture (RTX 4050) - Replace deprecated cudaThreadSynchronize() with cudaDeviceSynchronize() - Add CUDA_SAFE_CALL macro for CUDA 13 compatibility - Fix duplicate function definitions (compare_result_gpu, SHStep) - Fix syntax error in bssn_step_gpu.C - Enable GPU calculation in AMSS_NCKU_Input.py - Successfully build ABEGPU executable	2026-01-13 18:15:49 +00:00
CGH0S7	b27e071cde	Makefile updated for rocky10	2026-01-14 01:41:31 +08:00
CGH0S7	a1125d4c79	try to build gpu version	2026-01-13 23:52:44 +08:00
CGH0S7	dcc66588fc	gitignore updated	2026-01-13 23:45:49 +08:00
CGH0S7	950d448edf	fix(build): update LDLIBS to use -lmpi and remove hardcoded paths	2026-01-13 23:40:51 +08:00