feat: port GPU code to CUDA 13 and enable GPU computation

Major changes: - Update makefile.inc for CUDA 13.1 with sm_89 architecture (RTX 4050) - Replace deprecated cudaThreadSynchronize() with cudaDeviceSynchronize() - Add CUDA_SAFE_CALL macro for CUDA 13 compatibility - Fix duplicate function definitions (compare_result_gpu, SHStep) - Fix syntax error in bssn_step_gpu.C - Enable GPU calculation in AMSS_NCKU_Input.py - Successfully build ABEGPU executable
Makefile updated for rocky10
2026-01-13 18:15:49 +00:00 · 2026-01-14 01:41:31 +08:00 · 2026-01-13 23:52:44 +08:00 · 2026-01-13 23:45:49 +08:00 · 2026-01-13 23:40:51 +08:00
29 changed files with 779 additions and 2519 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,3 @@
 __pycache__
 GW150914
 GW150914-origin
 docs
 *.tmp
--- a/AMSS_NCKU_Input.py
+++ b/AMSS_NCKU_Input.py
@@ -16,12 +16,12 @@ import numpy
 File_directory   = "GW150914"                    ## output file directory
 Output_directory = "binary_output"               ## binary data file directory
                                                 ## The file directory name should not be too long
-MPI_processes    = 64                             ## number of mpi processes used in the simulation
+MPI_processes    = 96                             ## number of mpi processes used in the simulation
-GPU_Calculation  = "no"                          ## Use GPU or not 
+GPU_Calculation  = "yes"                         ## Use GPU or not
-                                                 ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
+                                                 ## GPU support has been updated for CUDA 13
-CPU_Part         = 1.0
+CPU_Part         = 0.0
-GPU_Part         = 0.0
+GPU_Part         = 1.0
 #################################################
--- a/AMSS_NCKU_Program.py
+++ b/AMSS_NCKU_Program.py
@@ -8,14 +8,6 @@
 ##
 ##################################################################
 ## Guard against re-execution by multiprocessing child processes.
 ## Without this, using 'spawn' or 'forkserver' context would cause every
 ## worker to re-run the entire script, spawning exponentially more
 ## workers (fork bomb).
 if __name__ != '__main__':
    import sys as _sys
    _sys.exit(0)
 ##################################################################
@@ -432,31 +424,26 @@ print(
 import plot_xiaoqu
 import plot_GW_strain_amplitude_xiaoqu
 from parallel_plot_helper import run_plot_tasks_parallel
 plot_tasks = []
 ## Plot black hole trajectory
-plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot,   (binary_results_directory, figure_directory) ) )
+plot_xiaoqu.generate_puncture_orbit_plot(   binary_results_directory, figure_directory )
-plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot3D, (binary_results_directory, figure_directory) ) )
+plot_xiaoqu.generate_puncture_orbit_plot3D( binary_results_directory, figure_directory )
 ## Plot black hole separation vs. time
-plot_tasks.append( ( plot_xiaoqu.generate_puncture_distence_plot, (binary_results_directory, figure_directory) ) )
+plot_xiaoqu.generate_puncture_distence_plot( binary_results_directory, figure_directory )
 ## Plot gravitational waveforms (psi4 and strain amplitude)
 for i in range(input_data.Detector_Number):
-    plot_tasks.append( ( plot_xiaoqu.generate_gravitational_wave_psi4_plot, (binary_results_directory, figure_directory, i) ) )
+    plot_xiaoqu.generate_gravitational_wave_psi4_plot( binary_results_directory, figure_directory, i )
-    plot_tasks.append( ( plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot, (binary_results_directory, figure_directory, i) ) )
+    plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot( binary_results_directory, figure_directory, i )
 ## Plot ADM mass evolution
 for i in range(input_data.Detector_Number):
-    plot_tasks.append( ( plot_xiaoqu.generate_ADMmass_plot, (binary_results_directory, figure_directory, i) ) )
+    plot_xiaoqu.generate_ADMmass_plot( binary_results_directory, figure_directory, i )
 ## Plot Hamiltonian constraint violation over time
 for i in range(input_data.grid_level):
-    plot_tasks.append( ( plot_xiaoqu.generate_constraint_check_plot, (binary_results_directory, figure_directory, i) ) )
+    plot_xiaoqu.generate_constraint_check_plot( binary_results_directory, figure_directory, i )
 run_plot_tasks_parallel(plot_tasks)
 ## Plot stored binary data
 plot_xiaoqu.generate_binary_data_plot( binary_results_directory, figure_directory )
--- a/AMSS_NCKU_Verify_ASC26.py
+++ b/AMSS_NCKU_Verify_ASC26.py
@@ -1,279 +0,0 @@
 #!/usr/bin/env python3
 """
 AMSS-NCKU GW150914 Simulation Regression Test Script
 Verification Requirements:
 1. XY-plane trajectory RMS error < 1% (Optimized vs. baseline, max of BH1 and BH2)
 2. ADM constraint violation < 2 (Grid Level 0)
 RMS Calculation Method:
 - Computes trajectory deviation on the XY plane independently for BH1 and BH2
 - For each black hole: RMS = sqrt((1/M) * sum((Δr_i / r_i^max)^2)) × 100%
 - Final RMS = max(RMS_BH1, RMS_BH2)
 Usage: python3 AMSS_NCKU_Verify_ASC26.py [output_dir]
 Default: output_dir = GW150914/AMSS_NCKU_output
 Reference: GW150914-origin (baseline simulation)
 """
 import numpy as np
 import sys
 import os
 # ANSI Color Codes
 class Color:
    GREEN = '\033[92m'
    RED = '\033[91m'
    YELLOW = '\033[93m'
    BLUE = '\033[94m'
    BOLD = '\033[1m'
    RESET = '\033[0m'
 def get_status_text(passed):
    if passed:
        return f"{Color.GREEN}{Color.BOLD}PASS{Color.RESET}"
    else:
        return f"{Color.RED}{Color.BOLD}FAIL{Color.RESET}"
 def load_bh_trajectory(filepath):
    """Load black hole trajectory data"""
    data = np.loadtxt(filepath)
    return {
        'time': data[:, 0],
        'x1': data[:, 1], 'y1': data[:, 2], 'z1': data[:, 3],
        'x2': data[:, 4], 'y2': data[:, 5], 'z2': data[:, 6]
    }
 def load_constraint_data(filepath):
    """Load constraint violation data"""
    data = []
    with open(filepath, 'r') as f:
        for line in f:
            if line.startswith('#'):
                continue
            parts = line.split()
            if len(parts) >= 8:
                data.append([float(x) for x in parts[:8]])
    return np.array(data)
 def calculate_rms_error(bh_data_ref, bh_data_target):
    """
    Calculate trajectory-based RMS error on the XY plane between baseline and optimized simulations.
    This function computes the RMS error independently for BH1 and BH2 trajectories,
    then returns the maximum of the two as the final RMS error metric.
    For each black hole, the RMS is calculated as:
        RMS = sqrt( (1/M) * sum( (Δr_i / r_i^max)^2 ) ) × 100%
    where:
        Δr_i = sqrt((x_ref,i - x_new,i)^2 + (y_ref,i - y_new,i)^2)
        r_i^max = max(sqrt(x_ref,i^2 + y_ref,i^2), sqrt(x_new,i^2 + y_new,i^2))
    Args:
        bh_data_ref: Reference (baseline) trajectory data
        bh_data_target: Target (optimized) trajectory data
    Returns:
        rms_value: Final RMS error as a percentage (max of BH1 and BH2)
        error: Error message if any
    """
    # Align data: truncate to the length of the shorter dataset
    M = min(len(bh_data_ref['time']), len(bh_data_target['time']))
    if M < 10:
        return None, "Insufficient data points for comparison"
    # Extract XY coordinates for both black holes
    x1_ref = bh_data_ref['x1'][:M]
    y1_ref = bh_data_ref['y1'][:M]
    x2_ref = bh_data_ref['x2'][:M]
    y2_ref = bh_data_ref['y2'][:M]
    x1_new = bh_data_target['x1'][:M]
    y1_new = bh_data_target['y1'][:M]
    x2_new = bh_data_target['x2'][:M]
    y2_new = bh_data_target['y2'][:M]
    # Calculate RMS for BH1
    delta_r1 = np.sqrt((x1_ref - x1_new)**2 + (y1_ref - y1_new)**2)
    r1_ref = np.sqrt(x1_ref**2 + y1_ref**2)
    r1_new = np.sqrt(x1_new**2 + y1_new**2)
    r1_max = np.maximum(r1_ref, r1_new)
    # Calculate RMS for BH2
    delta_r2 = np.sqrt((x2_ref - x2_new)**2 + (y2_ref - y2_new)**2)
    r2_ref = np.sqrt(x2_ref**2 + y2_ref**2)
    r2_new = np.sqrt(x2_new**2 + y2_new**2)
    r2_max = np.maximum(r2_ref, r2_new)
    # Avoid division by zero for BH1
    valid_mask1 = r1_max > 1e-15
    if np.sum(valid_mask1) < 10:
        return None, "Insufficient valid data points for BH1"
    terms1 = (delta_r1[valid_mask1] / r1_max[valid_mask1])**2
    rms_bh1 = np.sqrt(np.mean(terms1)) * 100
    # Avoid division by zero for BH2
    valid_mask2 = r2_max > 1e-15
    if np.sum(valid_mask2) < 10:
        return None, "Insufficient valid data points for BH2"
    terms2 = (delta_r2[valid_mask2] / r2_max[valid_mask2])**2
    rms_bh2 = np.sqrt(np.mean(terms2)) * 100
    # Final RMS is the maximum of BH1 and BH2
    rms_final = max(rms_bh1, rms_bh2)
    return rms_final, None
 def analyze_constraint_violation(constraint_data, n_levels=9):
    """
    Analyze ADM constraint violation
    Return maximum constraint violation for Grid Level 0
    """
    # Extract Grid Level 0 data (first entry for each time step)
    level0_data = constraint_data[::n_levels]
    # Calculate maximum absolute value for each constraint
    results = {
        'Ham': np.max(np.abs(level0_data[:, 1])),
        'Px': np.max(np.abs(level0_data[:, 2])),
        'Py': np.max(np.abs(level0_data[:, 3])),
        'Pz': np.max(np.abs(level0_data[:, 4])),
        'Gx': np.max(np.abs(level0_data[:, 5])),
        'Gy': np.max(np.abs(level0_data[:, 6])),
        'Gz': np.max(np.abs(level0_data[:, 7]))
    }
    results['max_violation'] = max(results.values())
    return results
 def print_header():
    """Print report header"""
    print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
    print(Color.BOLD + "   AMSS-NCKU GW150914 Simulation Regression Test Report" + Color.RESET)
    print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
 def print_rms_results(rms_rel, error, threshold=1.0):
    """Print RMS error results"""
    print(f"\n{Color.BOLD}1. RMS Error Analysis (Baseline vs Optimized){Color.RESET}")
    print("-" * 45)
    if error:
        print(f"   {Color.RED}Error: {error}{Color.RESET}")
        return False
    passed = rms_rel < threshold
    print(f"   RMS relative error: {rms_rel:.4f}%")
    print(f"   Requirement:        < {threshold}%")
    print(f"   Status:             {get_status_text(passed)}")
    return passed
 def print_constraint_results(results, threshold=2.0):
    """Print constraint violation results"""
    print(f"\n{Color.BOLD}2. ADM Constraint Violation Analysis (Grid Level 0){Color.RESET}")
    print("-" * 45)
    names = ['Ham', 'Px', 'Py', 'Pz', 'Gx', 'Gy', 'Gz']
    for i, name in enumerate(names):
        print(f"   Max |{name:3}|: {results[name]:.6f}", end="   ")
        if (i + 1) % 2 == 0: print()
    if len(names) % 2 != 0: print()
    passed = results['max_violation'] < threshold
    print(f"\n   Maximum violation:  {results['max_violation']:.6f}")
    print(f"   Requirement:        < {threshold}")
    print(f"   Status:             {get_status_text(passed)}")
    return passed
 def print_summary(rms_passed, constraint_passed):
    """Print summary"""
    print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
    print(Color.BOLD + "Verification Summary" + Color.RESET)
    print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
    all_passed = rms_passed and constraint_passed
    res_rms = get_status_text(rms_passed)
    res_con = get_status_text(constraint_passed)
    print(f"   [1] RMS trajectory check:         {res_rms}")
    print(f"   [2] ADM constraint check:         {res_con}")
    final_status = f"{Color.GREEN}{Color.BOLD}ALL CHECKS PASSED{Color.RESET}" if all_passed else f"{Color.RED}{Color.BOLD}SOME CHECKS FAILED{Color.RESET}"
    print(f"\n   Overall result: {final_status}")
    print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET + "\n")
    return all_passed
 def main():
    # Determine target (optimized) output directory
    if len(sys.argv) > 1:
        target_dir = sys.argv[1]
    else:
        script_dir = os.path.dirname(os.path.abspath(__file__))
        target_dir = os.path.join(script_dir, "GW150914/AMSS_NCKU_output")
    # Determine reference (baseline) directory
    script_dir = os.path.dirname(os.path.abspath(__file__))
    reference_dir = os.path.join(script_dir, "GW150914-origin/AMSS_NCKU_output")
    # Data file paths
    bh_file_ref = os.path.join(reference_dir, "bssn_BH.dat")
    bh_file_target = os.path.join(target_dir, "bssn_BH.dat")
    constraint_file = os.path.join(target_dir, "bssn_constraint.dat")
    # Check if files exist
    if not os.path.exists(bh_file_ref):
        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Baseline trajectory file not found: {bh_file_ref}")
        sys.exit(1)
    if not os.path.exists(bh_file_target):
        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Target trajectory file not found: {bh_file_target}")
        sys.exit(1)
    if not os.path.exists(constraint_file):
        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Constraint data file not found: {constraint_file}")
        sys.exit(1)
    # Print header
    print_header()
    print(f"\n{Color.BOLD}Reference (Baseline):{Color.RESET} {Color.BLUE}{reference_dir}{Color.RESET}")
    print(f"{Color.BOLD}Target (Optimized):  {Color.RESET} {Color.BLUE}{target_dir}{Color.RESET}")
    # Load data
    bh_data_ref = load_bh_trajectory(bh_file_ref)
    bh_data_target = load_bh_trajectory(bh_file_target)
    constraint_data = load_constraint_data(constraint_file)
    # Calculate RMS error
    rms_rel, error = calculate_rms_error(bh_data_ref, bh_data_target)
    rms_passed = print_rms_results(rms_rel, error)
    # Analyze constraint violation
    constraint_results = analyze_constraint_violation(constraint_data)
    constraint_passed = print_constraint_results(constraint_results)
    # Print summary
    all_passed = print_summary(rms_passed, constraint_passed)
    # Return exit code
    sys.exit(0 if all_passed else 1)
 if __name__ == "__main__":
    main()
--- a/AMSS_NCKU_source/FFT.f90
+++ b/AMSS_NCKU_source/FFT.f90
@@ -37,51 +37,57 @@ close(77)
 end program checkFFT
 #endif
 !-------------
 ! Optimized FFT using Intel oneMKL DFTI
 ! Mathematical equivalence: Standard DFT definition
 !   Forward (isign=1):  X[k] = sum_{n=0}^{N-1} x[n] * exp(-2*pi*i*k*n/N)
 !   Backward (isign=-1): X[k] = sum_{n=0}^{N-1} x[n] * exp(+2*pi*i*k*n/N)
 ! Input/Output: dataa is interleaved complex array [Re(0),Im(0),Re(1),Im(1),...]
 !-------------
 SUBROUTINE four1(dataa,nn,isign)
 use MKL_DFTI
 implicit none
-INTEGER, intent(in) :: isign, nn
+INTEGER::isign,nn
-DOUBLE PRECISION, dimension(2*nn), intent(inout) :: dataa
+double precision,dimension(2*nn)::dataa
-
+INTEGER::i,istep,j,m,mmax,n
-type(DFTI_DESCRIPTOR), pointer :: desc
+double precision::tempi,tempr
-integer :: status
+DOUBLE PRECISION::theta,wi,wpi,wpr,wr,wtemp
-
+n=2*nn
-! Create DFTI descriptor for 1D complex-to-complex transform
+j=1
-status = DftiCreateDescriptor(desc, DFTI_DOUBLE, DFTI_COMPLEX, 1, nn)
+do i=1,n,2
-if (status /= 0) return
+  if(j.gt.i)then
-
+     tempr=dataa(j)
-! Set input/output storage as interleaved complex (default)
+     tempi=dataa(j+1)
-status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_INPLACE)
+     dataa(j)=dataa(i)
-if (status /= 0) then
+     dataa(j+1)=dataa(i+1)
-   status = DftiFreeDescriptor(desc)
+     dataa(i)=tempr
-   return
+     dataa(i+1)=tempi
  endif
-
+  m=nn
-! Commit the descriptor
+1 if ((m.ge.2).and.(j.gt.m)) then
-status = DftiCommitDescriptor(desc)
+  j=j-m
-if (status /= 0) then
+  m=m/2
-   status = DftiFreeDescriptor(desc)
+goto 1
   return
  endif
-
+j=j+m
-! Execute FFT based on direction
+enddo
-if (isign == 1) then
+mmax=2
-   ! Forward FFT: exp(-2*pi*i*k*n/N)
+2  if (n.gt.mmax) then
-   status = DftiComputeForward(desc, dataa)
+     istep=2*mmax
-else
+     theta=6.28318530717959d0/(isign*mmax)
-   ! Backward FFT: exp(+2*pi*i*k*n/N)
+     wpr=-2.d0*sin(0.5d0*theta)**2
-   status = DftiComputeBackward(desc, dataa)
+     wpi=sin(theta)
     wr=1.d0
     wi=0.d0
     do m=1,mmax,2
       do i=m,n,istep
         j=i+mmax
         tempr=sngl(wr)*dataa(j)-sngl(wi)*dataa(j+1)
         tempi=sngl(wr)*dataa(j+1)+sngl(wi)*dataa(j)
         dataa(j)=dataa(i)-tempr
         dataa(j+1)=dataa(i+1)-tempi
         dataa(i)=dataa(i)+tempr
         dataa(i+1)=dataa(i+1)+tempi
       enddo
          wtemp=wr
          wr=wr*wpr-wi*wpi+wr
          wi=wi*wpr+wtemp*wpi+wi
     enddo
 mmax=istep
 goto 2
 endif
 ! Free descriptor
 status = DftiFreeDescriptor(desc)
 return
 END SUBROUTINE four1
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -3756,358 +3756,6 @@ void Parallel::Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
  delete[] transfer_src;
  delete[] transfer_dst;
 }
 //
 // Async Sync: split into SyncBegin (initiate MPI) and SyncEnd (wait + unpack)
 // This allows overlapping MPI communication with computation.
 //
 static void transfer_begin(Parallel::TransferState *ts)
 {
  int myrank;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  int cpusize = ts->cpusize;
  ts->reqs = new MPI_Request[2 * cpusize];
  ts->stats = new MPI_Status[2 * cpusize];
  ts->req_no = 0;
  ts->send_data = new double *[cpusize];
  ts->rec_data = new double *[cpusize];
  int length;
  for (int node = 0; node < cpusize; node++)
  {
    ts->send_data[node] = ts->rec_data[node] = 0;
    if (node == myrank)
    {
      // Local copy: pack then immediately unpack (no MPI needed)
      if ((length = Parallel::data_packer(0, ts->transfer_src[myrank], ts->transfer_dst[myrank],
                                          node, PACK, ts->VarList1, ts->VarList2, ts->Symmetry)))
      {
        double *local_data = new double[length];
        if (!local_data)
        {
          cout << "out of memory in transfer_begin, local copy" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        Parallel::data_packer(local_data, ts->transfer_src[myrank], ts->transfer_dst[myrank],
                              node, PACK, ts->VarList1, ts->VarList2, ts->Symmetry);
        Parallel::data_packer(local_data, ts->transfer_src[node], ts->transfer_dst[node],
                              node, UNPACK, ts->VarList1, ts->VarList2, ts->Symmetry);
        delete[] local_data;
      }
    }
    else
    {
      // send from this cpu to cpu#node
      if ((length = Parallel::data_packer(0, ts->transfer_src[myrank], ts->transfer_dst[myrank],
                                          node, PACK, ts->VarList1, ts->VarList2, ts->Symmetry)))
      {
        ts->send_data[node] = new double[length];
        if (!ts->send_data[node])
        {
          cout << "out of memory in transfer_begin, send" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        Parallel::data_packer(ts->send_data[node], ts->transfer_src[myrank], ts->transfer_dst[myrank],
                              node, PACK, ts->VarList1, ts->VarList2, ts->Symmetry);
        MPI_Isend((void *)ts->send_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD,
                  ts->reqs + ts->req_no++);
      }
      // receive from cpu#node to this cpu
      if ((length = Parallel::data_packer(0, ts->transfer_src[node], ts->transfer_dst[node],
                                          node, UNPACK, ts->VarList1, ts->VarList2, ts->Symmetry)))
      {
        ts->rec_data[node] = new double[length];
        if (!ts->rec_data[node])
        {
          cout << "out of memory in transfer_begin, recv" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        MPI_Irecv((void *)ts->rec_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD,
                  ts->reqs + ts->req_no++);
      }
    }
  }
  // NOTE: MPI_Waitall is NOT called here - that happens in transfer_end
 }
 //
 static void transfer_end(Parallel::TransferState *ts)
 {
  // Wait for all pending MPI operations
  MPI_Waitall(ts->req_no, ts->reqs, ts->stats);
  // Unpack received data from remote ranks
  for (int node = 0; node < ts->cpusize; node++)
    if (ts->rec_data[node])
      Parallel::data_packer(ts->rec_data[node], ts->transfer_src[node], ts->transfer_dst[node],
                            node, UNPACK, ts->VarList1, ts->VarList2, ts->Symmetry);
  // Cleanup MPI buffers
  for (int node = 0; node < ts->cpusize; node++)
  {
    if (ts->send_data[node])
      delete[] ts->send_data[node];
    if (ts->rec_data[node])
      delete[] ts->rec_data[node];
  }
  delete[] ts->reqs;
  delete[] ts->stats;
  delete[] ts->send_data;
  delete[] ts->rec_data;
 }
 //
 Parallel::SyncHandle *Parallel::SyncBegin(Patch *Pat, MyList<var> *VarList, int Symmetry)
 {
  int cpusize;
  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
  SyncHandle *handle = new SyncHandle;
  handle->num_states = 1;
  handle->states = new TransferState[1];
  TransferState *ts = &handle->states[0];
  ts->cpusize = cpusize;
  ts->VarList1 = VarList;
  ts->VarList2 = VarList;
  ts->Symmetry = Symmetry;
  ts->owns_gsl = true;
  ts->dst = build_ghost_gsl(Pat);
  ts->src = new MyList<Parallel::gridseg> *[cpusize];
  ts->transfer_src = new MyList<Parallel::gridseg> *[cpusize];
  ts->transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
  for (int node = 0; node < cpusize; node++)
  {
    ts->src[node] = build_owned_gsl0(Pat, node);
    build_gstl(ts->src[node], ts->dst, &ts->transfer_src[node], &ts->transfer_dst[node]);
  }
  transfer_begin(ts);
  return handle;
 }
 //
 Parallel::SyncHandle *Parallel::SyncBegin(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
 {
  int cpusize;
  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
  // Count patches
  int num_patches = 0;
  MyList<Patch> *Pp = PatL;
  while (Pp) { num_patches++; Pp = Pp->next; }
  SyncHandle *handle = new SyncHandle;
  handle->num_states = num_patches + 1; // intra-patch transfers + 1 inter-patch transfer
  handle->states = new TransferState[handle->num_states];
  // Intra-patch sync: for each patch, build ghost lists and initiate transfer
  int idx = 0;
  Pp = PatL;
  while (Pp)
  {
    TransferState *ts = &handle->states[idx];
    ts->cpusize = cpusize;
    ts->VarList1 = VarList;
    ts->VarList2 = VarList;
    ts->Symmetry = Symmetry;
    ts->owns_gsl = true;
    ts->dst = build_ghost_gsl(Pp->data);
    ts->src = new MyList<Parallel::gridseg> *[cpusize];
    ts->transfer_src = new MyList<Parallel::gridseg> *[cpusize];
    ts->transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
    for (int node = 0; node < cpusize; node++)
    {
      ts->src[node] = build_owned_gsl0(Pp->data, node);
      build_gstl(ts->src[node], ts->dst, &ts->transfer_src[node], &ts->transfer_dst[node]);
    }
    transfer_begin(ts);
    idx++;
    Pp = Pp->next;
  }
  // Inter-patch sync: buffer zone exchange between patches
  {
    TransferState *ts = &handle->states[idx];
    ts->cpusize = cpusize;
    ts->VarList1 = VarList;
    ts->VarList2 = VarList;
    ts->Symmetry = Symmetry;
    ts->owns_gsl = true;
    ts->dst = build_buffer_gsl(PatL);
    ts->src = new MyList<Parallel::gridseg> *[cpusize];
    ts->transfer_src = new MyList<Parallel::gridseg> *[cpusize];
    ts->transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
    for (int node = 0; node < cpusize; node++)
    {
      ts->src[node] = build_owned_gsl(PatL, node, 5, Symmetry);
      build_gstl(ts->src[node], ts->dst, &ts->transfer_src[node], &ts->transfer_dst[node]);
    }
    transfer_begin(ts);
  }
  return handle;
 }
 //
 void Parallel::SyncEnd(SyncHandle *handle)
 {
  if (!handle)
    return;
  // Wait for all pending transfers and unpack
  for (int i = 0; i < handle->num_states; i++)
  {
    TransferState *ts = &handle->states[i];
    transfer_end(ts);
    // Cleanup grid segment lists only if this state owns them
    if (ts->owns_gsl)
    {
      if (ts->dst)
        ts->dst->destroyList();
      for (int node = 0; node < ts->cpusize; node++)
      {
        if (ts->src[node])
          ts->src[node]->destroyList();
        if (ts->transfer_src[node])
          ts->transfer_src[node]->destroyList();
        if (ts->transfer_dst[node])
          ts->transfer_dst[node]->destroyList();
      }
      delete[] ts->src;
      delete[] ts->transfer_src;
      delete[] ts->transfer_dst;
    }
  }
  delete[] handle->states;
  delete handle;
 }
 //
 // SyncPreparePlan: Pre-build grid segment lists for a patch list.
 // The plan can be reused across multiple SyncBeginWithPlan calls
 // as long as the mesh topology does not change (no regridding).
 //
 Parallel::SyncPlan *Parallel::SyncPreparePlan(MyList<Patch> *PatL, int Symmetry)
 {
  int cpusize;
  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
  // Count patches
  int num_patches = 0;
  MyList<Patch> *Pp = PatL;
  while (Pp) { num_patches++; Pp = Pp->next; }
  SyncPlan *plan = new SyncPlan;
  plan->num_entries = num_patches + 1; // intra-patch + 1 inter-patch
  plan->Symmetry = Symmetry;
  plan->entries = new SyncPlanEntry[plan->num_entries];
  // Intra-patch entries: ghost zone exchange within each patch
  int idx = 0;
  Pp = PatL;
  while (Pp)
  {
    SyncPlanEntry *pe = &plan->entries[idx];
    pe->cpusize = cpusize;
    pe->dst = build_ghost_gsl(Pp->data);
    pe->src = new MyList<Parallel::gridseg> *[cpusize];
    pe->transfer_src = new MyList<Parallel::gridseg> *[cpusize];
    pe->transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
    for (int node = 0; node < cpusize; node++)
    {
      pe->src[node] = build_owned_gsl0(Pp->data, node);
      build_gstl(pe->src[node], pe->dst, &pe->transfer_src[node], &pe->transfer_dst[node]);
    }
    idx++;
    Pp = Pp->next;
  }
  // Inter-patch entry: buffer zone exchange between patches
  {
    SyncPlanEntry *pe = &plan->entries[idx];
    pe->cpusize = cpusize;
    pe->dst = build_buffer_gsl(PatL);
    pe->src = new MyList<Parallel::gridseg> *[cpusize];
    pe->transfer_src = new MyList<Parallel::gridseg> *[cpusize];
    pe->transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
    for (int node = 0; node < cpusize; node++)
    {
      pe->src[node] = build_owned_gsl(PatL, node, 5, Symmetry);
      build_gstl(pe->src[node], pe->dst, &pe->transfer_src[node], &pe->transfer_dst[node]);
    }
  }
  return plan;
 }
 //
 void Parallel::SyncFreePlan(SyncPlan *plan)
 {
  if (!plan)
    return;
  for (int i = 0; i < plan->num_entries; i++)
  {
    SyncPlanEntry *pe = &plan->entries[i];
    if (pe->dst)
      pe->dst->destroyList();
    for (int node = 0; node < pe->cpusize; node++)
    {
      if (pe->src[node])
        pe->src[node]->destroyList();
      if (pe->transfer_src[node])
        pe->transfer_src[node]->destroyList();
      if (pe->transfer_dst[node])
        pe->transfer_dst[node]->destroyList();
    }
    delete[] pe->src;
    delete[] pe->transfer_src;
    delete[] pe->transfer_dst;
  }
  delete[] plan->entries;
  delete plan;
 }
 //
 // SyncBeginWithPlan: Use pre-built GSLs from a SyncPlan to initiate async transfer.
 // This avoids the O(cpusize * blocks^2) cost of rebuilding GSLs on every call.
 //
 Parallel::SyncHandle *Parallel::SyncBeginWithPlan(SyncPlan *plan, MyList<var> *VarList)
 {
  return SyncBeginWithPlan(plan, VarList, VarList);
 }
 //
 Parallel::SyncHandle *Parallel::SyncBeginWithPlan(SyncPlan *plan, MyList<var> *VarList1, MyList<var> *VarList2)
 {
  SyncHandle *handle = new SyncHandle;
  handle->num_states = plan->num_entries;
  handle->states = new TransferState[handle->num_states];
  for (int i = 0; i < plan->num_entries; i++)
  {
    SyncPlanEntry *pe = &plan->entries[i];
    TransferState *ts = &handle->states[i];
    ts->cpusize = pe->cpusize;
    ts->VarList1 = VarList1;
    ts->VarList2 = VarList2;
    ts->Symmetry = plan->Symmetry;
    ts->owns_gsl = false; // GSLs are owned by the plan, not this handle
    // Borrow GSL pointers from the plan (do NOT free them in SyncEnd)
    ts->transfer_src = pe->transfer_src;
    ts->transfer_dst = pe->transfer_dst;
    ts->src = pe->src;
    ts->dst = pe->dst;
    transfer_begin(ts);
  }
  return handle;
 }
 // collect buffer grid segments or blocks for the periodic boundary condition of given patch
 // ---------------------------------------------------
 // |con |                                       |con |
--- a/AMSS_NCKU_source/Parallel.h
+++ b/AMSS_NCKU_source/Parallel.h
@@ -81,53 +81,6 @@ namespace Parallel
                   int Symmetry);
  void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry);
  void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
  // Async Sync: overlap MPI communication with computation
  struct TransferState
  {
    MPI_Request *reqs;
    MPI_Status *stats;
    int req_no;
    double **send_data;
    double **rec_data;
    int cpusize;
    MyList<gridseg> **transfer_src;
    MyList<gridseg> **transfer_dst;
    MyList<gridseg> **src;
    MyList<gridseg> *dst;
    MyList<var> *VarList1;
    MyList<var> *VarList2;
    int Symmetry;
    bool owns_gsl; // true if this state owns and should free the GSLs
  };
  struct SyncHandle
  {
    TransferState *states;
    int num_states;
  };
  SyncHandle *SyncBegin(Patch *Pat, MyList<var> *VarList, int Symmetry);
  SyncHandle *SyncBegin(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
  void SyncEnd(SyncHandle *handle);
  // Cached GSL plan: pre-build grid segment lists once, reuse across multiple Sync calls
  struct SyncPlanEntry
  {
    int cpusize;
    MyList<gridseg> **transfer_src;
    MyList<gridseg> **transfer_dst;
    MyList<gridseg> **src;
    MyList<gridseg> *dst;
  };
  struct SyncPlan
  {
    SyncPlanEntry *entries;
    int num_entries;
    int Symmetry;
  };
  SyncPlan *SyncPreparePlan(MyList<Patch> *PatL, int Symmetry);
  void SyncFreePlan(SyncPlan *plan);
  SyncHandle *SyncBeginWithPlan(SyncPlan *plan, MyList<var> *VarList);
  SyncHandle *SyncBeginWithPlan(SyncPlan *plan, MyList<var> *VarList1, MyList<var> *VarList2);
  void OutBdLow2Hi(Patch *Patc, Patch *Patf,
                   MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
                   int Symmetry);
--- a/AMSS_NCKU_source/TwoPunctures.C
+++ b/AMSS_NCKU_source/TwoPunctures.C
--- a/AMSS_NCKU_source/TwoPunctures.h
+++ b/AMSS_NCKU_source/TwoPunctures.h
@@ -1,8 +1,7 @@
 #ifndef TWO_PUNCTURES_H
 #define TWO_PUNCTURES_H
 #include <omp.h>
 #define StencilSize 19
 #define N_PlaneRelax 1
 #define NRELAX 200
@@ -43,18 +42,6 @@ private:
       int ntotal;
       // ===== Precomputed spectral derivative matrices =====
       double *D1_A, *D2_A;
       double *D1_B, *D2_B;
       double *DF1_phi, *DF2_phi;
       // ===== Pre-allocated workspace for LineRelax (per-thread) =====
       int max_threads;
       double **ws_diag_be, **ws_e_be, **ws_f_be, **ws_b_be, **ws_x_be;
       double **ws_l_be, **ws_u_be, **ws_d_be, **ws_y_be;
       double **ws_diag_al, **ws_e_al, **ws_f_al, **ws_b_al, **ws_x_al;
       double **ws_l_al, **ws_u_al, **ws_d_al, **ws_y_al;
       struct parameters
       {
              int nvar, n1, n2, n3;
@@ -71,28 +58,6 @@ public:
                    int Newtonmaxit);
       ~TwoPunctures();
       // 02/07: New/modified methods
       void allocate_workspace();
       void free_workspace();
       void precompute_derivative_matrices();
       void build_cheb_deriv_matrices(int n, double *D1, double *D2);
       void build_fourier_deriv_matrices(int N, double *DF1, double *DF2);
       void Derivatives_AB3_MatMul(int nvar, int n1, int n2, int n3, derivs v);
       void ThomasAlgorithm_ws(int N, double *b, double *a, double *c, double *x, double *q,
                                double *l, double *u_ws, double *d, double *y);
       void LineRelax_be_omp(double *dv,
                         int const i, int const k, int const nvar,
                         int const n1, int const n2, int const n3,
                         double const *rhs, int const *ncols, int **cols,
                         double **JFD, int tid);
       void LineRelax_al_omp(double *dv,
                         int const j, int const k, int const nvar,
                         int const n1, int const n2, int const n3,
                         double const *rhs, int const *ncols,
                         int **cols, double **JFD, int tid);
       void relax_omp(double *dv, int const nvar, int const n1, int const n2, int const n3,
                  double const *rhs, int const *ncols, int **cols, double **JFD);
       void Solve();
       void set_initial_guess(derivs v);
       int index(int i, int j, int k, int l, int a, int b, int c, int d);
@@ -151,11 +116,23 @@ public:
       double BY_KKofxyz(double x, double y, double z);
       void SetMatrix_JFD(int nvar, int n1, int n2, int n3, derivs u, int *ncols, int **cols, double **Matrix);
       void J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, double *Jdv, derivs u);
       void relax(double *dv, int const nvar, int const n1, int const n2, int const n3,
                  double const *rhs, int const *ncols, int **cols, double **JFD);
       void LineRelax_be(double *dv,
                         int const i, int const k, int const nvar,
                         int const n1, int const n2, int const n3,
                         double const *rhs, int const *ncols, int **cols,
                         double **JFD);
       void JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2,
                         int n3, derivs dv, derivs u, double *values);
       void LinEquations(double A, double B, double X, double R,
                         double x, double r, double phi,
                         double y, double z, derivs dU, derivs U, double *values);
       void LineRelax_al(double *dv,
                         int const j, int const k, int const nvar,
                         int const n1, int const n2, int const n3,
                         double const *rhs, int const *ncols,
                         int **cols, double **JFD);
       void ThomasAlgorithm(int N, double *b, double *a, double *c, double *x, double *q);
       void Save(char *fname);
       // provided by Vasileios Paschalidis (vpaschal@illinois.edu)
--- a/AMSS_NCKU_source/Z4c_class.C
+++ b/AMSS_NCKU_source/Z4c_class.C
@@ -186,12 +186,6 @@ void Z4c_class::Step(int lev, int YN)
  int ERROR = 0;
  MyList<ss_patch> *sPp;
  // Pre-build grid segment lists once for this level's patches.
  // These are reused across predictor + 3 corrector SyncBegin calls,
  // avoiding O(cpusize * blocks^2) rebuild each time.
  Parallel::SyncPlan *sync_plan = Parallel::SyncPreparePlan(GH->PatL[lev], Symmetry);
  // Predictor
  MyList<Patch> *Pp = GH->PatL[lev];
  while (Pp)
@@ -327,17 +321,13 @@ void Z4c_class::Step(int lev, int YN)
    }
    Pp = Pp->next;
  }
-  // Start async ghost zone exchange - overlaps with error check and Shell computation
+  // check error information
  Parallel::SyncHandle *sync_pre = Parallel::SyncBeginWithPlan(sync_plan, SynchList_pre);
  // check error information (overlaps with MPI transfer)
  {
    int erh = ERROR;
    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  }
  if (ERROR)
  {
    Parallel::SyncEnd(sync_pre); sync_pre = 0;
    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
@@ -485,7 +475,6 @@ void Z4c_class::Step(int lev, int YN)
  }
  if (ERROR)
  {
    Parallel::SyncEnd(sync_pre); sync_pre = 0;
    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
@@ -496,8 +485,7 @@ void Z4c_class::Step(int lev, int YN)
  }
 #endif
-  // Complete async ghost zone exchange
+  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
  if (sync_pre) Parallel::SyncEnd(sync_pre);
 #ifdef WithShell
  if (lev == 0)
@@ -705,17 +693,13 @@ void Z4c_class::Step(int lev, int YN)
      Pp = Pp->next;
    }
-    // Start async ghost zone exchange - overlaps with error check and Shell computation
+    // check error information
    Parallel::SyncHandle *sync_cor = Parallel::SyncBeginWithPlan(sync_plan, SynchList_cor);
    // check error information (overlaps with MPI transfer)
    {
      int erh = ERROR;
      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    }
    if (ERROR)
    {
      Parallel::SyncEnd(sync_cor); sync_cor = 0;
      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
@@ -873,7 +857,6 @@ void Z4c_class::Step(int lev, int YN)
    }
    if (ERROR)
    {
      Parallel::SyncEnd(sync_cor); sync_cor = 0;
      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
@@ -885,8 +868,7 @@ void Z4c_class::Step(int lev, int YN)
    }
 #endif
-    // Complete async ghost zone exchange
+    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
    if (sync_cor) Parallel::SyncEnd(sync_cor);
 #ifdef WithShell
    if (lev == 0)
@@ -1060,8 +1042,6 @@ void Z4c_class::Step(int lev, int YN)
      Porg0[ithBH][2] = Porg1[ithBH][2];
    }
  }
  Parallel::SyncFreePlan(sync_plan);
 }
 #else
 // for constraint preserving boundary (CPBC)
@@ -1095,10 +1075,6 @@ void Z4c_class::Step(int lev, int YN)
  int ERROR = 0;
  MyList<ss_patch> *sPp;
  // Pre-build grid segment lists once for this level's patches.
  Parallel::SyncPlan *sync_plan = Parallel::SyncPreparePlan(GH->PatL[lev], Symmetry);
  // Predictor
  MyList<Patch> *Pp = GH->PatL[lev];
  while (Pp)
@@ -1566,17 +1542,13 @@ void Z4c_class::Step(int lev, int YN)
  }
 #endif
  }
-  // Start async ghost zone exchange - overlaps with error check
+  // check error information
  Parallel::SyncHandle *sync_pre = Parallel::SyncBeginWithPlan(sync_plan, SynchList_pre);
  // check error information (overlaps with MPI transfer)
  {
    int erh = ERROR;
    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  }
  if (ERROR)
  {
    Parallel::SyncEnd(sync_pre); sync_pre = 0;
    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
@@ -1586,8 +1558,7 @@ void Z4c_class::Step(int lev, int YN)
    }
  }
-  // Complete async ghost zone exchange
+  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
  if (sync_pre) Parallel::SyncEnd(sync_pre);
  if (lev == 0)
  {
@@ -2132,17 +2103,13 @@ void Z4c_class::Step(int lev, int YN)
        sPp = sPp->next;
      }
    }
-    // Start async ghost zone exchange - overlaps with error check
+    // check error information
    Parallel::SyncHandle *sync_cor = Parallel::SyncBeginWithPlan(sync_plan, SynchList_cor);
    // check error information (overlaps with MPI transfer)
    {
      int erh = ERROR;
      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    }
    if (ERROR)
    {
      Parallel::SyncEnd(sync_cor); sync_cor = 0;
      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
@@ -2153,8 +2120,7 @@ void Z4c_class::Step(int lev, int YN)
      }
    }
-    // Complete async ghost zone exchange
+    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
    if (sync_cor) Parallel::SyncEnd(sync_cor);
    if (lev == 0)
    {
@@ -2380,8 +2346,6 @@ void Z4c_class::Step(int lev, int YN)
 	  DG_List->clearList();
 	}
 #endif
  Parallel::SyncFreePlan(sync_plan);
 }
 #endif
 #undef MRBD
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -3035,12 +3035,6 @@ void bssn_class::Step(int lev, int YN)
  int ERROR = 0;
  MyList<ss_patch> *sPp;
  // Pre-build grid segment lists once for this level's patches.
  // These are reused across predictor + 3 corrector SyncBegin calls,
  // avoiding O(cpusize * blocks^2) rebuild each time.
  Parallel::SyncPlan *sync_plan = Parallel::SyncPreparePlan(GH->PatL[lev], Symmetry);
  // Predictor
  MyList<Patch> *Pp = GH->PatL[lev];
  while (Pp)
@@ -3164,18 +3158,13 @@ void bssn_class::Step(int lev, int YN)
    }
    Pp = Pp->next;
  }
-
+  // check error information
  // Start async ghost zone exchange - overlaps with error check and Shell computation
  Parallel::SyncHandle *sync_pre = Parallel::SyncBeginWithPlan(sync_plan, SynchList_pre);
  // check error information (overlaps with MPI transfer)
  {
    int erh = ERROR;
    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  }
  if (ERROR)
  {
    Parallel::SyncEnd(sync_pre); sync_pre = 0;
    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
@@ -3335,7 +3324,6 @@ void bssn_class::Step(int lev, int YN)
  if (ERROR)
  {
    Parallel::SyncEnd(sync_pre); sync_pre = 0;
    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
@@ -3346,8 +3334,7 @@ void bssn_class::Step(int lev, int YN)
  }
 #endif
-  // Complete async ghost zone exchange
+  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
  if (sync_pre) Parallel::SyncEnd(sync_pre);
 #ifdef WithShell
  if (lev == 0)
@@ -3541,10 +3528,7 @@ void bssn_class::Step(int lev, int YN)
      Pp = Pp->next;
    }
-    // Start async ghost zone exchange - overlaps with error check and Shell computation
+    // check error information
    Parallel::SyncHandle *sync_cor = Parallel::SyncBeginWithPlan(sync_plan, SynchList_cor);
    // check error information (overlaps with MPI transfer)
    {
      int erh = ERROR;
      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
@@ -3552,7 +3536,6 @@ void bssn_class::Step(int lev, int YN)
    if (ERROR)
    {
      Parallel::SyncEnd(sync_cor); sync_cor = 0;
      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
@@ -3709,7 +3692,6 @@ void bssn_class::Step(int lev, int YN)
    }
    if (ERROR)
    {
      Parallel::SyncEnd(sync_cor); sync_cor = 0;
      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
@@ -3722,8 +3704,7 @@ void bssn_class::Step(int lev, int YN)
    }
 #endif
-    // Complete async ghost zone exchange
+    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
    if (sync_cor) Parallel::SyncEnd(sync_cor);
 #ifdef WithShell
    if (lev == 0)
@@ -3914,8 +3895,6 @@ void bssn_class::Step(int lev, int YN)
      Porg0[ithBH][2] = Porg1[ithBH][2];
    }
  }
  Parallel::SyncFreePlan(sync_plan);
 }
 //================================================================================================
@@ -4838,12 +4817,6 @@ void bssn_class::Step(int lev, int YN)
  int ERROR = 0;
  MyList<ss_patch> *sPp;
  // Pre-build grid segment lists once for this level's patches.
  // These are reused across predictor + 3 corrector SyncBegin calls,
  // avoiding O(cpusize * blocks^2) rebuild each time.
  Parallel::SyncPlan *sync_plan = Parallel::SyncPreparePlan(GH->PatL[lev], Symmetry);
  // Predictor
  MyList<Patch> *Pp = GH->PatL[lev];
  while (Pp)
@@ -4970,17 +4943,13 @@ void bssn_class::Step(int lev, int YN)
  //   misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Predictor rhs calculation");
-  // Start async ghost zone exchange - overlaps with error check and BH position
+  // check error information
  Parallel::SyncHandle *sync_pre = Parallel::SyncBeginWithPlan(sync_plan, SynchList_pre);
  // check error information (overlaps with MPI transfer)
  {
    int erh = ERROR;
    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev]);
  }
  if (ERROR)
  {
    Parallel::SyncEnd(sync_pre); sync_pre = 0;
    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
@@ -4992,8 +4961,7 @@ void bssn_class::Step(int lev, int YN)
  //   misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor sync");
-  // Complete async ghost zone exchange
+  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
  if (sync_pre) Parallel::SyncEnd(sync_pre);
 #if (MAPBH == 0)
  // for black hole position
@@ -5172,17 +5140,13 @@ void bssn_class::Step(int lev, int YN)
    //   misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector error check");
-    // Start async ghost zone exchange - overlaps with error check and BH position
+    // check error information
    Parallel::SyncHandle *sync_cor = Parallel::SyncBeginWithPlan(sync_plan, SynchList_cor);
    // check error information (overlaps with MPI transfer)
    {
      int erh = ERROR;
      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev]);
    }
    if (ERROR)
    {
      Parallel::SyncEnd(sync_cor); sync_cor = 0;
      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
@@ -5196,8 +5160,7 @@ void bssn_class::Step(int lev, int YN)
    //    misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector sync");
-    // Complete async ghost zone exchange
+    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
    if (sync_cor) Parallel::SyncEnd(sync_cor);
    //    misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Corrector sync");
@@ -5313,8 +5276,6 @@ void bssn_class::Step(int lev, int YN)
  //     if(myrank==GH->start_rank[lev]) cout<<GH->mylev<<endl;
  //     misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"complet GH Step");
  Parallel::SyncFreePlan(sync_plan);
 }
 //================================================================================================
--- a/AMSS_NCKU_source/bssn_gpu.cu
+++ b/AMSS_NCKU_source/bssn_gpu.cu
@@ -18,7 +18,7 @@ using namespace std;
 #include <fstream>
 #endif
-void compare_result_gpu(int ftag1,double * datac,int data_num){
+static void compare_result_gpu(int ftag1,double * datac,int data_num){
 	double * data = (double*)malloc(sizeof(double)*data_num);
 	cudaMemcpy(data, datac, data_num * sizeof(double), cudaMemcpyDeviceToHost);
 	compare_result(ftag1,data,data_num);
@@ -83,7 +83,7 @@ inline void sub_enforce_ga(int matrix_size){
 	double * trA = M_ chin1;
 	enforce_ga<<<GRID_DIM,BLOCK_DIM>>>(trA);
 	cudaMemset(trA,0,matrix_size * sizeof(double));
-	cudaThreadSynchronize(); 
+	cudaDeviceSynchronize(); 
 	//cudaMemset(Mh_ gupxx,0,matrix_size * sizeof(double));
 	//trA gxx,gyy,gzz gupxx,gupxy,gupxz,gupyy,gupyz,gupzz
@@ -273,13 +273,13 @@ __global__ void sub_symmetry_bd_partK(int ord,double * func, double * funcc,doub
 #endif //ifdef Vertex
 inline void sub_symmetry_bd(int ord,double * func, double * funcc,double * SoA){
 	sub_symmetry_bd_partF<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_symmetry_bd_partI<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[0]);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_symmetry_bd_partJ<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[1]);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_symmetry_bd_partK<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[2]);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 }
@@ -378,9 +378,9 @@ inline void sub_fdderivs(double * f,double *fh,double *fxx,double *fxy,double *f
 	cudaMemset(fyy,0,_3D_SIZE[0] * sizeof(double));
 	cudaMemset(fyz,0,_3D_SIZE[0] * sizeof(double));
 	cudaMemset(fzz,0,_3D_SIZE[0] * sizeof(double));
-	cudaThreadSynchronize(); 
+	cudaDeviceSynchronize(); 
 	sub_fdderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fxx,fxy,fxz,fyy,fyz,fzz);
-	cudaThreadSynchronize(); 
+	cudaDeviceSynchronize(); 
 }
 __global__ void sub_fderivs_part1(double * f,double * fh,double *fx,double *fy,double *fz  )
@@ -445,9 +445,9 @@ inline void sub_fderivs(double * f,double * fh,double *fx,double *fy,double *fz,
 	cudaMemset(fy,0,_3D_SIZE[0] * sizeof(double));
 	cudaMemset(fz,0,_3D_SIZE[0] * sizeof(double));
-	cudaThreadSynchronize(); 
+	cudaDeviceSynchronize(); 
 	sub_fderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fx,fy,fz);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 }
 __global__ void computeRicci_part1(double * dst)
@@ -465,9 +465,9 @@ __global__ void computeRicci_part1(double * dst)
 inline void computeRicci(double * src,double* dst,double * SoA, Meta* meta)
 {
 	sub_fdderivs(src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,SoA);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	computeRicci_part1<<<GRID_DIM,BLOCK_DIM>>>(dst);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 }/*Exception*/
@@ -524,9 +524,9 @@ __global__ void sub_kodis_part1(double *f,double *fh,double *f_rhs)
 inline void sub_kodis(double *f,double *fh,double *f_rhs,double *SoA)
 {
 	sub_symmetry_bd(3,f,fh,SoA);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_kodis_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 }
 __global__ void  sub_lopsided_part1(double *f,double* fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz)
@@ -617,9 +617,9 @@ __global__ void  sub_lopsided_part1(double *f,double* fh,double *f_rhs,double *S
 inline void  sub_lopsided(double *f,double*fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz,double *SoA){
 	sub_symmetry_bd(3,f,fh,SoA);
-	cudaThreadSynchronize(); 
+	cudaDeviceSynchronize(); 
 	sub_lopsided_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs,Sfx,Sfy,Sfz);
-	cudaThreadSynchronize(); 
+	cudaDeviceSynchronize(); 
 }
 __global__ void compute_rhs_bssn_part1() 
@@ -2656,13 +2656,13 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 #ifdef TIMING1
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	gettimeofday(&tv2, NULL);
   	cout<<"TIME USED"<<TimeBetween(tv1, tv2)<<endl; 
 #endif	
 	//cout<<"GPU meta data ready.\n";
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 //--------------test constant memory address & value--------------
 /*	double rank = mpi_rank;
@@ -2685,7 +2685,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 	//sub_enforce_ga(matrix_size);
 	//4.1-----compute rhs---------
 	compute_rhs_bssn_part1<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fderivs(Mh_ betax,Mh_ fh,Mh_ betaxx,Mh_ betaxy,Mh_ betaxz,ass);
 	sub_fderivs(Mh_ betay,Mh_ fh,Mh_ betayx,Mh_ betayy,Mh_ betayz,sas);
@@ -2701,7 +2701,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 	sub_fderivs(Mh_ gyz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz, saa);
  	compute_rhs_bssn_part2<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fdderivs(Mh_ betax,Mh_ fh,Mh_ gxxx,Mh_ gxyx,Mh_ gxzx,Mh_ gyyx,Mh_ gyzx,Mh_ gzzx,ass);
 	sub_fdderivs(Mh_ betay,Mh_ fh,Mh_ gxxy,Mh_ gxyy,Mh_ gxzy,Mh_ gyyy,Mh_ gyzy,Mh_ gzzy,sas);
@@ -2711,7 +2711,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 	sub_fderivs( Mh_ Gamz, Mh_ fh,Mh_ Gamzx, Mh_ Gamzy, Mh_ Gamzz,ssa);
 	compute_rhs_bssn_part3<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	computeRicci(Mh_ dxx,Mh_ Rxx,sss, meta);
 	computeRicci(Mh_ dyy,Mh_ Ryy,sss, meta);
@@ -2720,20 +2720,20 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 	computeRicci(Mh_ gxz,Mh_ Rxz,asa, meta);
 	computeRicci(Mh_ gyz,Mh_ Ryz,saa, meta);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	compute_rhs_bssn_part4<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fdderivs(Mh_ chi,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
 	compute_rhs_bssn_part5<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fdderivs(Mh_ Lap,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
 	compute_rhs_bssn_part6<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 #if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5)
 	sub_fderivs(Mh_ chi,Mh_ fh, Mh_ dtSfx_rhs, Mh_ dtSfy_rhs, Mh_ dtSfz_rhs,sss);
@@ -2805,7 +2805,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 	if(co == 0){
 		compute_rhs_bssn_part7<<<GRID_DIM,BLOCK_DIM>>>();
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 		sub_fderivs(Mh_ Axx,Mh_ fh,Mh_ gxxx,Mh_ gxxy,Mh_ gxxz,sss);
 		sub_fderivs(Mh_ Axy,Mh_ fh,Mh_ gxyx,Mh_ gxyy,Mh_ gxyz,aas);
@@ -2814,7 +2814,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 		sub_fderivs(Mh_ Ayz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz,saa);
 		sub_fderivs(Mh_ Azz,Mh_ fh,Mh_ gzzx,Mh_ gzzy,Mh_ gzzz,sss);
 		compute_rhs_bssn_part8<<<GRID_DIM,BLOCK_DIM>>>();
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 	}
 #if (ABV == 1)
@@ -2895,7 +2895,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 //-------------------FOR GPU TEST----------------------
 //-----------------------------------------------------
 #ifdef TIMING
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	gettimeofday(&tv2, NULL);
   	cout<<"MPI rank is: "<<mpi_rank<<" GPU TIME is"<<TimeBetween(tv1, tv2)<<" (s)."<<endl; 
 #endif
--- a/AMSS_NCKU_source/bssn_gpu.h
+++ b/AMSS_NCKU_source/bssn_gpu.h
@@ -4,6 +4,17 @@
 #include "bssn_macro.h"
 #include "macrodef.fh"
 // CUDA error checking macro for CUDA 13 compatibility
 #define CUDA_SAFE_CALL(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA error in %s:%d: %s\n", __FILE__, __LINE__, \
                    cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)
 #define DEVICE_ID 0
 // #define DEVICE_ID_BY_MPI_RANK
 #define GRID_DIM 256
--- a/AMSS_NCKU_source/bssn_gpu_rhs_ss.cu
+++ b/AMSS_NCKU_source/bssn_gpu_rhs_ss.cu
@@ -20,7 +20,7 @@ using namespace std;
 __device__ volatile unsigned int global_count = 0;
-void compare_result_gpu(int ftag1,double * datac,int data_num){
+static void compare_result_gpu(int ftag1,double * datac,int data_num){
 	double * data = (double*)malloc(sizeof(double)*data_num);
 	cudaMemcpy(data, datac, data_num * sizeof(double), cudaMemcpyDeviceToHost);
 	compare_result(ftag1,data,data_num);
@@ -153,11 +153,11 @@ __global__ void sub_symmetry_bd_ss_partJ(int ord,double * func, double * funcc,d
 inline void sub_symmetry_bd_ss(int ord,double * func, double * funcc,double * SoA){
 	sub_symmetry_bd_ss_partF<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_symmetry_bd_ss_partI<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[0]);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_symmetry_bd_ss_partJ<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[1]);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 }
 __global__ void sub_fderivs_shc_part1(double *fx,double *fy,double *fz){
@@ -247,13 +247,13 @@ inline void sub_fderivs_shc(int& sst,double * f,double * fh,double *fx,double *f
 	//cudaMemset(Msh_ gy,0,h_3D_SIZE[0] * sizeof(double));
 	//cudaMemset(Msh_ gz,0,h_3D_SIZE[0] * sizeof(double));
 	sub_symmetry_bd_ss(2,f,fh,SoA1);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			//compare_result_gpu(0,fh,h_3D_SIZE[2]);	
 	sub_fderivs_sh<<<GRID_DIM,BLOCK_DIM>>>(fh,Msh_ gx,Msh_ gy,Msh_ gz);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fderivs_shc_part1<<<GRID_DIM,BLOCK_DIM>>>(fx,fy,fz);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			//compare_result_gpu(1,fx,h_3D_SIZE[0]);
 			//compare_result_gpu(2,fy,h_3D_SIZE[0]);
 			//compare_result_gpu(3,fz,h_3D_SIZE[0]);
@@ -451,17 +451,17 @@ inline void sub_fdderivs_shc(int& sst,double * f,double * fh,
 	//fderivs_sh
 	sub_symmetry_bd_ss(2,f,fh,SoA1);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			//compare_result_gpu(1,fh,h_3D_SIZE[2]);	
 	sub_fderivs_sh<<<GRID_DIM,BLOCK_DIM>>>(fh,Msh_ gx,Msh_ gy,Msh_ gz);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	//fdderivs_sh
 	sub_symmetry_bd_ss(2,f,fh,SoA1);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			//compare_result_gpu(21,fh,h_3D_SIZE[2]);
 	sub_fdderivs_sh<<<GRID_DIM,BLOCK_DIM>>>(fh,Msh_ gxx,Msh_ gxy,Msh_ gxz,Msh_ gyy,Msh_ gyz,Msh_ gzz);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			/*compare_result_gpu(11,Msh_ gx,h_3D_SIZE[0]);
 			compare_result_gpu(12,Msh_ gy,h_3D_SIZE[0]);
 			compare_result_gpu(13,Msh_ gz,h_3D_SIZE[0]);
@@ -472,7 +472,7 @@ inline void sub_fdderivs_shc(int& sst,double * f,double * fh,
 			compare_result_gpu(5,Msh_ gyz,h_3D_SIZE[0]);
 			compare_result_gpu(6,Msh_ gzz,h_3D_SIZE[0]);*/
 	sub_fdderivs_shc_part1<<<GRID_DIM,BLOCK_DIM>>>(fxx,fxy,fxz,fyy,fyz,fzz);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			/*compare_result_gpu(1,fxx,h_3D_SIZE[0]);
 			compare_result_gpu(2,fxy,h_3D_SIZE[0]);
 			compare_result_gpu(3,fxz,h_3D_SIZE[0]);
@@ -496,9 +496,9 @@ __global__ void computeRicci_ss_part1(double * dst)
 inline void computeRicci_ss(int &sst,double * src,double* dst,double * SoA, Meta* meta)
 {
 	sub_fdderivs_shc(sst,src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,SoA);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	computeRicci_ss_part1<<<GRID_DIM,BLOCK_DIM>>>(dst);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 }
 __global__ void sub_lopsided_ss_part1(double * dst)
@@ -516,9 +516,9 @@ __global__ void sub_lopsided_ss_part1(double * dst)
 inline void sub_lopsided_ss(int& sst,double *src,double* dst,double *SoA)
 {
 		sub_fderivs_shc(sst,src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,SoA);
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 		sub_lopsided_ss_part1<<<GRID_DIM,BLOCK_DIM>>>(dst);
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 }
 __global__ void sub_kodis_sh_part1(double *f,double *fh,double *f_rhs)
@@ -590,11 +590,11 @@ inline void sub_kodis_ss(int &sst,double *f,double *fh,double *f_rhs,double *SoA
 	}
 			//compare_result_gpu(10,f,h_3D_SIZE[0]);
 	sub_symmetry_bd_ss(3,f,fh,SoA1);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			//compare_result_gpu(0,fh,h_3D_SIZE[3]);
 	sub_kodis_sh_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			//compare_result_gpu(1,f_rhs,h_3D_SIZE[0]);
 }
@@ -2287,13 +2287,13 @@ int gpu_rhs_ss(RHS_SS_PARA)
 #ifdef TIMING1
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	gettimeofday(&tv2, NULL);
   	cout<<"TIME USED"<<TimeBetween(tv1, tv2)<<endl; 
 #endif	
 	//cout<<"GPU meta data ready.\n";
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 //-------------get device info-------------------------------------
@@ -2306,7 +2306,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	//sub_enforce_ga(matrix_size);
 	//4.1-----compute rhs---------
 	compute_rhs_ss_part1<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fderivs_shc(sst,Mh_ betax,Mh_ fh,Mh_ betaxx,Mh_ betaxy,Mh_ betaxz,ass);
 	sub_fderivs_shc(sst,Mh_ betay,Mh_ fh,Mh_ betayx,Mh_ betayy,Mh_ betayz,sas);
@@ -2322,7 +2322,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	sub_fderivs_shc(sst,Mh_ gyz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz, saa);
 	compute_rhs_ss_part2<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fdderivs_shc(sst,Mh_ betax,Mh_ fh,Mh_ gxxx,Mh_ gxyx,Mh_ gxzx,Mh_ gyyx,Mh_ gyzx,Mh_ gzzx,ass);
 	sub_fdderivs_shc(sst,Mh_ betay,Mh_ fh,Mh_ gxxy,Mh_ gxyy,Mh_ gxzy,Mh_ gyyy,Mh_ gyzy,Mh_ gzzy,sas);
@@ -2332,7 +2332,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	sub_fderivs_shc( sst,Mh_ Gamz, Mh_ fh,Mh_ Gamzx, Mh_ Gamzy, Mh_ Gamzz,ssa);
 	compute_rhs_ss_part3<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	computeRicci_ss(sst,Mh_ dxx,Mh_ Rxx,sss, meta);
 	computeRicci_ss(sst,Mh_ dyy,Mh_ Ryy,sss, meta);
@@ -2340,25 +2340,25 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	computeRicci_ss(sst,Mh_ gxy,Mh_ Rxy,aas, meta);
 	computeRicci_ss(sst,Mh_ gxz,Mh_ Rxz,asa, meta);
 	computeRicci_ss(sst,Mh_ gyz,Mh_ Ryz,saa, meta);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	compute_rhs_ss_part4<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fdderivs_shc(sst,Mh_ chi,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
-	//cudaThreadSynchronize();
+	//cudaDeviceSynchronize();
 	//compare_result_gpu(0,Mh_ chi,h_3D_SIZE[0]);
 	//compare_result_gpu(1,Mh_ chi,h_3D_SIZE[0]);
 	//compare_result_gpu(2,Mh_ fyz,h_3D_SIZE[0]);
 	compute_rhs_ss_part5<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fdderivs_shc(sst,Mh_ Lap,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
 	compute_rhs_ss_part6<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 #if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5)
 	sub_fderivs_shc(sst,Mh_ chi,Mh_ fh, Mh_ dtSfx_rhs, Mh_ dtSfy_rhs, Mh_ dtSfz_rhs,sss);
@@ -2423,7 +2423,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	}
 	if(co == 0){
 		compute_rhs_ss_part7<<<GRID_DIM,BLOCK_DIM>>>();
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 		sub_fderivs_shc(sst,Mh_ Axx,Mh_ fh,Mh_ gxxx,Mh_ gxxy,Mh_ gxxz,sss);
 		sub_fderivs_shc(sst,Mh_ Axy,Mh_ fh,Mh_ gxyx,Mh_ gxyy,Mh_ gxyz,aas);
@@ -2432,7 +2432,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 		sub_fderivs_shc(sst,Mh_ Ayz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz,saa);
 		sub_fderivs_shc(sst,Mh_ Azz,Mh_ fh,Mh_ gzzx,Mh_ gzzy,Mh_ gzzz,sss);
 		compute_rhs_ss_part8<<<GRID_DIM,BLOCK_DIM>>>();
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 	}
 #if (ABV == 1)
@@ -2512,7 +2512,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	//test kodis
 	//sub_kodis_sh(sst,Msh_ drhodx,Mh_ fh2,Msh_ drhody,sss);
 #ifdef TIMING
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	gettimeofday(&tv2, NULL);
   	cout<<"MPI rank is: "<<mpi_rank<<" GPU TIME is"<<TimeBetween(tv1, tv2)<<" (s)."<<endl; 
 #endif
--- a/AMSS_NCKU_source/bssn_rhs.f90
+++ b/AMSS_NCKU_source/bssn_rhs.f90
@@ -106,8 +106,7 @@
  call getpbh(BHN,Porg,Mass)
 #endif
-!!! sanity check (disabled in production builds for performance)
+!!! sanity check
 #ifdef DEBUG
  dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) &
      +sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz)                   &
      +sum(Gamx)+sum(Gamy)+sum(Gamz)                                           &
@@ -137,7 +136,6 @@
     gont = 1
     return
  endif
 #endif
  PI = dacos(-ONE)
@@ -945,60 +943,103 @@
  SSA(2)=SYM
  SSA(3)=ANTI
-!!!!!!!!!advection term + Kreiss-Oliger dissipation (merged for cache efficiency)
+!!!!!!!!!advection term part
 ! lopsided_kodis shares the symmetry_bd buffer between advection and
 ! dissipation, eliminating redundant full-grid copies. For metric variables
 ! gxx/gyy/gzz (=dxx/dyy/dzz+1): kodis stencil coefficients sum to zero,
 ! so the constant offset has no effect on dissipation.
-  call lopsided_kodis(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
+  call lopsided(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS)
-  call lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps)
+  call lopsided(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA)
-  call lopsided_kodis(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps)
+  call lopsided(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA)
-  call lopsided_kodis(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
+  call lopsided(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS)
-  call lopsided_kodis(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA,eps)
+  call lopsided(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA)
-  call lopsided_kodis(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA,eps)
+  call lopsided(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA)
-  call lopsided_kodis(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS,eps)
+  call lopsided(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS)
-  call lopsided_kodis(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS,eps)
+  call lopsided(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS)
-  call lopsided_kodis(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
+  call lopsided(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA)
-
+!!
 #if 1 
 !! bam does not apply dissipation on gauge variables
  call lopsided_kodis(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS,eps)
 #if (GAUGE == 0 || GAUGE == 1 || GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7)
  call lopsided_kodis(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS,eps)
  call lopsided_kodis(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS,eps)
  call lopsided_kodis(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
 #endif
 #if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
  call lopsided_kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS,eps)
  call lopsided_kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS,eps)
  call lopsided_kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
 #endif
 #else
 ! No dissipation on gauge variables (advection only)
  call lopsided(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS)
 #if (GAUGE == 0 || GAUGE == 1 || GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7)
  call lopsided(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS)
  call lopsided(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS)
  call lopsided(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA)
 #endif
 #if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
  call lopsided(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS)
  call lopsided(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS)
  call lopsided(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA)
 #endif
  if(eps>0)then 
 ! usual Kreiss-Oliger dissipation      
  call kodis(ex,X,Y,Z,chi,chi_rhs,SSS,Symmetry,eps)
  call kodis(ex,X,Y,Z,trK,trK_rhs,SSS,Symmetry,eps)
  call kodis(ex,X,Y,Z,dxx,gxx_rhs,SSS,Symmetry,eps)
  call kodis(ex,X,Y,Z,gxy,gxy_rhs,AAS,Symmetry,eps)
  call kodis(ex,X,Y,Z,gxz,gxz_rhs,ASA,Symmetry,eps)
  call kodis(ex,X,Y,Z,dyy,gyy_rhs,SSS,Symmetry,eps)
  call kodis(ex,X,Y,Z,gyz,gyz_rhs,SAA,Symmetry,eps)
  call kodis(ex,X,Y,Z,dzz,gzz_rhs,SSS,Symmetry,eps)
 #if 0
 #define i 42
 #define j 40
 #define k 40
 if(Lev == 1)then
 write(*,*) X(i),Y(j),Z(k)
 write(*,*) "before",Axx_rhs(i,j,k)
 endif
 #undef i
 #undef j
 #undef k
 !!stop
 #endif
  call kodis(ex,X,Y,Z,Axx,Axx_rhs,SSS,Symmetry,eps)
 #if 0
 #define i 42
 #define j 40
 #define k 40
 if(Lev == 1)then
 write(*,*) X(i),Y(j),Z(k)
 write(*,*) "after",Axx_rhs(i,j,k)
 endif
 #undef i
 #undef j
 #undef k
 !!stop
 #endif
  call kodis(ex,X,Y,Z,Axy,Axy_rhs,AAS,Symmetry,eps)
  call kodis(ex,X,Y,Z,Axz,Axz_rhs,ASA,Symmetry,eps)
  call kodis(ex,X,Y,Z,Ayy,Ayy_rhs,SSS,Symmetry,eps)
  call kodis(ex,X,Y,Z,Ayz,Ayz_rhs,SAA,Symmetry,eps)
  call kodis(ex,X,Y,Z,Azz,Azz_rhs,SSS,Symmetry,eps)
  call kodis(ex,X,Y,Z,Gamx,Gamx_rhs,ASS,Symmetry,eps)
  call kodis(ex,X,Y,Z,Gamy,Gamy_rhs,SAS,Symmetry,eps)
  call kodis(ex,X,Y,Z,Gamz,Gamz_rhs,SSA,Symmetry,eps)
 #if 1 
 !! bam does not apply dissipation on gauge variables
  call kodis(ex,X,Y,Z,Lap,Lap_rhs,SSS,Symmetry,eps)
  call kodis(ex,X,Y,Z,betax,betax_rhs,ASS,Symmetry,eps)
  call kodis(ex,X,Y,Z,betay,betay_rhs,SAS,Symmetry,eps)
  call kodis(ex,X,Y,Z,betaz,betaz_rhs,SSA,Symmetry,eps)
 #if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
  call kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,ASS,Symmetry,eps)
  call kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,SAS,Symmetry,eps)
  call kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,SSA,Symmetry,eps)
 #endif
 #endif
  endif
  if(co == 0)then
 ! ham_Res = trR + 2/3 * K^2 - A_ij * A^ij - 16 * PI * rho
--- a/AMSS_NCKU_source/bssn_step_gpu.C
+++ b/AMSS_NCKU_source/bssn_step_gpu.C
@@ -1676,8 +1676,11 @@ void bssn_class::Step_GPU(int lev, int YN)
 #endif // PSTR == ?
 //--------------------------With Shell--------------------------
 // Note: SHStep() implementation is in bssn_gpu_class.C
 #ifdef WithShell
 #if 0
 // This SHStep() implementation has been moved to bssn_gpu_class.C to avoid duplicate definition
 void bssn_class::SHStep()
 {
  int lev = 0;
@@ -1938,5 +1941,5 @@ void bssn_class::SHStep()
    sPp = sPp->next;
  }
 }
-d
+#endif // #if 0
 #endif // withshell
--- a/AMSS_NCKU_source/enforce_algebra.f90
+++ b/AMSS_NCKU_source/enforce_algebra.f90
@@ -19,60 +19,48 @@
 !~~~~~~~> Local variable:
-  integer :: i,j,k
+  real*8, dimension(ex(1),ex(2),ex(3)) :: trA,detg
-  real*8 :: lgxx,lgyy,lgzz,ldetg
+  real*8, dimension(ex(1),ex(2),ex(3)) :: gxx,gyy,gzz 
-  real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz
+  real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz,gupyy,gupyz,gupzz
  real*8 :: ltrA,lscale
  real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0
 !~~~~~~>
-  do k=1,ex(3)
+  gxx = dxx + ONE
-  do j=1,ex(2)
+  gyy = dyy + ONE
-  do i=1,ex(1)
+  gzz = dzz + ONE
-    lgxx = dxx(i,j,k) + ONE
+  detg =  gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
-    lgyy = dyy(i,j,k) + ONE
+          gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
-    lgzz = dzz(i,j,k) + ONE
+  gupxx =   ( gyy * gzz - gyz * gyz ) / detg
  gupxy = - ( gxy * gzz - gyz * gxz ) / detg
  gupxz =   ( gxy * gyz - gyy * gxz ) / detg
  gupyy =   ( gxx * gzz - gxz * gxz ) / detg
  gupyz = - ( gxx * gyz - gxy * gxz ) / detg
  gupzz =   ( gxx * gyy - gxy * gxy ) / detg
-    ldetg =  lgxx * lgyy * lgzz &
+  trA =         gupxx * Axx + gupyy * Ayy + gupzz * Azz &
-           + gxy(i,j,k) * gyz(i,j,k) * gxz(i,j,k) &
+       + TWO * (gupxy * Axy + gupxz * Axz + gupyz * Ayz)
           + gxz(i,j,k) * gxy(i,j,k) * gyz(i,j,k) &
           - gxz(i,j,k) * lgyy * gxz(i,j,k) &
           - gxy(i,j,k) * gxy(i,j,k) * lgzz &
           - lgxx * gyz(i,j,k) * gyz(i,j,k)
-    lgupxx =   ( lgyy * lgzz - gyz(i,j,k) * gyz(i,j,k) ) / ldetg
+  Axx = Axx - F1o3 * gxx * trA
-    lgupxy = - ( gxy(i,j,k) * lgzz - gyz(i,j,k) * gxz(i,j,k) ) / ldetg
+  Axy = Axy - F1o3 * gxy * trA
-    lgupxz =   ( gxy(i,j,k) * gyz(i,j,k) - lgyy * gxz(i,j,k) ) / ldetg
+  Axz = Axz - F1o3 * gxz * trA
-    lgupyy =   ( lgxx * lgzz - gxz(i,j,k) * gxz(i,j,k) ) / ldetg
+  Ayy = Ayy - F1o3 * gyy * trA
-    lgupyz = - ( lgxx * gyz(i,j,k) - gxy(i,j,k) * gxz(i,j,k) ) / ldetg
+  Ayz = Ayz - F1o3 * gyz * trA
-    lgupzz =   ( lgxx * lgyy - gxy(i,j,k) * gxy(i,j,k) ) / ldetg
+  Azz = Azz - F1o3 * gzz * trA
-    ltrA =         lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) &
+  detg = ONE / ( detg ** F1o3 ) 
                 + lgupzz * Azz(i,j,k) &
         + TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) &
                 + lgupyz * Ayz(i,j,k))
-    Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA
+  gxx = gxx * detg
-    Axy(i,j,k) = Axy(i,j,k) - F1o3 * gxy(i,j,k) * ltrA
+  gxy = gxy * detg
-    Axz(i,j,k) = Axz(i,j,k) - F1o3 * gxz(i,j,k) * ltrA
+  gxz = gxz * detg
-    Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA
+  gyy = gyy * detg
-    Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * gyz(i,j,k) * ltrA
+  gyz = gyz * detg
-    Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA
+  gzz = gzz * detg
-    lscale = ONE / ( ldetg ** F1o3 )
+  dxx = gxx - ONE
-
+  dyy = gyy - ONE
-    dxx(i,j,k) = lgxx * lscale - ONE
+  dzz = gzz - ONE
    gxy(i,j,k) = gxy(i,j,k) * lscale
    gxz(i,j,k) = gxz(i,j,k) * lscale
    dyy(i,j,k) = lgyy * lscale - ONE
    gyz(i,j,k) = gyz(i,j,k) * lscale
    dzz(i,j,k) = lgzz * lscale - ONE
  enddo
  enddo
  enddo
  return
@@ -95,70 +83,50 @@
 !~~~~~~~> Local variable:
-  integer :: i,j,k
+  real*8, dimension(ex(1),ex(2),ex(3)) :: trA
-  real*8 :: lgxx,lgyy,lgzz,lscale
+  real*8, dimension(ex(1),ex(2),ex(3)) :: gxx,gyy,gzz 
-  real*8 :: lgxy,lgxz,lgyz
+  real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz,gupyy,gupyz,gupzz
  real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz
  real*8 :: ltrA
  real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0
 !~~~~~~>
-  do k=1,ex(3)
+  gxx = dxx + ONE
-  do j=1,ex(2)
+  gyy = dyy + ONE
-  do i=1,ex(1)
+  gzz = dzz + ONE
 ! for g
  gupzz =  gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
           gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
-! for g: normalize determinant first
+  gupzz = ONE / ( gupzz ** F1o3 ) 
    lgxx = dxx(i,j,k) + ONE
    lgyy = dyy(i,j,k) + ONE
    lgzz = dzz(i,j,k) + ONE
    lgxy = gxy(i,j,k)
    lgxz = gxz(i,j,k)
    lgyz = gyz(i,j,k)
-    lscale =  lgxx * lgyy * lgzz + lgxy * lgyz * lgxz &
+  gxx = gxx * gupzz
-            + lgxz * lgxy * lgyz - lgxz * lgyy * lgxz &
+  gxy = gxy * gupzz
-            - lgxy * lgxy * lgzz - lgxx * lgyz * lgyz
+  gxz = gxz * gupzz
  gyy = gyy * gupzz
  gyz = gyz * gupzz
  gzz = gzz * gupzz
-    lscale = ONE / ( lscale ** F1o3 )
+  dxx = gxx - ONE
  dyy = gyy - ONE
  dzz = gzz - ONE
 ! for A  
-    lgxx = lgxx * lscale
+  gupxx =   ( gyy * gzz - gyz * gyz )
-    lgxy = lgxy * lscale
+  gupxy = - ( gxy * gzz - gyz * gxz )
-    lgxz = lgxz * lscale
+  gupxz =   ( gxy * gyz - gyy * gxz )
-    lgyy = lgyy * lscale
+  gupyy =   ( gxx * gzz - gxz * gxz )
-    lgyz = lgyz * lscale
+  gupyz = - ( gxx * gyz - gxy * gxz )
-    lgzz = lgzz * lscale
+  gupzz =   ( gxx * gyy - gxy * gxy )
-    dxx(i,j,k) = lgxx - ONE
+  trA =         gupxx * Axx + gupyy * Ayy + gupzz * Azz &
-    gxy(i,j,k) = lgxy
+       + TWO * (gupxy * Axy + gupxz * Axz + gupyz * Ayz)
    gxz(i,j,k) = lgxz
    dyy(i,j,k) = lgyy - ONE
    gyz(i,j,k) = lgyz
    dzz(i,j,k) = lgzz - ONE
-! for A: trace-free using normalized metric (det=1, no division needed)
+  Axx = Axx - F1o3 * gxx * trA
-    lgupxx =   ( lgyy * lgzz - lgyz * lgyz )
+  Axy = Axy - F1o3 * gxy * trA
-    lgupxy = - ( lgxy * lgzz - lgyz * lgxz )
+  Axz = Axz - F1o3 * gxz * trA
-    lgupxz =   ( lgxy * lgyz - lgyy * lgxz )
+  Ayy = Ayy - F1o3 * gyy * trA
-    lgupyy =   ( lgxx * lgzz - lgxz * lgxz )
+  Ayz = Ayz - F1o3 * gyz * trA
-    lgupyz = - ( lgxx * lgyz - lgxy * lgxz )
+  Azz = Azz - F1o3 * gzz * trA
    lgupzz =   ( lgxx * lgyy - lgxy * lgxy )
    ltrA =         lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) &
                 + lgupzz * Azz(i,j,k) &
         + TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) &
                 + lgupyz * Ayz(i,j,k))
    Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA
    Axy(i,j,k) = Axy(i,j,k) - F1o3 * lgxy * ltrA
    Axz(i,j,k) = Axz(i,j,k) - F1o3 * lgxz * ltrA
    Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA
    Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * lgyz * ltrA
    Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA
  enddo
  enddo
  enddo
  return
--- a/AMSS_NCKU_source/fmisc.f90
+++ b/AMSS_NCKU_source/fmisc.f90
@@ -324,6 +324,7 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
  integer::i
  funcc = 0.d0
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -349,6 +350,7 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
  integer::i
  funcc = 0.d0
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -377,6 +379,7 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
  integer::i
  funcc = 0.d0
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -883,6 +886,7 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
  integer::i
  funcc = 0.d0
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
@@ -908,6 +912,7 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
  integer::i
  funcc = 0.d0
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
@@ -936,6 +941,7 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
  integer::i
  funcc = 0.d0
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
@@ -1113,16 +1119,25 @@ end subroutine d2dump
 !------------------------------------------------------------------------------
  subroutine polint(xa,ya,x,y,dy,ordn)
  implicit none
 !~~~~~~> Input Parameter:
  integer,intent(in) :: ordn
  real*8, dimension(ordn), intent(in) :: xa,ya
  real*8, intent(in) :: x
  real*8, intent(out) :: y,dy
-  integer :: i, m, ns, n_m
+!~~~~~~> Other parameter:
-  real*8, dimension(ordn) :: c, d, ho
+
-  real*8 :: dif, dift, hp, h, den_val
+  integer :: m,n,ns
  real*8, dimension(ordn) :: c,d,den,ho
  real*8 :: dif,dift
 !~~~~~~>
  n=ordn
  m=ordn
  c=ya
  d=ya
@@ -1130,38 +1145,27 @@ end subroutine d2dump
  ns=1
  dif=abs(x-xa(1))
-
+  do m=1,n
-  do i = 2, ordn
+   dift=abs(x-xa(m))
    dift = abs(x - xa(i))
   if(dift < dif) then
-      ns = i
+    ns=m
    dif=dift
   end if
  end do
  y=ya(ns)
  ns=ns-1
-
+  do m=1,n-1
-  do m = 1, ordn - 1
+    den(1:n-m)=ho(1:n-m)-ho(1+m:n)
-    n_m = ordn - m
+    if (any(den(1:n-m) == 0.0))then
    do i = 1, n_m
      hp = ho(i)
      h  = ho(i+m)
      den_val = hp - h
      if (den_val == 0.0d0) then
      write(*,*) 'failure in polint for point',x
      write(*,*) 'with input points: ',xa
      stop
    endif
-
+    den(1:n-m)=(c(2:n-m+1)-d(1:n-m))/den(1:n-m)
-      den_val = (c(i+1) - d(i)) / den_val
+    d(1:n-m)=ho(1+m:n)*den(1:n-m)
-
+    c(1:n-m)=ho(1:n-m)*den(1:n-m)
-      d(i) = h * den_val
+    if (2*ns < n-m) then
      c(i) = hp * den_val
    end do
    if (2 * ns < n_m) then
      dy=c(ns+1)
    else
      dy=d(ns)
@@ -1171,6 +1175,7 @@ end subroutine d2dump
  end do
  return
  end subroutine polint
 !------------------------------------------------------------------------------
 !
@@ -1178,37 +1183,35 @@ end subroutine d2dump
 !
 !------------------------------------------------------------------------------
  subroutine polin2(x1a,x2a,ya,x1,x2,y,dy,ordn)
  implicit none
 !~~~~~~> Input parameters:
  integer,intent(in) :: ordn
  real*8, dimension(1:ordn), intent(in) :: x1a,x2a
  real*8, dimension(1:ordn,1:ordn), intent(in) :: ya
  real*8, intent(in) :: x1,x2
  real*8, intent(out) :: y,dy
-#ifdef POLINT_LEGACY_ORDER
+!~~~~~~> Other parameters:
  integer  :: i,m
  real*8, dimension(ordn) :: ymtmp
  real*8, dimension(ordn) :: yntmp
  m=size(x1a)
  do i=1,m
    yntmp=ya(i,:)
    call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
  end do
  call polint(x1a,ymtmp,x1,y,dy,ordn)
 #else
  integer  :: j
  real*8, dimension(ordn) :: ymtmp
  real*8 :: dy_temp
  do j=1,ordn
    call polint(x1a, ya(:,j), x1, ymtmp(j), dy_temp, ordn)
  end do
-  call polint(x2a, ymtmp, x2, y, dy, ordn)
+
-#endif
+  call polint(x1a,ymtmp,x1,y,dy,ordn)
  return
  end subroutine polin2
 !------------------------------------------------------------------------------
 !
@@ -1216,15 +1219,18 @@ end subroutine d2dump
 !
 !------------------------------------------------------------------------------
  subroutine polin3(x1a,x2a,x3a,ya,x1,x2,x3,y,dy,ordn)
  implicit none
 !~~~~~~> Input parameters:
  integer,intent(in) :: ordn
  real*8, dimension(1:ordn), intent(in) :: x1a,x2a,x3a
  real*8, dimension(1:ordn,1:ordn,1:ordn), intent(in) :: ya
  real*8, intent(in) :: x1,x2,x3
  real*8, intent(out) :: y,dy
-#ifdef POLINT_LEGACY_ORDER
+!~~~~~~> Other parameters:
  integer  :: i,j,m,n
  real*8, dimension(ordn,ordn) :: yatmp
  real*8, dimension(ordn) :: ymtmp
@@ -1233,33 +1239,24 @@ end subroutine d2dump
  m=size(x1a)
  n=size(x2a)
  do i=1,m
   do j=1,n
    yqtmp=ya(i,j,:)
    call polint(x3a,yqtmp,x3,yatmp(i,j),dy,ordn)
   end do
    yntmp=yatmp(i,:)
    call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
  end do
  call polint(x1a,ymtmp,x1,y,dy,ordn)
 #else
  integer  :: j, k
  real*8, dimension(ordn,ordn) :: yatmp
  real*8, dimension(ordn) :: ymtmp
  real*8 :: dy_temp
  do k=1,ordn
    do j=1,ordn
      call polint(x1a, ya(:,j,k), x1, yatmp(j,k), dy_temp, ordn)
  end do
-  end do
+
-  do k=1,ordn
+  call polint(x1a,ymtmp,x1,y,dy,ordn)
    call polint(x2a, yatmp(:,k), x2, ymtmp(k), dy_temp, ordn)
  end do
  call polint(x3a, ymtmp, x3, y, dy, ordn)
 #endif
  return
  end subroutine polin3
 !--------------------------------------------------------------------------------------
 ! calculate L2norm  
@@ -1279,9 +1276,7 @@ end subroutine d2dump
  real*8            :: dX, dY, dZ
  integer::imin,jmin,kmin
  integer::imax,jmax,kmax
-  integer::i,j,k,n_elements
+  integer::i,j,k
  real*8, dimension(:), allocatable :: f_flat
  real*8, external :: DDOT
  dX = X(2) - X(1)
  dY = Y(2) - Y(1)
@@ -1305,12 +1300,7 @@ if(dabs(X(1)-xmin) < dX) imin = 1
 if(dabs(Y(1)-ymin) < dY) jmin = 1
 if(dabs(Z(1)-zmin) < dZ) kmin = 1
-! Optimized with oneMKL BLAS DDOT for dot product
+f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax))
 n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
 allocate(f_flat(n_elements))
 f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
 f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
 deallocate(f_flat)
 f_out = f_out*dX*dY*dZ
@@ -1335,9 +1325,7 @@ f_out = f_out*dX*dY*dZ
  real*8            :: dX, dY, dZ
  integer::imin,jmin,kmin
  integer::imax,jmax,kmax
-  integer::i,j,k,n_elements
+  integer::i,j,k
  real*8, dimension(:), allocatable :: f_flat
  real*8, external :: DDOT
  real*8 :: PIo4
@@ -1400,12 +1388,7 @@ if(Symmetry==2)then
  if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
 endif
-! Optimized with oneMKL BLAS DDOT for dot product
+f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax))
 n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
 allocate(f_flat(n_elements))
 f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
 f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
 deallocate(f_flat)
 f_out = f_out*dX*dY*dZ
@@ -1433,8 +1416,6 @@ f_out = f_out*dX*dY*dZ
  integer::imin,jmin,kmin
  integer::imax,jmax,kmax
  integer::i,j,k
  real*8, dimension(:), allocatable :: f_flat
  real*8, external :: DDOT
  real*8 :: PIo4
@@ -1497,12 +1478,11 @@ if(Symmetry==2)then
  if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
 endif
-! Optimized with oneMKL BLAS DDOT for dot product
+f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax))
 f_out = f_out
 Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
 allocate(f_flat(Nout))
 f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [Nout])
 f_out = DDOT(Nout, f_flat, 1, f_flat, 1)
 deallocate(f_flat)
  return
@@ -1700,7 +1680,6 @@ deallocate(f_flat)
  real*8, dimension(ORDN,ORDN) :: tmp2
  real*8, dimension(ORDN) :: tmp1
  real*8, dimension(3) :: SoAh
  real*8, external :: DDOT
 ! +1 because c++ gives 0 for first point
  cxB = inds+1  
@@ -1736,21 +1715,20 @@ deallocate(f_flat)
     ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),cxB(3):cxT(3))
  endif 
  ! Optimized with BLAS operations for better performance
  ! First dimension: z-direction weighted sum
  tmp2=0
  do m=1,ORDN
    tmp2 = tmp2 + coef(2*ORDN+m)*ya(:,:,m)
  enddo
  ! Second dimension: y-direction weighted sum
  tmp1=0
  do m=1,ORDN
    tmp1 = tmp1 + coef(ORDN+m)*tmp2(:,m)
  enddo
-  ! Third dimension: x-direction weighted sum using BLAS DDOT
+  f_int=0
-  f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
+  do m=1,ORDN
    f_int = f_int + coef(m)*tmp1(m)
  enddo
  return
@@ -1780,7 +1758,6 @@ deallocate(f_flat)
  real*8, dimension(ORDN,ORDN) :: ya
  real*8, dimension(ORDN) :: tmp1
  real*8, dimension(2) :: SoAh
  real*8, external :: DDOT
 ! +1 because c++ gives 0 for first point
  cxB = inds(1:2)+1  
@@ -1810,14 +1787,15 @@ deallocate(f_flat)
     ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),inds(3))
  endif 
  ! Optimized with BLAS operations
  tmp1=0
  do m=1,ORDN
    tmp1 = tmp1 + coef(ORDN+m)*ya(:,m)
  enddo
-  ! Use BLAS DDOT for final weighted sum
+  f_int=0
-  f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
+  do m=1,ORDN
    f_int = f_int + coef(m)*tmp1(m)
  enddo
  return
@@ -1848,7 +1826,6 @@ deallocate(f_flat)
  real*8, dimension(ORDN) :: ya
  real*8 :: SoAh
  integer,dimension(3) :: inds
  real*8, external :: DDOT
 ! +1 because c++ gives 0 for first point
  inds = indsi + 1
@@ -1909,8 +1886,10 @@ deallocate(f_flat)
          write(*,*)"error in global_interpind1d, not recognized dumyd = ",dumyd
  endif
-  ! Optimized with BLAS DDOT for weighted sum
+  f_int=0
-  f_int = DDOT(ORDN, coef, 1, ya, 1)
+  do m=1,ORDN
    f_int = f_int + coef(m)*ya(m)
  enddo
  return
@@ -2142,38 +2121,24 @@ deallocate(f_flat)
  end function fWigner_d_function
 !----------------------------------
 ! Optimized factorial function using lookup table for small N
 ! and log-gamma for large N to avoid overflow
  function ffact(N) result(gont)
  implicit none
  integer,intent(in) :: N
  real*8 :: gont
  integer :: i
-  ! Lookup table for factorials 0! to 20! (precomputed)
+  integer :: i
  real*8, parameter, dimension(0:20) :: fact_table = [ &
    1.d0, 1.d0, 2.d0, 6.d0, 24.d0, 120.d0, 720.d0, 5040.d0, 40320.d0, &
    362880.d0, 3628800.d0, 39916800.d0, 479001600.d0, 6227020800.d0, &
    87178291200.d0, 1307674368000.d0, 20922789888000.d0, &
    355687428096000.d0, 6402373705728000.d0, 121645100408832000.d0, &
    2432902008176640000.d0 ]
 ! sanity check
  if(N < 0)then
     write(*,*) "ffact: error input for factorial"
     gont = 1.d0
     return
  endif
-  ! Use lookup table for small N (fast path)
+  gont = 1.d0
-  if(N <= 20)then
+  do i=1,N
-     gont = fact_table(N)
+     gont = gont*i
-  else
+  enddo
     ! Use log-gamma function for large N: N! = exp(log_gamma(N+1))
     ! This avoids overflow and is computed efficiently
     gont = exp(log_gamma(dble(N+1)))
  endif
  return
--- a/AMSS_NCKU_source/gaussj.C
+++ b/AMSS_NCKU_source/gaussj.C
@@ -16,66 +16,115 @@ using namespace std;
 #include <string.h>
 #include <math.h>
 #endif
-
+/* Linear equation solution by Gauss-Jordan elimination.
 // Intel oneMKL LAPACK interface
 #include <mkl_lapacke.h>
 /* Linear equation solution using Intel oneMKL LAPACK.
 a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input
 containing the right-hand side vectors. On output a is
 replaced by its matrix inverse, and b is replaced by the
-corresponding set of solution vectors.
+corresponding set of solution vectors */
 Mathematical equivalence:
  Solves: A * x = b  =>  x = A^(-1) * b
  Original Gauss-Jordan and LAPACK dgesv/dgetri produce identical results
  within numerical precision. */
 int gaussj(double *a, double *b, int n)
 {
-  // Allocate pivot array and workspace
+  double swap;
  lapack_int *ipiv = new lapack_int[n];
  lapack_int info;
-  // Make a copy of matrix a for solving (dgesv modifies it to LU form)
+  int *indxc, *indxr, *ipiv;
-  double *a_copy = new double[n * n];
+  indxc = new int[n];
-  for (int i = 0; i < n * n; i++) {
+  indxr = new int[n];
-    a_copy[i] = a[i];
+  ipiv = new int[n];
  int i, icol, irow, j, k, l, ll;
  double big, dum, pivinv, temp;
  for (j = 0; j < n; j++)
    ipiv[j] = 0;
  for (i = 0; i < n; i++)
  {
    big = 0.0;
    for (j = 0; j < n; j++)
      if (ipiv[j] != 1)
        for (k = 0; k < n; k++)
        {
          if (ipiv[k] == 0)
          {
            if (fabs(a[j * n + k]) >= big)
            {
              big = fabs(a[j * n + k]);
              irow = j;
              icol = k;
            }
          }
          else if (ipiv[k] > 1)
          {
            cout << "gaussj: Singular Matrix-1" << endl;
            for (int ii = 0; ii < n; ii++)
            {
              for (int jj = 0; jj < n; jj++)
                cout << a[ii * n + jj] << " ";
              cout << endl;
            }
            return 1; // error return
          }
        }
-  // Step 1: Solve linear system A*x = b using LU decomposition
+    ipiv[icol] = ipiv[icol] + 1;
-  // LAPACKE_dgesv uses column-major by default, but we use row-major
+    if (irow != icol)
-  info = LAPACKE_dgesv(LAPACK_ROW_MAJOR, n, 1, a_copy, n, ipiv, b, 1);
+    {
-
+      for (l = 0; l < n; l++)
-  if (info != 0) {
+      {
-    cout << "gaussj: Singular Matrix (dgesv info=" << info << ")" << endl;
+        swap = a[irow * n + l];
-    delete[] ipiv;
+        a[irow * n + l] = a[icol * n + l];
-    delete[] a_copy;
+        a[icol * n + l] = swap;
    return 1;
      }
-  // Step 2: Compute matrix inverse A^(-1) using LU factorization
+      swap = b[irow];
-  // First do LU factorization of original matrix a
+      b[irow] = b[icol];
-  info = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, n, n, a, n, ipiv);
+      b[icol] = swap;
  if (info != 0) {
    cout << "gaussj: Singular Matrix (dgetrf info=" << info << ")" << endl;
    delete[] ipiv;
    delete[] a_copy;
    return 1;
    }
-  // Then compute inverse from LU factorization
+    indxr[i] = irow;
-  info = LAPACKE_dgetri(LAPACK_ROW_MAJOR, n, a, n, ipiv);
+    indxc[i] = icol;
-  if (info != 0) {
+    if (a[icol * n + icol] == 0.0)
-    cout << "gaussj: Singular Matrix (dgetri info=" << info << ")" << endl;
+    {
-    delete[] ipiv;
+      cout << "gaussj: Singular Matrix-2" << endl;
-    delete[] a_copy;
+      for (int ii = 0; ii < n; ii++)
-    return 1;
+      {
        for (int jj = 0; jj < n; jj++)
          cout << a[ii * n + jj] << " ";
        cout << endl;
      }
      return 1; // error return
    }
    pivinv = 1.0 / a[icol * n + icol];
    a[icol * n + icol] = 1.0;
    for (l = 0; l < n; l++)
      a[icol * n + l] *= pivinv;
    b[icol] *= pivinv;
    for (ll = 0; ll < n; ll++)
      if (ll != icol)
      {
        dum = a[ll * n + icol];
        a[ll * n + icol] = 0.0;
        for (l = 0; l < n; l++)
          a[ll * n + l] -= a[icol * n + l] * dum;
        b[ll] -= b[icol] * dum;
      }
  }
  for (l = n - 1; l >= 0; l--)
  {
    if (indxr[l] != indxc[l])
      for (k = 0; k < n; k++)
      {
        swap = a[k * n + indxr[l]];
        a[k * n + indxr[l]] = a[k * n + indxc[l]];
        a[k * n + indxc[l]] = swap;
      }
  }
  delete[] indxc;
  delete[] indxr;
  delete[] ipiv;
  delete[] a_copy;
  return 0;
 }
--- a/AMSS_NCKU_source/ilucg.f90
+++ b/AMSS_NCKU_source/ilucg.f90
@@ -512,10 +512,11 @@
      IMPLICIT DOUBLE PRECISION (A-H,O-Z)
      DIMENSION V(N),W(N)
 !     SUBROUTINE TO COMPUTE DOUBLE PRECISION VECTOR DOT PRODUCT.
 !     Optimized using Intel oneMKL BLAS ddot
 !     Mathematical equivalence: DGVV = sum_{i=1}^{N} V(i)*W(i)
-      DOUBLE PRECISION, EXTERNAL :: DDOT
+      SUM = 0.0D0
-      DGVV = DDOT(N, V, 1, W, 1)
+            DO 10 I = 1,N
            SUM = SUM + V(I)*W(I)
 10          CONTINUE
      DGVV = SUM
      RETURN
      END
--- a/AMSS_NCKU_source/lopsidediff.f90
+++ b/AMSS_NCKU_source/lopsidediff.f90
@@ -487,201 +487,6 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
  end subroutine lopsided
 !-----------------------------------------------------------------------------
 ! Combined advection (lopsided) + Kreiss-Oliger dissipation (kodis)
 ! Shares the symmetry_bd buffer fh, eliminating one full-grid copy per call.
 ! Mathematically identical to calling lopsided then kodis separately.
 !-----------------------------------------------------------------------------
 subroutine lopsided_kodis(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA,eps)
  implicit none
 !~~~~~~> Input parameters:
  integer, intent(in)  :: ex(1:3),Symmetry
  real*8,  intent(in)  :: X(1:ex(1)),Y(1:ex(2)),Z(1:ex(3))
  real*8,dimension(ex(1),ex(2),ex(3)),intent(in)   :: f,Sfx,Sfy,Sfz
  real*8,dimension(ex(1),ex(2),ex(3)),intent(inout):: f_rhs
  real*8,dimension(3),intent(in) ::SoA
  real*8,intent(in) :: eps
 !~~~~~~> local variables:
 ! note index -2,-1,0, so we have 3 extra points
  real*8,dimension(-2:ex(1),-2:ex(2),-2:ex(3))   :: fh
  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
  real*8 :: dX,dY,dZ
  real*8 :: d12dx,d12dy,d12dz,d2dx,d2dy,d2dz
  real*8,  parameter :: ZEO=0.d0,ONE=1.d0, F3=3.d0
  real*8,  parameter :: TWO=2.d0,F6=6.0d0,F18=1.8d1
  real*8,  parameter :: F12=1.2d1, F10=1.d1,EIT=8.d0
  integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
 ! kodis parameters
  real*8, parameter :: SIX=6.d0,FIT=1.5d1,TWT=2.d1
  real*8, parameter :: cof=6.4d1   ! 2^6
  dX = X(2)-X(1)
  dY = Y(2)-Y(1)
  dZ = Z(2)-Z(1)
  d12dx = ONE/F12/dX
  d12dy = ONE/F12/dY
  d12dz = ONE/F12/dZ
  d2dx = ONE/TWO/dX
  d2dy = ONE/TWO/dY
  d2dz = ONE/TWO/dZ
  imax = ex(1)
  jmax = ex(2)
  kmax = ex(3)
  imin = 1
  jmin = 1
  kmin = 1
  if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -2
  if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -2
  if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -2
 ! Single symmetry_bd call shared by both advection and dissipation
  call symmetry_bd(3,ex,f,fh,SoA)
 ! ---- Advection (lopsided) loop ----
 ! upper bound set ex-1 only for efficiency, 
 ! the loop body will set ex 0 also
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
 ! x direction   
    if(Sfx(i,j,k) > ZEO)then
      if(i+3 <= imax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfx(i,j,k)*d12dx*(-F3*fh(i-1,j,k)-F10*fh(i,j,k)+F18*fh(i+1,j,k) &
                                    -F6*fh(i+2,j,k)+    fh(i+3,j,k))
     elseif(i+2 <= imax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfx(i,j,k)*d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
     elseif(i+1 <= imax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfx(i,j,k)*d12dx*(-F3*fh(i+1,j,k)-F10*fh(i,j,k)+F18*fh(i-1,j,k) &
                                    -F6*fh(i-2,j,k)+    fh(i-3,j,k))
     endif
   elseif(Sfx(i,j,k) < ZEO)then
      if(i-3 >= imin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfx(i,j,k)*d12dx*(-F3*fh(i+1,j,k)-F10*fh(i,j,k)+F18*fh(i-1,j,k) &
                                    -F6*fh(i-2,j,k)+    fh(i-3,j,k))
     elseif(i-2 >= imin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfx(i,j,k)*d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
     elseif(i-1 >= imin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfx(i,j,k)*d12dx*(-F3*fh(i-1,j,k)-F10*fh(i,j,k)+F18*fh(i+1,j,k) &
                                    -F6*fh(i+2,j,k)+    fh(i+3,j,k))
     endif
   endif
 ! y direction   
    if(Sfy(i,j,k) > ZEO)then
      if(j+3 <= jmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j-1,k)-F10*fh(i,j,k)+F18*fh(i,j+1,k) &
                                    -F6*fh(i,j+2,k)+    fh(i,j+3,k))
     elseif(j+2 <= jmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfy(i,j,k)*d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
     elseif(j+1 <= jmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j+1,k)-F10*fh(i,j,k)+F18*fh(i,j-1,k) &
                                    -F6*fh(i,j-2,k)+    fh(i,j-3,k))
     endif
   elseif(Sfy(i,j,k) < ZEO)then
      if(j-3 >= jmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j+1,k)-F10*fh(i,j,k)+F18*fh(i,j-1,k) &
                                    -F6*fh(i,j-2,k)+    fh(i,j-3,k))
     elseif(j-2 >= jmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfy(i,j,k)*d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
     elseif(j-1 >= jmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j-1,k)-F10*fh(i,j,k)+F18*fh(i,j+1,k) &
                                    -F6*fh(i,j+2,k)+    fh(i,j+3,k))
     endif
   endif
 ! z direction   
    if(Sfz(i,j,k) > ZEO)then
      if(k+3 <= kmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k-1)-F10*fh(i,j,k)+F18*fh(i,j,k+1) &
                                    -F6*fh(i,j,k+2)+    fh(i,j,k+3))
     elseif(k+2 <= kmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfz(i,j,k)*d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
     elseif(k+1 <= kmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k+1)-F10*fh(i,j,k)+F18*fh(i,j,k-1) &
                                    -F6*fh(i,j,k-2)+    fh(i,j,k-3))
     endif
   elseif(Sfz(i,j,k) < ZEO)then
      if(k-3 >= kmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k+1)-F10*fh(i,j,k)+F18*fh(i,j,k-1) &
                                    -F6*fh(i,j,k-2)+    fh(i,j,k-3))
     elseif(k-2 >= kmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfz(i,j,k)*d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
     elseif(k-1 >= kmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k-1)-F10*fh(i,j,k)+F18*fh(i,j,k+1) &
                                    -F6*fh(i,j,k+2)+    fh(i,j,k+3))
     endif
   endif
  enddo
  enddo
  enddo
 ! ---- Dissipation (kodis) loop ----
  if(eps > ZEO) then
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
  if(i-3 >= imin .and. i+3 <= imax .and. &
     j-3 >= jmin .and. j+3 <= jmax .and. &
     k-3 >= kmin .and. k+3 <= kmax) then
   f_rhs(i,j,k)       = f_rhs(i,j,k) + eps/cof *( (     &
                              (fh(i-3,j,k)+fh(i+3,j,k)) - &
                          SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
                          FIT*(fh(i-1,j,k)+fh(i+1,j,k)) - &
                          TWT* fh(i,j,k)            )/dX + &
                                                  (     &
                              (fh(i,j-3,k)+fh(i,j+3,k)) - &
                          SIX*(fh(i,j-2,k)+fh(i,j+2,k)) + &
                          FIT*(fh(i,j-1,k)+fh(i,j+1,k)) - &
                          TWT* fh(i,j,k)            )/dY + &
                                                  (     &
                              (fh(i,j,k-3)+fh(i,j,k+3)) - &
                          SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
                          FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
                          TWT* fh(i,j,k)            )/dZ )
  endif
  enddo
  enddo
  enddo
  endif
  return
  end subroutine lopsided_kodis
 #elif (ghost_width == 4)
 ! sixth order code
 ! Compute advection terms in right hand sides of field equations
--- a/AMSS_NCKU_source/macrodef.h
+++ b/AMSS_NCKU_source/macrodef.h
@@ -2,7 +2,7 @@
 #ifndef MICRODEF_H
 #define MICRODEF_H
-#include "macrodef.fh"
+#include "microdef.fh"
 // application parameters
--- a/AMSS_NCKU_source/makefile
+++ b/AMSS_NCKU_source/makefile
@@ -16,12 +16,6 @@ include makefile.inc
 .cu.o:
 	$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
 TwoPunctures.o: TwoPunctures.C
 	${CXX} $(CXXAPPFLAGS) -qopenmp -c $< -o $@
 TwoPunctureABE.o: TwoPunctureABE.C
 	${CXX} $(CXXAPPFLAGS) -qopenmp -c $< -o $@
 # Input files
 C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
           cgh.o bssn_class.o surface_integral.o ShellPatch.o\
@@ -102,7 +96,7 @@ ABEGPU: $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
 	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
 TwoPunctureABE: $(TwoPunctureFILES)
-	$(CLINKER) $(CXXAPPFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
+	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(TwoPunctureFILES) $(LDLIBS)
 clean:
 	rm *.o ABE ABEGPU TwoPunctureABE make.log -f
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -1,31 +1,22 @@
 ## GCC version (commented out)
 ## filein  = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
 ## filein  = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
 ## LDLIBS  = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
-## Intel oneAPI version with oneMKL (Optimized for performance)
+filein  = -I/usr/include -I/usr/include/openmpi-x86_64 -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
 filein  = -I/usr/include/ -I${MKLROOT}/include
-## Using sequential MKL (OpenMP disabled for better single-threaded performance)
+##filein  = -I/usr/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/ -I/usr/lib/cuda/include
 ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
 LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl
-## Aggressive optimization flags:
+LDLIBS  = -L/usr/lib64/openmpi/lib -Wl,-rpath,/usr/lib64/openmpi/lib -lmpi -lgfortran -L/usr/local/cuda-13.1/lib64 -Wl,-rpath,/usr/local/cuda-13.1/lib64 -lcudart -lcuda 
-## -O3: Maximum optimization
+##LDLIBS  = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -L/usr/lib/cuda/lib64 -lcudart -lmpi -lgfortran
 ## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible)
 ## -fp-model fast=2: Aggressive floating-point optimizations
 ## -fma: Enable fused multiply-add instructions
 CXXAPPFLAGS  = -O3 -xHost -fp-model fast=2 -fma -ipo \
               -Dfortran3 -Dnewc -I${MKLROOT}/include
 f90appflags  = -O3 -xHost -fp-model fast=2 -fma -ipo \
               -align array64byte -fpp -I${MKLROOT}/include
 f90          = ifx
 f77          = ifx
 CXX          = icpx
 CC           = icx
 CLINKER      = mpiicpx 
-Cu = nvcc
+CXXAPPFLAGS  = -O3 -Wno-deprecated -Dfortran3 -Dnewc
-CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
+#f90appflags = -O3 -fpp
 f90appflags  = -O3 -x f95-cpp-input
 f90          = gfortran 
 f77          = gfortran 
 CXX          = g++
 CC           = gcc
 CLINKER      = mpic++ 
 Cu = /usr/local/cuda-13.1/bin/nvcc
 CUDA_LIB_PATH = -L/usr/local/cuda-13.1/lib64 -I/usr/include -I/usr/local/cuda-13.1/include
 #CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
-CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc
+# RTX 4050 uses Ada Lovelace architecture (compute capability 8.9)
 CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch=sm_89 -Dfortran3 -Dnewc
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -10,17 +10,6 @@
 import AMSS_NCKU_Input as input_data
 import subprocess
 import time
 ## CPU core binding configuration using taskset
 ## taskset ensures all child processes inherit the CPU affinity mask
 ## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111)
 ## Format: taskset -c 4-55,60-111 ensures processes only run on these cores
 NUMACTL_CPU_BIND = "taskset -c 0-111"
 ## Build parallelism configuration
 ## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
 ## Set make -j to utilize available cores for faster builds
 BUILD_JOBS = 104
 ##################################################################
@@ -37,11 +26,11 @@ def makefile_ABE():
    print( " Compiling the AMSS-NCKU executable file ABE/ABEGPU " ) 
    print(                                                        )
-    ## Build command with CPU binding to nohz_full cores
+    ## Build command
    if (input_data.GPU_Calculation == "no"):
-        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABE"
+        makefile_command  = "make -j4" + " ABE"
    elif (input_data.GPU_Calculation == "yes"):
-        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU"
+        makefile_command  = "make -j4" + " ABEGPU"
    else:
        print( " CPU/GPU numerical calculation setting is wrong " )
        print(                                                    )
@@ -78,8 +67,8 @@ def makefile_TwoPunctureABE():
    print( " Compiling the AMSS-NCKU executable file TwoPunctureABE " )
    print(                                                            )
-    ## Build command with CPU binding to nohz_full cores
+    ## Build command
-    makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} TwoPunctureABE"
+    makefile_command = "make" + " TwoPunctureABE"
    ## Execute the command with subprocess.Popen and stream output
    makefile_process = subprocess.Popen(makefile_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) 
@@ -116,10 +105,10 @@ def run_ABE():
    ## Define the command to run; cast other values to strings as needed
    if (input_data.GPU_Calculation == "no"):
-        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
+        mpi_command         = "mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
        mpi_command_outfile = "ABE_out.log"
    elif (input_data.GPU_Calculation == "yes"):
-        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
+        mpi_command         = "mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
        mpi_command_outfile = "ABEGPU_out.log"
    ## Execute the MPI command and stream output
@@ -152,13 +141,13 @@ def run_ABE():
 ## Run the AMSS-NCKU TwoPuncture program TwoPunctureABE
 def run_TwoPunctureABE():
-    tp_time1=time.time()
+
    print(                                                          )
    print( " Running the AMSS-NCKU executable file TwoPunctureABE " ) 
    print(                                                          )
    ## Define the command to run
-    TwoPuncture_command         = NUMACTL_CPU_BIND + " ./TwoPunctureABE"
+    TwoPuncture_command         = "./TwoPunctureABE"
    TwoPuncture_command_outfile = "TwoPunctureABE_out.log"
    ## Execute the command with subprocess.Popen and stream output
@@ -179,9 +168,7 @@ def run_TwoPunctureABE():
    print(                                               )
    print( " The TwoPunctureABE simulation is finished " ) 
    print(                                               )
-    tp_time2=time.time()
+    
    et=tp_time2-tp_time1
    print(f"Used time: {et}")
    return
 ##################################################################
--- a/parallel_plot_helper.py
+++ b/parallel_plot_helper.py
@@ -1,29 +0,0 @@
 import multiprocessing
 def run_plot_task(task):
    """Execute a single plotting task.
    Parameters
    ----------
    task : tuple
        A tuple of (function, args_tuple) where function is a callable
        plotting function and args_tuple contains its arguments.
    """
    func, args = task
    return func(*args)
 def run_plot_tasks_parallel(plot_tasks):
    """Execute a list of independent plotting tasks in parallel.
    Uses the 'fork' context to create worker processes so that the main
    script is NOT re-imported/re-executed in child processes.
    Parameters
    ----------
    plot_tasks : list of tuples
        Each element is (function, args_tuple).
    """
    ctx = multiprocessing.get_context('fork')
    with ctx.Pool() as pool:
        pool.map(run_plot_task, plot_tasks)
--- a/plot_GW_strain_amplitude_xiaoqu.py
+++ b/plot_GW_strain_amplitude_xiaoqu.py
@@ -11,8 +11,6 @@
 import numpy                               ## numpy for array operations
 import scipy                               ## scipy for interpolation and signal processing
 import math
 import matplotlib
 matplotlib.use('Agg')                      ## use non-interactive backend for multiprocessing safety
 import matplotlib.pyplot    as     plt     ## matplotlib for plotting
 import os                                  ## os for system/file operations
--- a/plot_binary_data.py
+++ b/plot_binary_data.py
@@ -8,23 +8,16 @@
 ##
 #################################################
 ## Restrict OpenMP to one thread per process so that running
 ## many workers in parallel does not create an O(workers * BLAS_threads)
 ## thread explosion.  The variable MUST be set before numpy/scipy
 ## are imported, because the BLAS library reads them only at load time.
 import os
 os.environ.setdefault("OMP_NUM_THREADS",        "1")
 import numpy
 import scipy
 import matplotlib
 matplotlib.use('Agg')                      ## use non-interactive backend for multiprocessing safety
 import matplotlib.pyplot    as     plt
 from   matplotlib.colors    import LogNorm
 from   mpl_toolkits.mplot3d import Axes3D
 ## import torch
 import AMSS_NCKU_Input      as input_data
 import os
 #########################################################################################
@@ -199,19 +192,3 @@ def get_data_xy( Rmin, Rmax, n, data0, time, figure_title, figure_outdir ):
 ####################################################################################
 ####################################################################################
 ## Allow this module to be run as a standalone script so that each
 ## binary-data plot can be executed in a fresh subprocess whose BLAS
 ## environment variables (set above) take effect before numpy loads.
 ##
 ## Usage:  python3 plot_binary_data.py <filename> <binary_outdir> <figure_outdir>
 ####################################################################################
 if __name__ == '__main__':
    import sys
    if len(sys.argv) != 4:
        print(f"Usage: {sys.argv[0]} <filename> <binary_outdir> <figure_outdir>")
        sys.exit(1)
    plot_binary_data(sys.argv[1], sys.argv[2], sys.argv[3])
--- a/plot_xiaoqu.py
+++ b/plot_xiaoqu.py
@@ -8,8 +8,6 @@
 #################################################
 import numpy                               ## numpy for array operations
 import matplotlib
 matplotlib.use('Agg')                      ## use non-interactive backend for multiprocessing safety
 import matplotlib.pyplot    as     plt     ## matplotlib for plotting
 from   mpl_toolkits.mplot3d import Axes3D  ## needed for 3D plots
 import glob
@@ -17,9 +15,6 @@ import os                                  ## operating system utilities
 import plot_binary_data
 import AMSS_NCKU_Input as input_data
 import subprocess
 import sys
 import multiprocessing
 # plt.rcParams['text.usetex'] = True  ## enable LaTeX fonts in plots
@@ -55,40 +50,10 @@ def generate_binary_data_plot( binary_outdir, figure_outdir ):
        file_list.append(x)
        print(x)
-    ## Plot each file in parallel using subprocesses.
+    ## Plot each file in the list
    ## Each subprocess is a fresh Python process where the BLAS thread-count
    ## environment variables (set at the top of plot_binary_data.py) take
    ## effect before numpy is imported.  This avoids the thread explosion
    ## that occurs when multiprocessing.Pool with 'fork' context inherits
    ## already-initialized multi-threaded BLAS from the parent.
    script = os.path.join( os.path.dirname(__file__), "plot_binary_data.py" )
    max_workers = min( multiprocessing.cpu_count(), len(file_list) ) if file_list else 0
    running = []
    failed  = []
    for filename in file_list:
        print(filename)
-        proc = subprocess.Popen(
+        plot_binary_data.plot_binary_data(filename, binary_outdir, figure_outdir)
            [sys.executable, script, filename, binary_outdir, figure_outdir],
        )
        running.append( (proc, filename) )
        ## Keep at most max_workers subprocesses active at a time
        if len(running) >= max_workers:
            p, fn = running.pop(0)
            p.wait()
            if p.returncode != 0:
                failed.append(fn)
    ## Wait for all remaining subprocesses to finish
    for p, fn in running:
        p.wait()
        if p.returncode != 0:
            failed.append(fn)
    if failed:
        print( " WARNING: the following binary data plots failed:" )
        for fn in failed:
            print( "   ", fn )
    print(                        )
    print( " Binary Data Plot Has been Finished " )
Author	SHA1	Message	Date
CGH0S7	75be0968fc	feat: port GPU code to CUDA 13 and enable GPU computation Major changes: - Update makefile.inc for CUDA 13.1 with sm_89 architecture (RTX 4050) - Replace deprecated cudaThreadSynchronize() with cudaDeviceSynchronize() - Add CUDA_SAFE_CALL macro for CUDA 13 compatibility - Fix duplicate function definitions (compare_result_gpu, SHStep) - Fix syntax error in bssn_step_gpu.C - Enable GPU calculation in AMSS_NCKU_Input.py - Successfully build ABEGPU executable	2026-01-13 18:15:49 +00:00
CGH0S7	b27e071cde	Makefile updated for rocky10	2026-01-14 01:41:31 +08:00
CGH0S7	a1125d4c79	try to build gpu version	2026-01-13 23:52:44 +08:00
CGH0S7	dcc66588fc	gitignore updated	2026-01-13 23:45:49 +08:00
CGH0S7	950d448edf	fix(build): update LDLIBS to use -lmpi and remove hardcoded paths	2026-01-13 23:40:51 +08:00