Fix potential division by zero in reta_val calculation and enable NaN checks

Added a safety check for the denominator in the reta_val calculation to prevent division by zero when chi approaches zero (e.g., at far-field boundaries). Also enabled DEBUG_NAN_CHECK macro to catch invalid inputs early. Initialized output arrays to zero to prevent uninitialized memory access.
Fix boundary handling in bssn_rhs_opt.f90 to prevent NaNs
2026-01-19 20:29:48 +08:00 · 2026-01-19 20:03:22 +08:00 · 2026-01-19 19:22:52 +08:00 · 2026-01-19 17:14:28 +08:00 · 2026-01-19 16:39:24 +08:00
30 changed files with 3534 additions and 4278 deletions
--- a/AMSS_NCKU_Input.py
+++ b/AMSS_NCKU_Input.py
@@ -16,7 +16,7 @@ import numpy
 File_directory   = "GW150914"                    ## output file directory
 Output_directory = "binary_output"               ## binary data file directory
                                                 ## The file directory name should not be too long
-MPI_processes    = 64                             ## number of mpi processes used in the simulation
+MPI_processes    = 48                             ## number of mpi processes used in the simulation
 GPU_Calculation  = "no"                          ## Use GPU or not 
                                                 ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
--- a/AMSS_NCKU_Program.py
+++ b/AMSS_NCKU_Program.py
@@ -8,14 +8,6 @@
 ##
 ##################################################################
 ## Guard against re-execution by multiprocessing child processes.
 ## Without this, using 'spawn' or 'forkserver' context would cause every
 ## worker to re-run the entire script, spawning exponentially more
 ## workers (fork bomb).
 if __name__ != '__main__':
    import sys as _sys
    _sys.exit(0)
 ##################################################################
@@ -432,31 +424,26 @@ print(
 import plot_xiaoqu
 import plot_GW_strain_amplitude_xiaoqu
 from parallel_plot_helper import run_plot_tasks_parallel
 plot_tasks = []
 ## Plot black hole trajectory
-plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot,   (binary_results_directory, figure_directory) ) )
+plot_xiaoqu.generate_puncture_orbit_plot(   binary_results_directory, figure_directory )
-plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot3D, (binary_results_directory, figure_directory) ) )
+plot_xiaoqu.generate_puncture_orbit_plot3D( binary_results_directory, figure_directory )
 ## Plot black hole separation vs. time
-plot_tasks.append( ( plot_xiaoqu.generate_puncture_distence_plot, (binary_results_directory, figure_directory) ) )
+plot_xiaoqu.generate_puncture_distence_plot( binary_results_directory, figure_directory )
 ## Plot gravitational waveforms (psi4 and strain amplitude)
 for i in range(input_data.Detector_Number):
-    plot_tasks.append( ( plot_xiaoqu.generate_gravitational_wave_psi4_plot, (binary_results_directory, figure_directory, i) ) )
+    plot_xiaoqu.generate_gravitational_wave_psi4_plot( binary_results_directory, figure_directory, i )
-    plot_tasks.append( ( plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot, (binary_results_directory, figure_directory, i) ) )
+    plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot( binary_results_directory, figure_directory, i )
 ## Plot ADM mass evolution
 for i in range(input_data.Detector_Number):
-    plot_tasks.append( ( plot_xiaoqu.generate_ADMmass_plot, (binary_results_directory, figure_directory, i) ) )
+    plot_xiaoqu.generate_ADMmass_plot( binary_results_directory, figure_directory, i )
 ## Plot Hamiltonian constraint violation over time
 for i in range(input_data.grid_level):
-    plot_tasks.append( ( plot_xiaoqu.generate_constraint_check_plot, (binary_results_directory, figure_directory, i) ) )
+    plot_xiaoqu.generate_constraint_check_plot( binary_results_directory, figure_directory, i )
 run_plot_tasks_parallel(plot_tasks)
 ## Plot stored binary data
 plot_xiaoqu.generate_binary_data_plot( binary_results_directory, figure_directory )
--- a/AMSS_NCKU_source/MPatch.C
+++ b/AMSS_NCKU_source/MPatch.C
@@ -341,9 +341,8 @@ void Patch::Interp_Points(MyList<var> *VarList,
                          double *Shellf, int Symmetry)
 {
  // NOTE: we do not Synchnize variables here, make sure of that before calling this routine
-  int myrank, nprocs;
+  int myrank;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
  int ordn = 2 * ghost_width;
  MyList<var> *varl;
@@ -355,18 +354,24 @@ void Patch::Interp_Points(MyList<var> *VarList,
    varl = varl->next;
  }
-  memset(Shellf, 0, sizeof(double) * NN * num_var);
+  double *shellf;
  shellf = new double[NN * num_var];
  memset(shellf, 0, sizeof(double) * NN * num_var);
-  // owner_rank[j] records which MPI rank owns point j
+  // we use weight to monitor code, later some day we can move it for optimization
-  // All ranks traverse the same block list so they all agree on ownership
+  int *weight;
-  int *owner_rank;
+  weight = new int[NN];
-  owner_rank = new int[NN];
+  memset(weight, 0, sizeof(int) * NN);
-  for (int j = 0; j < NN; j++)
+
-    owner_rank[j] = -1;
+  double *DH, *llb, *uub;
  DH = new double[dim];
  double DH[dim], llb[dim], uub[dim];
  for (int i = 0; i < dim; i++)
  {
    DH[i] = getdX(i);
  }
  llb = new double[dim];
  uub = new double[dim];
  for (int j = 0; j < NN; j++) // run along points
  {
@@ -398,6 +403,12 @@ void Patch::Interp_Points(MyList<var> *VarList,
      bool flag = true;
      for (int i = 0; i < dim; i++)
      {
 // NOTE: our dividing structure is (exclude ghost)
 // -1 0
 //       1  2
 // so (0,1) does not belong to any part for vertex structure
 // here we put (0,0.5) to left part and (0.5,1) to right part
 // BUT for cell structure the bbox is (-1.5,0.5) and (0.5,2.5), there is no missing region at all
 #ifdef Vertex
 #ifdef Cell
 #error Both Cell and Vertex are defined
@@ -422,7 +433,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
      if (flag)
      {
        notfind = false;
        owner_rank[j] = BP->rank;
        if (myrank == BP->rank)
        {
          //---> interpolation
@@ -430,11 +440,14 @@ void Patch::Interp_Points(MyList<var> *VarList,
          int k = 0;
          while (varl) // run along variables
          {
-            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
+            //              shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn],
            //	  		                                    pox,ordn,varl->data->SoA,Symmetry);
            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[j * num_var + k],
                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
            varl = varl->next;
            k++;
          }
          weight[j] = 1;
        }
      }
      if (Bp == ble)
@@ -443,327 +456,103 @@ void Patch::Interp_Points(MyList<var> *VarList,
    }
  }
-  // Replace MPI_Allreduce with per-owner MPI_Bcast:
+  MPI_Allreduce(shellf, Shellf, NN * num_var, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-  // Group consecutive points by owner rank and broadcast each group.
+  int *Weight;
-  // Since each point's data is non-zero only on the owner rank,
+  Weight = new int[NN];
-  // Bcast from owner is equivalent to Allreduce(MPI_SUM) but much cheaper.
+  MPI_Allreduce(weight, Weight, NN, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  //  misc::tillherecheck("print me");
  for (int i = 0; i < NN; i++)
  {
-    int j = 0;
+    if (Weight[i] > 1)
    while (j < NN)
    {
-      int cur_owner = owner_rank[j];
+      if (myrank == 0)
-      if (cur_owner < 0)
+        cout << "WARNING: Patch::Interp_Points meets multiple weight" << endl;
-      {
+      for (int j = 0; j < num_var; j++)
-        if (myrank == 0)
+        Shellf[j + i * num_var] = Shellf[j + i * num_var] / Weight[i];
        {
          cout << "ERROR: Patch::Interp_Points fails to find point (";
          for (int d = 0; d < dim; d++)
          {
            cout << XX[d][j];
            if (d < dim - 1)
              cout << ",";
            else
              cout << ")";
          }
          cout << " on Patch (";
          for (int d = 0; d < dim; d++)
          {
            cout << bbox[d] << "+" << lli[d] * DH[d];
            if (d < dim - 1)
              cout << ",";
            else
              cout << ")--";
          }
          cout << "(";
          for (int d = 0; d < dim; d++)
          {
            cout << bbox[dim + d] << "-" << uui[d] * DH[d];
            if (d < dim - 1)
              cout << ",";
            else
              cout << ")" << endl;
          }
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        j++;
        continue;
      }
      // Find contiguous run of points with the same owner
      int jstart = j;
      while (j < NN && owner_rank[j] == cur_owner)
        j++;
      int count = (j - jstart) * num_var;
      MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner, MPI_COMM_WORLD);
    }
-  }
+    else if (Weight[i] == 0 && myrank == 0)
  delete[] owner_rank;
 }
 void Patch::Interp_Points(MyList<var> *VarList,
                          int NN, double **XX,
                          double *Shellf, int Symmetry,
                          int Nmin_consumer, int Nmax_consumer)
 {
  // Targeted point-to-point overload: each owner sends each point only to
  // the one rank that needs it for integration (consumer), reducing
  // communication volume by ~nprocs times compared to the Bcast version.
  int myrank, nprocs;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
  int ordn = 2 * ghost_width;
  MyList<var> *varl;
  int num_var = 0;
  varl = VarList;
  while (varl)
  {
    num_var++;
    varl = varl->next;
  }
  memset(Shellf, 0, sizeof(double) * NN * num_var);
  // owner_rank[j] records which MPI rank owns point j
  int *owner_rank;
  owner_rank = new int[NN];
  for (int j = 0; j < NN; j++)
    owner_rank[j] = -1;
  double DH[dim], llb[dim], uub[dim];
  for (int i = 0; i < dim; i++)
    DH[i] = getdX(i);
  // --- Interpolation phase (identical to original) ---
  for (int j = 0; j < NN; j++)
  {
    double pox[dim];
    for (int i = 0; i < dim; i++)
    {
      pox[i] = XX[i][j];
      if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
      {
        cout << "Patch::Interp_Points: point (";
        for (int k = 0; k < dim; k++)
        {
          cout << XX[k][j];
          if (k < dim - 1)
            cout << ",";
          else
            cout << ") is out of current Patch." << endl;
        }
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
    MyList<Block> *Bp = blb;
    bool notfind = true;
    while (notfind && Bp)
    {
      Block *BP = Bp->data;
      bool flag = true;
      for (int i = 0; i < dim; i++)
      {
 #ifdef Vertex
 #ifdef Cell
 #error Both Cell and Vertex are defined
 #endif
        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
 #else
 #ifdef Cell
        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
 #else
 #error Not define Vertex nor Cell
 #endif
 #endif
        if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
        {
          flag = false;
          break;
        }
      }
      if (flag)
      {
        notfind = false;
        owner_rank[j] = BP->rank;
        if (myrank == BP->rank)
        {
          varl = VarList;
          int k = 0;
          while (varl)
          {
            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
            varl = varl->next;
            k++;
          }
        }
      }
      if (Bp == ble)
        break;
      Bp = Bp->next;
    }
  }
  // --- Error check for unfound points ---
  for (int j = 0; j < NN; j++)
  {
    if (owner_rank[j] < 0 && myrank == 0)
    {
      cout << "ERROR: Patch::Interp_Points fails to find point (";
-      for (int d = 0; d < dim; d++)
+      for (int j = 0; j < dim; j++)
      {
-        cout << XX[d][j];
+        cout << XX[j][i];
-        if (d < dim - 1)
+        if (j < dim - 1)
          cout << ",";
        else
          cout << ")";
      }
      cout << " on Patch (";
-      for (int d = 0; d < dim; d++)
+      for (int j = 0; j < dim; j++)
      {
-        cout << bbox[d] << "+" << lli[d] * DH[d];
+        cout << bbox[j] << "+" << lli[j] * getdX(j);
-        if (d < dim - 1)
+        if (j < dim - 1)
          cout << ",";
        else
          cout << ")--";
      }
      cout << "(";
-      for (int d = 0; d < dim; d++)
+      for (int j = 0; j < dim; j++)
      {
-        cout << bbox[dim + d] << "-" << uui[d] * DH[d];
+        cout << bbox[dim + j] << "-" << uui[j] * getdX(j);
-        if (d < dim - 1)
+        if (j < dim - 1)
          cout << ",";
        else
          cout << ")" << endl;
      }
 #if 0
       checkBlock();
 #else
      cout << "splited domains:" << endl;
      {
        MyList<Block> *Bp = blb;
        while (Bp)
        {
          Block *BP = Bp->data;
          for (int i = 0; i < dim; i++)
          {
 #ifdef Vertex
 #ifdef Cell
 #error Both Cell and Vertex are defined
 #endif
            llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
            uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
 #else
 #ifdef Cell
            llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
            uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
 #else
 #error Not define Vertex nor Cell
 #endif
 #endif
          }
          cout << "(";
          for (int j = 0; j < dim; j++)
          {
            cout << llb[j] << ":" << uub[j];
            if (j < dim - 1)
              cout << ",";
            else
              cout << ")" << endl;
          }
          if (Bp == ble)
            break;
          Bp = Bp->next;
        }
      }
 #endif
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
-  // --- Targeted point-to-point communication phase ---
+  delete[] shellf;
-  // Compute consumer_rank[j] using the same deterministic formula as surface_integral
+  delete[] weight;
-  int *consumer_rank = new int[NN];
+  delete[] Weight;
-  {
+  delete[] DH;
-    int mp = NN / nprocs;
+  delete[] llb;
-    int Lp = NN - nprocs * mp;
+  delete[] uub;
    for (int j = 0; j < NN; j++)
    {
      if (j < Lp * (mp + 1))
        consumer_rank[j] = j / (mp + 1);
      else
        consumer_rank[j] = Lp + (j - Lp * (mp + 1)) / mp;
    }
  }
  // Count sends and recvs per rank
  int *send_count = new int[nprocs];
  int *recv_count = new int[nprocs];
  memset(send_count, 0, sizeof(int) * nprocs);
  memset(recv_count, 0, sizeof(int) * nprocs);
  for (int j = 0; j < NN; j++)
  {
    int own = owner_rank[j];
    int con = consumer_rank[j];
    if (own == con)
      continue; // local — no communication needed
    if (own == myrank)
      send_count[con]++;
    if (con == myrank)
      recv_count[own]++;
  }
  // Build send buffers: for each destination rank, pack (index, data) pairs
  // Each entry: 1 int (point index j) + num_var doubles
  int total_send = 0, total_recv = 0;
  int *send_offset = new int[nprocs];
  int *recv_offset = new int[nprocs];
  for (int r = 0; r < nprocs; r++)
  {
    send_offset[r] = total_send;
    total_send += send_count[r];
    recv_offset[r] = total_recv;
    total_recv += recv_count[r];
  }
  // Pack send buffers: each message contains (j, data[0..num_var-1]) per point
  int stride = 1 + num_var; // 1 double for index + num_var doubles for data
  double *sendbuf = new double[total_send * stride];
  double *recvbuf = new double[total_recv * stride];
  // Temporary counters for packing
  int *pack_pos = new int[nprocs];
  memset(pack_pos, 0, sizeof(int) * nprocs);
  for (int j = 0; j < NN; j++)
  {
    int own = owner_rank[j];
    int con = consumer_rank[j];
    if (own != myrank || con == myrank)
      continue;
    int pos = (send_offset[con] + pack_pos[con]) * stride;
    sendbuf[pos] = (double)j; // point index
    for (int v = 0; v < num_var; v++)
      sendbuf[pos + 1 + v] = Shellf[j * num_var + v];
    pack_pos[con]++;
  }
  // Post non-blocking recvs and sends
  int n_req = 0;
  for (int r = 0; r < nprocs; r++)
  {
    if (recv_count[r] > 0) n_req++;
    if (send_count[r] > 0) n_req++;
  }
  MPI_Request *reqs = new MPI_Request[n_req];
  int req_idx = 0;
  for (int r = 0; r < nprocs; r++)
  {
    if (recv_count[r] > 0)
    {
      MPI_Irecv(recvbuf + recv_offset[r] * stride,
                recv_count[r] * stride, MPI_DOUBLE,
                r, 0, MPI_COMM_WORLD, &reqs[req_idx++]);
    }
  }
  for (int r = 0; r < nprocs; r++)
  {
    if (send_count[r] > 0)
    {
      MPI_Isend(sendbuf + send_offset[r] * stride,
                send_count[r] * stride, MPI_DOUBLE,
                r, 0, MPI_COMM_WORLD, &reqs[req_idx++]);
    }
  }
  if (n_req > 0)
    MPI_Waitall(n_req, reqs, MPI_STATUSES_IGNORE);
  // Unpack recv buffers into Shellf
  for (int i = 0; i < total_recv; i++)
  {
    int pos = i * stride;
    int j = (int)recvbuf[pos];
    for (int v = 0; v < num_var; v++)
      Shellf[j * num_var + v] = recvbuf[pos + 1 + v];
  }
  delete[] reqs;
  delete[] sendbuf;
  delete[] recvbuf;
  delete[] pack_pos;
  delete[] send_offset;
  delete[] recv_offset;
  delete[] send_count;
  delete[] recv_count;
  delete[] consumer_rank;
  delete[] owner_rank;
 }
 void Patch::Interp_Points(MyList<var> *VarList,
                          int NN, double **XX,
@@ -784,22 +573,24 @@ void Patch::Interp_Points(MyList<var> *VarList,
    varl = varl->next;
  }
-  memset(Shellf, 0, sizeof(double) * NN * num_var);
+  double *shellf;
  shellf = new double[NN * num_var];
  memset(shellf, 0, sizeof(double) * NN * num_var);
-  // owner_rank[j] stores the global rank that owns point j
+  // we use weight to monitor code, later some day we can move it for optimization
-  int *owner_rank;
+  int *weight;
-  owner_rank = new int[NN];
+  weight = new int[NN];
-  for (int j = 0; j < NN; j++)
+  memset(weight, 0, sizeof(int) * NN);
    owner_rank[j] = -1;
-  // Build global-to-local rank translation for Comm_here
+  double *DH, *llb, *uub;
-  MPI_Group world_group, local_group;
+  DH = new double[dim];
  MPI_Comm_group(MPI_COMM_WORLD, &world_group);
  MPI_Comm_group(Comm_here, &local_group);
  double DH[dim], llb[dim], uub[dim];
  for (int i = 0; i < dim; i++)
  {
    DH[i] = getdX(i);
  }
  llb = new double[dim];
  uub = new double[dim];
  for (int j = 0; j < NN; j++) // run along points
  {
@@ -831,6 +622,12 @@ void Patch::Interp_Points(MyList<var> *VarList,
      bool flag = true;
      for (int i = 0; i < dim; i++)
      {
 // NOTE: our dividing structure is (exclude ghost)
 // -1 0
 //       1  2
 // so (0,1) does not belong to any part for vertex structure
 // here we put (0,0.5) to left part and (0.5,1) to right part
 // BUT for cell structure the bbox is (-1.5,0.5) and (0.5,2.5), there is no missing region at all
 #ifdef Vertex
 #ifdef Cell
 #error Both Cell and Vertex are defined
@@ -855,7 +652,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
      if (flag)
      {
        notfind = false;
        owner_rank[j] = BP->rank;
        if (myrank == BP->rank)
        {
          //---> interpolation
@@ -863,11 +659,14 @@ void Patch::Interp_Points(MyList<var> *VarList,
          int k = 0;
          while (varl) // run along variables
          {
-            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
+            //              shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn],
            //	  		                                    pox,ordn,varl->data->SoA,Symmetry);
            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[j * num_var + k],
                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
            varl = varl->next;
            k++;
          }
          weight[j] = 1;
        }
      }
      if (Bp == ble)
@@ -876,35 +675,97 @@ void Patch::Interp_Points(MyList<var> *VarList,
    }
  }
-  // Collect unique global owner ranks and translate to local ranks in Comm_here
+  MPI_Allreduce(shellf, Shellf, NN * num_var, MPI_DOUBLE, MPI_SUM, Comm_here);
-  // Then broadcast each owner's points via MPI_Bcast on Comm_here
+  int *Weight;
-  {
+  Weight = new int[NN];
-    int j = 0;
+  MPI_Allreduce(weight, Weight, NN, MPI_INT, MPI_SUM, Comm_here);
    while (j < NN)
    {
      int cur_owner_global = owner_rank[j];
      if (cur_owner_global < 0)
      {
        // Point not found — skip (error check disabled for sub-communicator levels)
        j++;
        continue;
      }
      // Translate global rank to local rank in Comm_here
      int cur_owner_local;
      MPI_Group_translate_ranks(world_group, 1, &cur_owner_global, local_group, &cur_owner_local);
-      // Find contiguous run of points with the same owner
+  //  misc::tillherecheck("print me");
-      int jstart = j;
+  //  if(lmyrank == 0) cout<<"myrank = "<<myrank<<"print me"<<endl;
-      while (j < NN && owner_rank[j] == cur_owner_global)
+
-        j++;
+  for (int i = 0; i < NN; i++)
-      int count = (j - jstart) * num_var;
+  {
-      MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner_local, Comm_here);
+    if (Weight[i] > 1)
    {
      if (lmyrank == 0)
        cout << "WARNING: Patch::Interp_Points meets multiple weight" << endl;
      for (int j = 0; j < num_var; j++)
        Shellf[j + i * num_var] = Shellf[j + i * num_var] / Weight[i];
    }
 #if 0 // for not involved levels, this may fail     
     else if(Weight[i] == 0 && lmyrank == 0)
     {
       cout<<"ERROR: Patch::Interp_Points fails to find point (";
       for(int j=0;j<dim;j++)
       {
 	  cout<<XX[j][i];
 	  if(j<dim-1) cout<<",";
 	  else        cout<<")";
       }
       cout<<" on Patch (";
       for(int j=0;j<dim;j++)
       {
 	  cout<<bbox[j]<<"+"<<lli[j]*getdX(j);
 	  if(j<dim-1) cout<<",";
 	  else        cout<<")--";
       }
       cout<<"(";
       for(int j=0;j<dim;j++)
       {
 	  cout<<bbox[dim+j]<<"-"<<uui[j]*getdX(j);
 	  if(j<dim-1) cout<<",";
 	  else        cout<<")"<<endl;
       }
 #if 0
       checkBlock();
 #else
  cout<<"splited domains:"<<endl;
  {
     MyList<Block> *Bp=blb;
     while(Bp)
     {
 	Block *BP=Bp->data;
 	for(int i=0;i<dim;i++)
 	{
 #ifdef Vertex
 #ifdef Cell
 #error Both Cell and Vertex are defined
 #endif    
          llb[i] = (feq(BP->bbox[i]    ,bbox[i]    ,DH[i]/2)) ? BP->bbox[i]+lli[i]*DH[i]     : BP->bbox[i]    +(ghost_width-0.5)*DH[i];
          uub[i] = (feq(BP->bbox[dim+i],bbox[dim+i],DH[i]/2)) ? BP->bbox[dim+i]-uui[i]*DH[i] : BP->bbox[dim+i]-(ghost_width-0.5)*DH[i];
 #else
 #ifdef Cell
          llb[i] = (feq(BP->bbox[i]    ,bbox[i]    ,DH[i]/2)) ? BP->bbox[i]+lli[i]*DH[i]     : BP->bbox[i]    +ghost_width*DH[i];
          uub[i] = (feq(BP->bbox[dim+i],bbox[dim+i],DH[i]/2)) ? BP->bbox[dim+i]-uui[i]*DH[i] : BP->bbox[dim+i]-ghost_width*DH[i];
 #else
 #error Not define Vertex nor Cell
 #endif
 #endif 
 	}       
       cout<<"(";
       for(int j=0;j<dim;j++)
       {
 	  cout<<llb[j]<<":"<<uub[j];
 	  if(j<dim-1) cout<<",";
 	  else        cout<<")"<<endl;
       }
 	if(Bp == ble) break;
 	Bp=Bp->next;
     }
  }
 #endif       
       MPI_Abort(MPI_COMM_WORLD,1);
     }
 #endif
  }
-  MPI_Group_free(&world_group);
+  delete[] shellf;
-  MPI_Group_free(&local_group);
+  delete[] weight;
-  delete[] owner_rank;
+  delete[] Weight;
  delete[] DH;
  delete[] llb;
  delete[] uub;
 }
 void Patch::checkBlock()
 {
--- a/AMSS_NCKU_source/MPatch.h
+++ b/AMSS_NCKU_source/MPatch.h
@@ -39,10 +39,6 @@ public:
   bool Find_Point(double *XX);
   void Interp_Points(MyList<var> *VarList,
                      int NN, double **XX,
                      double *Shellf, int Symmetry,
                      int Nmin_consumer, int Nmax_consumer);
   void Interp_Points(MyList<var> *VarList,
                      int NN, double **XX,
                      double *Shellf, int Symmetry, MPI_Comm Comm_here);
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -3756,502 +3756,6 @@ void Parallel::Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
  delete[] transfer_src;
  delete[] transfer_dst;
 }
 // Merged Sync: collect all intra-patch and inter-patch grid segment lists,
 // then issue a single transfer() call instead of N+1 separate ones.
 void Parallel::Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
 {
  int cpusize;
  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
  MyList<Parallel::gridseg> **combined_src = new MyList<Parallel::gridseg> *[cpusize];
  MyList<Parallel::gridseg> **combined_dst = new MyList<Parallel::gridseg> *[cpusize];
  for (int node = 0; node < cpusize; node++)
    combined_src[node] = combined_dst[node] = 0;
  // Phase A: Intra-patch ghost exchange segments
  MyList<Patch> *Pp = PatL;
  while (Pp)
  {
    Patch *Pat = Pp->data;
    MyList<Parallel::gridseg> *dst_ghost = build_ghost_gsl(Pat);
    for (int node = 0; node < cpusize; node++)
    {
      MyList<Parallel::gridseg> *src_owned = build_owned_gsl0(Pat, node);
      MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
      build_gstl(src_owned, dst_ghost, &tsrc, &tdst);
      if (tsrc)
      {
        if (combined_src[node])
          combined_src[node]->catList(tsrc);
        else
          combined_src[node] = tsrc;
      }
      if (tdst)
      {
        if (combined_dst[node])
          combined_dst[node]->catList(tdst);
        else
          combined_dst[node] = tdst;
      }
      if (src_owned)
        src_owned->destroyList();
    }
    if (dst_ghost)
      dst_ghost->destroyList();
    Pp = Pp->next;
  }
  // Phase B: Inter-patch buffer exchange segments
  MyList<Parallel::gridseg> *dst_buffer = build_buffer_gsl(PatL);
  for (int node = 0; node < cpusize; node++)
  {
    MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatL, node, 5, Symmetry);
    MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
    build_gstl(src_owned, dst_buffer, &tsrc, &tdst);
    if (tsrc)
    {
      if (combined_src[node])
        combined_src[node]->catList(tsrc);
      else
        combined_src[node] = tsrc;
    }
    if (tdst)
    {
      if (combined_dst[node])
        combined_dst[node]->catList(tdst);
      else
        combined_dst[node] = tdst;
    }
    if (src_owned)
      src_owned->destroyList();
  }
  if (dst_buffer)
    dst_buffer->destroyList();
  // Phase C: Single transfer
  transfer(combined_src, combined_dst, VarList, VarList, Symmetry);
  // Phase D: Cleanup
  for (int node = 0; node < cpusize; node++)
  {
    if (combined_src[node])
      combined_src[node]->destroyList();
    if (combined_dst[node])
      combined_dst[node]->destroyList();
  }
  delete[] combined_src;
  delete[] combined_dst;
 }
 // SyncCache constructor
 Parallel::SyncCache::SyncCache()
    : valid(false), cpusize(0), combined_src(0), combined_dst(0),
      send_lengths(0), recv_lengths(0), send_bufs(0), recv_bufs(0),
      send_buf_caps(0), recv_buf_caps(0), reqs(0), stats(0), max_reqs(0),
      lengths_valid(false)
 {
 }
 // SyncCache invalidate: free grid segment lists but keep buffers
 void Parallel::SyncCache::invalidate()
 {
  if (!valid)
    return;
  for (int i = 0; i < cpusize; i++)
  {
    if (combined_src[i])
      combined_src[i]->destroyList();
    if (combined_dst[i])
      combined_dst[i]->destroyList();
    combined_src[i] = combined_dst[i] = 0;
    send_lengths[i] = recv_lengths[i] = 0;
  }
  valid = false;
  lengths_valid = false;
 }
 // SyncCache destroy: free everything
 void Parallel::SyncCache::destroy()
 {
  invalidate();
  if (combined_src) delete[] combined_src;
  if (combined_dst) delete[] combined_dst;
  if (send_lengths) delete[] send_lengths;
  if (recv_lengths) delete[] recv_lengths;
  if (send_buf_caps) delete[] send_buf_caps;
  if (recv_buf_caps) delete[] recv_buf_caps;
  for (int i = 0; i < cpusize; i++)
  {
    if (send_bufs && send_bufs[i]) delete[] send_bufs[i];
    if (recv_bufs && recv_bufs[i]) delete[] recv_bufs[i];
  }
  if (send_bufs) delete[] send_bufs;
  if (recv_bufs) delete[] recv_bufs;
  if (reqs) delete[] reqs;
  if (stats) delete[] stats;
  combined_src = combined_dst = 0;
  send_lengths = recv_lengths = 0;
  send_buf_caps = recv_buf_caps = 0;
  send_bufs = recv_bufs = 0;
  reqs = 0; stats = 0;
  cpusize = 0; max_reqs = 0;
 }
 // transfer_cached: reuse pre-allocated buffers from SyncCache
 void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
                               MyList<var> *VarList1, MyList<var> *VarList2,
                               int Symmetry, SyncCache &cache)
 {
  int myrank;
  MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize);
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  int cpusize = cache.cpusize;
  int req_no = 0;
  int node;
  for (node = 0; node < cpusize; node++)
  {
    if (node == myrank)
    {
      int length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
      cache.recv_lengths[node] = length;
      if (length > 0)
      {
        if (length > cache.recv_buf_caps[node])
        {
          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
          cache.recv_bufs[node] = new double[length];
          cache.recv_buf_caps[node] = length;
        }
        data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
      }
    }
    else
    {
      // send
      int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
      cache.send_lengths[node] = slength;
      if (slength > 0)
      {
        if (slength > cache.send_buf_caps[node])
        {
          if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
          cache.send_bufs[node] = new double[slength];
          cache.send_buf_caps[node] = slength;
        }
        data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
        MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
      }
      // recv
      int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
      cache.recv_lengths[node] = rlength;
      if (rlength > 0)
      {
        if (rlength > cache.recv_buf_caps[node])
        {
          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
          cache.recv_bufs[node] = new double[rlength];
          cache.recv_buf_caps[node] = rlength;
        }
        MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
      }
    }
  }
  MPI_Waitall(req_no, cache.reqs, cache.stats);
  for (node = 0; node < cpusize; node++)
    if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
      data_packer(cache.recv_bufs[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
 }
 // Sync_cached: build grid segment lists on first call, reuse on subsequent calls
 void Parallel::Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache)
 {
  if (!cache.valid)
  {
    int cpusize;
    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
    cache.cpusize = cpusize;
    // Allocate cache arrays if needed
    if (!cache.combined_src)
    {
      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
      cache.send_lengths = new int[cpusize];
      cache.recv_lengths = new int[cpusize];
      cache.send_bufs = new double *[cpusize];
      cache.recv_bufs = new double *[cpusize];
      cache.send_buf_caps = new int[cpusize];
      cache.recv_buf_caps = new int[cpusize];
      for (int i = 0; i < cpusize; i++)
      {
        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
      }
      cache.max_reqs = 2 * cpusize;
      cache.reqs = new MPI_Request[cache.max_reqs];
      cache.stats = new MPI_Status[cache.max_reqs];
    }
    for (int node = 0; node < cpusize; node++)
    {
      cache.combined_src[node] = cache.combined_dst[node] = 0;
      cache.send_lengths[node] = cache.recv_lengths[node] = 0;
    }
    // Build intra-patch segments (same as Sync_merged Phase A)
    MyList<Patch> *Pp = PatL;
    while (Pp)
    {
      Patch *Pat = Pp->data;
      MyList<Parallel::gridseg> *dst_ghost = build_ghost_gsl(Pat);
      for (int node = 0; node < cpusize; node++)
      {
        MyList<Parallel::gridseg> *src_owned = build_owned_gsl0(Pat, node);
        MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
        build_gstl(src_owned, dst_ghost, &tsrc, &tdst);
        if (tsrc)
        {
          if (cache.combined_src[node])
            cache.combined_src[node]->catList(tsrc);
          else
            cache.combined_src[node] = tsrc;
        }
        if (tdst)
        {
          if (cache.combined_dst[node])
            cache.combined_dst[node]->catList(tdst);
          else
            cache.combined_dst[node] = tdst;
        }
        if (src_owned) src_owned->destroyList();
      }
      if (dst_ghost) dst_ghost->destroyList();
      Pp = Pp->next;
    }
    // Build inter-patch segments (same as Sync_merged Phase B)
    MyList<Parallel::gridseg> *dst_buffer = build_buffer_gsl(PatL);
    for (int node = 0; node < cpusize; node++)
    {
      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatL, node, 5, Symmetry);
      MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
      build_gstl(src_owned, dst_buffer, &tsrc, &tdst);
      if (tsrc)
      {
        if (cache.combined_src[node])
          cache.combined_src[node]->catList(tsrc);
        else
          cache.combined_src[node] = tsrc;
      }
      if (tdst)
      {
        if (cache.combined_dst[node])
          cache.combined_dst[node]->catList(tdst);
        else
          cache.combined_dst[node] = tdst;
      }
      if (src_owned) src_owned->destroyList();
    }
    if (dst_buffer) dst_buffer->destroyList();
    cache.valid = true;
  }
  // Use cached lists with buffer-reusing transfer
  transfer_cached(cache.combined_src, cache.combined_dst, VarList, VarList, Symmetry, cache);
 }
 // Sync_start: pack and post MPI_Isend/Irecv, return immediately
 void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
                          SyncCache &cache, AsyncSyncState &state)
 {
  // Ensure cache is built
  if (!cache.valid)
  {
    // Build cache (same logic as Sync_cached)
    int cpusize;
    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
    cache.cpusize = cpusize;
    if (!cache.combined_src)
    {
      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
      cache.send_lengths = new int[cpusize];
      cache.recv_lengths = new int[cpusize];
      cache.send_bufs = new double *[cpusize];
      cache.recv_bufs = new double *[cpusize];
      cache.send_buf_caps = new int[cpusize];
      cache.recv_buf_caps = new int[cpusize];
      for (int i = 0; i < cpusize; i++)
      {
        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
      }
      cache.max_reqs = 2 * cpusize;
      cache.reqs = new MPI_Request[cache.max_reqs];
      cache.stats = new MPI_Status[cache.max_reqs];
    }
    for (int node = 0; node < cpusize; node++)
    {
      cache.combined_src[node] = cache.combined_dst[node] = 0;
      cache.send_lengths[node] = cache.recv_lengths[node] = 0;
    }
    MyList<Patch> *Pp = PatL;
    while (Pp)
    {
      Patch *Pat = Pp->data;
      MyList<Parallel::gridseg> *dst_ghost = build_ghost_gsl(Pat);
      for (int node = 0; node < cpusize; node++)
      {
        MyList<Parallel::gridseg> *src_owned = build_owned_gsl0(Pat, node);
        MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
        build_gstl(src_owned, dst_ghost, &tsrc, &tdst);
        if (tsrc)
        {
          if (cache.combined_src[node])
            cache.combined_src[node]->catList(tsrc);
          else
            cache.combined_src[node] = tsrc;
        }
        if (tdst)
        {
          if (cache.combined_dst[node])
            cache.combined_dst[node]->catList(tdst);
          else
            cache.combined_dst[node] = tdst;
        }
        if (src_owned) src_owned->destroyList();
      }
      if (dst_ghost) dst_ghost->destroyList();
      Pp = Pp->next;
    }
    MyList<Parallel::gridseg> *dst_buffer = build_buffer_gsl(PatL);
    for (int node = 0; node < cpusize; node++)
    {
      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatL, node, 5, Symmetry);
      MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
      build_gstl(src_owned, dst_buffer, &tsrc, &tdst);
      if (tsrc)
      {
        if (cache.combined_src[node])
          cache.combined_src[node]->catList(tsrc);
        else
          cache.combined_src[node] = tsrc;
      }
      if (tdst)
      {
        if (cache.combined_dst[node])
          cache.combined_dst[node]->catList(tdst);
        else
          cache.combined_dst[node] = tdst;
      }
      if (src_owned) src_owned->destroyList();
    }
    if (dst_buffer) dst_buffer->destroyList();
    cache.valid = true;
  }
  // Now pack and post async MPI operations
  int myrank;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  int cpusize = cache.cpusize;
  state.req_no = 0;
  state.active = true;
  MyList<Parallel::gridseg> **src = cache.combined_src;
  MyList<Parallel::gridseg> **dst = cache.combined_dst;
  for (int node = 0; node < cpusize; node++)
  {
    if (node == myrank)
    {
      int length;
      if (!cache.lengths_valid) {
        length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
        cache.recv_lengths[node] = length;
      } else {
        length = cache.recv_lengths[node];
      }
      if (length > 0)
      {
        if (length > cache.recv_buf_caps[node])
        {
          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
          cache.recv_bufs[node] = new double[length];
          cache.recv_buf_caps[node] = length;
        }
        data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
      }
    }
    else
    {
      int slength;
      if (!cache.lengths_valid) {
        slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
        cache.send_lengths[node] = slength;
      } else {
        slength = cache.send_lengths[node];
      }
      if (slength > 0)
      {
        if (slength > cache.send_buf_caps[node])
        {
          if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
          cache.send_bufs[node] = new double[slength];
          cache.send_buf_caps[node] = slength;
        }
        data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
        MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
      }
      int rlength;
      if (!cache.lengths_valid) {
        rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry);
        cache.recv_lengths[node] = rlength;
      } else {
        rlength = cache.recv_lengths[node];
      }
      if (rlength > 0)
      {
        if (rlength > cache.recv_buf_caps[node])
        {
          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
          cache.recv_bufs[node] = new double[rlength];
          cache.recv_buf_caps[node] = rlength;
        }
        MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
      }
    }
  }
  cache.lengths_valid = true;
 }
 // Sync_finish: wait for async MPI operations and unpack
 void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state,
                           MyList<var> *VarList, int Symmetry)
 {
  if (!state.active)
    return;
  MPI_Waitall(state.req_no, cache.reqs, cache.stats);
  int cpusize = cache.cpusize;
  MyList<Parallel::gridseg> **src = cache.combined_src;
  MyList<Parallel::gridseg> **dst = cache.combined_dst;
  for (int node = 0; node < cpusize; node++)
    if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
      data_packer(cache.recv_bufs[node], src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry);
  state.active = false;
 }
 // collect buffer grid segments or blocks for the periodic boundary condition of given patch
 // ---------------------------------------------------
 // |con |                                       |con |
@@ -5286,203 +4790,6 @@ void Parallel::OutBdLow2Himix(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
  delete[] transfer_src;
  delete[] transfer_dst;
 }
 // Restrict_cached: cache grid segment lists, reuse buffers via transfer_cached
 void Parallel::Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                               MyList<var> *VarList1, MyList<var> *VarList2,
                               int Symmetry, SyncCache &cache)
 {
  if (!cache.valid)
  {
    int cpusize;
    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
    cache.cpusize = cpusize;
    if (!cache.combined_src)
    {
      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
      cache.send_lengths = new int[cpusize];
      cache.recv_lengths = new int[cpusize];
      cache.send_bufs = new double *[cpusize];
      cache.recv_bufs = new double *[cpusize];
      cache.send_buf_caps = new int[cpusize];
      cache.recv_buf_caps = new int[cpusize];
      for (int i = 0; i < cpusize; i++)
      {
        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
      }
      cache.max_reqs = 2 * cpusize;
      cache.reqs = new MPI_Request[cache.max_reqs];
      cache.stats = new MPI_Status[cache.max_reqs];
    }
    MyList<Parallel::gridseg> *dst = build_complete_gsl(PatcL);
    for (int node = 0; node < cpusize; node++)
    {
      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatfL, node, 2, Symmetry);
      build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]);
      if (src_owned) src_owned->destroyList();
    }
    if (dst) dst->destroyList();
    cache.valid = true;
  }
  transfer_cached(cache.combined_src, cache.combined_dst, VarList1, VarList2, Symmetry, cache);
 }
 // OutBdLow2Hi_cached: cache grid segment lists, reuse buffers via transfer_cached
 void Parallel::OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                                  MyList<var> *VarList1, MyList<var> *VarList2,
                                  int Symmetry, SyncCache &cache)
 {
  if (!cache.valid)
  {
    int cpusize;
    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
    cache.cpusize = cpusize;
    if (!cache.combined_src)
    {
      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
      cache.send_lengths = new int[cpusize];
      cache.recv_lengths = new int[cpusize];
      cache.send_bufs = new double *[cpusize];
      cache.recv_bufs = new double *[cpusize];
      cache.send_buf_caps = new int[cpusize];
      cache.recv_buf_caps = new int[cpusize];
      for (int i = 0; i < cpusize; i++)
      {
        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
      }
      cache.max_reqs = 2 * cpusize;
      cache.reqs = new MPI_Request[cache.max_reqs];
      cache.stats = new MPI_Status[cache.max_reqs];
    }
    MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
    for (int node = 0; node < cpusize; node++)
    {
      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatcL, node, 4, Symmetry);
      build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]);
      if (src_owned) src_owned->destroyList();
    }
    if (dst) dst->destroyList();
    cache.valid = true;
  }
  transfer_cached(cache.combined_src, cache.combined_dst, VarList1, VarList2, Symmetry, cache);
 }
 // OutBdLow2Himix_cached: same as OutBdLow2Hi_cached but uses transfermix for unpacking
 void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                                     MyList<var> *VarList1, MyList<var> *VarList2,
                                     int Symmetry, SyncCache &cache)
 {
  if (!cache.valid)
  {
    int cpusize;
    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
    cache.cpusize = cpusize;
    if (!cache.combined_src)
    {
      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
      cache.send_lengths = new int[cpusize];
      cache.recv_lengths = new int[cpusize];
      cache.send_bufs = new double *[cpusize];
      cache.recv_bufs = new double *[cpusize];
      cache.send_buf_caps = new int[cpusize];
      cache.recv_buf_caps = new int[cpusize];
      for (int i = 0; i < cpusize; i++)
      {
        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
      }
      cache.max_reqs = 2 * cpusize;
      cache.reqs = new MPI_Request[cache.max_reqs];
      cache.stats = new MPI_Status[cache.max_reqs];
    }
    MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
    for (int node = 0; node < cpusize; node++)
    {
      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatcL, node, 4, Symmetry);
      build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]);
      if (src_owned) src_owned->destroyList();
    }
    if (dst) dst->destroyList();
    cache.valid = true;
  }
  // Use transfermix instead of transfer for mix-mode interpolation
  int myrank;
  MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize);
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  int cpusize = cache.cpusize;
  int req_no = 0;
  for (int node = 0; node < cpusize; node++)
  {
    if (node == myrank)
    {
      int length = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
      cache.recv_lengths[node] = length;
      if (length > 0)
      {
        if (length > cache.recv_buf_caps[node])
        {
          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
          cache.recv_bufs[node] = new double[length];
          cache.recv_buf_caps[node] = length;
        }
        data_packermix(cache.recv_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
      }
    }
    else
    {
      int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
      cache.send_lengths[node] = slength;
      if (slength > 0)
      {
        if (slength > cache.send_buf_caps[node])
        {
          if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
          cache.send_bufs[node] = new double[slength];
          cache.send_buf_caps[node] = slength;
        }
        data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
        MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
      }
      int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
      cache.recv_lengths[node] = rlength;
      if (rlength > 0)
      {
        if (rlength > cache.recv_buf_caps[node])
        {
          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
          cache.recv_bufs[node] = new double[rlength];
          cache.recv_buf_caps[node] = rlength;
        }
        MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
      }
    }
  }
  MPI_Waitall(req_no, cache.reqs, cache.stats);
  for (int node = 0; node < cpusize; node++)
    if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
      data_packermix(cache.recv_bufs[node], cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
 }
 // collect all buffer grid segments or blocks for given patch
 MyList<Parallel::gridseg> *Parallel::build_buffer_gsl(Patch *Pat)
 {
--- a/AMSS_NCKU_source/Parallel.h
+++ b/AMSS_NCKU_source/Parallel.h
@@ -81,43 +81,6 @@ namespace Parallel
                   int Symmetry);
  void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry);
  void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
  void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
  struct SyncCache {
    bool valid;
    int cpusize;
    MyList<gridseg> **combined_src;
    MyList<gridseg> **combined_dst;
    int *send_lengths;
    int *recv_lengths;
    double **send_bufs;
    double **recv_bufs;
    int *send_buf_caps;
    int *recv_buf_caps;
    MPI_Request *reqs;
    MPI_Status *stats;
    int max_reqs;
    bool lengths_valid;
    SyncCache();
    void invalidate();
    void destroy();
  };
  void Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache);
  void transfer_cached(MyList<gridseg> **src, MyList<gridseg> **dst,
                       MyList<var> *VarList1, MyList<var> *VarList2,
                       int Symmetry, SyncCache &cache);
  struct AsyncSyncState {
    int req_no;
    bool active;
    AsyncSyncState() : req_no(0), active(false) {}
  };
  void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
                  SyncCache &cache, AsyncSyncState &state);
  void Sync_finish(SyncCache &cache, AsyncSyncState &state,
                   MyList<var> *VarList, int Symmetry);
  void OutBdLow2Hi(Patch *Patc, Patch *Patf,
                   MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
                   int Symmetry);
@@ -130,15 +93,6 @@ namespace Parallel
  void OutBdLow2Himix(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                      MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
                      int Symmetry);
  void Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                       MyList<var> *VarList1, MyList<var> *VarList2,
                       int Symmetry, SyncCache &cache);
  void OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                          MyList<var> *VarList1, MyList<var> *VarList2,
                          int Symmetry, SyncCache &cache);
  void OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                             MyList<var> *VarList1, MyList<var> *VarList2,
                             int Symmetry, SyncCache &cache);
  void Prolong(Patch *Patc, Patch *Patf,
               MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
               int Symmetry);
--- a/AMSS_NCKU_source/TwoPunctures.C
+++ b/AMSS_NCKU_source/TwoPunctures.C
--- a/AMSS_NCKU_source/TwoPunctures.h
+++ b/AMSS_NCKU_source/TwoPunctures.h
@@ -1,8 +1,7 @@
 #ifndef TWO_PUNCTURES_H
 #define TWO_PUNCTURES_H
 #include <omp.h>
 #define StencilSize 19
 #define N_PlaneRelax 1
 #define NRELAX 200
@@ -33,7 +32,7 @@ private:
       int npoints_A, npoints_B, npoints_phi;
       double target_M_plus, target_M_minus;
-
+       
       double admMass;
       double adm_tol;
@@ -43,18 +42,6 @@ private:
       int ntotal;
       // ===== Precomputed spectral derivative matrices =====
       double *D1_A, *D2_A;
       double *D1_B, *D2_B;
       double *DF1_phi, *DF2_phi;
       // ===== Pre-allocated workspace for LineRelax (per-thread) =====
       int max_threads;
       double **ws_diag_be, **ws_e_be, **ws_f_be, **ws_b_be, **ws_x_be;
       double **ws_l_be, **ws_u_be, **ws_d_be, **ws_y_be;
       double **ws_diag_al, **ws_e_al, **ws_f_al, **ws_b_al, **ws_x_al;
       double **ws_l_al, **ws_u_al, **ws_d_al, **ws_y_al;
       struct parameters
       {
              int nvar, n1, n2, n3;
@@ -71,28 +58,6 @@ public:
                    int Newtonmaxit);
       ~TwoPunctures();
       // 02/07: New/modified methods
       void allocate_workspace();
       void free_workspace();
       void precompute_derivative_matrices();
       void build_cheb_deriv_matrices(int n, double *D1, double *D2);
       void build_fourier_deriv_matrices(int N, double *DF1, double *DF2);
       void Derivatives_AB3_MatMul(int nvar, int n1, int n2, int n3, derivs v);
       void ThomasAlgorithm_ws(int N, double *b, double *a, double *c, double *x, double *q,
                                double *l, double *u_ws, double *d, double *y);
       void LineRelax_be_omp(double *dv,
                         int const i, int const k, int const nvar,
                         int const n1, int const n2, int const n3,
                         double const *rhs, int const *ncols, int **cols,
                         double **JFD, int tid);
       void LineRelax_al_omp(double *dv,
                         int const j, int const k, int const nvar,
                         int const n1, int const n2, int const n3,
                         double const *rhs, int const *ncols,
                         int **cols, double **JFD, int tid);
       void relax_omp(double *dv, int const nvar, int const n1, int const n2, int const n3,
                  double const *rhs, int const *ncols, int **cols, double **JFD);
       void Solve();
       void set_initial_guess(derivs v);
       int index(int i, int j, int k, int l, int a, int b, int c, int d);
@@ -151,11 +116,23 @@ public:
       double BY_KKofxyz(double x, double y, double z);
       void SetMatrix_JFD(int nvar, int n1, int n2, int n3, derivs u, int *ncols, int **cols, double **Matrix);
       void J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, double *Jdv, derivs u);
       void relax(double *dv, int const nvar, int const n1, int const n2, int const n3,
                  double const *rhs, int const *ncols, int **cols, double **JFD);
       void LineRelax_be(double *dv,
                         int const i, int const k, int const nvar,
                         int const n1, int const n2, int const n3,
                         double const *rhs, int const *ncols, int **cols,
                         double **JFD);
       void JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2,
                         int n3, derivs dv, derivs u, double *values);
       void LinEquations(double A, double B, double X, double R,
                         double x, double r, double phi,
                         double y, double z, derivs dU, derivs U, double *values);
       void LineRelax_al(double *dv,
                         int const j, int const k, int const nvar,
                         int const n1, int const n2, int const n3,
                         double const *rhs, int const *ncols,
                         int **cols, double **JFD);
       void ThomasAlgorithm(int N, double *b, double *a, double *c, double *x, double *q);
       void Save(char *fname);
       // provided by Vasileios Paschalidis (vpaschal@illinois.edu)
@@ -164,4 +141,4 @@ public:
       void SpecCoef(parameters par, int ivar, double *v, double *cf);
 };
-#endif /* TWO_PUNCTURES_H */
+#endif /* TWO_PUNCTURES_H */
--- a/AMSS_NCKU_source/Z4c_class.C
+++ b/AMSS_NCKU_source/Z4c_class.C
@@ -321,7 +321,22 @@ void Z4c_class::Step(int lev, int YN)
    }
    Pp = Pp->next;
  }
-  // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
+  // check error information
  {
    int erh = ERROR;
    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  }
  if (ERROR)
  {
    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime 
                              << ", lev = " << lev << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
 #ifdef WithShell
  // evolve Shell Patches
@@ -339,9 +354,9 @@ void Z4c_class::Step(int lev, int YN)
        {
 #if (AGM == 0)
          f_enforce_ga(cg->shape,
-                       cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
+                       cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                       cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
-                       cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
+                       cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
                       cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
 #endif
@@ -453,16 +468,24 @@ void Z4c_class::Step(int lev, int YN)
      sPp = sPp->next;
    }
  }
-  // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+  // check error information
  MPI_Request err_req_pre;
  {
    int erh = ERROR;
-    MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_pre);
+    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  }
  if (ERROR)
  {
    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
 #endif
-  Parallel::AsyncSyncState async_pre;
+  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
  Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
 #ifdef WithShell
  if (lev == 0)
@@ -475,30 +498,12 @@ void Z4c_class::Step(int lev, int YN)
    {
      prev_clock = curr_clock;
      curr_clock = clock();
-      cout << " Shell stuff synchronization used "
+      cout << " Shell stuff synchronization used " 
-           << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
+           << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) 
           << " seconds! " << endl;
    }
  }
 #endif
  Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
 #ifdef WithShell
  // Complete non-blocking error reduction and check
  MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE);
  if (ERROR)
  {
    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
                              << ", lev = " << lev << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
 #endif
  // for black hole position
  if (BH_num > 0 && lev == GH->levels - 1)
@@ -688,7 +693,23 @@ void Z4c_class::Step(int lev, int YN)
      Pp = Pp->next;
    }
-    // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
+    // check error information
    {
      int erh = ERROR;
      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    }
    if (ERROR)
    {
      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count 
                                << " variables at t = " << PhysTime 
                                << ", lev = " << lev << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
 #ifdef WithShell
    // evolve Shell Patches
@@ -829,16 +850,25 @@ void Z4c_class::Step(int lev, int YN)
        sPp = sPp->next;
      }
    }
-    // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+    // check error information
    MPI_Request err_req_cor;
    {
      int erh = ERROR;
-      MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
+      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    }
    if (ERROR)
    {
      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count 
                                << " variables at t = " << PhysTime << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
 #endif
-    Parallel::AsyncSyncState async_cor;
+    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
    Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
 #ifdef WithShell
    if (lev == 0)
@@ -851,30 +881,11 @@ void Z4c_class::Step(int lev, int YN)
      {
        prev_clock = curr_clock;
        curr_clock = clock();
-        cout << " Shell stuff synchronization used "
+        cout << " Shell stuff synchronization used " 
-             << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
+             << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) 
             << " seconds! " << endl;
      }
    }
 #endif
    Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
 #ifdef WithShell
    // Complete non-blocking error reduction and check
    MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
    if (ERROR)
    {
      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
                                << " variables at t = " << PhysTime
                                << ", lev = " << lev << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
 #endif
    // for black hole position
    if (BH_num > 0 && lev == GH->levels - 1)
@@ -1241,7 +1252,22 @@ void Z4c_class::Step(int lev, int YN)
 	 }
  }
 #endif
-  // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
+  // check error information
  {
    int erh = ERROR;
    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  }
  if (ERROR)
  {
    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime 
                              << ", lev = " << lev << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
  // evolve Shell Patches
  if (lev == 0)
@@ -1516,15 +1542,23 @@ void Z4c_class::Step(int lev, int YN)
  }
 #endif
  }
-  // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+  // check error information
  MPI_Request err_req_pre;
  {
    int erh = ERROR;
-    MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_pre);
+    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  }
  if (ERROR)
  {
    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
-  Parallel::AsyncSyncState async_pre;
+  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
  Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
  if (lev == 0)
  {
@@ -1536,8 +1570,8 @@ void Z4c_class::Step(int lev, int YN)
    {
      prev_clock = curr_clock;
      curr_clock = clock();
-      cout << " Shell stuff synchronization used "
+      cout << " Shell stuff synchronization used " 
-           << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
+           << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) 
           << " seconds! " << endl;
    }
@@ -1586,22 +1620,6 @@ void Z4c_class::Step(int lev, int YN)
  }
 #endif
  }
  Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
  // Complete non-blocking error reduction and check
  MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE);
  if (ERROR)
  {
    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
                              << ", lev = " << lev << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
  // for black hole position
  if (BH_num > 0 && lev == GH->levels - 1)
@@ -1823,7 +1841,23 @@ void Z4c_class::Step(int lev, int YN)
      Pp = Pp->next;
    }
-    // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
+    // check error information
    {
      int erh = ERROR;
      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    }
    if (ERROR)
    {
      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count 
                                << " variables at t = " << PhysTime 
                                << ", lev = " << lev << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
    // evolve Shell Patches
    if (lev == 0)
@@ -2069,15 +2103,24 @@ void Z4c_class::Step(int lev, int YN)
        sPp = sPp->next;
      }
    }
-    // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+    // check error information
    MPI_Request err_req_cor;
    {
      int erh = ERROR;
-      MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
+      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    }
    if (ERROR)
    {
      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count 
                                << " variables at t = " << PhysTime << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
-    Parallel::AsyncSyncState async_cor;
+    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
    Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
    if (lev == 0)
    {
@@ -2089,8 +2132,8 @@ void Z4c_class::Step(int lev, int YN)
      {
        prev_clock = curr_clock;
        curr_clock = clock();
-        cout << " Shell stuff synchronization used "
+        cout << " Shell stuff synchronization used " 
-             << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
+             << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) 
             << " seconds! " << endl;
      }
    }
@@ -2127,23 +2170,6 @@ void Z4c_class::Step(int lev, int YN)
    }
 // end smooth
 #endif
    Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
    // Complete non-blocking error reduction and check
    MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
    if (ERROR)
    {
      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
                                << " variables at t = " << PhysTime
                                << ", lev = " << lev << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
    // for black hole position
    if (BH_num > 0 && lev == GH->levels - 1)
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -730,12 +730,6 @@ void bssn_class::Initialize()
    PhysTime = StartTime;
    Setup_Black_Hole_position();
  }
  // Initialize sync caches (per-level, for predictor and corrector)
  sync_cache_pre = new Parallel::SyncCache[GH->levels];
  sync_cache_cor = new Parallel::SyncCache[GH->levels];
  sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
  sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
 }
 //================================================================================================
@@ -987,32 +981,6 @@ bssn_class::~bssn_class()
  delete Azzz;
 #endif
  // Destroy sync caches before GH
  if (sync_cache_pre)
  {
    for (int i = 0; i < GH->levels; i++)
      sync_cache_pre[i].destroy();
    delete[] sync_cache_pre;
  }
  if (sync_cache_cor)
  {
    for (int i = 0; i < GH->levels; i++)
      sync_cache_cor[i].destroy();
    delete[] sync_cache_cor;
  }
  if (sync_cache_rp_coarse)
  {
    for (int i = 0; i < GH->levels; i++)
      sync_cache_rp_coarse[i].destroy();
    delete[] sync_cache_rp_coarse;
  }
  if (sync_cache_rp_fine)
  {
    for (int i = 0; i < GH->levels; i++)
      sync_cache_rp_fine[i].destroy();
    delete[] sync_cache_rp_fine;
  }
  delete GH;
 #ifdef WithShell
  delete SH;
@@ -2213,7 +2181,6 @@ void bssn_class::Evolve(int Steps)
    GH->Regrid(Symmetry, BH_num, Porgbr, Porg0,
               SynchList_cor, OldStateList, StateList, SynchList_pre,
               fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor);
    for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
 #endif
 #if (REGLEV == 0 && (PSTR == 1 || PSTR == 2))
@@ -2429,7 +2396,6 @@ void bssn_class::RecursiveStep(int lev)
  GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
                      SynchList_cor, OldStateList, StateList, SynchList_pre,
                      fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
  for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
 #endif
 }
@@ -2608,7 +2574,6 @@ void bssn_class::ParallelStep()
  GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
                      SynchList_cor, OldStateList, StateList, SynchList_pre,
                      fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
  for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
 #endif
 }
@@ -2775,7 +2740,6 @@ void bssn_class::ParallelStep()
        GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
                            SynchList_cor, OldStateList, StateList, SynchList_pre,
                            fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor);
        for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
        //               a_stream.clear();
        //               a_stream.str("");
@@ -2790,7 +2754,6 @@ void bssn_class::ParallelStep()
      GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
                          SynchList_cor, OldStateList, StateList, SynchList_pre,
                          fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
      for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
      //               a_stream.clear();
      //               a_stream.str("");
@@ -2809,7 +2772,6 @@ void bssn_class::ParallelStep()
          GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
                              SynchList_cor, OldStateList, StateList, SynchList_pre,
                              fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor);
          for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
          //               a_stream.clear();
          //               a_stream.str("");
@@ -2825,7 +2787,6 @@ void bssn_class::ParallelStep()
          GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
                              SynchList_cor, OldStateList, StateList, SynchList_pre,
                              fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor);
          for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
          //               a_stream.clear();
          //               a_stream.str("");
@@ -3197,7 +3158,21 @@ void bssn_class::Step(int lev, int YN)
    }
    Pp = Pp->next;
  }
-  // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
+  // check error information
  {
    int erh = ERROR;
    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  }
  if (ERROR)
  {
    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime << ", lev = " << lev << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
 #ifdef WithShell
  // evolve Shell Patches
@@ -3215,9 +3190,9 @@ void bssn_class::Step(int lev, int YN)
        {
 #if (AGM == 0)
          f_enforce_ga(cg->shape,
-                       cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
+                       cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                       cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
-                       cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
+                       cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
                       cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
 #endif
@@ -3341,16 +3316,25 @@ void bssn_class::Step(int lev, int YN)
 #endif
  }
-  // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+  // check error information
  MPI_Request err_req;
  {
    int erh = ERROR;
-    MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req);
+    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  }
  if (ERROR)
  {
    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
 #endif
-  Parallel::AsyncSyncState async_pre;
+  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
  Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
 #ifdef WithShell
  if (lev == 0)
@@ -3363,29 +3347,12 @@ void bssn_class::Step(int lev, int YN)
    {
      prev_clock = curr_clock;
      curr_clock = clock();
-      cout << " Shell stuff synchronization used "
+      cout << " Shell stuff synchronization used " 
-           << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
+           << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) 
           << " seconds! " << endl;
    }
  }
 #endif
  Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
 #ifdef WithShell
  // Complete non-blocking error reduction and check
  MPI_Wait(&err_req, MPI_STATUS_IGNORE);
  if (ERROR)
  {
    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime << ", lev = " << lev << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
 #endif
 #if (MAPBH == 0)
  // for black hole position
@@ -3561,7 +3528,24 @@ void bssn_class::Step(int lev, int YN)
      Pp = Pp->next;
    }
-    // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
+    // check error information
    {
      int erh = ERROR;
      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    }
    if (ERROR)
    {
      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count 
                                << " variables at t = " << PhysTime 
                                << ", lev = " << lev << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
 #ifdef WithShell
    // evolve Shell Patches
@@ -3579,9 +3563,9 @@ void bssn_class::Step(int lev, int YN)
          {
 #if (AGM == 0)
            f_enforce_ga(cg->shape,
-                         cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
+                         cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
                         cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
-                         cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
+                         cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], 
                         cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
 #elif (AGM == 1)
            if (iter_count == 3)
@@ -3701,16 +3685,26 @@ void bssn_class::Step(int lev, int YN)
        sPp = sPp->next;
      }
    }
-    // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+    // check error information
    MPI_Request err_req_cor;
    {
      int erh = ERROR;
-      MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
+      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    }
    if (ERROR)
    {
      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" 
                                << iter_count << " variables at t = " 
                                << PhysTime << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
 #endif
-    Parallel::AsyncSyncState async_cor;
+    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
    Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
 #ifdef WithShell
    if (lev == 0)
@@ -3723,31 +3717,12 @@ void bssn_class::Step(int lev, int YN)
      {
        prev_clock = curr_clock;
        curr_clock = clock();
-        cout << " Shell stuff synchronization used "
+        cout << " Shell stuff synchronization used " 
-             << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
+             << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) 
             << " seconds! " << endl;
      }
    }
 #endif
    Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
 #ifdef WithShell
    // Complete non-blocking error reduction and check
    MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
    if (ERROR)
    {
      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
                                << " variables at t = " << PhysTime
                                << ", lev = " << lev << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
 #endif
 #if (MAPBH == 0)
    // for black hole position
@@ -4059,7 +4034,22 @@ void bssn_class::Step(int lev, int YN)
    }
    Pp = Pp->next;
  }
-  // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
+  // check error information
  {
    int erh = ERROR;
    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  }
  if (ERROR)
  {
    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime 
                              << ", lev = " << lev << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
 #ifdef WithShell
  // evolve Shell Patches
@@ -4077,15 +4067,15 @@ void bssn_class::Step(int lev, int YN)
        {
 #if (AGM == 0)
          f_enforce_ga(cg->shape,
-                       cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
+                       cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                       cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
-                       cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
+                       cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
                       cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
 #endif
          if (f_compute_rhs_bssn_ss(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
-                                    cg->fgfs[fngfs + ShellPatch::gx],
+                                    cg->fgfs[fngfs + ShellPatch::gx], 
-                                    cg->fgfs[fngfs + ShellPatch::gy],
+                                    cg->fgfs[fngfs + ShellPatch::gy], 
                                    cg->fgfs[fngfs + ShellPatch::gz],
                                    cg->fgfs[fngfs + ShellPatch::drhodx], 
                                    cg->fgfs[fngfs + ShellPatch::drhody], 
@@ -4200,16 +4190,25 @@ void bssn_class::Step(int lev, int YN)
  }
 #endif
  }
-  // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+  // check error information
  MPI_Request err_req;
  {
    int erh = ERROR;
-    MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req);
+    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  }
  if (ERROR)
  {
    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " 
                              << PhysTime << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
 #endif
-  Parallel::AsyncSyncState async_pre;
+  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
  Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
 #ifdef WithShell
  if (lev == 0)
@@ -4222,27 +4221,9 @@ void bssn_class::Step(int lev, int YN)
    {
      prev_clock = curr_clock;
      curr_clock = clock();
-      cout << " Shell stuff synchronization used "
+      cout << " Shell stuff synchronization used " 
-           << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
+      << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) 
-           << " seconds! " << endl;
+      << " seconds! " << endl;
    }
  }
 #endif
  Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
 #ifdef WithShell
  // Complete non-blocking error reduction and check
  MPI_Wait(&err_req, MPI_STATUS_IGNORE);
  if (ERROR)
  {
    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
                              << ", lev = " << lev << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
 #endif
@@ -4405,7 +4386,23 @@ void bssn_class::Step(int lev, int YN)
      Pp = Pp->next;
    }
-    // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
+    // check error information
    {
      int erh = ERROR;
      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    }
    if (ERROR)
    {
      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count 
                                << " variables at t = " << PhysTime 
                                << ", lev = " << lev << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
 #ifdef WithShell
    // evolve Shell Patches
@@ -4423,9 +4420,9 @@ void bssn_class::Step(int lev, int YN)
          {
 #if (AGM == 0)
            f_enforce_ga(cg->shape,
-                         cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
+                         cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
                         cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
-                         cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
+                         cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], 
                         cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
 #elif (AGM == 1)
            if (iter_count == 3)
@@ -4545,16 +4542,25 @@ void bssn_class::Step(int lev, int YN)
        sPp = sPp->next;
      }
    }
-    // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+    // check error information
    MPI_Request err_req_cor;
    {
      int erh = ERROR;
-      MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
+      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    }
    if (ERROR)
    {
      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count 
                                << " variables at t = " << PhysTime << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
 #endif
-    Parallel::AsyncSyncState async_cor;
+    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
    Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
 #ifdef WithShell
    if (lev == 0)
@@ -4567,30 +4573,11 @@ void bssn_class::Step(int lev, int YN)
      {
        prev_clock = curr_clock;
        curr_clock = clock();
-        cout << " Shell stuff synchronization used "
+        cout << " Shell stuff synchronization used " 
-             << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
+             << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) 
             << " seconds! " << endl;
      }
    }
 #endif
    Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
 #ifdef WithShell
    // Complete non-blocking error reduction and check
    MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
    if (ERROR)
    {
      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
                                << " variables at t = " << PhysTime
                                << ", lev = " << lev << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
 #endif
    // for black hole position
    if (BH_num > 0 && lev == GH->levels - 1)
@@ -4956,19 +4943,11 @@ void bssn_class::Step(int lev, int YN)
  //   misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Predictor rhs calculation");
-  // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+  // check error information
  MPI_Request err_req;
  {
    int erh = ERROR;
-    MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev], &err_req);
+    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev]);
  }
  //   misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor sync");
  Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
  // Complete non-blocking error reduction and check
  MPI_Wait(&err_req, MPI_STATUS_IGNORE);
  if (ERROR)
  {
    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
@@ -4980,6 +4959,10 @@ void bssn_class::Step(int lev, int YN)
    }
  }
  //   misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor sync");
  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
 #if (MAPBH == 0)
  // for black hole position
  if (BH_num > 0 && lev == GH->levels - 1)
@@ -5157,34 +5140,30 @@ void bssn_class::Step(int lev, int YN)
    //   misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector error check");
-    // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+    // check error information
    MPI_Request err_req_cor;
    {
      int erh = ERROR;
-      MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev], &err_req_cor);
+      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev]);
    }
    //    misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector sync");
    Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
    //    misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Corrector sync");
    // Complete non-blocking error reduction and check
    MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
    if (ERROR)
    {
      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
-          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
+          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count 
-                                << " variables at t = " << PhysTime
+                                << " variables at t = " << PhysTime 
                                << ", lev = " << lev << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
    //    misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector sync");
    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
    //    misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Corrector sync");
 #if (MAPBH == 0)
    // for black hole position
    if (BH_num > 0 && lev == GH->levels - 1)
@@ -5468,11 +5447,21 @@ void bssn_class::SHStep()
 #if (PSTR == 1 || PSTR == 2)
 //   misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor's error check");
 #endif
-  // Non-blocking error reduction overlapped with Synch to hide Allreduce latency
+  // check error information
  MPI_Request err_req;
  {
    int erh = ERROR;
-    MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req);
+    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  }
  if (ERROR)
  {
    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
  {
@@ -5484,25 +5473,12 @@ void bssn_class::SHStep()
    {
      prev_clock = curr_clock;
      curr_clock = clock();
-      cout << " Shell stuff synchronization used "
+      cout << " Shell stuff synchronization used " 
-           << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
+           << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) 
           << " seconds! " << endl;
    }
  }
  // Complete non-blocking error reduction and check
  MPI_Wait(&err_req, MPI_STATUS_IGNORE);
  if (ERROR)
  {
    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
  // corrector
  for (iter_count = 1; iter_count < 4; iter_count++)
  {
@@ -5645,11 +5621,21 @@ void bssn_class::SHStep()
        sPp = sPp->next;
      }
    }
-    // Non-blocking error reduction overlapped with Synch to hide Allreduce latency
+    // check error information
    MPI_Request err_req_cor;
    {
      int erh = ERROR;
-      MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
+      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    }
    if (ERROR)
    {
      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count 
                                << " variables at t = " << PhysTime << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
    {
@@ -5661,26 +5647,12 @@ void bssn_class::SHStep()
      {
        prev_clock = curr_clock;
        curr_clock = clock();
-        cout << " Shell stuff synchronization used "
+        cout << " Shell stuff synchronization used " 
-             << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
+             << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) 
             << " seconds! " << endl;
      }
    }
    // Complete non-blocking error reduction and check
    MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
    if (ERROR)
    {
      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count
                                << " variables at t = " << PhysTime << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
    sPp = SH->PatL;
    while (sPp)
    {
@@ -5809,7 +5781,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 //       misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
 #endif
-      Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
+      Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
 #if (PSTR == 1 || PSTR == 2)
 //       a_stream.clear();
@@ -5819,11 +5791,21 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry);
@@ -5860,7 +5842,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 //       misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
 #endif
-      Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);
+      Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry);
 #if (PSTR == 1 || PSTR == 2)
 //       a_stream.clear();
@@ -5870,11 +5852,21 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SL, SL, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SL, SL, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry);
@@ -5888,7 +5880,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif
    }
-    Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]);
+    Parallel::Sync(GH->PatL[lev], SL, Symmetry);
 #if (PSTR == 1 || PSTR == 2)
 //    a_stream.clear();
@@ -5946,14 +5938,24 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
 #endif
-      Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
+      Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry);
@@ -5968,21 +5970,31 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
 #endif
-      Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);
+      Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry);
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SL, SL, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SL, SL, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry);
 #endif
    }
-    Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]);
+    Parallel::Sync(GH->PatL[lev], SL, Symmetry);
  }
 }
@@ -6033,14 +6045,24 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry);
 #endif
-      Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
+      Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -6057,21 +6079,31 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry);
 #endif
-      Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
+      Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry);
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry);
 #endif
    }
-    Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
+    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
  }
 }
@@ -6101,11 +6133,21 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
      }
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -6114,11 +6156,21 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
    else // no time refinement levels and for all same time levels
    {
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -6134,10 +6186,10 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
 #else
      Parallel::Restrict_after(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry);
 #endif
-      Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
+      Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry);
    }
-    Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
+    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
  }
 }
 #undef MIXOUTB
--- a/AMSS_NCKU_source/bssn_class.h
+++ b/AMSS_NCKU_source/bssn_class.h
@@ -126,11 +126,6 @@ public:
       MyList<var> *OldStateList, *DumpList;
       MyList<var> *ConstraintList;
       Parallel::SyncCache *sync_cache_pre;  // per-level cache for predictor sync
       Parallel::SyncCache *sync_cache_cor;  // per-level cache for corrector sync
       Parallel::SyncCache *sync_cache_rp_coarse;  // RestrictProlong sync on PatL[lev-1]
       Parallel::SyncCache *sync_cache_rp_fine;    // RestrictProlong sync on PatL[lev]
       monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
       monitor *ConVMonitor;
       surface_integral *Waveshell;
--- a/AMSS_NCKU_source/bssn_rhs.f90
+++ b/AMSS_NCKU_source/bssn_rhs.f90
--- a/AMSS_NCKU_source/bssn_rhs_legacy.f90
+++ b/AMSS_NCKU_source/bssn_rhs_legacy.f90
--- a/AMSS_NCKU_source/bssn_rhs_opt.f90
+++ b/AMSS_NCKU_source/bssn_rhs_opt.f90
--- a/AMSS_NCKU_source/enforce_algebra.f90
+++ b/AMSS_NCKU_source/enforce_algebra.f90
@@ -18,61 +18,49 @@
  real*8, dimension(ex(1),ex(2),ex(3)), intent(inout) :: Ayy,Ayz,Azz
 !~~~~~~~> Local variable:
-
+  
-  integer :: i,j,k
+  real*8, dimension(ex(1),ex(2),ex(3)) :: trA,detg
-  real*8 :: lgxx,lgyy,lgzz,ldetg
+  real*8, dimension(ex(1),ex(2),ex(3)) :: gxx,gyy,gzz 
-  real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz
+  real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz,gupyy,gupyz,gupzz
  real*8 :: ltrA,lscale
  real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0
 !~~~~~~>
-  do k=1,ex(3)
+  gxx = dxx + ONE
-  do j=1,ex(2)
+  gyy = dyy + ONE
-  do i=1,ex(1)
+  gzz = dzz + ONE
-    lgxx = dxx(i,j,k) + ONE
+  detg =  gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
-    lgyy = dyy(i,j,k) + ONE
+          gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
-    lgzz = dzz(i,j,k) + ONE
+  gupxx =   ( gyy * gzz - gyz * gyz ) / detg
  gupxy = - ( gxy * gzz - gyz * gxz ) / detg
  gupxz =   ( gxy * gyz - gyy * gxz ) / detg
  gupyy =   ( gxx * gzz - gxz * gxz ) / detg
  gupyz = - ( gxx * gyz - gxy * gxz ) / detg
  gupzz =   ( gxx * gyy - gxy * gxy ) / detg
-    ldetg =  lgxx * lgyy * lgzz &
+  trA =         gupxx * Axx + gupyy * Ayy + gupzz * Azz &
-           + gxy(i,j,k) * gyz(i,j,k) * gxz(i,j,k) &
+       + TWO * (gupxy * Axy + gupxz * Axz + gupyz * Ayz)
           + gxz(i,j,k) * gxy(i,j,k) * gyz(i,j,k) &
           - gxz(i,j,k) * lgyy * gxz(i,j,k) &
           - gxy(i,j,k) * gxy(i,j,k) * lgzz &
           - lgxx * gyz(i,j,k) * gyz(i,j,k)
-    lgupxx =   ( lgyy * lgzz - gyz(i,j,k) * gyz(i,j,k) ) / ldetg
+  Axx = Axx - F1o3 * gxx * trA
-    lgupxy = - ( gxy(i,j,k) * lgzz - gyz(i,j,k) * gxz(i,j,k) ) / ldetg
+  Axy = Axy - F1o3 * gxy * trA
-    lgupxz =   ( gxy(i,j,k) * gyz(i,j,k) - lgyy * gxz(i,j,k) ) / ldetg
+  Axz = Axz - F1o3 * gxz * trA
-    lgupyy =   ( lgxx * lgzz - gxz(i,j,k) * gxz(i,j,k) ) / ldetg
+  Ayy = Ayy - F1o3 * gyy * trA
-    lgupyz = - ( lgxx * gyz(i,j,k) - gxy(i,j,k) * gxz(i,j,k) ) / ldetg
+  Ayz = Ayz - F1o3 * gyz * trA
-    lgupzz =   ( lgxx * lgyy - gxy(i,j,k) * gxy(i,j,k) ) / ldetg
+  Azz = Azz - F1o3 * gzz * trA
-    ltrA =         lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) &
+  detg = ONE / ( detg ** F1o3 ) 
-                 + lgupzz * Azz(i,j,k) &
+  
-         + TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) &
+  gxx = gxx * detg
-                 + lgupyz * Ayz(i,j,k))
+  gxy = gxy * detg
  gxz = gxz * detg
  gyy = gyy * detg
  gyz = gyz * detg
  gzz = gzz * detg
-    Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA
+  dxx = gxx - ONE
-    Axy(i,j,k) = Axy(i,j,k) - F1o3 * gxy(i,j,k) * ltrA
+  dyy = gyy - ONE
-    Axz(i,j,k) = Axz(i,j,k) - F1o3 * gxz(i,j,k) * ltrA
+  dzz = gzz - ONE
    Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA
    Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * gyz(i,j,k) * ltrA
    Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA
    lscale = ONE / ( ldetg ** F1o3 )
    dxx(i,j,k) = lgxx * lscale - ONE
    gxy(i,j,k) = gxy(i,j,k) * lscale
    gxz(i,j,k) = gxz(i,j,k) * lscale
    dyy(i,j,k) = lgyy * lscale - ONE
    gyz(i,j,k) = gyz(i,j,k) * lscale
    dzz(i,j,k) = lgzz * lscale - ONE
  enddo
  enddo
  enddo
  return
@@ -94,71 +82,51 @@
  real*8, dimension(ex(1),ex(2),ex(3)), intent(inout) :: Ayy,Ayz,Azz
 !~~~~~~~> Local variable:
-
+  
-  integer :: i,j,k
+  real*8, dimension(ex(1),ex(2),ex(3)) :: trA
-  real*8 :: lgxx,lgyy,lgzz,lscale
+  real*8, dimension(ex(1),ex(2),ex(3)) :: gxx,gyy,gzz 
-  real*8 :: lgxy,lgxz,lgyz
+  real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz,gupyy,gupyz,gupzz
  real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz
  real*8 :: ltrA
  real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0
 !~~~~~~>
-  do k=1,ex(3)
+  gxx = dxx + ONE
-  do j=1,ex(2)
+  gyy = dyy + ONE
-  do i=1,ex(1)
+  gzz = dzz + ONE
 ! for g
  gupzz =  gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
           gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
-! for g: normalize determinant first
+  gupzz = ONE / ( gupzz ** F1o3 ) 
-    lgxx = dxx(i,j,k) + ONE
+  
-    lgyy = dyy(i,j,k) + ONE
+  gxx = gxx * gupzz
-    lgzz = dzz(i,j,k) + ONE
+  gxy = gxy * gupzz
-    lgxy = gxy(i,j,k)
+  gxz = gxz * gupzz
-    lgxz = gxz(i,j,k)
+  gyy = gyy * gupzz
-    lgyz = gyz(i,j,k)
+  gyz = gyz * gupzz
  gzz = gzz * gupzz
-    lscale =  lgxx * lgyy * lgzz + lgxy * lgyz * lgxz &
+  dxx = gxx - ONE
-            + lgxz * lgxy * lgyz - lgxz * lgyy * lgxz &
+  dyy = gyy - ONE
-            - lgxy * lgxy * lgzz - lgxx * lgyz * lgyz
+  dzz = gzz - ONE
 ! for A  
-    lscale = ONE / ( lscale ** F1o3 )
+  gupxx =   ( gyy * gzz - gyz * gyz )
  gupxy = - ( gxy * gzz - gyz * gxz )
  gupxz =   ( gxy * gyz - gyy * gxz )
  gupyy =   ( gxx * gzz - gxz * gxz )
  gupyz = - ( gxx * gyz - gxy * gxz )
  gupzz =   ( gxx * gyy - gxy * gxy )
-    lgxx = lgxx * lscale
+  trA =         gupxx * Axx + gupyy * Ayy + gupzz * Azz &
-    lgxy = lgxy * lscale
+       + TWO * (gupxy * Axy + gupxz * Axz + gupyz * Ayz)
    lgxz = lgxz * lscale
    lgyy = lgyy * lscale
    lgyz = lgyz * lscale
    lgzz = lgzz * lscale
-    dxx(i,j,k) = lgxx - ONE
+  Axx = Axx - F1o3 * gxx * trA
-    gxy(i,j,k) = lgxy
+  Axy = Axy - F1o3 * gxy * trA
-    gxz(i,j,k) = lgxz
+  Axz = Axz - F1o3 * gxz * trA
-    dyy(i,j,k) = lgyy - ONE
+  Ayy = Ayy - F1o3 * gyy * trA
-    gyz(i,j,k) = lgyz
+  Ayz = Ayz - F1o3 * gyz * trA
-    dzz(i,j,k) = lgzz - ONE
+  Azz = Azz - F1o3 * gzz * trA
 ! for A: trace-free using normalized metric (det=1, no division needed)
    lgupxx =   ( lgyy * lgzz - lgyz * lgyz )
    lgupxy = - ( lgxy * lgzz - lgyz * lgxz )
    lgupxz =   ( lgxy * lgyz - lgyy * lgxz )
    lgupyy =   ( lgxx * lgzz - lgxz * lgxz )
    lgupyz = - ( lgxx * lgyz - lgxy * lgxz )
    lgupzz =   ( lgxx * lgyy - lgxy * lgxy )
    ltrA =         lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) &
                 + lgupzz * Azz(i,j,k) &
         + TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) &
                 + lgupyz * Ayz(i,j,k))
    Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA
    Axy(i,j,k) = Axy(i,j,k) - F1o3 * lgxy * ltrA
    Axz(i,j,k) = Axz(i,j,k) - F1o3 * lgxz * ltrA
    Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA
    Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * lgyz * ltrA
    Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA
  enddo
  enddo
  enddo
  return
--- a/AMSS_NCKU_source/fmisc.f90
+++ b/AMSS_NCKU_source/fmisc.f90
@@ -324,6 +324,7 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
  integer::i
  funcc = 0.d0
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -349,6 +350,7 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
  integer::i
  funcc = 0.d0
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -377,6 +379,7 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
  integer::i
  funcc = 0.d0
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -883,6 +886,7 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
  integer::i
  funcc = 0.d0
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
@@ -908,6 +912,7 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
  integer::i
  funcc = 0.d0
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
@@ -936,6 +941,7 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
  integer::i
  funcc = 0.d0
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
@@ -1112,65 +1118,64 @@ end subroutine d2dump
 ! Lagrangian polynomial interpolation
 !------------------------------------------------------------------------------
-  subroutine polint(xa, ya, x, y, dy, ordn)
+  subroutine polint(xa,ya,x,y,dy,ordn)
  implicit none
-  integer, intent(in) :: ordn
+!~~~~~~> Input Parameter:
-  real*8, dimension(ordn), intent(in) :: xa, ya
+  integer,intent(in) :: ordn
  real*8, dimension(ordn), intent(in) :: xa,ya
  real*8, intent(in) :: x
-  real*8, intent(out) :: y, dy
+  real*8, intent(out) :: y,dy
-  integer :: i, m, ns, n_m
+!~~~~~~> Other parameter:
  real*8, dimension(ordn) :: c, d, ho
  real*8 :: dif, dift, hp, h, den_val
-  c = ya
+  integer :: m,n,ns
-  d = ya
+  real*8, dimension(ordn) :: c,d,den,ho
-  ho = xa - x
+  real*8 :: dif,dift
-  ns = 1
+!~~~~~~>
  dif = abs(x - xa(1))
-  do i = 2, ordn
+  n=ordn
-    dift = abs(x - xa(i))
+  m=ordn
-    if (dift < dif) then
+
-      ns = i
+  c=ya
-      dif = dift
+  d=ya
-    end if
+  ho=xa-x
  ns=1
  dif=abs(x-xa(1))
  do m=1,n
   dift=abs(x-xa(m))
   if(dift < dif) then
    ns=m
    dif=dift
   end if
  end do
-  y = ya(ns)
+  y=ya(ns)
-  ns = ns - 1
+  ns=ns-1
-
+  do m=1,n-1
-  do m = 1, ordn - 1
+    den(1:n-m)=ho(1:n-m)-ho(1+m:n)
-    n_m = ordn - m
+    if (any(den(1:n-m) == 0.0))then
-    do i = 1, n_m
+      write(*,*) 'failure in polint for point',x
-      hp = ho(i)
+      write(*,*) 'with input points: ',xa
-      h  = ho(i+m)
+      stop
-      den_val = hp - h
+    endif
-
+    den(1:n-m)=(c(2:n-m+1)-d(1:n-m))/den(1:n-m)
-      if (den_val == 0.0d0) then
+    d(1:n-m)=ho(1+m:n)*den(1:n-m)
-        write(*,*) 'failure in polint for point',x
+    c(1:n-m)=ho(1:n-m)*den(1:n-m)
-        write(*,*) 'with input points: ',xa
+    if (2*ns < n-m) then
-        stop
+      dy=c(ns+1)
      end if
      den_val = (c(i+1) - d(i)) / den_val
      d(i) = h * den_val
      c(i) = hp * den_val
    end do
    if (2 * ns < n_m) then
      dy = c(ns + 1)
    else
-      dy = d(ns)
+      dy=d(ns)
-      ns = ns - 1
+      ns=ns-1
    end if
-    y = y + dy
+    y=y+dy
  end do
  return
  end subroutine polint
 !------------------------------------------------------------------------------
 !
@@ -1178,37 +1183,35 @@ end subroutine d2dump
 !
 !------------------------------------------------------------------------------
  subroutine polin2(x1a,x2a,ya,x1,x2,y,dy,ordn)
  implicit none
 !~~~~~~> Input parameters:
  integer,intent(in) :: ordn
  real*8, dimension(1:ordn), intent(in) :: x1a,x2a
  real*8, dimension(1:ordn,1:ordn), intent(in) :: ya
  real*8, intent(in) :: x1,x2
  real*8, intent(out) :: y,dy
-#ifdef POLINT_LEGACY_ORDER
+!~~~~~~> Other parameters:
  integer  :: i,m
  real*8, dimension(ordn) :: ymtmp
  real*8, dimension(ordn) :: yntmp
  m=size(x1a)
  do i=1,m
    yntmp=ya(i,:)
    call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
  end do
  call polint(x1a,ymtmp,x1,y,dy,ordn)
 #else
  integer  :: j
  real*8, dimension(ordn) :: ymtmp
  real*8 :: dy_temp
  do j=1,ordn
    call polint(x1a, ya(:,j), x1, ymtmp(j), dy_temp, ordn)
  end do
-  call polint(x2a, ymtmp, x2, y, dy, ordn)
+
-#endif
+  call polint(x1a,ymtmp,x1,y,dy,ordn)
  return
  end subroutine polin2
 !------------------------------------------------------------------------------
 !
@@ -1216,15 +1219,18 @@ end subroutine d2dump
 !
 !------------------------------------------------------------------------------
  subroutine polin3(x1a,x2a,x3a,ya,x1,x2,x3,y,dy,ordn)
  implicit none
 !~~~~~~> Input parameters:
  integer,intent(in) :: ordn
  real*8, dimension(1:ordn), intent(in) :: x1a,x2a,x3a
  real*8, dimension(1:ordn,1:ordn,1:ordn), intent(in) :: ya
  real*8, intent(in) :: x1,x2,x3
  real*8, intent(out) :: y,dy
-#ifdef POLINT_LEGACY_ORDER
+!~~~~~~> Other parameters:
  integer  :: i,j,m,n
  real*8, dimension(ordn,ordn) :: yatmp
  real*8, dimension(ordn) :: ymtmp
@@ -1233,33 +1239,24 @@ end subroutine d2dump
  m=size(x1a)
  n=size(x2a)
  do i=1,m
   do j=1,n
    yqtmp=ya(i,j,:)
    call polint(x3a,yqtmp,x3,yatmp(i,j),dy,ordn)
   end do
    yntmp=yatmp(i,:)
    call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
  end do
  call polint(x1a,ymtmp,x1,y,dy,ordn)
 #else
  integer  :: j, k
  real*8, dimension(ordn,ordn) :: yatmp
  real*8, dimension(ordn) :: ymtmp
  real*8 :: dy_temp
  do k=1,ordn
    do j=1,ordn
      call polint(x1a, ya(:,j,k), x1, yatmp(j,k), dy_temp, ordn)
    end do
  end do
-  do k=1,ordn
+
-    call polint(x2a, yatmp(:,k), x2, ymtmp(k), dy_temp, ordn)
+  call polint(x1a,ymtmp,x1,y,dy,ordn)
  end do
  call polint(x3a, ymtmp, x3, y, dy, ordn)
 #endif
  return
  end subroutine polin3
 !--------------------------------------------------------------------------------------
 ! calculate L2norm
--- a/AMSS_NCKU_source/lopsidediff.f90
+++ b/AMSS_NCKU_source/lopsidediff.f90
@@ -487,201 +487,6 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
  end subroutine lopsided
 !-----------------------------------------------------------------------------
 ! Combined advection (lopsided) + Kreiss-Oliger dissipation (kodis)
 ! Shares the symmetry_bd buffer fh, eliminating one full-grid copy per call.
 ! Mathematically identical to calling lopsided then kodis separately.
 !-----------------------------------------------------------------------------
 subroutine lopsided_kodis(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA,eps)
  implicit none
 !~~~~~~> Input parameters:
  integer, intent(in)  :: ex(1:3),Symmetry
  real*8,  intent(in)  :: X(1:ex(1)),Y(1:ex(2)),Z(1:ex(3))
  real*8,dimension(ex(1),ex(2),ex(3)),intent(in)   :: f,Sfx,Sfy,Sfz
  real*8,dimension(ex(1),ex(2),ex(3)),intent(inout):: f_rhs
  real*8,dimension(3),intent(in) ::SoA
  real*8,intent(in) :: eps
 !~~~~~~> local variables:
 ! note index -2,-1,0, so we have 3 extra points
  real*8,dimension(-2:ex(1),-2:ex(2),-2:ex(3))   :: fh
  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
  real*8 :: dX,dY,dZ
  real*8 :: d12dx,d12dy,d12dz,d2dx,d2dy,d2dz
  real*8,  parameter :: ZEO=0.d0,ONE=1.d0, F3=3.d0
  real*8,  parameter :: TWO=2.d0,F6=6.0d0,F18=1.8d1
  real*8,  parameter :: F12=1.2d1, F10=1.d1,EIT=8.d0
  integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
 ! kodis parameters
  real*8, parameter :: SIX=6.d0,FIT=1.5d1,TWT=2.d1
  real*8, parameter :: cof=6.4d1   ! 2^6
  dX = X(2)-X(1)
  dY = Y(2)-Y(1)
  dZ = Z(2)-Z(1)
  d12dx = ONE/F12/dX
  d12dy = ONE/F12/dY
  d12dz = ONE/F12/dZ
  d2dx = ONE/TWO/dX
  d2dy = ONE/TWO/dY
  d2dz = ONE/TWO/dZ
  imax = ex(1)
  jmax = ex(2)
  kmax = ex(3)
  imin = 1
  jmin = 1
  kmin = 1
  if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -2
  if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -2
  if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -2
 ! Single symmetry_bd call shared by both advection and dissipation
  call symmetry_bd(3,ex,f,fh,SoA)
 ! ---- Advection (lopsided) loop ----
 ! upper bound set ex-1 only for efficiency, 
 ! the loop body will set ex 0 also
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
 ! x direction   
    if(Sfx(i,j,k) > ZEO)then
      if(i+3 <= imax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfx(i,j,k)*d12dx*(-F3*fh(i-1,j,k)-F10*fh(i,j,k)+F18*fh(i+1,j,k) &
                                    -F6*fh(i+2,j,k)+    fh(i+3,j,k))
     elseif(i+2 <= imax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfx(i,j,k)*d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
     elseif(i+1 <= imax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfx(i,j,k)*d12dx*(-F3*fh(i+1,j,k)-F10*fh(i,j,k)+F18*fh(i-1,j,k) &
                                    -F6*fh(i-2,j,k)+    fh(i-3,j,k))
     endif
   elseif(Sfx(i,j,k) < ZEO)then
      if(i-3 >= imin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfx(i,j,k)*d12dx*(-F3*fh(i+1,j,k)-F10*fh(i,j,k)+F18*fh(i-1,j,k) &
                                    -F6*fh(i-2,j,k)+    fh(i-3,j,k))
     elseif(i-2 >= imin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfx(i,j,k)*d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
     elseif(i-1 >= imin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfx(i,j,k)*d12dx*(-F3*fh(i-1,j,k)-F10*fh(i,j,k)+F18*fh(i+1,j,k) &
                                    -F6*fh(i+2,j,k)+    fh(i+3,j,k))
     endif
   endif
 ! y direction   
    if(Sfy(i,j,k) > ZEO)then
      if(j+3 <= jmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j-1,k)-F10*fh(i,j,k)+F18*fh(i,j+1,k) &
                                    -F6*fh(i,j+2,k)+    fh(i,j+3,k))
     elseif(j+2 <= jmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfy(i,j,k)*d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
     elseif(j+1 <= jmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j+1,k)-F10*fh(i,j,k)+F18*fh(i,j-1,k) &
                                    -F6*fh(i,j-2,k)+    fh(i,j-3,k))
     endif
   elseif(Sfy(i,j,k) < ZEO)then
      if(j-3 >= jmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j+1,k)-F10*fh(i,j,k)+F18*fh(i,j-1,k) &
                                    -F6*fh(i,j-2,k)+    fh(i,j-3,k))
     elseif(j-2 >= jmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfy(i,j,k)*d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
     elseif(j-1 >= jmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j-1,k)-F10*fh(i,j,k)+F18*fh(i,j+1,k) &
                                    -F6*fh(i,j+2,k)+    fh(i,j+3,k))
     endif
   endif
 ! z direction   
    if(Sfz(i,j,k) > ZEO)then
      if(k+3 <= kmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k-1)-F10*fh(i,j,k)+F18*fh(i,j,k+1) &
                                    -F6*fh(i,j,k+2)+    fh(i,j,k+3))
     elseif(k+2 <= kmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfz(i,j,k)*d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
     elseif(k+1 <= kmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k+1)-F10*fh(i,j,k)+F18*fh(i,j,k-1) &
                                    -F6*fh(i,j,k-2)+    fh(i,j,k-3))
     endif
   elseif(Sfz(i,j,k) < ZEO)then
      if(k-3 >= kmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k+1)-F10*fh(i,j,k)+F18*fh(i,j,k-1) &
                                    -F6*fh(i,j,k-2)+    fh(i,j,k-3))
     elseif(k-2 >= kmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfz(i,j,k)*d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
     elseif(k-1 >= kmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k-1)-F10*fh(i,j,k)+F18*fh(i,j,k+1) &
                                    -F6*fh(i,j,k+2)+    fh(i,j,k+3))
     endif
   endif
  enddo
  enddo
  enddo
 ! ---- Dissipation (kodis) loop ----
  if(eps > ZEO) then
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
  if(i-3 >= imin .and. i+3 <= imax .and. &
     j-3 >= jmin .and. j+3 <= jmax .and. &
     k-3 >= kmin .and. k+3 <= kmax) then
   f_rhs(i,j,k)       = f_rhs(i,j,k) + eps/cof *( (     &
                              (fh(i-3,j,k)+fh(i+3,j,k)) - &
                          SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
                          FIT*(fh(i-1,j,k)+fh(i+1,j,k)) - &
                          TWT* fh(i,j,k)            )/dX + &
                                                  (     &
                              (fh(i,j-3,k)+fh(i,j+3,k)) - &
                          SIX*(fh(i,j-2,k)+fh(i,j+2,k)) + &
                          FIT*(fh(i,j-1,k)+fh(i,j+1,k)) - &
                          TWT* fh(i,j,k)            )/dY + &
                                                  (     &
                              (fh(i,j,k-3)+fh(i,j,k+3)) - &
                          SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
                          FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
                          TWT* fh(i,j,k)            )/dZ )
  endif
  enddo
  enddo
  enddo
  endif
  return
  end subroutine lopsided_kodis
 #elif (ghost_width == 4)
 ! sixth order code
 ! Compute advection terms in right hand sides of field equations
--- a/AMSS_NCKU_source/makefile
+++ b/AMSS_NCKU_source/makefile
@@ -16,12 +16,6 @@ include makefile.inc
 .cu.o:
 	$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
 TwoPunctures.o: TwoPunctures.C
 	${CXX} $(CXXAPPFLAGS) -qopenmp -c $< -o $@
 TwoPunctureABE.o: TwoPunctureABE.C
 	${CXX} $(CXXAPPFLAGS) -qopenmp -c $< -o $@
 # Input files
 C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
           cgh.o bssn_class.o surface_integral.o ShellPatch.o\
@@ -40,7 +34,7 @@ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o
 F90FILES = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
 	   prolongrestrict_cell.o prolongrestrict_vertex.o\
-	   rungekutta4_rout.o bssn_rhs.o diff_new.o kodiss.o kodiss_sh.o\
+	   rungekutta4_rout.o bssn_rhs_opt.o bssn_rhs.o bssn_rhs_legacy.o diff_new.o kodiss.o kodiss_sh.o\
 	   lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
 	   shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
           getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
@@ -102,7 +96,7 @@ ABEGPU: $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
 	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
 TwoPunctureABE: $(TwoPunctureFILES)
-	$(CLINKER) $(CXXAPPFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
+	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(TwoPunctureFILES) $(LDLIBS)
 clean:
 	rm *.o ABE ABEGPU TwoPunctureABE make.log -f
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -7,24 +7,25 @@
 filein  = -I/usr/include/ -I${MKLROOT}/include
 ## Using sequential MKL (OpenMP disabled for better single-threaded performance)
-## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
+LDLIBS  = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -lifcore -limf -lmpi \
-LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl
+          -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core \
          -lpthread -lm -ldl
-## Aggressive optimization flags + PGO Phase 2 (profile-guided optimization)
+## Aggressive optimization flags:
-## -fprofile-instr-use: use collected profile data to guide optimization decisions
+## -O3: Maximum optimization
-##   (branch prediction, basic block layout, inlining, loop unrolling)
+## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible)
-PROFDATA     = ../../pgo_profile/default.profdata
+## -fp-model fast=2: Aggressive floating-point optimizations
-CXXAPPFLAGS  = -O3 -xHost -fp-model fast=2 -fma -ipo \
+## -fma: Enable fused multiply-add instructions
-               -fprofile-instr-use=$(PROFDATA) \
+## Note: OpenMP has been disabled (-qopenmp removed) due to performance issues
 CXXAPPFLAGS  = -O3 -xHost -fp-model fast=2 -fma \
               -Dfortran3 -Dnewc -I${MKLROOT}/include
-f90appflags  = -O3 -xHost -fp-model fast=2 -fma -ipo \
+f90appflags  = -O3 -xHost -fp-model fast=2 -fma \
-               -fprofile-instr-use=$(PROFDATA) \
+               -fpp -I${MKLROOT}/include
               -align array64byte -fpp -I${MKLROOT}/include
 f90          = ifx
 f77          = ifx
 CXX          = icpx
 CC           = icx
-CLINKER      = mpiicpx
+CLINKER      = mpiicpx 
 Cu = nvcc
 CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
--- a/AMSS_NCKU_source/surface_integral.C
+++ b/AMSS_NCKU_source/surface_integral.C
@@ -220,9 +220,16 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
    pox[2][n] = rex * nz_g[n];
  }
  double *shellf;
  shellf = new double[n_tot * InList];
  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);
  int mp, Lp, Nmin, Nmax;
  mp = n_tot / cpusize;
  Lp = n_tot - cpusize * mp;
  if (Lp > myrank)
  {
    Nmin = myrank * mp + myrank;
@@ -234,11 +241,6 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
    Nmax = Nmin + mp - 1;
  }
  double *shellf;
  shellf = new double[n_tot * InList];
  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
  //|~~~~~> Integrate the dot product of Dphi with the surface normal.
  double *RP_out, *IP_out;
@@ -361,17 +363,8 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
  }
  //|------+  Communicate and sum the results from each processor.
-  {
+  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double *RPIP_out = new double[2 * NN];
+  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    double *RPIP = new double[2 * NN];
    memcpy(RPIP_out, RP_out, NN * sizeof(double));
    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    memcpy(RP, RPIP, NN * sizeof(double));
    memcpy(IP, RPIP + NN, NN * sizeof(double));
    delete[] RPIP_out;
    delete[] RPIP;
  }
  //|------= Free memory.
@@ -563,17 +556,8 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
  }
  //|------+  Communicate and sum the results from each processor.
-  {
+  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, Comm_here);
-    double *RPIP_out = new double[2 * NN];
+  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, Comm_here);
    double *RPIP = new double[2 * NN];
    memcpy(RPIP_out, RP_out, NN * sizeof(double));
    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, Comm_here);
    memcpy(RP, RPIP, NN * sizeof(double));
    memcpy(IP, RPIP + NN, NN * sizeof(double));
    delete[] RPIP_out;
    delete[] RPIP;
  }
  //|------= Free memory.
@@ -751,17 +735,8 @@ void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH, var *Rpsi4
  }
  //|------+  Communicate and sum the results from each processor.
-  {
+  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double *RPIP_out = new double[2 * NN];
+  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    double *RPIP = new double[2 * NN];
    memcpy(RPIP_out, RP_out, NN * sizeof(double));
    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    memcpy(RP, RPIP, NN * sizeof(double));
    memcpy(IP, RPIP + NN, NN * sizeof(double));
    delete[] RPIP_out;
    delete[] RPIP;
  }
  //|------= Free memory.
@@ -1009,17 +984,8 @@ void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH,
  }
  //|------+  Communicate and sum the results from each processor.
-  {
+  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double *RPIP_out = new double[2 * NN];
+  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    double *RPIP = new double[2 * NN];
    memcpy(RPIP_out, RP_out, NN * sizeof(double));
    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    memcpy(RP, RPIP, NN * sizeof(double));
    memcpy(IP, RPIP + NN, NN * sizeof(double));
    delete[] RPIP_out;
    delete[] RPIP;
  }
  //|------= Free memory.
@@ -1453,17 +1419,8 @@ void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH,
  }
  //|------+  Communicate and sum the results from each processor.
-  {
+  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double *RPIP_out = new double[2 * NN];
+  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    double *RPIP = new double[2 * NN];
    memcpy(RPIP_out, RP_out, NN * sizeof(double));
    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    memcpy(RP, RPIP, NN * sizeof(double));
    memcpy(IP, RPIP + NN, NN * sizeof(double));
    delete[] RPIP_out;
    delete[] RPIP;
  }
  //|------= Free memory.
@@ -1897,17 +1854,8 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH,
  }
  //|------+  Communicate and sum the results from each processor.
-  {
+  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double *RPIP_out = new double[2 * NN];
+  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    double *RPIP = new double[2 * NN];
    memcpy(RPIP_out, RP_out, NN * sizeof(double));
    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    memcpy(RP, RPIP, NN * sizeof(double));
    memcpy(IP, RPIP + NN, NN * sizeof(double));
    delete[] RPIP_out;
    delete[] RPIP;
  }
  //|------= Free memory.
@@ -2092,17 +2040,8 @@ void surface_integral::surf_Wave(double rex, int lev, NullShellPatch2 *GH, var *
  }
  //|------+  Communicate and sum the results from each processor.
-  {
+  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double *RPIP_out = new double[2 * NN];
+  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    double *RPIP = new double[2 * NN];
    memcpy(RPIP_out, RP_out, NN * sizeof(double));
    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    memcpy(RP, RPIP, NN * sizeof(double));
    memcpy(IP, RPIP + NN, NN * sizeof(double));
    delete[] RPIP_out;
    delete[] RPIP;
  }
  //|------= Free memory.
@@ -2287,17 +2226,8 @@ void surface_integral::surf_Wave(double rex, int lev, NullShellPatch *GH, var *R
  }
  //|------+  Communicate and sum the results from each processor.
-  {
+  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double *RPIP_out = new double[2 * NN];
+  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    double *RPIP = new double[2 * NN];
    memcpy(RPIP_out, RP_out, NN * sizeof(double));
    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    memcpy(RP, RPIP, NN * sizeof(double));
    memcpy(IP, RPIP + NN, NN * sizeof(double));
    delete[] RPIP_out;
    delete[] RPIP;
  }
  //|------= Free memory.
@@ -2384,9 +2314,25 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
    pox[2][n] = rex * nz_g[n];
  }
  double *shellf;
  shellf = new double[n_tot * InList];
  // we have assumed there is only one box on this level,
  // so we do not need loop boxes
  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);
  double Mass_out = 0;
  double ang_outx, ang_outy, ang_outz;
  double p_outx, p_outy, p_outz;
  ang_outx = ang_outy = ang_outz = 0.0;
  p_outx = p_outy = p_outz = 0.0;
  const double f1o8 = 0.125;
  int mp, Lp, Nmin, Nmax;
  mp = n_tot / cpusize;
  Lp = n_tot - cpusize * mp;
  if (Lp > myrank)
  {
    Nmin = myrank * mp + myrank;
@@ -2398,20 +2344,6 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
    Nmax = Nmin + mp - 1;
  }
  double *shellf;
  shellf = new double[n_tot * InList];
  // we have assumed there is only one box on this level,
  // so we do not need loop boxes
  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
  double Mass_out = 0;
  double ang_outx, ang_outy, ang_outz;
  double p_outx, p_outy, p_outz;
  ang_outx = ang_outy = ang_outz = 0.0;
  p_outx = p_outy = p_outz = 0.0;
  const double f1o8 = 0.125;
  double Chi, Psi;
  double Gxx, Gxy, Gxz, Gyy, Gyz, Gzz;
  double gupxx, gupxy, gupxz, gupyy, gupyz, gupzz;
@@ -2532,13 +2464,15 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
    }
  }
-  {
+  MPI_Allreduce(&Mass_out, &mass, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double scalar_out[7] = {Mass_out, ang_outx, ang_outy, ang_outz, p_outx, p_outy, p_outz};
+
-    double scalar_in[7];
+  MPI_Allreduce(&ang_outx, &sx, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    MPI_Allreduce(scalar_out, scalar_in, 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(&ang_outy, &sy, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    mass = scalar_in[0]; sx = scalar_in[1]; sy = scalar_in[2]; sz = scalar_in[3];
+  MPI_Allreduce(&ang_outz, &sz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    px = scalar_in[4]; py = scalar_in[5]; pz = scalar_in[6];
+
-  }
+  MPI_Allreduce(&p_outx, &px, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  MPI_Allreduce(&p_outy, &py, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  MPI_Allreduce(&p_outz, &pz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 #ifdef GaussInt
  mass = mass * rex * rex * dphi * factor;
@@ -2801,13 +2735,15 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
    }
  }
-  {
+  MPI_Allreduce(&Mass_out, &mass, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
-    double scalar_out[7] = {Mass_out, ang_outx, ang_outy, ang_outz, p_outx, p_outy, p_outz};
+
-    double scalar_in[7];
+  MPI_Allreduce(&ang_outx, &sx, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
-    MPI_Allreduce(scalar_out, scalar_in, 7, MPI_DOUBLE, MPI_SUM, Comm_here);
+  MPI_Allreduce(&ang_outy, &sy, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
-    mass = scalar_in[0]; sx = scalar_in[1]; sy = scalar_in[2]; sz = scalar_in[3];
+  MPI_Allreduce(&ang_outz, &sz, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
-    px = scalar_in[4]; py = scalar_in[5]; pz = scalar_in[6];
+
-  }
+  MPI_Allreduce(&p_outx, &px, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
  MPI_Allreduce(&p_outy, &py, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
  MPI_Allreduce(&p_outz, &pz, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
 #ifdef GaussInt
  mass = mass * rex * rex * dphi * factor;
@@ -3084,13 +3020,15 @@ void surface_integral::surf_MassPAng(double rex, int lev, ShellPatch *GH, var *c
    }
  }
-  {
+  MPI_Allreduce(&Mass_out, &mass, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double scalar_out[7] = {Mass_out, ang_outx, ang_outy, ang_outz, p_outx, p_outy, p_outz};
+
-    double scalar_in[7];
+  MPI_Allreduce(&ang_outx, &sx, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    MPI_Allreduce(scalar_out, scalar_in, 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(&ang_outy, &sy, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    mass = scalar_in[0]; sx = scalar_in[1]; sy = scalar_in[2]; sz = scalar_in[3];
+  MPI_Allreduce(&ang_outz, &sz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    px = scalar_in[4]; py = scalar_in[5]; pz = scalar_in[6];
+
-  }
+  MPI_Allreduce(&p_outx, &px, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  MPI_Allreduce(&p_outy, &py, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  MPI_Allreduce(&p_outz, &pz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 #ifdef GaussInt
  mass = mass * rex * rex * dphi * factor;
@@ -3669,17 +3607,8 @@ void surface_integral::surf_Wave(double rex, cgh *GH, ShellPatch *SH,
  }
  //|------+  Communicate and sum the results from each processor.
-  {
+  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double *RPIP_out = new double[2 * NN];
+  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    double *RPIP = new double[2 * NN];
    memcpy(RPIP_out, RP_out, NN * sizeof(double));
    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    memcpy(RP, RPIP, NN * sizeof(double));
    memcpy(IP, RPIP + NN, NN * sizeof(double));
    delete[] RPIP_out;
    delete[] RPIP;
  }
  //|------= Free memory.
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -10,18 +10,17 @@
 import AMSS_NCKU_Input as input_data
 import subprocess
-import time
+
 ## CPU core binding configuration using taskset
 ## taskset ensures all child processes inherit the CPU affinity mask
 ## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111)
 ## Format: taskset -c 4-55,60-111 ensures processes only run on these cores
-#NUMACTL_CPU_BIND = "taskset -c 0-111"
+NUMACTL_CPU_BIND = "taskset -c 4-55,60-111"
 NUMACTL_CPU_BIND = "taskset -c 16-47,64-95"
 ## Build parallelism configuration
 ## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
 ## Set make -j to utilize available cores for faster builds
-BUILD_JOBS = 96
+BUILD_JOBS = 104
 ##################################################################
@@ -118,7 +117,6 @@ def run_ABE():
    if (input_data.GPU_Calculation == "no"):
        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
        #mpi_command         = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
        mpi_command_outfile = "ABE_out.log"
    elif (input_data.GPU_Calculation == "yes"):
        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
@@ -154,14 +152,13 @@ def run_ABE():
 ## Run the AMSS-NCKU TwoPuncture program TwoPunctureABE
 def run_TwoPunctureABE():
-    tp_time1=time.time()
+
    print(                                                          )
    print( " Running the AMSS-NCKU executable file TwoPunctureABE " ) 
    print(                                                          )
    ## Define the command to run
-    #TwoPuncture_command         = NUMACTL_CPU_BIND + " ./TwoPunctureABE"
+    TwoPuncture_command         = NUMACTL_CPU_BIND + " ./TwoPunctureABE"
    TwoPuncture_command         = " ./TwoPunctureABE"
    TwoPuncture_command_outfile = "TwoPunctureABE_out.log"
    ## Execute the command with subprocess.Popen and stream output
@@ -182,9 +179,7 @@ def run_TwoPunctureABE():
    print(                                               )
    print( " The TwoPunctureABE simulation is finished " ) 
    print(                                               )
-    tp_time2=time.time()
+    
    et=tp_time2-tp_time1
    print(f"Used time: {et}")
    return
 ##################################################################
--- a/parallel_plot_helper.py
+++ b/parallel_plot_helper.py
@@ -1,29 +0,0 @@
 import multiprocessing
 def run_plot_task(task):
    """Execute a single plotting task.
    Parameters
    ----------
    task : tuple
        A tuple of (function, args_tuple) where function is a callable
        plotting function and args_tuple contains its arguments.
    """
    func, args = task
    return func(*args)
 def run_plot_tasks_parallel(plot_tasks):
    """Execute a list of independent plotting tasks in parallel.
    Uses the 'fork' context to create worker processes so that the main
    script is NOT re-imported/re-executed in child processes.
    Parameters
    ----------
    plot_tasks : list of tuples
        Each element is (function, args_tuple).
    """
    ctx = multiprocessing.get_context('fork')
    with ctx.Pool() as pool:
        pool.map(run_plot_task, plot_tasks)
--- a/pgo_profile/PGO_Profile_Analysis.md
+++ b/pgo_profile/PGO_Profile_Analysis.md
@@ -1,97 +0,0 @@
 # AMSS-NCKU PGO Profile Analysis Report
 ## 1. Profiling Environment
 | Item | Value |
 |------|-------|
 | Compiler | Intel oneAPI DPC++/C++ 2025.3.0 (icpx/ifx) |
 | Instrumentation Flag | `-fprofile-instr-generate` |
 | Optimization Level (instrumented) | `-O2 -xHost -fma` |
 | MPI Processes | 1 (single process to avoid MPI+instrumentation deadlock) |
 | Profile File | `default_9725750769337483397_0.profraw` (327 KB) |
 | Merged Profile | `default.profdata` (394 KB) |
 | llvm-profdata | `/home/intel/oneapi/compiler/2025.3/bin/compiler/llvm-profdata` |
 ## 2. Reduced Simulation Parameters (for profiling run)
 | Parameter | Production Value | Profiling Value |
 |-----------|-----------------|-----------------|
 | MPI_processes | 64 | 1 |
 | grid_level | 9 | 4 |
 | static_grid_level | 5 | 3 |
 | static_grid_number | 96 | 24 |
 | moving_grid_number | 48 | 16 |
 | largest_box_xyz_max | 320^3 | 160^3 |
 | Final_Evolution_Time | 1000.0 | 10.0 |
 | Evolution_Step_Number | 10,000,000 | 1,000 |
 | Detector_Number | 12 | 2 |
 ## 3. Profile Summary
 | Metric | Value |
 |--------|-------|
 | Total instrumented functions | 1,392 |
 | Functions with non-zero counts | 117 (8.4%) |
 | Functions with zero counts | 1,275 (91.6%) |
 | Maximum function entry count | 386,459,248 |
 | Maximum internal block count | 370,477,680 |
 | Total block count | 4,198,023,118 |
 ## 4. Top 20 Hotspot Functions
 | Rank | Total Count | Max Block Count | Function | Category |
 |------|------------|-----------------|----------|----------|
 | 1 | 1,241,601,732 | 370,477,680 | `polint_` | Interpolation |
 | 2 | 755,994,435 | 230,156,640 | `prolong3_` | Grid prolongation |
 | 3 | 667,964,095 | 3,697,792 | `compute_rhs_bssn_` | BSSN RHS evolution |
 | 4 | 539,736,051 | 386,459,248 | `symmetry_bd_` | Symmetry boundary |
 | 5 | 277,310,808 | 53,170,728 | `lopsided_` | Lopsided FD stencil |
 | 6 | 155,534,488 | 94,535,040 | `decide3d_` | 3D grid decision |
 | 7 | 119,267,712 | 19,266,048 | `rungekutta4_rout_` | RK4 time integrator |
 | 8 | 91,574,616 | 48,824,160 | `kodis_` | Kreiss-Oliger dissipation |
 | 9 | 67,555,389 | 43,243,680 | `fderivs_` | Finite differences |
 | 10 | 55,296,000 | 42,246,144 | `misc::fact(int)` | Factorial utility |
 | 11 | 43,191,071 | 27,663,328 | `fdderivs_` | 2nd-order FD derivatives |
 | 12 | 36,233,965 | 22,429,440 | `restrict3_` | Grid restriction |
 | 13 | 24,698,512 | 17,231,520 | `polin3_` | Polynomial interpolation |
 | 14 | 22,962,942 | 20,968,768 | `copy_` | Data copy |
 | 15 | 20,135,696 | 17,259,168 | `Ansorg::barycentric(...)` | Spectral interpolation |
 | 16 | 14,650,224 | 7,224,768 | `Ansorg::barycentric_omega(...)` | Spectral weights |
 | 17 | 13,242,296 | 2,871,920 | `global_interp_` | Global interpolation |
 | 18 | 12,672,000 | 7,734,528 | `sommerfeld_rout_` | Sommerfeld boundary |
 | 19 | 6,872,832 | 1,880,064 | `sommerfeld_routbam_` | Sommerfeld boundary (BAM) |
 | 20 | 5,709,900 | 2,809,632 | `l2normhelper_` | L2 norm computation |
 ## 5. Hotspot Category Breakdown
 Top 20 functions account for ~98% of total execution counts:
 | Category | Functions | Combined Count | Share |
 |----------|-----------|---------------|-------|
 | Interpolation / Prolongation / Restriction | polint_, prolong3_, restrict3_, polin3_, global_interp_, Ansorg::* | ~2,093M | ~50% |
 | BSSN RHS + FD stencils | compute_rhs_bssn_, lopsided_, fderivs_, fdderivs_ | ~1,056M | ~25% |
 | Boundary conditions | symmetry_bd_, sommerfeld_rout_, sommerfeld_routbam_ | ~559M | ~13% |
 | Time integration | rungekutta4_rout_ | ~119M | ~3% |
 | Dissipation | kodis_ | ~92M | ~2% |
 | Utilities | misc::fact, decide3d_, copy_, l2normhelper_ | ~256M | ~6% |
 ## 6. Conclusions
 1. **Profile data is valid**: 1,392 functions instrumented, 117 exercised with ~4.2 billion total counts.
 2. **Hotspot concentration is high**: Top 5 functions alone account for ~76% of all counts, which is ideal for PGO — the compiler has strong branch/layout optimization targets.
 3. **Fortran numerical kernels dominate**: `polint_`, `prolong3_`, `compute_rhs_bssn_`, `symmetry_bd_`, `lopsided_` are all Fortran routines in the inner evolution loop. PGO will optimize their branch prediction and basic block layout.
 4. **91.6% of functions have zero counts**: These are code paths for unused features (GPU, BSSN-EScalar, BSSN-EM, Z4C, etc.). PGO will deprioritize them, improving instruction cache utilization.
 5. **Profile is representative**: Despite the reduced grid size, the code path coverage matches production — the same kernels (RHS, prolongation, restriction, boundary) are exercised. PGO branch probabilities from this profile will transfer well to full-scale runs.
 ## 7. PGO Phase 2 Usage
 To apply the profile, use the following flags in `makefile.inc`:
 ```makefile
 CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
              -fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \
              -Dfortran3 -Dnewc -I${MKLROOT}/include
 f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
              -fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \
              -align array64byte -fpp -I${MKLROOT}/include
 ```
--- a/pgo_profile/default.profdata
+++ b/pgo_profile/default.profdata
--- a/pgo_profile/default.profdata.backup
+++ b/pgo_profile/default.profdata.backup
--- a/pgo_profile/default_15874826282416242821_0_58277.profraw
+++ b/pgo_profile/default_15874826282416242821_0_58277.profraw
--- a/pgo_profile/default_9725750769337483397_0.profraw
+++ b/pgo_profile/default_9725750769337483397_0.profraw
--- a/plot_GW_strain_amplitude_xiaoqu.py
+++ b/plot_GW_strain_amplitude_xiaoqu.py
@@ -11,8 +11,6 @@
 import numpy                               ## numpy for array operations
 import scipy                               ## scipy for interpolation and signal processing
 import math
 import matplotlib
 matplotlib.use('Agg')                      ## use non-interactive backend for multiprocessing safety
 import matplotlib.pyplot    as     plt     ## matplotlib for plotting
 import os                                  ## os for system/file operations
--- a/plot_binary_data.py
+++ b/plot_binary_data.py
@@ -8,23 +8,16 @@
 ##
 #################################################
 ## Restrict OpenMP to one thread per process so that running
 ## many workers in parallel does not create an O(workers * BLAS_threads)
 ## thread explosion.  The variable MUST be set before numpy/scipy
 ## are imported, because the BLAS library reads them only at load time.
 import os
 os.environ.setdefault("OMP_NUM_THREADS",        "1")
 import numpy
 import scipy
 import matplotlib
 matplotlib.use('Agg')                      ## use non-interactive backend for multiprocessing safety
 import matplotlib.pyplot    as     plt
 from   matplotlib.colors    import LogNorm
 from   mpl_toolkits.mplot3d import Axes3D
 ## import torch
 import AMSS_NCKU_Input      as input_data
 import os
 #########################################################################################
@@ -199,19 +192,3 @@ def get_data_xy( Rmin, Rmax, n, data0, time, figure_title, figure_outdir ):
 ####################################################################################
 ####################################################################################
 ## Allow this module to be run as a standalone script so that each
 ## binary-data plot can be executed in a fresh subprocess whose BLAS
 ## environment variables (set above) take effect before numpy loads.
 ##
 ## Usage:  python3 plot_binary_data.py <filename> <binary_outdir> <figure_outdir>
 ####################################################################################
 if __name__ == '__main__':
    import sys
    if len(sys.argv) != 4:
        print(f"Usage: {sys.argv[0]} <filename> <binary_outdir> <figure_outdir>")
        sys.exit(1)
    plot_binary_data(sys.argv[1], sys.argv[2], sys.argv[3])
--- a/plot_xiaoqu.py
+++ b/plot_xiaoqu.py
@@ -8,8 +8,6 @@
 #################################################
 import numpy                               ## numpy for array operations
 import matplotlib
 matplotlib.use('Agg')                      ## use non-interactive backend for multiprocessing safety
 import matplotlib.pyplot    as     plt     ## matplotlib for plotting
 from   mpl_toolkits.mplot3d import Axes3D  ## needed for 3D plots
 import glob
@@ -17,9 +15,6 @@ import os                                  ## operating system utilities
 import plot_binary_data
 import AMSS_NCKU_Input as input_data
 import subprocess
 import sys
 import multiprocessing
 # plt.rcParams['text.usetex'] = True  ## enable LaTeX fonts in plots
@@ -55,40 +50,10 @@ def generate_binary_data_plot( binary_outdir, figure_outdir ):
        file_list.append(x)
        print(x)
-    ## Plot each file in parallel using subprocesses.
+    ## Plot each file in the list
    ## Each subprocess is a fresh Python process where the BLAS thread-count
    ## environment variables (set at the top of plot_binary_data.py) take
    ## effect before numpy is imported.  This avoids the thread explosion
    ## that occurs when multiprocessing.Pool with 'fork' context inherits
    ## already-initialized multi-threaded BLAS from the parent.
    script = os.path.join( os.path.dirname(__file__), "plot_binary_data.py" )
    max_workers = min( multiprocessing.cpu_count(), len(file_list) ) if file_list else 0
    running = []
    failed  = []
    for filename in file_list:
        print(filename)
-        proc = subprocess.Popen(
+        plot_binary_data.plot_binary_data(filename, binary_outdir, figure_outdir)
            [sys.executable, script, filename, binary_outdir, figure_outdir],
        )
        running.append( (proc, filename) )
        ## Keep at most max_workers subprocesses active at a time
        if len(running) >= max_workers:
            p, fn = running.pop(0)
            p.wait()
            if p.returncode != 0:
                failed.append(fn)
    ## Wait for all remaining subprocesses to finish
    for p, fn in running:
        p.wait()
        if p.returncode != 0:
            failed.append(fn)
    if failed:
        print( " WARNING: the following binary data plots failed:" )
        for fn in failed:
            print( "   ", fn )
    print(                        )
    print( " Binary Data Plot Has been Finished " )
Author	SHA1	Message	Date
CGH0S7	ed89bc029b	Fix potential division by zero in reta_val calculation and enable NaN checks Added a safety check for the denominator in the reta_val calculation to prevent division by zero when chi approaches zero (e.g., at far-field boundaries). Also enabled DEBUG_NAN_CHECK macro to catch invalid inputs early. Initialized output arrays to zero to prevent uninitialized memory access.	2026-01-19 20:29:48 +08:00
CGH0S7	19274e93d1	Fix boundary handling in bssn_rhs_opt.f90 to prevent NaNs Refactored calc_derivs and calc_dderivs to include correct boundary handling logic matching the legacy code. Implemented fallback to 2nd order derivatives when near boundaries where 4th order stencils cannot be used. Added logic to initialize output arrays to zero to avoid uninitialized memory access.	2026-01-19 20:03:22 +08:00
CGH0S7	ae1a474cca	Fix compilation errors and complete logic in BSSN RHS optimization	2026-01-19 19:22:52 +08:00
CGH0S7	cbb8fb3a87	patched last commit	2026-01-19 17:14:28 +08:00
CGH0S7	4472d89a9f	Optimize bssn_rhs calculation with cache blocking and vectorization - Implemented cache blocking (BLK=8) in bssn_rhs_opt.f90 to improve L1/L2 cache hit rate. - Introduced bssn_rhs_opt.f90 module with vectorized derivative and physics kernels. - Renamed original implementation to bssn_rhs_legacy.f90 for fallback. - Updated bssn_rhs.f90 to act as a dispatcher, using the optimized path for ghost_width=3. - Updated makefile to include new source files. - Added DEBUG_NAN_CHECK macro to optionally disable NaN checks in production.	2026-01-19 16:39:24 +08:00