skip redundant MPI ghost cell syncs for stages 0, 1 & 2

BSSN 每个 RK4 时间步执行 4 次 MPI ghost zone 同步： Stage 0（预测）结束后：Parallel::Sync(SynchList_pre) Stage 1（校正 1）结束后：Parallel::Sync(SynchList_cor) Stage 2（校正 2）结束后：Parallel::Sync(SynchList_cor) Stage 3（校正 3）结束后：Parallel::Sync(SynchList_cor) ← 必要（为下一步提供 ghost） bssnEM_class.C、Z4c_class.C 结构相同，一起修改了
Initialize output arrays to zero in fdderivs_c.C and fderivs_c.C
2026-02-26 16:16:33 +08:00 · 2026-02-26 11:48:28 +08:00 · 2026-02-25 22:21:43 +08:00 · 2026-02-25 22:21:20 +08:00 · 2026-02-25 22:21:19 +08:00 · 2026-02-25 22:21:19 +08:00
42 changed files with 4443 additions and 998 deletions
--- a/AMSS_NCKU_Program.py
+++ b/AMSS_NCKU_Program.py
@@ -8,6 +8,14 @@
 ##
 ##################################################################

+## Guard against re-execution by multiprocessing child processes.
+## Without this, using 'spawn' or 'forkserver' context would cause every
+## worker to re-run the entire script, spawning exponentially more
+## workers (fork bomb).
+if __name__ != '__main__':
+    import sys as _sys
+    _sys.exit(0)
+

 ##################################################################

@@ -424,26 +432,31 @@ print(

 import plot_xiaoqu
 import plot_GW_strain_amplitude_xiaoqu
+from parallel_plot_helper import run_plot_tasks_parallel
+
+plot_tasks = []

 ## Plot black hole trajectory
-plot_xiaoqu.generate_puncture_orbit_plot(   binary_results_directory, figure_directory )
-plot_xiaoqu.generate_puncture_orbit_plot3D( binary_results_directory, figure_directory )
+plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot,   (binary_results_directory, figure_directory) ) )
+plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot3D, (binary_results_directory, figure_directory) ) )

 ## Plot black hole separation vs. time
-plot_xiaoqu.generate_puncture_distence_plot( binary_results_directory, figure_directory )
+plot_tasks.append( ( plot_xiaoqu.generate_puncture_distence_plot, (binary_results_directory, figure_directory) ) )

 ## Plot gravitational waveforms (psi4 and strain amplitude)
 for i in range(input_data.Detector_Number):
-    plot_xiaoqu.generate_gravitational_wave_psi4_plot( binary_results_directory, figure_directory, i )
-    plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot( binary_results_directory, figure_directory, i )
+    plot_tasks.append( ( plot_xiaoqu.generate_gravitational_wave_psi4_plot, (binary_results_directory, figure_directory, i) ) )
+    plot_tasks.append( ( plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot, (binary_results_directory, figure_directory, i) ) )

 ## Plot ADM mass evolution
 for i in range(input_data.Detector_Number):
-    plot_xiaoqu.generate_ADMmass_plot( binary_results_directory, figure_directory, i )
+    plot_tasks.append( ( plot_xiaoqu.generate_ADMmass_plot, (binary_results_directory, figure_directory, i) ) )

 ## Plot Hamiltonian constraint violation over time
 for i in range(input_data.grid_level):
-    plot_xiaoqu.generate_constraint_check_plot( binary_results_directory, figure_directory, i )
+    plot_tasks.append( ( plot_xiaoqu.generate_constraint_check_plot, (binary_results_directory, figure_directory, i) ) )
+
+run_plot_tasks_parallel(plot_tasks)

 ## Plot stored binary data
 plot_xiaoqu.generate_binary_data_plot( binary_results_directory, figure_directory )
--- a/AMSS_NCKU_source/MPatch.C
+++ b/AMSS_NCKU_source/MPatch.C
@@ -341,8 +341,9 @@ void Patch::Interp_Points(MyList<var> *VarList,
                          double *Shellf, int Symmetry)
 {
  // NOTE: we do not Synchnize variables here, make sure of that before calling this routine
-  int myrank;
+  int myrank, nprocs;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  int ordn = 2 * ghost_width;
  MyList<var> *varl;
@@ -354,24 +355,18 @@ void Patch::Interp_Points(MyList<var> *VarList,
    varl = varl->next;
  }

-  double *shellf;
-  shellf = new double[NN * num_var];
-  memset(shellf, 0, sizeof(double) * NN * num_var);
+  memset(Shellf, 0, sizeof(double) * NN * num_var);

-  // we use weight to monitor code, later some day we can move it for optimization
-  int *weight;
-  weight = new int[NN];
-  memset(weight, 0, sizeof(int) * NN);
-
-  double *DH, *llb, *uub;
-  DH = new double[dim];
+  // owner_rank[j] records which MPI rank owns point j
+  // All ranks traverse the same block list so they all agree on ownership
+  int *owner_rank;
+  owner_rank = new int[NN];
+  for (int j = 0; j < NN; j++)
+    owner_rank[j] = -1;

+  double DH[dim], llb[dim], uub[dim];
  for (int i = 0; i < dim; i++)
-  {
    DH[i] = getdX(i);
-  }
-  llb = new double[dim];
-  uub = new double[dim];

  for (int j = 0; j < NN; j++) // run along points
  {
@@ -403,12 +398,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
      bool flag = true;
      for (int i = 0; i < dim; i++)
      {
-// NOTE: our dividing structure is (exclude ghost)
-// -1 0
-//       1  2
-// so (0,1) does not belong to any part for vertex structure
-// here we put (0,0.5) to left part and (0.5,1) to right part
-// BUT for cell structure the bbox is (-1.5,0.5) and (0.5,2.5), there is no missing region at all
 #ifdef Vertex
 #ifdef Cell
 #error Both Cell and Vertex are defined
@@ -433,6 +422,7 @@ void Patch::Interp_Points(MyList<var> *VarList,
      if (flag)
      {
        notfind = false;
+        owner_rank[j] = BP->rank;
        if (myrank == BP->rank)
        {
          //---> interpolation
@@ -440,14 +430,11 @@ void Patch::Interp_Points(MyList<var> *VarList,
          int k = 0;
          while (varl) // run along variables
          {
-            //              shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn],
-            //	  		                                    pox,ordn,varl->data->SoA,Symmetry);
-            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[j * num_var + k],
+            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
            varl = varl->next;
            k++;
          }
-          weight[j] = 1;
        }
      }
      if (Bp == ble)
@@ -456,61 +443,125 @@ void Patch::Interp_Points(MyList<var> *VarList,
    }
  }

-  MPI_Allreduce(shellf, Shellf, NN * num_var, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-  int *Weight;
-  Weight = new int[NN];
-  MPI_Allreduce(weight, Weight, NN, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-
-  //  misc::tillherecheck("print me");
-
-  for (int i = 0; i < NN; i++)
+  // Replace MPI_Allreduce with per-owner MPI_Bcast:
+  // Group consecutive points by owner rank and broadcast each group.
+  // Since each point's data is non-zero only on the owner rank,
+  // Bcast from owner is equivalent to Allreduce(MPI_SUM) but much cheaper.
  {
-    if (Weight[i] > 1)
+    int j = 0;
+    while (j < NN)
+    {
+      int cur_owner = owner_rank[j];
+      if (cur_owner < 0)
      {
        if (myrank == 0)
-        cout << "WARNING: Patch::Interp_Points meets multiple weight" << endl;
-      for (int j = 0; j < num_var; j++)
-        Shellf[j + i * num_var] = Shellf[j + i * num_var] / Weight[i];
-    }
-    else if (Weight[i] == 0 && myrank == 0)
        {
          cout << "ERROR: Patch::Interp_Points fails to find point (";
-      for (int j = 0; j < dim; j++)
+          for (int d = 0; d < dim; d++)
          {
-        cout << XX[j][i];
-        if (j < dim - 1)
+            cout << XX[d][j];
+            if (d < dim - 1)
              cout << ",";
            else
              cout << ")";
          }
          cout << " on Patch (";
-      for (int j = 0; j < dim; j++)
+          for (int d = 0; d < dim; d++)
          {
-        cout << bbox[j] << "+" << lli[j] * getdX(j);
-        if (j < dim - 1)
+            cout << bbox[d] << "+" << lli[d] * DH[d];
+            if (d < dim - 1)
              cout << ",";
            else
              cout << ")--";
          }
          cout << "(";
-      for (int j = 0; j < dim; j++)
+          for (int d = 0; d < dim; d++)
          {
-        cout << bbox[dim + j] << "-" << uui[j] * getdX(j);
-        if (j < dim - 1)
+            cout << bbox[dim + d] << "-" << uui[d] * DH[d];
+            if (d < dim - 1)
              cout << ",";
            else
              cout << ")" << endl;
          }
-#if 0
-       checkBlock();
-#else
-      cout << "splited domains:" << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        j++;
+        continue;
+      }
+      // Find contiguous run of points with the same owner
+      int jstart = j;
+      while (j < NN && owner_rank[j] == cur_owner)
+        j++;
+      int count = (j - jstart) * num_var;
+      MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner, MPI_COMM_WORLD);
+    }
+  }
+
+  delete[] owner_rank;
+}
+void Patch::Interp_Points(MyList<var> *VarList,
+                          int NN, double **XX,
+                          double *Shellf, int Symmetry,
+                          int Nmin_consumer, int Nmax_consumer)
+{
+  // Targeted point-to-point overload: each owner sends each point only to
+  // the one rank that needs it for integration (consumer), reducing
+  // communication volume by ~nprocs times compared to the Bcast version.
+  int myrank, nprocs;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+  int ordn = 2 * ghost_width;
+  MyList<var> *varl;
+  int num_var = 0;
+  varl = VarList;
+  while (varl)
  {
+    num_var++;
+    varl = varl->next;
+  }
+
+  memset(Shellf, 0, sizeof(double) * NN * num_var);
+
+  // owner_rank[j] records which MPI rank owns point j
+  int *owner_rank;
+  owner_rank = new int[NN];
+  for (int j = 0; j < NN; j++)
+    owner_rank[j] = -1;
+
+  double DH[dim], llb[dim], uub[dim];
+  for (int i = 0; i < dim; i++)
+    DH[i] = getdX(i);
+
+  // --- Interpolation phase (identical to original) ---
+  for (int j = 0; j < NN; j++)
+  {
+    double pox[dim];
+    for (int i = 0; i < dim; i++)
+    {
+      pox[i] = XX[i][j];
+      if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
+      {
+        cout << "Patch::Interp_Points: point (";
+        for (int k = 0; k < dim; k++)
+        {
+          cout << XX[k][j];
+          if (k < dim - 1)
+            cout << ",";
+          else
+            cout << ") is out of current Patch." << endl;
+        }
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+    }
+
    MyList<Block> *Bp = blb;
-        while (Bp)
+    bool notfind = true;
+    while (notfind && Bp)
    {
      Block *BP = Bp->data;

+      bool flag = true;
      for (int i = 0; i < dim; i++)
      {
 #ifdef Vertex
@@ -527,32 +578,192 @@ void Patch::Interp_Points(MyList<var> *VarList,
 #error Not define Vertex nor Cell
 #endif
 #endif
-          }
-          cout << "(";
-          for (int j = 0; j < dim; j++)
+        if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
        {
-            cout << llb[j] << ":" << uub[j];
-            if (j < dim - 1)
-              cout << ",";
-            else
-              cout << ")" << endl;
+          flag = false;
+          break;
+        }
+      }
+
+      if (flag)
+      {
+        notfind = false;
+        owner_rank[j] = BP->rank;
+        if (myrank == BP->rank)
+        {
+          varl = VarList;
+          int k = 0;
+          while (varl)
+          {
+            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
+                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
+            varl = varl->next;
+            k++;
+          }
+        }
      }
      if (Bp == ble)
        break;
      Bp = Bp->next;
    }
  }
-#endif
+
+  // --- Error check for unfound points ---
+  for (int j = 0; j < NN; j++)
+  {
+    if (owner_rank[j] < 0 && myrank == 0)
+    {
+      cout << "ERROR: Patch::Interp_Points fails to find point (";
+      for (int d = 0; d < dim; d++)
+      {
+        cout << XX[d][j];
+        if (d < dim - 1)
+          cout << ",";
+        else
+          cout << ")";
+      }
+      cout << " on Patch (";
+      for (int d = 0; d < dim; d++)
+      {
+        cout << bbox[d] << "+" << lli[d] * DH[d];
+        if (d < dim - 1)
+          cout << ",";
+        else
+          cout << ")--";
+      }
+      cout << "(";
+      for (int d = 0; d < dim; d++)
+      {
+        cout << bbox[dim + d] << "-" << uui[d] * DH[d];
+        if (d < dim - 1)
+          cout << ",";
+        else
+          cout << ")" << endl;
+      }
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }

-  delete[] shellf;
-  delete[] weight;
-  delete[] Weight;
-  delete[] DH;
-  delete[] llb;
-  delete[] uub;
+  // --- Targeted point-to-point communication phase ---
+  // Compute consumer_rank[j] using the same deterministic formula as surface_integral
+  int *consumer_rank = new int[NN];
+  {
+    int mp = NN / nprocs;
+    int Lp = NN - nprocs * mp;
+    for (int j = 0; j < NN; j++)
+    {
+      if (j < Lp * (mp + 1))
+        consumer_rank[j] = j / (mp + 1);
+      else
+        consumer_rank[j] = Lp + (j - Lp * (mp + 1)) / mp;
+    }
+  }
+
+  // Count sends and recvs per rank
+  int *send_count = new int[nprocs];
+  int *recv_count = new int[nprocs];
+  memset(send_count, 0, sizeof(int) * nprocs);
+  memset(recv_count, 0, sizeof(int) * nprocs);
+
+  for (int j = 0; j < NN; j++)
+  {
+    int own = owner_rank[j];
+    int con = consumer_rank[j];
+    if (own == con)
+      continue; // local — no communication needed
+    if (own == myrank)
+      send_count[con]++;
+    if (con == myrank)
+      recv_count[own]++;
+  }
+
+  // Build send buffers: for each destination rank, pack (index, data) pairs
+  // Each entry: 1 int (point index j) + num_var doubles
+  int total_send = 0, total_recv = 0;
+  int *send_offset = new int[nprocs];
+  int *recv_offset = new int[nprocs];
+  for (int r = 0; r < nprocs; r++)
+  {
+    send_offset[r] = total_send;
+    total_send += send_count[r];
+    recv_offset[r] = total_recv;
+    total_recv += recv_count[r];
+  }
+
+  // Pack send buffers: each message contains (j, data[0..num_var-1]) per point
+  int stride = 1 + num_var; // 1 double for index + num_var doubles for data
+  double *sendbuf = new double[total_send * stride];
+  double *recvbuf = new double[total_recv * stride];
+
+  // Temporary counters for packing
+  int *pack_pos = new int[nprocs];
+  memset(pack_pos, 0, sizeof(int) * nprocs);
+
+  for (int j = 0; j < NN; j++)
+  {
+    int own = owner_rank[j];
+    int con = consumer_rank[j];
+    if (own != myrank || con == myrank)
+      continue;
+    int pos = (send_offset[con] + pack_pos[con]) * stride;
+    sendbuf[pos] = (double)j; // point index
+    for (int v = 0; v < num_var; v++)
+      sendbuf[pos + 1 + v] = Shellf[j * num_var + v];
+    pack_pos[con]++;
+  }
+
+  // Post non-blocking recvs and sends
+  int n_req = 0;
+  for (int r = 0; r < nprocs; r++)
+  {
+    if (recv_count[r] > 0) n_req++;
+    if (send_count[r] > 0) n_req++;
+  }
+
+  MPI_Request *reqs = new MPI_Request[n_req];
+  int req_idx = 0;
+
+  for (int r = 0; r < nprocs; r++)
+  {
+    if (recv_count[r] > 0)
+    {
+      MPI_Irecv(recvbuf + recv_offset[r] * stride,
+                recv_count[r] * stride, MPI_DOUBLE,
+                r, 0, MPI_COMM_WORLD, &reqs[req_idx++]);
+    }
+  }
+  for (int r = 0; r < nprocs; r++)
+  {
+    if (send_count[r] > 0)
+    {
+      MPI_Isend(sendbuf + send_offset[r] * stride,
+                send_count[r] * stride, MPI_DOUBLE,
+                r, 0, MPI_COMM_WORLD, &reqs[req_idx++]);
+    }
+  }
+
+  if (n_req > 0)
+    MPI_Waitall(n_req, reqs, MPI_STATUSES_IGNORE);
+
+  // Unpack recv buffers into Shellf
+  for (int i = 0; i < total_recv; i++)
+  {
+    int pos = i * stride;
+    int j = (int)recvbuf[pos];
+    for (int v = 0; v < num_var; v++)
+      Shellf[j * num_var + v] = recvbuf[pos + 1 + v];
+  }
+
+  delete[] reqs;
+  delete[] sendbuf;
+  delete[] recvbuf;
+  delete[] pack_pos;
+  delete[] send_offset;
+  delete[] recv_offset;
+  delete[] send_count;
+  delete[] recv_count;
+  delete[] consumer_rank;
+  delete[] owner_rank;
 }
 void Patch::Interp_Points(MyList<var> *VarList,
                          int NN, double **XX,
@@ -573,24 +784,22 @@ void Patch::Interp_Points(MyList<var> *VarList,
    varl = varl->next;
  }

-  double *shellf;
-  shellf = new double[NN * num_var];
-  memset(shellf, 0, sizeof(double) * NN * num_var);
+  memset(Shellf, 0, sizeof(double) * NN * num_var);

-  // we use weight to monitor code, later some day we can move it for optimization
-  int *weight;
-  weight = new int[NN];
-  memset(weight, 0, sizeof(int) * NN);
+  // owner_rank[j] stores the global rank that owns point j
+  int *owner_rank;
+  owner_rank = new int[NN];
+  for (int j = 0; j < NN; j++)
+    owner_rank[j] = -1;

-  double *DH, *llb, *uub;
-  DH = new double[dim];
+  // Build global-to-local rank translation for Comm_here
+  MPI_Group world_group, local_group;
+  MPI_Comm_group(MPI_COMM_WORLD, &world_group);
+  MPI_Comm_group(Comm_here, &local_group);

+  double DH[dim], llb[dim], uub[dim];
  for (int i = 0; i < dim; i++)
-  {
    DH[i] = getdX(i);
-  }
-  llb = new double[dim];
-  uub = new double[dim];

  for (int j = 0; j < NN; j++) // run along points
  {
@@ -622,12 +831,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
      bool flag = true;
      for (int i = 0; i < dim; i++)
      {
-// NOTE: our dividing structure is (exclude ghost)
-// -1 0
-//       1  2
-// so (0,1) does not belong to any part for vertex structure
-// here we put (0,0.5) to left part and (0.5,1) to right part
-// BUT for cell structure the bbox is (-1.5,0.5) and (0.5,2.5), there is no missing region at all
 #ifdef Vertex
 #ifdef Cell
 #error Both Cell and Vertex are defined
@@ -652,6 +855,7 @@ void Patch::Interp_Points(MyList<var> *VarList,
      if (flag)
      {
        notfind = false;
+        owner_rank[j] = BP->rank;
        if (myrank == BP->rank)
        {
          //---> interpolation
@@ -659,14 +863,11 @@ void Patch::Interp_Points(MyList<var> *VarList,
          int k = 0;
          while (varl) // run along variables
          {
-            //              shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn],
-            //	  		                                    pox,ordn,varl->data->SoA,Symmetry);
-            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[j * num_var + k],
+            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
            varl = varl->next;
            k++;
          }
-          weight[j] = 1;
        }
      }
      if (Bp == ble)
@@ -675,97 +876,35 @@ void Patch::Interp_Points(MyList<var> *VarList,
    }
  }

-  MPI_Allreduce(shellf, Shellf, NN * num_var, MPI_DOUBLE, MPI_SUM, Comm_here);
-  int *Weight;
-  Weight = new int[NN];
-  MPI_Allreduce(weight, Weight, NN, MPI_INT, MPI_SUM, Comm_here);
+  // Collect unique global owner ranks and translate to local ranks in Comm_here
+  // Then broadcast each owner's points via MPI_Bcast on Comm_here
+  {
+    int j = 0;
+    while (j < NN)
+    {
+      int cur_owner_global = owner_rank[j];
+      if (cur_owner_global < 0)
+      {
+        // Point not found — skip (error check disabled for sub-communicator levels)
+        j++;
+        continue;
+      }
+      // Translate global rank to local rank in Comm_here
+      int cur_owner_local;
+      MPI_Group_translate_ranks(world_group, 1, &cur_owner_global, local_group, &cur_owner_local);

-  //  misc::tillherecheck("print me");
-  //  if(lmyrank == 0) cout<<"myrank = "<<myrank<<"print me"<<endl;
-
-  for (int i = 0; i < NN; i++)
-  {
-    if (Weight[i] > 1)
-    {
-      if (lmyrank == 0)
-        cout << "WARNING: Patch::Interp_Points meets multiple weight" << endl;
-      for (int j = 0; j < num_var; j++)
-        Shellf[j + i * num_var] = Shellf[j + i * num_var] / Weight[i];
+      // Find contiguous run of points with the same owner
+      int jstart = j;
+      while (j < NN && owner_rank[j] == cur_owner_global)
+        j++;
+      int count = (j - jstart) * num_var;
+      MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner_local, Comm_here);
    }
-#if 0 // for not involved levels, this may fail     
-     else if(Weight[i] == 0 && lmyrank == 0)
-     {
-       cout<<"ERROR: Patch::Interp_Points fails to find point (";
-       for(int j=0;j<dim;j++)
-       {
-	  cout<<XX[j][i];
-	  if(j<dim-1) cout<<",";
-	  else        cout<<")";
-       }
-       cout<<" on Patch (";
-       for(int j=0;j<dim;j++)
-       {
-	  cout<<bbox[j]<<"+"<<lli[j]*getdX(j);
-	  if(j<dim-1) cout<<",";
-	  else        cout<<")--";
-       }
-       cout<<"(";
-       for(int j=0;j<dim;j++)
-       {
-	  cout<<bbox[dim+j]<<"-"<<uui[j]*getdX(j);
-	  if(j<dim-1) cout<<",";
-	  else        cout<<")"<<endl;
-       }
-#if 0
-       checkBlock();
-#else
-  cout<<"splited domains:"<<endl;
-  {
-     MyList<Block> *Bp=blb;
-     while(Bp)
-     {
-	Block *BP=Bp->data;
-
-	for(int i=0;i<dim;i++)
-	{
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif    
-          llb[i] = (feq(BP->bbox[i]    ,bbox[i]    ,DH[i]/2)) ? BP->bbox[i]+lli[i]*DH[i]     : BP->bbox[i]    +(ghost_width-0.5)*DH[i];
-          uub[i] = (feq(BP->bbox[dim+i],bbox[dim+i],DH[i]/2)) ? BP->bbox[dim+i]-uui[i]*DH[i] : BP->bbox[dim+i]-(ghost_width-0.5)*DH[i];
-#else
-#ifdef Cell
-          llb[i] = (feq(BP->bbox[i]    ,bbox[i]    ,DH[i]/2)) ? BP->bbox[i]+lli[i]*DH[i]     : BP->bbox[i]    +ghost_width*DH[i];
-          uub[i] = (feq(BP->bbox[dim+i],bbox[dim+i],DH[i]/2)) ? BP->bbox[dim+i]-uui[i]*DH[i] : BP->bbox[dim+i]-ghost_width*DH[i];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif 
-	}       
-       cout<<"(";
-       for(int j=0;j<dim;j++)
-       {
-	  cout<<llb[j]<<":"<<uub[j];
-	  if(j<dim-1) cout<<",";
-	  else        cout<<")"<<endl;
-       }
-	if(Bp == ble) break;
-	Bp=Bp->next;
-     }
-  }
-#endif       
-       MPI_Abort(MPI_COMM_WORLD,1);
-     }
-#endif
  }

-  delete[] shellf;
-  delete[] weight;
-  delete[] Weight;
-  delete[] DH;
-  delete[] llb;
-  delete[] uub;
+  MPI_Group_free(&world_group);
+  MPI_Group_free(&local_group);
+  delete[] owner_rank;
 }
 void Patch::checkBlock()
 {
--- a/AMSS_NCKU_source/MPatch.h
+++ b/AMSS_NCKU_source/MPatch.h
@@ -39,6 +39,10 @@ public:

   bool Find_Point(double *XX);

+   void Interp_Points(MyList<var> *VarList,
+                      int NN, double **XX,
+                      double *Shellf, int Symmetry,
+                      int Nmin_consumer, int Nmax_consumer);
   void Interp_Points(MyList<var> *VarList,
                      int NN, double **XX,
                      double *Shellf, int Symmetry, MPI_Comm Comm_here);
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -3756,6 +3756,502 @@ void Parallel::Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
  delete[] transfer_src;
  delete[] transfer_dst;
 }
+// Merged Sync: collect all intra-patch and inter-patch grid segment lists,
+// then issue a single transfer() call instead of N+1 separate ones.
+void Parallel::Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
+{
+  int cpusize;
+  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+
+  MyList<Parallel::gridseg> **combined_src = new MyList<Parallel::gridseg> *[cpusize];
+  MyList<Parallel::gridseg> **combined_dst = new MyList<Parallel::gridseg> *[cpusize];
+  for (int node = 0; node < cpusize; node++)
+    combined_src[node] = combined_dst[node] = 0;
+
+  // Phase A: Intra-patch ghost exchange segments
+  MyList<Patch> *Pp = PatL;
+  while (Pp)
+  {
+    Patch *Pat = Pp->data;
+    MyList<Parallel::gridseg> *dst_ghost = build_ghost_gsl(Pat);
+
+    for (int node = 0; node < cpusize; node++)
+    {
+      MyList<Parallel::gridseg> *src_owned = build_owned_gsl0(Pat, node);
+      MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
+      build_gstl(src_owned, dst_ghost, &tsrc, &tdst);
+
+      if (tsrc)
+      {
+        if (combined_src[node])
+          combined_src[node]->catList(tsrc);
+        else
+          combined_src[node] = tsrc;
+      }
+      if (tdst)
+      {
+        if (combined_dst[node])
+          combined_dst[node]->catList(tdst);
+        else
+          combined_dst[node] = tdst;
+      }
+
+      if (src_owned)
+        src_owned->destroyList();
+    }
+
+    if (dst_ghost)
+      dst_ghost->destroyList();
+
+    Pp = Pp->next;
+  }
+
+  // Phase B: Inter-patch buffer exchange segments
+  MyList<Parallel::gridseg> *dst_buffer = build_buffer_gsl(PatL);
+  for (int node = 0; node < cpusize; node++)
+  {
+    MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatL, node, 5, Symmetry);
+    MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
+    build_gstl(src_owned, dst_buffer, &tsrc, &tdst);
+
+    if (tsrc)
+    {
+      if (combined_src[node])
+        combined_src[node]->catList(tsrc);
+      else
+        combined_src[node] = tsrc;
+    }
+    if (tdst)
+    {
+      if (combined_dst[node])
+        combined_dst[node]->catList(tdst);
+      else
+        combined_dst[node] = tdst;
+    }
+
+    if (src_owned)
+      src_owned->destroyList();
+  }
+  if (dst_buffer)
+    dst_buffer->destroyList();
+
+  // Phase C: Single transfer
+  transfer(combined_src, combined_dst, VarList, VarList, Symmetry);
+
+  // Phase D: Cleanup
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (combined_src[node])
+      combined_src[node]->destroyList();
+    if (combined_dst[node])
+      combined_dst[node]->destroyList();
+  }
+  delete[] combined_src;
+  delete[] combined_dst;
+}
+// SyncCache constructor
+Parallel::SyncCache::SyncCache()
+    : valid(false), cpusize(0), combined_src(0), combined_dst(0),
+      send_lengths(0), recv_lengths(0), send_bufs(0), recv_bufs(0),
+      send_buf_caps(0), recv_buf_caps(0), reqs(0), stats(0), max_reqs(0),
+      lengths_valid(false)
+{
+}
+// SyncCache invalidate: free grid segment lists but keep buffers
+void Parallel::SyncCache::invalidate()
+{
+  if (!valid)
+    return;
+  for (int i = 0; i < cpusize; i++)
+  {
+    if (combined_src[i])
+      combined_src[i]->destroyList();
+    if (combined_dst[i])
+      combined_dst[i]->destroyList();
+    combined_src[i] = combined_dst[i] = 0;
+    send_lengths[i] = recv_lengths[i] = 0;
+  }
+  valid = false;
+  lengths_valid = false;
+}
+// SyncCache destroy: free everything
+void Parallel::SyncCache::destroy()
+{
+  invalidate();
+  if (combined_src) delete[] combined_src;
+  if (combined_dst) delete[] combined_dst;
+  if (send_lengths) delete[] send_lengths;
+  if (recv_lengths) delete[] recv_lengths;
+  if (send_buf_caps) delete[] send_buf_caps;
+  if (recv_buf_caps) delete[] recv_buf_caps;
+  for (int i = 0; i < cpusize; i++)
+  {
+    if (send_bufs && send_bufs[i]) delete[] send_bufs[i];
+    if (recv_bufs && recv_bufs[i]) delete[] recv_bufs[i];
+  }
+  if (send_bufs) delete[] send_bufs;
+  if (recv_bufs) delete[] recv_bufs;
+  if (reqs) delete[] reqs;
+  if (stats) delete[] stats;
+  combined_src = combined_dst = 0;
+  send_lengths = recv_lengths = 0;
+  send_buf_caps = recv_buf_caps = 0;
+  send_bufs = recv_bufs = 0;
+  reqs = 0; stats = 0;
+  cpusize = 0; max_reqs = 0;
+}
+// transfer_cached: reuse pre-allocated buffers from SyncCache
+void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
+                               MyList<var> *VarList1, MyList<var> *VarList2,
+                               int Symmetry, SyncCache &cache)
+{
+  int myrank;
+  MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  int cpusize = cache.cpusize;
+
+  int req_no = 0;
+  int node;
+
+  for (node = 0; node < cpusize; node++)
+  {
+    if (node == myrank)
+    {
+      int length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+      cache.recv_lengths[node] = length;
+      if (length > 0)
+      {
+        if (length > cache.recv_buf_caps[node])
+        {
+          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
+          cache.recv_bufs[node] = new double[length];
+          cache.recv_buf_caps[node] = length;
+        }
+        data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+      }
+    }
+    else
+    {
+      // send
+      int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+      cache.send_lengths[node] = slength;
+      if (slength > 0)
+      {
+        if (slength > cache.send_buf_caps[node])
+        {
+          if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
+          cache.send_bufs[node] = new double[slength];
+          cache.send_buf_caps[node] = slength;
+        }
+        data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+        MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
+      }
+      // recv
+      int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
+      cache.recv_lengths[node] = rlength;
+      if (rlength > 0)
+      {
+        if (rlength > cache.recv_buf_caps[node])
+        {
+          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
+          cache.recv_bufs[node] = new double[rlength];
+          cache.recv_buf_caps[node] = rlength;
+        }
+        MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
+      }
+    }
+  }
+
+  MPI_Waitall(req_no, cache.reqs, cache.stats);
+
+  for (node = 0; node < cpusize; node++)
+    if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
+      data_packer(cache.recv_bufs[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
+}
+// Sync_cached: build grid segment lists on first call, reuse on subsequent calls
+void Parallel::Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache)
+{
+  if (!cache.valid)
+  {
+    int cpusize;
+    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+    cache.cpusize = cpusize;
+
+    // Allocate cache arrays if needed
+    if (!cache.combined_src)
+    {
+      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
+      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
+      cache.send_lengths = new int[cpusize];
+      cache.recv_lengths = new int[cpusize];
+      cache.send_bufs = new double *[cpusize];
+      cache.recv_bufs = new double *[cpusize];
+      cache.send_buf_caps = new int[cpusize];
+      cache.recv_buf_caps = new int[cpusize];
+      for (int i = 0; i < cpusize; i++)
+      {
+        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
+        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
+      }
+      cache.max_reqs = 2 * cpusize;
+      cache.reqs = new MPI_Request[cache.max_reqs];
+      cache.stats = new MPI_Status[cache.max_reqs];
+    }
+
+    for (int node = 0; node < cpusize; node++)
+    {
+      cache.combined_src[node] = cache.combined_dst[node] = 0;
+      cache.send_lengths[node] = cache.recv_lengths[node] = 0;
+    }
+
+    // Build intra-patch segments (same as Sync_merged Phase A)
+    MyList<Patch> *Pp = PatL;
+    while (Pp)
+    {
+      Patch *Pat = Pp->data;
+      MyList<Parallel::gridseg> *dst_ghost = build_ghost_gsl(Pat);
+      for (int node = 0; node < cpusize; node++)
+      {
+        MyList<Parallel::gridseg> *src_owned = build_owned_gsl0(Pat, node);
+        MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
+        build_gstl(src_owned, dst_ghost, &tsrc, &tdst);
+        if (tsrc)
+        {
+          if (cache.combined_src[node])
+            cache.combined_src[node]->catList(tsrc);
+          else
+            cache.combined_src[node] = tsrc;
+        }
+        if (tdst)
+        {
+          if (cache.combined_dst[node])
+            cache.combined_dst[node]->catList(tdst);
+          else
+            cache.combined_dst[node] = tdst;
+        }
+        if (src_owned) src_owned->destroyList();
+      }
+      if (dst_ghost) dst_ghost->destroyList();
+      Pp = Pp->next;
+    }
+
+    // Build inter-patch segments (same as Sync_merged Phase B)
+    MyList<Parallel::gridseg> *dst_buffer = build_buffer_gsl(PatL);
+    for (int node = 0; node < cpusize; node++)
+    {
+      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatL, node, 5, Symmetry);
+      MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
+      build_gstl(src_owned, dst_buffer, &tsrc, &tdst);
+      if (tsrc)
+      {
+        if (cache.combined_src[node])
+          cache.combined_src[node]->catList(tsrc);
+        else
+          cache.combined_src[node] = tsrc;
+      }
+      if (tdst)
+      {
+        if (cache.combined_dst[node])
+          cache.combined_dst[node]->catList(tdst);
+        else
+          cache.combined_dst[node] = tdst;
+      }
+      if (src_owned) src_owned->destroyList();
+    }
+    if (dst_buffer) dst_buffer->destroyList();
+
+    cache.valid = true;
+  }
+
+  // Use cached lists with buffer-reusing transfer
+  transfer_cached(cache.combined_src, cache.combined_dst, VarList, VarList, Symmetry, cache);
+}
+// Sync_start: pack and post MPI_Isend/Irecv, return immediately
+void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
+                          SyncCache &cache, AsyncSyncState &state)
+{
+  // Ensure cache is built
+  if (!cache.valid)
+  {
+    // Build cache (same logic as Sync_cached)
+    int cpusize;
+    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+    cache.cpusize = cpusize;
+
+    if (!cache.combined_src)
+    {
+      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
+      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
+      cache.send_lengths = new int[cpusize];
+      cache.recv_lengths = new int[cpusize];
+      cache.send_bufs = new double *[cpusize];
+      cache.recv_bufs = new double *[cpusize];
+      cache.send_buf_caps = new int[cpusize];
+      cache.recv_buf_caps = new int[cpusize];
+      for (int i = 0; i < cpusize; i++)
+      {
+        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
+        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
+      }
+      cache.max_reqs = 2 * cpusize;
+      cache.reqs = new MPI_Request[cache.max_reqs];
+      cache.stats = new MPI_Status[cache.max_reqs];
+    }
+
+    for (int node = 0; node < cpusize; node++)
+    {
+      cache.combined_src[node] = cache.combined_dst[node] = 0;
+      cache.send_lengths[node] = cache.recv_lengths[node] = 0;
+    }
+
+    MyList<Patch> *Pp = PatL;
+    while (Pp)
+    {
+      Patch *Pat = Pp->data;
+      MyList<Parallel::gridseg> *dst_ghost = build_ghost_gsl(Pat);
+      for (int node = 0; node < cpusize; node++)
+      {
+        MyList<Parallel::gridseg> *src_owned = build_owned_gsl0(Pat, node);
+        MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
+        build_gstl(src_owned, dst_ghost, &tsrc, &tdst);
+        if (tsrc)
+        {
+          if (cache.combined_src[node])
+            cache.combined_src[node]->catList(tsrc);
+          else
+            cache.combined_src[node] = tsrc;
+        }
+        if (tdst)
+        {
+          if (cache.combined_dst[node])
+            cache.combined_dst[node]->catList(tdst);
+          else
+            cache.combined_dst[node] = tdst;
+        }
+        if (src_owned) src_owned->destroyList();
+      }
+      if (dst_ghost) dst_ghost->destroyList();
+      Pp = Pp->next;
+    }
+
+    MyList<Parallel::gridseg> *dst_buffer = build_buffer_gsl(PatL);
+    for (int node = 0; node < cpusize; node++)
+    {
+      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatL, node, 5, Symmetry);
+      MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
+      build_gstl(src_owned, dst_buffer, &tsrc, &tdst);
+      if (tsrc)
+      {
+        if (cache.combined_src[node])
+          cache.combined_src[node]->catList(tsrc);
+        else
+          cache.combined_src[node] = tsrc;
+      }
+      if (tdst)
+      {
+        if (cache.combined_dst[node])
+          cache.combined_dst[node]->catList(tdst);
+        else
+          cache.combined_dst[node] = tdst;
+      }
+      if (src_owned) src_owned->destroyList();
+    }
+    if (dst_buffer) dst_buffer->destroyList();
+    cache.valid = true;
+  }
+
+  // Now pack and post async MPI operations
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  int cpusize = cache.cpusize;
+  state.req_no = 0;
+  state.active = true;
+
+  MyList<Parallel::gridseg> **src = cache.combined_src;
+  MyList<Parallel::gridseg> **dst = cache.combined_dst;
+
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (node == myrank)
+    {
+      int length;
+      if (!cache.lengths_valid) {
+        length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
+        cache.recv_lengths[node] = length;
+      } else {
+        length = cache.recv_lengths[node];
+      }
+      if (length > 0)
+      {
+        if (length > cache.recv_buf_caps[node])
+        {
+          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
+          cache.recv_bufs[node] = new double[length];
+          cache.recv_buf_caps[node] = length;
+        }
+        data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
+      }
+    }
+    else
+    {
+      int slength;
+      if (!cache.lengths_valid) {
+        slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
+        cache.send_lengths[node] = slength;
+      } else {
+        slength = cache.send_lengths[node];
+      }
+      if (slength > 0)
+      {
+        if (slength > cache.send_buf_caps[node])
+        {
+          if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
+          cache.send_bufs[node] = new double[slength];
+          cache.send_buf_caps[node] = slength;
+        }
+        data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
+        MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
+      }
+      int rlength;
+      if (!cache.lengths_valid) {
+        rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry);
+        cache.recv_lengths[node] = rlength;
+      } else {
+        rlength = cache.recv_lengths[node];
+      }
+      if (rlength > 0)
+      {
+        if (rlength > cache.recv_buf_caps[node])
+        {
+          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
+          cache.recv_bufs[node] = new double[rlength];
+          cache.recv_buf_caps[node] = rlength;
+        }
+        MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
+      }
+    }
+  }
+  cache.lengths_valid = true;
+}
+// Sync_finish: wait for async MPI operations and unpack
+void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state,
+                           MyList<var> *VarList, int Symmetry)
+{
+  if (!state.active)
+    return;
+
+  MPI_Waitall(state.req_no, cache.reqs, cache.stats);
+
+  int cpusize = cache.cpusize;
+  MyList<Parallel::gridseg> **src = cache.combined_src;
+  MyList<Parallel::gridseg> **dst = cache.combined_dst;
+
+  for (int node = 0; node < cpusize; node++)
+    if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
+      data_packer(cache.recv_bufs[node], src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry);
+
+  state.active = false;
+}
 // collect buffer grid segments or blocks for the periodic boundary condition of given patch
 // ---------------------------------------------------
 // |con |                                       |con |
@@ -4790,6 +5286,203 @@ void Parallel::OutBdLow2Himix(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
  delete[] transfer_src;
  delete[] transfer_dst;
 }
+
+// Restrict_cached: cache grid segment lists, reuse buffers via transfer_cached
+void Parallel::Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
+                               MyList<var> *VarList1, MyList<var> *VarList2,
+                               int Symmetry, SyncCache &cache)
+{
+  if (!cache.valid)
+  {
+    int cpusize;
+    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+    cache.cpusize = cpusize;
+
+    if (!cache.combined_src)
+    {
+      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
+      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
+      cache.send_lengths = new int[cpusize];
+      cache.recv_lengths = new int[cpusize];
+      cache.send_bufs = new double *[cpusize];
+      cache.recv_bufs = new double *[cpusize];
+      cache.send_buf_caps = new int[cpusize];
+      cache.recv_buf_caps = new int[cpusize];
+      for (int i = 0; i < cpusize; i++)
+      {
+        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
+        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
+      }
+      cache.max_reqs = 2 * cpusize;
+      cache.reqs = new MPI_Request[cache.max_reqs];
+      cache.stats = new MPI_Status[cache.max_reqs];
+    }
+
+    MyList<Parallel::gridseg> *dst = build_complete_gsl(PatcL);
+    for (int node = 0; node < cpusize; node++)
+    {
+      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatfL, node, 2, Symmetry);
+      build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]);
+      if (src_owned) src_owned->destroyList();
+    }
+    if (dst) dst->destroyList();
+
+    cache.valid = true;
+  }
+
+  transfer_cached(cache.combined_src, cache.combined_dst, VarList1, VarList2, Symmetry, cache);
+}
+
+// OutBdLow2Hi_cached: cache grid segment lists, reuse buffers via transfer_cached
+void Parallel::OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
+                                  MyList<var> *VarList1, MyList<var> *VarList2,
+                                  int Symmetry, SyncCache &cache)
+{
+  if (!cache.valid)
+  {
+    int cpusize;
+    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+    cache.cpusize = cpusize;
+
+    if (!cache.combined_src)
+    {
+      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
+      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
+      cache.send_lengths = new int[cpusize];
+      cache.recv_lengths = new int[cpusize];
+      cache.send_bufs = new double *[cpusize];
+      cache.recv_bufs = new double *[cpusize];
+      cache.send_buf_caps = new int[cpusize];
+      cache.recv_buf_caps = new int[cpusize];
+      for (int i = 0; i < cpusize; i++)
+      {
+        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
+        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
+      }
+      cache.max_reqs = 2 * cpusize;
+      cache.reqs = new MPI_Request[cache.max_reqs];
+      cache.stats = new MPI_Status[cache.max_reqs];
+    }
+
+    MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
+    for (int node = 0; node < cpusize; node++)
+    {
+      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatcL, node, 4, Symmetry);
+      build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]);
+      if (src_owned) src_owned->destroyList();
+    }
+    if (dst) dst->destroyList();
+
+    cache.valid = true;
+  }
+
+  transfer_cached(cache.combined_src, cache.combined_dst, VarList1, VarList2, Symmetry, cache);
+}
+
+// OutBdLow2Himix_cached: same as OutBdLow2Hi_cached but uses transfermix for unpacking
+void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
+                                     MyList<var> *VarList1, MyList<var> *VarList2,
+                                     int Symmetry, SyncCache &cache)
+{
+  if (!cache.valid)
+  {
+    int cpusize;
+    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+    cache.cpusize = cpusize;
+
+    if (!cache.combined_src)
+    {
+      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
+      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
+      cache.send_lengths = new int[cpusize];
+      cache.recv_lengths = new int[cpusize];
+      cache.send_bufs = new double *[cpusize];
+      cache.recv_bufs = new double *[cpusize];
+      cache.send_buf_caps = new int[cpusize];
+      cache.recv_buf_caps = new int[cpusize];
+      for (int i = 0; i < cpusize; i++)
+      {
+        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
+        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
+      }
+      cache.max_reqs = 2 * cpusize;
+      cache.reqs = new MPI_Request[cache.max_reqs];
+      cache.stats = new MPI_Status[cache.max_reqs];
+    }
+
+    MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
+    for (int node = 0; node < cpusize; node++)
+    {
+      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatcL, node, 4, Symmetry);
+      build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]);
+      if (src_owned) src_owned->destroyList();
+    }
+    if (dst) dst->destroyList();
+
+    cache.valid = true;
+  }
+
+  // Use transfermix instead of transfer for mix-mode interpolation
+  int myrank;
+  MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  int cpusize = cache.cpusize;
+
+  int req_no = 0;
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (node == myrank)
+    {
+      int length = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+      cache.recv_lengths[node] = length;
+      if (length > 0)
+      {
+        if (length > cache.recv_buf_caps[node])
+        {
+          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
+          cache.recv_bufs[node] = new double[length];
+          cache.recv_buf_caps[node] = length;
+        }
+        data_packermix(cache.recv_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+      }
+    }
+    else
+    {
+      int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+      cache.send_lengths[node] = slength;
+      if (slength > 0)
+      {
+        if (slength > cache.send_buf_caps[node])
+        {
+          if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
+          cache.send_bufs[node] = new double[slength];
+          cache.send_buf_caps[node] = slength;
+        }
+        data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+        MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
+      }
+      int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
+      cache.recv_lengths[node] = rlength;
+      if (rlength > 0)
+      {
+        if (rlength > cache.recv_buf_caps[node])
+        {
+          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
+          cache.recv_bufs[node] = new double[rlength];
+          cache.recv_buf_caps[node] = rlength;
+        }
+        MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
+      }
+    }
+  }
+
+  MPI_Waitall(req_no, cache.reqs, cache.stats);
+
+  for (int node = 0; node < cpusize; node++)
+    if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
+      data_packermix(cache.recv_bufs[node], cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
+}
+
 // collect all buffer grid segments or blocks for given patch
 MyList<Parallel::gridseg> *Parallel::build_buffer_gsl(Patch *Pat)
 {
--- a/AMSS_NCKU_source/Parallel.h
+++ b/AMSS_NCKU_source/Parallel.h
@@ -81,6 +81,43 @@ namespace Parallel
                   int Symmetry);
  void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry);
  void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
+  void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
+
+  struct SyncCache {
+    bool valid;
+    int cpusize;
+    MyList<gridseg> **combined_src;
+    MyList<gridseg> **combined_dst;
+    int *send_lengths;
+    int *recv_lengths;
+    double **send_bufs;
+    double **recv_bufs;
+    int *send_buf_caps;
+    int *recv_buf_caps;
+    MPI_Request *reqs;
+    MPI_Status *stats;
+    int max_reqs;
+    bool lengths_valid;
+    SyncCache();
+    void invalidate();
+    void destroy();
+  };
+
+  void Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache);
+  void transfer_cached(MyList<gridseg> **src, MyList<gridseg> **dst,
+                       MyList<var> *VarList1, MyList<var> *VarList2,
+                       int Symmetry, SyncCache &cache);
+
+  struct AsyncSyncState {
+    int req_no;
+    bool active;
+    AsyncSyncState() : req_no(0), active(false) {}
+  };
+
+  void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
+                  SyncCache &cache, AsyncSyncState &state);
+  void Sync_finish(SyncCache &cache, AsyncSyncState &state,
+                   MyList<var> *VarList, int Symmetry);
  void OutBdLow2Hi(Patch *Patc, Patch *Patf,
                   MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
                   int Symmetry);
@@ -93,6 +130,15 @@ namespace Parallel
  void OutBdLow2Himix(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                      MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
                      int Symmetry);
+  void Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
+                       MyList<var> *VarList1, MyList<var> *VarList2,
+                       int Symmetry, SyncCache &cache);
+  void OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
+                          MyList<var> *VarList1, MyList<var> *VarList2,
+                          int Symmetry, SyncCache &cache);
+  void OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
+                             MyList<var> *VarList1, MyList<var> *VarList2,
+                             int Symmetry, SyncCache &cache);
  void Prolong(Patch *Patc, Patch *Patf,
               MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
               int Symmetry);
--- a/AMSS_NCKU_source/TwoPunctures.C
+++ b/AMSS_NCKU_source/TwoPunctures.C
@@ -1359,7 +1359,7 @@ void TwoPunctures::F_of_v(int nvar, int n1, int n2, int n3, derivs v, double *F,
    debugfile = fopen("res.dat", "w");
    assert(debugfile);
  }
-  #pragma omp parallel for collapse(3) schedule(static) \
+  #pragma omp parallel for collapse(3) schedule(dynamic,1) \
    private(i, j, k, ivar, indx, al, be, A, B, X, R, x, r, phi, y, z, Am1, \
            psi, psi2, psi4, psi7, r_plus, r_minus)
  for (i = 0; i < n1; i++)
@@ -1829,7 +1829,7 @@ void TwoPunctures::J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, doubl

  Derivatives_AB3_MatMul(nvar, n1, n2, n3, dv);

-  #pragma omp parallel for schedule(static) \
+  #pragma omp parallel for schedule(dynamic,1) \
    private(j, k, ivar, indx, al, be, A, B, X, R, x, r, phi, y, z, Am1)
  for (i = 0; i < n1; i++)
  {
@@ -2111,10 +2111,19 @@ void TwoPunctures::JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2,
  double sin_be, sin_be_i1, sin_be_i2, sin_be_i3, cos_be;
  double dV0, dV1, dV2, dV3, dV11, dV12, dV13, dV22, dV23, dV33,
      ha, ga, ga2, hb, gb, gb2, hp, gp, gp2, gagb, gagp, gbgp;
-  derivs dU, U;

-  allocate_derivs(&dU, nvar);
-  allocate_derivs(&U, nvar);
+  // Stack-allocated derivs (nvar=1) — no malloc/free!
+  double dU_d0[1], dU_d1[1], dU_d2[1], dU_d3[1];
+  double dU_d11[1], dU_d12[1], dU_d13[1], dU_d22[1], dU_d23[1], dU_d33[1];
+  double U_d0[1], U_d1[1], U_d2[1], U_d3[1];
+  double U_d11[1], U_d12[1], U_d13[1], U_d22[1], U_d23[1], U_d33[1];
+  derivs dU, U;
+  dU.d0=dU_d0; dU.d1=dU_d1; dU.d2=dU_d2; dU.d3=dU_d3;
+  dU.d11=dU_d11; dU.d12=dU_d12; dU.d13=dU_d13;
+  dU.d22=dU_d22; dU.d23=dU_d23; dU.d33=dU_d33;
+  U.d0=U_d0; U.d1=U_d1; U.d2=U_d2; U.d3=U_d3;
+  U.d11=U_d11; U.d12=U_d12; U.d13=U_d13;
+  U.d22=U_d22; U.d23=U_d23; U.d33=U_d33;

  if (k < 0)
    k = k + n3;
@@ -2182,12 +2191,9 @@ void TwoPunctures::JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2,
    dV11 = ga2 * (dv.d0[ipcc] + dv.d0[imcc] - 2 * dv.d0[iccc]);
    dV22 = gb2 * (dv.d0[icpc] + dv.d0[icmc] - 2 * dv.d0[iccc]);
    dV33 = gp2 * (dv.d0[iccp] + dv.d0[iccm] - 2 * dv.d0[iccc]);
-    dV12 =
-        0.25 * gagb * (dv.d0[ippc] - dv.d0[ipmc] + dv.d0[immc] - dv.d0[impc]);
-    dV13 =
-        0.25 * gagp * (dv.d0[ipcp] - dv.d0[imcp] + dv.d0[imcm] - dv.d0[ipcm]);
-    dV23 =
-        0.25 * gbgp * (dv.d0[icpp] - dv.d0[icpm] + dv.d0[icmm] - dv.d0[icmp]);
+    dV12 = 0.25 * gagb * (dv.d0[ippc] - dv.d0[ipmc] + dv.d0[immc] - dv.d0[impc]);
+    dV13 = 0.25 * gagp * (dv.d0[ipcp] - dv.d0[imcp] + dv.d0[imcm] - dv.d0[ipcm]);
+    dV23 = 0.25 * gbgp * (dv.d0[icpp] - dv.d0[icpm] + dv.d0[icmm] - dv.d0[icmp]);
    /* Derivatives of (dv) w.r.t. (A,B,phi):*/
    dV11 = sin_al_i3 * (sin_al * dV11 - cos_al * dV1);
    dV12 = sin_al_i1 * sin_be_i1 * dV12;
@@ -2230,11 +2236,12 @@ void TwoPunctures::JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2,
  /* (dU, dU_x, dU_y, dU_z, dU_xx, dU_xy, dU_xz, dU_yy, dU_yz, dU_zz)*/
  rx3_To_xyz(nvar, x, r, phi, &y, &z, dU);
  LinEquations(A, B, X, R, x, r, phi, y, z, dU, U, values);
-  for (ivar = 0; ivar < nvar; ivar++)
-    values[ivar] *= FAC;

-  free_derivs(&dU, nvar);
-  free_derivs(&U, nvar);
+  double FAC_val = sin_al * sin_be * sin_al * sin_be * sin_al * sin_be;
+  for (ivar = 0; ivar < nvar; ivar++)
+    values[ivar] *= FAC_val;
+
+  // No free_derivs needed — everything is on the stack
 }
 #undef FAC
 /*-----------------------------------------------------------*/
--- a/AMSS_NCKU_source/Z4c_class.C
+++ b/AMSS_NCKU_source/Z4c_class.C
@@ -485,25 +485,7 @@ void Z4c_class::Step(int lev, int YN)
  }
 #endif

-  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
-
-#ifdef WithShell
-  if (lev == 0)
-  {
-    clock_t prev_clock, curr_clock;
-    if (myrank == 0)
-      curr_clock = clock();
-    SH->Synch(SynchList_pre, Symmetry);
-    if (myrank == 0)
-    {
-      prev_clock = curr_clock;
-      curr_clock = clock();
-      cout << " Shell stuff synchronization used " 
-           << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) 
-           << " seconds! " << endl;
-    }
-  }
-#endif
+  // CA-RK4: skip post-prediction sync (redundant; ghost cells computable locally)

  // for black hole position
  if (BH_num > 0 && lev == GH->levels - 1)
@@ -868,6 +850,8 @@ void Z4c_class::Step(int lev, int YN)
    }
 #endif

+    // CA-RK4: only sync after last corrector (iter_count == 3); stages 1 & 2 are redundant
+    if (iter_count == 3) {
      Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);

 #ifdef WithShell
@@ -887,6 +871,7 @@ void Z4c_class::Step(int lev, int YN)
        }
      }
 #endif
+    } // end CA-RK4 guard
    // for black hole position
    if (BH_num > 0 && lev == GH->levels - 1)
    {
@@ -1558,7 +1543,7 @@ void Z4c_class::Step(int lev, int YN)
    }
  }

-  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
+  // CA-RK4: skip post-prediction MPI ghost sync (redundant; ghost cells computable locally)

  if (lev == 0)
  {
@@ -2120,6 +2105,8 @@ void Z4c_class::Step(int lev, int YN)
      }
    }

+    // CA-RK4: only MPI sync after last corrector (iter_count == 3); stages 1 & 2 are redundant
+    if (iter_count == 3)
      Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);

    if (lev == 0)
--- a/AMSS_NCKU_source/bssnEM_class.C
+++ b/AMSS_NCKU_source/bssnEM_class.C
@@ -1221,25 +1221,7 @@ void bssnEM_class::Step(int lev, int YN)
  }
 #endif

-  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
-
-#ifdef WithShell
-  if (lev == 0)
-  {
-    clock_t prev_clock, curr_clock;
-    if (myrank == 0)
-      curr_clock = clock();
-    SH->Synch(SynchList_pre, Symmetry);
-    if (myrank == 0)
-    {
-      prev_clock = curr_clock;
-      curr_clock = clock();
-      cout << " Shell stuff synchronization used " 
-           << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) 
-           << " seconds! " << endl;
-    }
-  }
-#endif
+  // CA-RK4: skip post-prediction sync (redundant; ghost cells computable locally)

  // for black hole position
  if (BH_num > 0 && lev == GH->levels - 1)
@@ -1683,6 +1665,8 @@ void bssnEM_class::Step(int lev, int YN)
    }
 #endif

+    // CA-RK4: only sync after last corrector (iter_count == 3); stages 1 & 2 are redundant
+    if (iter_count == 3) {
      Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);

 #ifdef WithShell
@@ -1702,6 +1686,7 @@ void bssnEM_class::Step(int lev, int YN)
        }
      }
 #endif
+    } // end CA-RK4 guard
    // for black hole position
    if (BH_num > 0 && lev == GH->levels - 1)
    {
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -730,6 +730,12 @@ void bssn_class::Initialize()
    PhysTime = StartTime;
    Setup_Black_Hole_position();
  }
+
+  // Initialize sync caches (per-level, for predictor and corrector)
+  sync_cache_pre = new Parallel::SyncCache[GH->levels];
+  sync_cache_cor = new Parallel::SyncCache[GH->levels];
+  sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
+  sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
 }

 //================================================================================================
@@ -981,6 +987,32 @@ bssn_class::~bssn_class()
  delete Azzz;
 #endif

+  // Destroy sync caches before GH
+  if (sync_cache_pre)
+  {
+    for (int i = 0; i < GH->levels; i++)
+      sync_cache_pre[i].destroy();
+    delete[] sync_cache_pre;
+  }
+  if (sync_cache_cor)
+  {
+    for (int i = 0; i < GH->levels; i++)
+      sync_cache_cor[i].destroy();
+    delete[] sync_cache_cor;
+  }
+  if (sync_cache_rp_coarse)
+  {
+    for (int i = 0; i < GH->levels; i++)
+      sync_cache_rp_coarse[i].destroy();
+    delete[] sync_cache_rp_coarse;
+  }
+  if (sync_cache_rp_fine)
+  {
+    for (int i = 0; i < GH->levels; i++)
+      sync_cache_rp_fine[i].destroy();
+    delete[] sync_cache_rp_fine;
+  }
+
  delete GH;
 #ifdef WithShell
  delete SH;
@@ -2181,6 +2213,7 @@ void bssn_class::Evolve(int Steps)
    GH->Regrid(Symmetry, BH_num, Porgbr, Porg0,
               SynchList_cor, OldStateList, StateList, SynchList_pre,
               fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor);
+    for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
 #endif

 #if (REGLEV == 0 && (PSTR == 1 || PSTR == 2))
@@ -2393,9 +2426,10 @@ void bssn_class::RecursiveStep(int lev)
 #endif

 #if (REGLEV == 0)
-  GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
+  if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
                      SynchList_cor, OldStateList, StateList, SynchList_pre,
-                      fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
+                      fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
+  for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
 #endif
 }

@@ -2571,9 +2605,10 @@ void bssn_class::ParallelStep()
  delete[] tporg;
  delete[] tporgo;
 #if (REGLEV == 0)
-  GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
+  if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
                      SynchList_cor, OldStateList, StateList, SynchList_pre,
-                      fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
+                      fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
+  for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
 #endif
 }

@@ -2737,9 +2772,10 @@ void bssn_class::ParallelStep()
      if (lev + 1 >= GH->movls)
      {
        //	       GH->Regrid_Onelevel_aux(lev,Symmetry,BH_num,Porgbr,Porg0,
-        GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
+        if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
                            SynchList_cor, OldStateList, StateList, SynchList_pre,
-                            fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor);
+                            fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor))
+        for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }

        //               a_stream.clear();
        //               a_stream.str("");
@@ -2751,9 +2787,10 @@ void bssn_class::ParallelStep()
    // for this level
    if (YN == 1)
    {
-      GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
+      if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
                          SynchList_cor, OldStateList, StateList, SynchList_pre,
-                          fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
+                          fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
+      for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }

      //               a_stream.clear();
      //               a_stream.str("");
@@ -2769,9 +2806,10 @@ void bssn_class::ParallelStep()
        if (YN == 1)
        {
          //	   GH->Regrid_Onelevel_aux(lev-2,Symmetry,BH_num,Porgbr,Porg0,
-          GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
+          if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
                              SynchList_cor, OldStateList, StateList, SynchList_pre,
-                              fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor);
+                              fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
+          for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }

          //               a_stream.clear();
          //               a_stream.str("");
@@ -2784,9 +2822,10 @@ void bssn_class::ParallelStep()
        if (i % 4 == 3)
        {
          //	   GH->Regrid_Onelevel_aux(lev-2,Symmetry,BH_num,Porgbr,Porg0,
-          GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
+          if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
                              SynchList_cor, OldStateList, StateList, SynchList_pre,
-                              fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor);
+                              fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
+          for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }

          //               a_stream.clear();
          //               a_stream.str("");
@@ -3158,21 +3197,7 @@ void bssn_class::Step(int lev, int YN)
    }
    Pp = Pp->next;
  }
-  // check error information
-  {
-    int erh = ERROR;
-    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-  }
-  if (ERROR)
-  {
-    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
-    if (myrank == 0)
-    {
-      if (ErrorMonitor->outfile)
-        ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime << ", lev = " << lev << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-  }
+  // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls

 #ifdef WithShell
  // evolve Shell Patches
@@ -3316,44 +3341,32 @@ void bssn_class::Step(int lev, int YN)
 #endif
  }

-  // check error information
+  // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+  MPI_Request err_req;
  {
    int erh = ERROR;
-    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req);
  }
+#endif

+  // CA-RK4: skip post-prediction sync (redundant; ghost cells computable locally)
+
+#ifdef WithShell
+  // Complete non-blocking error reduction and check
+  MPI_Wait(&err_req, MPI_STATUS_IGNORE);
  if (ERROR)
  {
+    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
-        ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
+        ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime << ", lev = " << lev << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
 #endif

-  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
-
-#ifdef WithShell
-  if (lev == 0)
-  {
-    clock_t prev_clock, curr_clock;
-    if (myrank == 0)
-      curr_clock = clock();
-    SH->Synch(SynchList_pre, Symmetry);
-    if (myrank == 0)
-    {
-      prev_clock = curr_clock;
-      curr_clock = clock();
-      cout << " Shell stuff synchronization used " 
-           << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) 
-           << " seconds! " << endl;
-    }
-  }
-#endif
-
 #if (MAPBH == 0)
  // for black hole position
  if (BH_num > 0 && lev == GH->levels - 1)
@@ -3528,24 +3541,7 @@ void bssn_class::Step(int lev, int YN)
      Pp = Pp->next;
    }

-    // check error information
-    {
-      int erh = ERROR;
-      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-    }
-
-    if (ERROR)
-    {
-      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
-      if (myrank == 0)
-      {
-        if (ErrorMonitor->outfile)
-          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count 
-                                << " variables at t = " << PhysTime 
-                                << ", lev = " << lev << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-    }
+    // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls

 #ifdef WithShell
    // evolve Shell Patches
@@ -3685,26 +3681,18 @@ void bssn_class::Step(int lev, int YN)
        sPp = sPp->next;
      }
    }
-    // check error information
+    // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+    MPI_Request err_req_cor;
    {
      int erh = ERROR;
-      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-    }
-    if (ERROR)
-    {
-      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
-      if (myrank == 0)
-      {
-        if (ErrorMonitor->outfile)
-          ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" 
-                                << iter_count << " variables at t = " 
-                                << PhysTime << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
+      MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
    }
 #endif

-    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
+    // CA-RK4: only sync after last corrector (iter_count == 3); stages 1 & 2 are redundant
+    if (iter_count == 3) {
+      Parallel::AsyncSyncState async_cor;
+      Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);

 #ifdef WithShell
      if (lev == 0)
@@ -3723,6 +3711,26 @@ void bssn_class::Step(int lev, int YN)
        }
      }
 #endif
+      Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
+    } // end CA-RK4 guard
+
+#ifdef WithShell
+    // Complete non-blocking error reduction and check
+    MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
+    if (ERROR)
+    {
+      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
+      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
+      if (myrank == 0)
+      {
+        if (ErrorMonitor->outfile)
+          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
+                                << " variables at t = " << PhysTime
+                                << ", lev = " << lev << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+    }
+#endif

 #if (MAPBH == 0)
    // for black hole position
@@ -4034,22 +4042,7 @@ void bssn_class::Step(int lev, int YN)
    }
    Pp = Pp->next;
  }
-  // check error information
-  {
-    int erh = ERROR;
-    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-  }
-  if (ERROR)
-  {
-    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
-    if (myrank == 0)
-    {
-      if (ErrorMonitor->outfile)
-        ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime 
-                              << ", lev = " << lev << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-  }
+  // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls

 #ifdef WithShell
  // evolve Shell Patches
@@ -4190,25 +4183,16 @@ void bssn_class::Step(int lev, int YN)
  }
 #endif
  }
-  // check error information
+  // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+  MPI_Request err_req;
  {
    int erh = ERROR;
-    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-  }
-  if (ERROR)
-  {
-    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
-    if (myrank == 0)
-    {
-      if (ErrorMonitor->outfile)
-        ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " 
-                              << PhysTime << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
+    MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req);
  }
 #endif

-  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
+  Parallel::AsyncSyncState async_pre;
+  Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);

 #ifdef WithShell
  if (lev == 0)
@@ -4227,6 +4211,24 @@ void bssn_class::Step(int lev, int YN)
    }
  }
 #endif
+  Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
+
+#ifdef WithShell
+  // Complete non-blocking error reduction and check
+  MPI_Wait(&err_req, MPI_STATUS_IGNORE);
+  if (ERROR)
+  {
+    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
+    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
+    if (myrank == 0)
+    {
+      if (ErrorMonitor->outfile)
+        ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
+                              << ", lev = " << lev << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+  }
+#endif

  // for black hole position
  if (BH_num > 0 && lev == GH->levels - 1)
@@ -4386,23 +4388,7 @@ void bssn_class::Step(int lev, int YN)
      Pp = Pp->next;
    }

-    // check error information
-    {
-      int erh = ERROR;
-      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-    }
-    if (ERROR)
-    {
-      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
-      if (myrank == 0)
-      {
-        if (ErrorMonitor->outfile)
-          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count 
-                                << " variables at t = " << PhysTime 
-                                << ", lev = " << lev << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-    }
+    // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls

 #ifdef WithShell
    // evolve Shell Patches
@@ -4542,25 +4528,16 @@ void bssn_class::Step(int lev, int YN)
        sPp = sPp->next;
      }
    }
-    // check error information
+    // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+    MPI_Request err_req_cor;
    {
      int erh = ERROR;
-      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-    }
-    if (ERROR)
-    {
-      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
-      if (myrank == 0)
-      {
-        if (ErrorMonitor->outfile)
-          ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count 
-                                << " variables at t = " << PhysTime << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
+      MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
    }
 #endif

-    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
+    Parallel::AsyncSyncState async_cor;
+    Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);

 #ifdef WithShell
    if (lev == 0)
@@ -4578,6 +4555,25 @@ void bssn_class::Step(int lev, int YN)
             << " seconds! " << endl;
      }
    }
+#endif
+    Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
+
+#ifdef WithShell
+    // Complete non-blocking error reduction and check
+    MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
+    if (ERROR)
+    {
+      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
+      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
+      if (myrank == 0)
+      {
+        if (ErrorMonitor->outfile)
+          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
+                                << " variables at t = " << PhysTime
+                                << ", lev = " << lev << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+    }
 #endif
    // for black hole position
    if (BH_num > 0 && lev == GH->levels - 1)
@@ -4943,11 +4939,19 @@ void bssn_class::Step(int lev, int YN)

  //   misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Predictor rhs calculation");

-  // check error information
+  // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+  MPI_Request err_req;
  {
    int erh = ERROR;
-    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev]);
+    MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev], &err_req);
  }
+
+  //   misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor sync");
+
+  Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
+
+  // Complete non-blocking error reduction and check
+  MPI_Wait(&err_req, MPI_STATUS_IGNORE);
  if (ERROR)
  {
    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
@@ -4959,10 +4963,6 @@ void bssn_class::Step(int lev, int YN)
    }
  }

-  //   misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor sync");
-
-  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
-
 #if (MAPBH == 0)
  // for black hole position
  if (BH_num > 0 && lev == GH->levels - 1)
@@ -5140,11 +5140,21 @@ void bssn_class::Step(int lev, int YN)

    //   misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector error check");

-    // check error information
+    // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+    MPI_Request err_req_cor;
    {
      int erh = ERROR;
-      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev]);
+      MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev], &err_req_cor);
    }
+
+    //    misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector sync");
+
+    Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
+
+    //    misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Corrector sync");
+
+    // Complete non-blocking error reduction and check
+    MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
    if (ERROR)
    {
      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
@@ -5158,12 +5168,6 @@ void bssn_class::Step(int lev, int YN)
      }
    }

-    //    misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector sync");
-
-    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
-
-    //    misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Corrector sync");
-
 #if (MAPBH == 0)
    // for black hole position
    if (BH_num > 0 && lev == GH->levels - 1)
@@ -5447,21 +5451,11 @@ void bssn_class::SHStep()
 #if (PSTR == 1 || PSTR == 2)
 //   misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor's error check");
 #endif
-  // check error information
+  // Non-blocking error reduction overlapped with Synch to hide Allreduce latency
+  MPI_Request err_req;
  {
    int erh = ERROR;
-    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-  }
-
-  if (ERROR)
-  {
-    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
-    if (myrank == 0)
-    {
-      if (ErrorMonitor->outfile)
-        ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
+    MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req);
  }

  {
@@ -5479,6 +5473,19 @@ void bssn_class::SHStep()
    }
  }

+  // Complete non-blocking error reduction and check
+  MPI_Wait(&err_req, MPI_STATUS_IGNORE);
+  if (ERROR)
+  {
+    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
+    if (myrank == 0)
+    {
+      if (ErrorMonitor->outfile)
+        ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+  }
+
  // corrector
  for (iter_count = 1; iter_count < 4; iter_count++)
  {
@@ -5621,21 +5628,11 @@ void bssn_class::SHStep()
        sPp = sPp->next;
      }
    }
-    // check error information
+    // Non-blocking error reduction overlapped with Synch to hide Allreduce latency
+    MPI_Request err_req_cor;
    {
      int erh = ERROR;
-      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-    }
-    if (ERROR)
-    {
-      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
-      if (myrank == 0)
-      {
-        if (ErrorMonitor->outfile)
-          ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count 
-                                << " variables at t = " << PhysTime << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
+      MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
    }

    {
@@ -5653,6 +5650,20 @@ void bssn_class::SHStep()
      }
    }

+    // Complete non-blocking error reduction and check
+    MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
+    if (ERROR)
+    {
+      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
+      if (myrank == 0)
+      {
+        if (ErrorMonitor->outfile)
+          ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count
+                                << " variables at t = " << PhysTime << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+    }
+
    sPp = SH->PatL;
    while (sPp)
    {
@@ -5781,7 +5792,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 //       misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
 #endif

-      Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
+      Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);

 #if (PSTR == 1 || PSTR == 2)
 //       a_stream.clear();
@@ -5791,21 +5802,11 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif

 #if (RPB == 0)
-      Ppc = GH->PatL[lev - 1];
-      while (Ppc)
-      {
-        Pp = GH->PatL[lev];
-        while (Pp)
-        {
 #if (MIXOUTB == 0)
-          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
+      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
 #elif (MIXOUTB == 1)
-          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
+      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
 #endif
-          Pp = Pp->next;
-        }
-        Ppc = Ppc->next;
-      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry);
@@ -5842,7 +5843,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 //       misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
 #endif

-      Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry);
+      Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);

 #if (PSTR == 1 || PSTR == 2)
 //       a_stream.clear();
@@ -5852,21 +5853,11 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif

 #if (RPB == 0)
-      Ppc = GH->PatL[lev - 1];
-      while (Ppc)
-      {
-        Pp = GH->PatL[lev];
-        while (Pp)
-        {
 #if (MIXOUTB == 0)
-          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SL, SL, Symmetry);
+      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
 #elif (MIXOUTB == 1)
-          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SL, SL, Symmetry);
+      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
 #endif
-          Pp = Pp->next;
-        }
-        Ppc = Ppc->next;
-      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry);
@@ -5880,7 +5871,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif
    }

-    Parallel::Sync(GH->PatL[lev], SL, Symmetry);
+    Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]);

 #if (PSTR == 1 || PSTR == 2)
 //    a_stream.clear();
@@ -5938,24 +5929,14 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
 #endif

-      Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
+      Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);

 #if (RPB == 0)
-      Ppc = GH->PatL[lev - 1];
-      while (Ppc)
-      {
-        Pp = GH->PatL[lev];
-        while (Pp)
-        {
 #if (MIXOUTB == 0)
-          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
+      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
 #elif (MIXOUTB == 1)
-          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
+      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
 #endif
-          Pp = Pp->next;
-        }
-        Ppc = Ppc->next;
-      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry);
@@ -5970,31 +5951,21 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
 #endif

-      Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry);
+      Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);

 #if (RPB == 0)
-      Ppc = GH->PatL[lev - 1];
-      while (Ppc)
-      {
-        Pp = GH->PatL[lev];
-        while (Pp)
-        {
 #if (MIXOUTB == 0)
-          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SL, SL, Symmetry);
+      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
 #elif (MIXOUTB == 1)
-          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SL, SL, Symmetry);
+      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
 #endif
-          Pp = Pp->next;
-        }
-        Ppc = Ppc->next;
-      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry);
 #endif
    }

-    Parallel::Sync(GH->PatL[lev], SL, Symmetry);
+    Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]);
  }
 }

@@ -6045,24 +6016,14 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry);
 #endif

-      Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
+      Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);

 #if (RPB == 0)
-      Ppc = GH->PatL[lev - 1];
-      while (Ppc)
-      {
-        Pp = GH->PatL[lev];
-        while (Pp)
-        {
 #if (MIXOUTB == 0)
-          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
+      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
-          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
+      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
 #endif
-          Pp = Pp->next;
-        }
-        Ppc = Ppc->next;
-      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -6079,31 +6040,21 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry);
 #endif

-      Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry);
+      Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);

 #if (RPB == 0)
-      Ppc = GH->PatL[lev - 1];
-      while (Ppc)
-      {
-        Pp = GH->PatL[lev];
-        while (Pp)
-        {
 #if (MIXOUTB == 0)
-          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
+      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
-          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
+      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
 #endif
-          Pp = Pp->next;
-        }
-        Ppc = Ppc->next;
-      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry);
 #endif
    }

-    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
+    Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
  }
 }

@@ -6133,21 +6084,11 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
      }

 #if (RPB == 0)
-      Ppc = GH->PatL[lev - 1];
-      while (Ppc)
-      {
-        Pp = GH->PatL[lev];
-        while (Pp)
-        {
 #if (MIXOUTB == 0)
-          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
+      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
-          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
+      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
 #endif
-          Pp = Pp->next;
-        }
-        Ppc = Ppc->next;
-      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -6156,21 +6097,11 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
    else // no time refinement levels and for all same time levels
    {
 #if (RPB == 0)
-      Ppc = GH->PatL[lev - 1];
-      while (Ppc)
-      {
-        Pp = GH->PatL[lev];
-        while (Pp)
-        {
 #if (MIXOUTB == 0)
-          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
+      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
-          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
+      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
 #endif
-          Pp = Pp->next;
-        }
-        Ppc = Ppc->next;
-      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -6186,10 +6117,10 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
 #else
      Parallel::Restrict_after(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry);
 #endif
-      Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry);
+      Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
    }

-    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
+    Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
  }
 }
 #undef MIXOUTB
--- a/AMSS_NCKU_source/bssn_class.h
+++ b/AMSS_NCKU_source/bssn_class.h
@@ -126,6 +126,11 @@ public:
       MyList<var> *OldStateList, *DumpList;
       MyList<var> *ConstraintList;

+       Parallel::SyncCache *sync_cache_pre;  // per-level cache for predictor sync
+       Parallel::SyncCache *sync_cache_cor;  // per-level cache for corrector sync
+       Parallel::SyncCache *sync_cache_rp_coarse;  // RestrictProlong sync on PatL[lev-1]
+       Parallel::SyncCache *sync_cache_rp_fine;    // RestrictProlong sync on PatL[lev]
+
       monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
       monitor *ConVMonitor;
       surface_integral *Waveshell;
--- a/AMSS_NCKU_source/bssn_rhs.f90
+++ b/AMSS_NCKU_source/bssn_rhs.f90
@@ -945,103 +945,60 @@
  SSA(2)=SYM
  SSA(3)=ANTI

-!!!!!!!!!advection term part
+!!!!!!!!!advection term + Kreiss-Oliger dissipation (merged for cache efficiency)
+! lopsided_kodis shares the symmetry_bd buffer between advection and
+! dissipation, eliminating redundant full-grid copies. For metric variables
+! gxx/gyy/gzz (=dxx/dyy/dzz+1): kodis stencil coefficients sum to zero,
+! so the constant offset has no effect on dissipation.

-  call lopsided(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS)
-  call lopsided(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA)
-  call lopsided(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA)
-  call lopsided(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS)
+  call lopsided_kodis(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
+  call lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps)
+  call lopsided_kodis(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps)
+  call lopsided_kodis(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps)

-  call lopsided(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS)
-  call lopsided(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA)
-  call lopsided(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA)
-  call lopsided(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS)
+  call lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
+  call lopsided_kodis(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA,eps)
+  call lopsided_kodis(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided_kodis(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA,eps)
+  call lopsided_kodis(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS,eps)

-  call lopsided(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS)
+  call lopsided_kodis(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided_kodis(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS,eps)

-  call lopsided(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS)
-  call lopsided(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS)
-  call lopsided(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA)
-!!
+  call lopsided_kodis(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS,eps)
+  call lopsided_kodis(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS,eps)
+  call lopsided_kodis(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
+
+#if 1 
+!! bam does not apply dissipation on gauge variables
+  call lopsided_kodis(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+#if (GAUGE == 0 || GAUGE == 1 || GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7)
+  call lopsided_kodis(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS,eps)
+  call lopsided_kodis(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS,eps)
+  call lopsided_kodis(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
+#endif
+#if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
+  call lopsided_kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS,eps)
+  call lopsided_kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS,eps)
+  call lopsided_kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
+#endif
+#else
+! No dissipation on gauge variables (advection only)
  call lopsided(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS)
-
 #if (GAUGE == 0 || GAUGE == 1 || GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7)
  call lopsided(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS)
  call lopsided(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS)
  call lopsided(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA)
 #endif
-
 #if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
  call lopsided(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS)
  call lopsided(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS)
  call lopsided(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA)
 #endif
-
-  if(eps>0)then 
-! usual Kreiss-Oliger dissipation      
-  call kodis(ex,X,Y,Z,chi,chi_rhs,SSS,Symmetry,eps)
-  call kodis(ex,X,Y,Z,trK,trK_rhs,SSS,Symmetry,eps)
-  call kodis(ex,X,Y,Z,dxx,gxx_rhs,SSS,Symmetry,eps)
-  call kodis(ex,X,Y,Z,gxy,gxy_rhs,AAS,Symmetry,eps)
-  call kodis(ex,X,Y,Z,gxz,gxz_rhs,ASA,Symmetry,eps)
-  call kodis(ex,X,Y,Z,dyy,gyy_rhs,SSS,Symmetry,eps)
-  call kodis(ex,X,Y,Z,gyz,gyz_rhs,SAA,Symmetry,eps)
-  call kodis(ex,X,Y,Z,dzz,gzz_rhs,SSS,Symmetry,eps)
-#if 0
-#define i 42
-#define j 40
-#define k 40
-if(Lev == 1)then
-write(*,*) X(i),Y(j),Z(k)
-write(*,*) "before",Axx_rhs(i,j,k)
-endif
-#undef i
-#undef j
-#undef k
-!!stop
 #endif
-  call kodis(ex,X,Y,Z,Axx,Axx_rhs,SSS,Symmetry,eps)
-#if 0
-#define i 42
-#define j 40
-#define k 40
-if(Lev == 1)then
-write(*,*) X(i),Y(j),Z(k)
-write(*,*) "after",Axx_rhs(i,j,k)
-endif
-#undef i
-#undef j
-#undef k
-!!stop
-#endif
-  call kodis(ex,X,Y,Z,Axy,Axy_rhs,AAS,Symmetry,eps)
-  call kodis(ex,X,Y,Z,Axz,Axz_rhs,ASA,Symmetry,eps)
-  call kodis(ex,X,Y,Z,Ayy,Ayy_rhs,SSS,Symmetry,eps)
-  call kodis(ex,X,Y,Z,Ayz,Ayz_rhs,SAA,Symmetry,eps)
-  call kodis(ex,X,Y,Z,Azz,Azz_rhs,SSS,Symmetry,eps)
-  call kodis(ex,X,Y,Z,Gamx,Gamx_rhs,ASS,Symmetry,eps)
-  call kodis(ex,X,Y,Z,Gamy,Gamy_rhs,SAS,Symmetry,eps)
-  call kodis(ex,X,Y,Z,Gamz,Gamz_rhs,SSA,Symmetry,eps)
-
-#if 1 
-!! bam does not apply dissipation on gauge variables
-  call kodis(ex,X,Y,Z,Lap,Lap_rhs,SSS,Symmetry,eps)
-  call kodis(ex,X,Y,Z,betax,betax_rhs,ASS,Symmetry,eps)
-  call kodis(ex,X,Y,Z,betay,betay_rhs,SAS,Symmetry,eps)
-  call kodis(ex,X,Y,Z,betaz,betaz_rhs,SSA,Symmetry,eps)
-#if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
-  call kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,ASS,Symmetry,eps)
-  call kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,SAS,Symmetry,eps)
-  call kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,SSA,Symmetry,eps)
-#endif
-#endif
-
-  endif

  if(co == 0)then
 ! ham_Res = trR + 2/3 * K^2 - A_ij * A^ij - 16 * PI * rho
--- a/AMSS_NCKU_source/bssn_rhs_c.C
+++ b/AMSS_NCKU_source/bssn_rhs_c.C
--- a/AMSS_NCKU_source/cgh.C
+++ b/AMSS_NCKU_source/cgh.C
@@ -1301,13 +1301,13 @@ bool cgh::Interp_One_Point(MyList<var> *VarList,
 }


-void cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
+bool cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
                          MyList<var> *OldList, MyList<var> *StateList,
                          MyList<var> *FutureList, MyList<var> *tmList, bool BB,
                          monitor *ErrorMonitor)
 {
  if (lev < movls)
-    return;
+    return false;

 #if (0)
  // #if (PSTR == 1 || PSTR == 2)
@@ -1396,7 +1396,7 @@ void cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, do
      for (bhi = 0; bhi < BH_num; bhi++)
        delete[] tmpPorg[bhi];
      delete[] tmpPorg;
-      return;
+      return false;
    }
    // x direction
    rr = (Porg0[bhi][0] - handle[lev][grd][0]) / dX;
@@ -1500,6 +1500,7 @@ void cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, do
  for (int bhi = 0; bhi < BH_num; bhi++)
    delete[] tmpPorg[bhi];
  delete[] tmpPorg;
+  return tot_flag;
 }


--- a/AMSS_NCKU_source/cgh.h
+++ b/AMSS_NCKU_source/cgh.h
@@ -74,7 +74,7 @@ public:
                               MyList<var> *OldList, MyList<var> *StateList,
                               MyList<var> *FutureList, MyList<var> *tmList,
                               int Symmetry, bool BB);
-   void Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
+   bool Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
                        MyList<var> *OldList, MyList<var> *StateList,
                        MyList<var> *FutureList, MyList<var> *tmList, bool BB,
                        monitor *ErrorMonitor);
--- a/AMSS_NCKU_source/diff_new.f90
+++ b/AMSS_NCKU_source/diff_new.f90
@@ -69,6 +69,8 @@
  fy = ZEO
  fz = ZEO

+!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
+!DIR$ UNROLL PARTIAL(4)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -371,6 +373,8 @@
  fxz = ZEO
  fyz = ZEO

+!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
+!DIR$ UNROLL PARTIAL(4)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
--- a/AMSS_NCKU_source/fdderivs_c.C
+++ b/AMSS_NCKU_source/fdderivs_c.C
@@ -0,0 +1,268 @@
+#include "tool.h"
+void fdderivs(const int ex[3],
+              const double *f,
+              double *fxx, double *fxy, double *fxz,
+              double *fyy, double *fyz, double *fzz,
+              const double *X, const double *Y, const double *Z,
+              double SYM1, double SYM2, double SYM3,
+              int Symmetry, int onoff)
+{
+    (void)onoff;
+
+    const int NO_SYMM = 0, EQ_SYMM = 1;
+    const double ZEO = 0.0, ONE = 1.0, TWO = 2.0;
+    const double F1o4   = 2.5e-1;          // 1/4
+    const double F8     = 8.0;
+    const double F16    = 16.0;
+    const double F30    = 30.0;
+    const double F1o12  = ONE / 12.0;
+    const double F1o144 = ONE / 144.0;
+
+    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
+
+    const double dX = X[1] - X[0];
+    const double dY = Y[1] - Y[0];
+    const double dZ = Z[1] - Z[0];
+
+    const int imaxF = ex1;
+    const int jmaxF = ex2;
+    const int kmaxF = ex3;
+
+    int iminF = 1, jminF = 1, kminF = 1;
+    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
+    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
+    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
+
+    const double SoA[3] = { SYM1, SYM2, SYM3 };
+
+    /* fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2 */
+    const size_t nx = (size_t)ex1 + 2;
+    const size_t ny = (size_t)ex2 + 2;
+    const size_t nz = (size_t)ex3 + 2;
+    const size_t fh_size = nx * ny * nz;
+
+    static double *fh = NULL;
+    static size_t cap = 0;
+
+    if (fh_size > cap) {
+        free(fh);
+        fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
+        cap = fh_size;
+    }
+    // double *fh = (double*)malloc(fh_size * sizeof(double));
+    if (!fh) return;
+
+    symmetry_bd(2, ex, f, fh, SoA);
+
+    /* 系数：按 Fortran 原式 */
+    const double Sdxdx = ONE / (dX * dX);
+    const double Sdydy = ONE / (dY * dY);
+    const double Sdzdz = ONE / (dZ * dZ);
+
+    const double Fdxdx = F1o12 / (dX * dX);
+    const double Fdydy = F1o12 / (dY * dY);
+    const double Fdzdz = F1o12 / (dZ * dZ);
+
+    const double Sdxdy = F1o4 / (dX * dY);
+    const double Sdxdz = F1o4 / (dX * dZ);
+    const double Sdydz = F1o4 / (dY * dZ);
+
+    const double Fdxdy = F1o144 / (dX * dY);
+    const double Fdxdz = F1o144 / (dX * dZ);
+    const double Fdydz = F1o144 / (dY * dZ);
+
+    /* 输出清零：fxx,fyy,fzz,fxy,fxz,fyz = 0 */
+    const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
+    for (size_t p = 0; p < all; ++p) {
+        fxx[p] = ZEO; fyy[p] = ZEO; fzz[p] = ZEO;
+        fxy[p] = ZEO; fxz[p] = ZEO; fyz[p] = ZEO;
+    }
+
+    /*
+     * Fortran:
+     * do k=1,ex3-1
+     * do j=1,ex2-1
+     * do i=1,ex1-1
+     */
+    
+    for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
+        const int kF = k0 + 1;
+        for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
+            const int jF = j0 + 1;
+            for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
+                const int iF = i0 + 1;
+                const size_t p = idx_ex(i0, j0, k0, ex);
+
+                /* 高阶分支：i±2,j±2,k±2 都在范围内 */
+                if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
+                    (jF + 2) <= jmaxF && (jF - 2) >= jminF &&
+                    (kF + 2) <= kmaxF && (kF - 2) >= kminF)
+                {
+                    fxx[p] = Fdxdx * (
+                        -fh[idx_fh_F_ord2(iF - 2, jF,     kF,     ex)] +
+                        F16 * fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] -
+                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
+                        fh[idx_fh_F_ord2(iF + 2, jF,     kF,     ex)] +
+                        F16 * fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
+                    );
+
+                    fyy[p] = Fdydy * (
+                        -fh[idx_fh_F_ord2(iF,     jF - 2, kF,     ex)] +
+                        F16 * fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] -
+                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
+                        fh[idx_fh_F_ord2(iF,     jF + 2, kF,     ex)] +
+                        F16 * fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
+                    );
+
+                    fzz[p] = Fdzdz * (
+                        -fh[idx_fh_F_ord2(iF,     jF,     kF - 2, ex)] +
+                        F16 * fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] -
+                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
+                        fh[idx_fh_F_ord2(iF,     jF,     kF + 2, ex)] +
+                        F16 * fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
+                    );
+
+                    /* fxy 高阶：完全照搬 Fortran 的括号结构 */
+                    {
+                        const double t_jm2 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)] );
+
+                        const double t_jm1 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)] );
+
+                        const double t_jp1 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)] );
+
+                        const double t_jp2 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)] );
+
+                        fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 );
+                    }
+
+                    /* fxz 高阶 */
+                    {
+                        const double t_km2 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)] );
+
+                        const double t_km1 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)] );
+
+                        const double t_kp1 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)] );
+
+                        const double t_kp2 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)] );
+
+                        fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
+                    }
+
+                    /* fyz 高阶 */
+                    {
+                        const double t_km2 =
+                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)]
+                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)] );
+
+                        const double t_km1 =
+                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)]
+                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)] );
+
+                        const double t_kp1 =
+                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
+                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)] );
+
+                        const double t_kp2 =
+                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)]
+                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)] );
+
+                        fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
+                    }
+                }
+                /* 二阶分支：i±1,j±1,k±1 在范围内 */
+                else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
+                         (jF + 1) <= jmaxF && (jF - 1) >= jminF &&
+                         (kF + 1) <= kmaxF && (kF - 1) >= kminF)
+                {
+                    fxx[p] = Sdxdx * (
+                        fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] -
+                        TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
+                        fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
+                    );
+
+                    fyy[p] = Sdydy * (
+                        fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] -
+                        TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
+                        fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
+                    );
+
+                    fzz[p] = Sdzdz * (
+                        fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] -
+                        TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
+                        fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
+                    );
+
+                    fxy[p] = Sdxdy * (
+                        fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] -
+                        fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] -
+                        fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] +
+                        fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
+                    );
+
+                    fxz[p] = Sdxdz * (
+                        fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] -
+                        fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] -
+                        fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] +
+                        fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
+                    );
+
+                    fyz[p] = Sdydz * (
+                        fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] -
+                        fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] -
+                        fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] +
+                        fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
+                    );
+                }else{
+                    fxx[p] = 0.0;
+                    fyy[p] = 0.0;
+                    fzz[p] = 0.0;
+                    fxy[p] = 0.0;
+                    fxz[p] = 0.0;
+                    fyz[p] = 0.0;
+                }
+            }
+        }
+    }
+
+    // free(fh);
+}
--- a/AMSS_NCKU_source/fderivs_c.C
+++ b/AMSS_NCKU_source/fderivs_c.C
@@ -0,0 +1,150 @@
+#include "tool.h"
+
+/*
+ * C 版 fderivs
+ *
+ * Fortran:
+ * subroutine fderivs(ex,f,fx,fy,fz,X,Y,Z,SYM1,SYM2,SYM3,symmetry,onoff)
+ *
+ * 约定：
+ *   f, fx, fy, fz: ex1*ex2*ex3，按 idx_ex 布局
+ *   X: ex1, Y: ex2, Z: ex3
+ */
+void fderivs(const int ex[3],
+             const double *f,
+             double *fx, double *fy, double *fz,
+             const double *X, const double *Y, const double *Z,
+             double SYM1, double SYM2, double SYM3,
+             int Symmetry, int onoff)
+{
+    (void)onoff; // Fortran 里没用到
+
+    const double ZEO = 0.0, ONE = 1.0;
+    const double TWO = 2.0, EIT = 8.0;
+    const double F12 = 12.0;
+
+    const int NO_SYMM = 0, EQ_SYMM = 1; // OCTANT=2 在本子程序里不直接用
+
+    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
+
+    // dX = X(2)-X(1) -> C: X[1]-X[0]
+    const double dX = X[1] - X[0];
+    const double dY = Y[1] - Y[0];
+    const double dZ = Z[1] - Z[0];
+
+    // Fortran 1-based bounds
+    const int imaxF = ex1;
+    const int jmaxF = ex2;
+    const int kmaxF = ex3;
+
+    int iminF = 1, jminF = 1, kminF = 1;
+    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
+    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
+    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
+
+    // SoA(1:3) = SYM1,SYM2,SYM3
+    const double SoA[3] = { SYM1, SYM2, SYM3 };
+
+    // fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2
+    const size_t nx = (size_t)ex1 + 2;
+    const size_t ny = (size_t)ex2 + 2;
+    const size_t nz = (size_t)ex3 + 2;
+    const size_t fh_size = nx * ny * nz;
+    static double *fh = NULL;
+    static size_t cap = 0;
+
+    if (fh_size > cap) {
+        free(fh);
+        fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
+        cap = fh_size;
+    }
+    // double *fh = (double*)malloc(fh_size * sizeof(double));
+    if (!fh) return;
+
+    // call symmetry_bd(2,ex,f,fh,SoA)
+    symmetry_bd(2, ex, f, fh, SoA);
+
+    const double d12dx = ONE / F12 / dX;
+    const double d12dy = ONE / F12 / dY;
+    const double d12dz = ONE / F12 / dZ;
+
+    const double d2dx  = ONE / TWO / dX;
+    const double d2dy  = ONE / TWO / dY;
+    const double d2dz  = ONE / TWO / dZ;
+
+    // fx = fy = fz = 0
+    const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
+    for (size_t p = 0; p < all; ++p) {
+        fx[p] = ZEO;
+        fy[p] = ZEO;
+        fz[p] = ZEO;
+    }
+
+    /*
+     * Fortran loops:
+     * do k=1,ex3-1
+     * do j=1,ex2-1
+     * do i=1,ex1-1
+     *
+     * C: k0=0..ex3-2, j0=0..ex2-2, i0=0..ex1-2
+     */
+    for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
+        const int kF = k0 + 1;
+        for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
+            const int jF = j0 + 1;
+            for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
+                const int iF = i0 + 1;
+                const size_t p = idx_ex(i0, j0, k0, ex);
+
+                // if(i+2 <= imax .and. i-2 >= imin ... )  (全是 Fortran 索引)
+                if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
+                    (jF + 2) <= jmaxF && (jF - 2) >= jminF &&
+                    (kF + 2) <= kmaxF && (kF - 2) >= kminF)
+                {
+                    fx[p] = d12dx * (
+                        fh[idx_fh_F_ord2(iF - 2, jF,     kF,     ex)] -
+                        EIT * fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] +
+                        EIT * fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)] -
+                        fh[idx_fh_F_ord2(iF + 2, jF,     kF,     ex)]
+                    );
+
+                    fy[p] = d12dy * (
+                        fh[idx_fh_F_ord2(iF,     jF - 2, kF,     ex)] -
+                        EIT * fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] +
+                        EIT * fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)] -
+                        fh[idx_fh_F_ord2(iF,     jF + 2, kF,     ex)]
+                    );
+
+                    fz[p] = d12dz * (
+                        fh[idx_fh_F_ord2(iF,     jF,     kF - 2, ex)] -
+                        EIT * fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] +
+                        EIT * fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)] -
+                        fh[idx_fh_F_ord2(iF,     jF,     kF + 2, ex)]
+                    );
+                }
+                // elseif(i+1 <= imax .and. i-1 >= imin ...)
+                else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
+                         (jF + 1) <= jmaxF && (jF - 1) >= jminF &&
+                         (kF + 1) <= kmaxF && (kF - 1) >= kminF)
+                {
+                    fx[p] = d2dx * (
+                        -fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] +
+                         fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
+                    );
+
+                    fy[p] = d2dy * (
+                        -fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] +
+                         fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
+                    );
+
+                    fz[p] = d2dz * (
+                        -fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] +
+                         fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
+                    );
+                }
+            }
+        }
+    }
+
+    // free(fh);
+}
--- a/AMSS_NCKU_source/fmisc.f90
+++ b/AMSS_NCKU_source/fmisc.f90
@@ -883,13 +883,17 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)

  integer::i

+!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
+!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
   enddo
+!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
   do i=0,ord-1
      funcc(:,-i,1:extc(3)) = funcc(:,i+1,1:extc(3))*SoA(2)
   enddo
+!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
   do i=0,ord-1
      funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
   enddo
@@ -1112,6 +1116,7 @@ end subroutine d2dump
 ! Lagrangian polynomial interpolation
 !------------------------------------------------------------------------------

+!DIR$ ATTRIBUTES FORCEINLINE :: polint
  subroutine polint(xa, ya, x, y, dy, ordn)
  implicit none

--- a/AMSS_NCKU_source/kodiss.f90
+++ b/AMSS_NCKU_source/kodiss.f90
@@ -65,6 +65,8 @@ real*8,intent(in) :: eps
 !                       dx^4

 !  note the sign (-1)^r-1, now r=2
+!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
+!DIR$ UNROLL PARTIAL(4)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
--- a/AMSS_NCKU_source/kodiss_c.C
+++ b/AMSS_NCKU_source/kodiss_c.C
@@ -0,0 +1,109 @@
+#include "tool.h"
+
+/*
+ * C 版 kodis
+ *
+ * Fortran signature:
+ * subroutine kodis(ex,X,Y,Z,f,f_rhs,SoA,Symmetry,eps)
+ *
+ * 约定：
+ *   X: ex1, Y: ex2, Z: ex3
+ *   f, f_rhs: ex1*ex2*ex3 按 idx_ex 布局
+ *   SoA[3]
+ *   eps: double
+ */
+void kodis(const int ex[3],
+           const double *X, const double *Y, const double *Z,
+           const double *f, double *f_rhs,
+           const double SoA[3],
+           int Symmetry, double eps)
+{
+    const double ONE = 1.0, SIX = 6.0, FIT = 15.0, TWT = 20.0;
+    const double cof = 64.0;             // 2^6
+    const int NO_SYMM = 0, OCTANT = 2;
+
+    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
+
+    // Fortran: dX = X(2)-X(1) -> C: X[1]-X[0]
+    const double dX = X[1] - X[0];
+    const double dY = Y[1] - Y[0];
+    const double dZ = Z[1] - Z[0];
+    (void)ONE; // ONE 在原 Fortran 里只是参数，这里不一定用得上
+
+    // Fortran: imax=ex(1) 等是 1-based 上界
+    const int imaxF = ex1;
+    const int jmaxF = ex2;
+    const int kmaxF = ex3;
+
+    // Fortran: imin=jmin=kmin=1，某些对称情况变 -2
+    int iminF = 1, jminF = 1, kminF = 1;
+
+    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
+    if (Symmetry == OCTANT && fabs(X[0]) < dX) iminF = -2;
+    if (Symmetry == OCTANT && fabs(Y[0]) < dY) jminF = -2;
+
+    // 分配 fh：大小 (ex1+3)*(ex2+3)*(ex3+3)，对应 ord=3
+    const size_t nx = (size_t)ex1 + 3;
+    const size_t ny = (size_t)ex2 + 3;
+    const size_t nz = (size_t)ex3 + 3;
+    const size_t fh_size = nx * ny * nz;
+
+    double *fh = (double*)malloc(fh_size * sizeof(double));
+    if (!fh) return;
+
+    // Fortran: call symmetry_bd(3,ex,f,fh,SoA)
+    symmetry_bd(3, ex, f, fh, SoA);
+
+    /*
+     * Fortran loops:
+     * do k=1,ex3
+     * do j=1,ex2
+     * do i=1,ex1
+     *
+     * C: k0=0..ex3-1, j0=0..ex2-1, i0=0..ex1-1
+     * 并定义 Fortran index: iF=i0+1, ...
+     */
+    for (int k0 = 0; k0 < ex3; ++k0) {
+        const int kF = k0 + 1;
+        for (int j0 = 0; j0 < ex2; ++j0) {
+            const int jF = j0 + 1;
+            for (int i0 = 0; i0 < ex1; ++i0) {
+                const int iF = i0 + 1;
+
+                // Fortran if 条件：
+                // i-3 >= imin .and. i+3 <= imax  等（都是 Fortran 索引）
+                if ((iF - 3) >= iminF && (iF + 3) <= imaxF &&
+                    (jF - 3) >= jminF && (jF + 3) <= jmaxF &&
+                    (kF - 3) >= kminF && (kF + 3) <= kmaxF)
+                {
+                    const size_t p = idx_ex(i0, j0, k0, ex);
+
+                    // 三个方向各一份同型的 7 点组合（实际上是对称的 6th-order dissipation/filter 核）
+                    const double Dx_term =
+                        ( (fh[idx_fh_F(iF - 3, jF, kF, ex)] + fh[idx_fh_F(iF + 3, jF, kF, ex)]) -
+                          SIX * (fh[idx_fh_F(iF - 2, jF, kF, ex)] + fh[idx_fh_F(iF + 2, jF, kF, ex)]) +
+                          FIT * (fh[idx_fh_F(iF - 1, jF, kF, ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]) -
+                          TWT *  fh[idx_fh_F(iF    , jF, kF, ex)] ) / dX;
+
+                    const double Dy_term =
+                        ( (fh[idx_fh_F(iF, jF - 3, kF, ex)] + fh[idx_fh_F(iF, jF + 3, kF, ex)]) -
+                          SIX * (fh[idx_fh_F(iF, jF - 2, kF, ex)] + fh[idx_fh_F(iF, jF + 2, kF, ex)]) +
+                          FIT * (fh[idx_fh_F(iF, jF - 1, kF, ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]) -
+                          TWT *  fh[idx_fh_F(iF, jF    , kF, ex)] ) / dY;
+
+                    const double Dz_term =
+                        ( (fh[idx_fh_F(iF, jF, kF - 3, ex)] + fh[idx_fh_F(iF, jF, kF + 3, ex)]) -
+                          SIX * (fh[idx_fh_F(iF, jF, kF - 2, ex)] + fh[idx_fh_F(iF, jF, kF + 2, ex)]) +
+                          FIT * (fh[idx_fh_F(iF, jF, kF - 1, ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]) -
+                          TWT *  fh[idx_fh_F(iF, jF, kF    , ex)] ) / dZ;
+
+                    // Fortran:
+                    // f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof*(Dx_term + Dy_term + Dz_term)
+                    f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term);
+                }
+            }
+        }
+    }
+
+    free(fh);
+}
--- a/AMSS_NCKU_source/lopsided_c.C
+++ b/AMSS_NCKU_source/lopsided_c.C
@@ -0,0 +1,255 @@
+#include "tool.h"
+/*
+ * 你需要提供 symmetry_bd 的 C 版本（或 Fortran 绑到 C 的接口）。
+ * Fortran: call symmetry_bd(3,ex,f,fh,SoA)
+ *
+ * 约定：
+ *   nghost = 3
+ *   ex[3]  = {ex1,ex2,ex3}
+ *   f      = 原始网格 (ex1*ex2*ex3)
+ *   fh     = 扩展网格 ((ex1+3)*(ex2+3)*(ex3+3))，对应 Fortran 的 (-2:ex1, ...)
+ *   SoA[3] = 输入参数
+ */
+void lopsided(const int ex[3],
+              const double *X, const double *Y, const double *Z,
+              const double *f, double *f_rhs,
+              const double *Sfx, const double *Sfy, const double *Sfz,
+              int Symmetry, const double SoA[3])
+{
+    const double ZEO = 0.0, ONE = 1.0, F3 = 3.0;
+    const double TWO = 2.0, F6 = 6.0, F18 = 18.0;
+    const double F12 = 12.0, F10 = 10.0, EIT = 8.0;
+
+    const int NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2;
+    (void)OCTANT; // 这里和 Fortran 一样只是定义了不用也没关系
+
+    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
+
+    // 对应 Fortran: dX = X(2)-X(1)  （Fortran 1-based）
+    // C: X[1]-X[0]
+    const double dX = X[1] - X[0];
+    const double dY = Y[1] - Y[0];
+    const double dZ = Z[1] - Z[0];
+
+    const double d12dx = ONE / F12 / dX;
+    const double d12dy = ONE / F12 / dY;
+    const double d12dz = ONE / F12 / dZ;
+
+    // Fortran 里算了 d2dx/d2dy/d2dz 但本 subroutine 里没用到（保持一致也算出来）
+    const double d2dx  = ONE / TWO / dX;
+    const double d2dy  = ONE / TWO / dY;
+    const double d2dz  = ONE / TWO / dZ;
+    (void)d2dx; (void)d2dy; (void)d2dz;
+
+    // Fortran:
+    // imax = ex(1); jmax = ex(2); kmax = ex(3)
+    const int imaxF = ex1;
+    const int jmaxF = ex2;
+    const int kmaxF = ex3;
+
+    // Fortran:
+    // imin=jmin=kmin=1; 若满足对称条件则设为 -2
+    int iminF = 1, jminF = 1, kminF = 1;
+    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
+    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2;
+    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2;
+
+    // 分配 fh：大小 (ex1+3)*(ex2+3)*(ex3+3)
+    const size_t nx = (size_t)ex1 + 3;
+    const size_t ny = (size_t)ex2 + 3;
+    const size_t nz = (size_t)ex3 + 3;
+    const size_t fh_size = nx * ny * nz;
+
+    double *fh = (double*)malloc(fh_size * sizeof(double));
+    if (!fh) return; // 内存不足：直接返回（你也可以改成 abort/报错）
+
+    // Fortran: call symmetry_bd(3,ex,f,fh,SoA)
+    symmetry_bd(3, ex, f, fh, SoA);
+
+    /*
+     * Fortran 主循环：
+     * do k=1,ex(3)-1
+     * do j=1,ex(2)-1
+     * do i=1,ex(1)-1
+     *
+     * 转成 C 0-based：
+     * k0 = 0..ex3-2, j0 = 0..ex2-2, i0 = 0..ex1-2
+     *
+     * 并且 Fortran 里的 i/j/k 在 fh 访问时，仍然是 Fortran 索引值：
+     * iF=i0+1, jF=j0+1, kF=k0+1
+     */
+    for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
+        const int kF = k0 + 1;
+        for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
+            const int jF = j0 + 1;
+            for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
+                const int iF = i0 + 1;
+
+                const size_t p = idx_ex(i0, j0, k0, ex);
+
+                // ---------------- x direction ----------------
+                const double sfx = Sfx[p];
+                if (sfx > ZEO) {
+                    // Fortran: if(i+3 <= imax)
+                    // iF+3 <= ex1  <=> i0+4 <= ex1 <=> i0 <= ex1-4
+                    if (i0 <= ex1 - 4) {
+                        f_rhs[p] += sfx * d12dx *
+                            (-F3  * fh[idx_fh_F(iF - 1, jF, kF, ex)]
+                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
+                             +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
+                             -F6  * fh[idx_fh_F(iF + 2, jF, kF, ex)]
+                             +      fh[idx_fh_F(iF + 3, jF, kF, ex)]);
+                    }
+                    // elseif(i+2 <= imax)  <=> i0 <= ex1-3
+                    else if (i0 <= ex1 - 3) {
+                        f_rhs[p] += sfx * d12dx *
+                            ( fh[idx_fh_F(iF - 2, jF, kF, ex)]
+                             -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
+                             +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
+                             -      fh[idx_fh_F(iF + 2, jF, kF, ex)]);
+                    }
+                    // elseif(i+1 <= imax)  <=> i0 <= ex1-2（循环里总成立）
+                    else if (i0 <= ex1 - 2) {
+                        f_rhs[p] -= sfx * d12dx *
+                            (-F3  * fh[idx_fh_F(iF + 1, jF, kF, ex)]
+                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
+                             +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
+                             -F6  * fh[idx_fh_F(iF - 2, jF, kF, ex)]
+                             +      fh[idx_fh_F(iF - 3, jF, kF, ex)]);
+                    }
+                } else if (sfx < ZEO) {
+                    // Fortran: if(i-3 >= imin)
+                    // (iF-3) >= iminF  <=> (i0-2) >= iminF
+                    if ((i0 - 2) >= iminF) {
+                        f_rhs[p] -= sfx * d12dx *
+                            (-F3  * fh[idx_fh_F(iF + 1, jF, kF, ex)]
+                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
+                             +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
+                             -F6  * fh[idx_fh_F(iF - 2, jF, kF, ex)]
+                             +      fh[idx_fh_F(iF - 3, jF, kF, ex)]);
+                    }
+                    // elseif(i-2 >= imin) <=> (i0-1) >= iminF
+                    else if ((i0 - 1) >= iminF) {
+                        f_rhs[p] += sfx * d12dx *
+                            ( fh[idx_fh_F(iF - 2, jF, kF, ex)]
+                             -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
+                             +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
+                             -      fh[idx_fh_F(iF + 2, jF, kF, ex)]);
+                    }
+                    // elseif(i-1 >= imin) <=> i0 >= iminF
+                    else if (i0 >= iminF) {
+                        f_rhs[p] += sfx * d12dx *
+                            (-F3  * fh[idx_fh_F(iF - 1, jF, kF, ex)]
+                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
+                             +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
+                             -F6  * fh[idx_fh_F(iF + 2, jF, kF, ex)]
+                             +      fh[idx_fh_F(iF + 3, jF, kF, ex)]);
+                    }
+                }
+
+                // ---------------- y direction ----------------
+                const double sfy = Sfy[p];
+                if (sfy > ZEO) {
+                    // jF+3 <= ex2 <=> j0+4 <= ex2 <=> j0 <= ex2-4
+                    if (j0 <= ex2 - 4) {
+                        f_rhs[p] += sfy * d12dy *
+                            (-F3  * fh[idx_fh_F(iF, jF - 1, kF, ex)]
+                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
+                             +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
+                             -F6  * fh[idx_fh_F(iF, jF + 2, kF, ex)]
+                             +      fh[idx_fh_F(iF, jF + 3, kF, ex)]);
+                    } else if (j0 <= ex2 - 3) {
+                        f_rhs[p] += sfy * d12dy *
+                            ( fh[idx_fh_F(iF, jF - 2, kF, ex)]
+                             -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
+                             +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
+                             -      fh[idx_fh_F(iF, jF + 2, kF, ex)]);
+                    } else if (j0 <= ex2 - 2) {
+                        f_rhs[p] -= sfy * d12dy *
+                            (-F3  * fh[idx_fh_F(iF, jF + 1, kF, ex)]
+                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
+                             +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
+                             -F6  * fh[idx_fh_F(iF, jF - 2, kF, ex)]
+                             +      fh[idx_fh_F(iF, jF - 3, kF, ex)]);
+                    }
+                } else if (sfy < ZEO) {
+                    if ((j0 - 2) >= jminF) {
+                        f_rhs[p] -= sfy * d12dy *
+                            (-F3  * fh[idx_fh_F(iF, jF + 1, kF, ex)]
+                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
+                             +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
+                             -F6  * fh[idx_fh_F(iF, jF - 2, kF, ex)]
+                             +      fh[idx_fh_F(iF, jF - 3, kF, ex)]);
+                    } else if ((j0 - 1) >= jminF) {
+                        f_rhs[p] += sfy * d12dy *
+                            ( fh[idx_fh_F(iF, jF - 2, kF, ex)]
+                             -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
+                             +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
+                             -      fh[idx_fh_F(iF, jF + 2, kF, ex)]);
+                    } else if (j0 >= jminF) {
+                        f_rhs[p] += sfy * d12dy *
+                            (-F3  * fh[idx_fh_F(iF, jF - 1, kF, ex)]
+                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
+                             +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
+                             -F6  * fh[idx_fh_F(iF, jF + 2, kF, ex)]
+                             +      fh[idx_fh_F(iF, jF + 3, kF, ex)]);
+                    }
+                }
+
+                // ---------------- z direction ----------------
+                const double sfz = Sfz[p];
+                if (sfz > ZEO) {
+                    if (k0 <= ex3 - 4) {
+                        f_rhs[p] += sfz * d12dz *
+                            (-F3  * fh[idx_fh_F(iF, jF, kF - 1, ex)]
+                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
+                             +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
+                             -F6  * fh[idx_fh_F(iF, jF, kF + 2, ex)]
+                             +      fh[idx_fh_F(iF, jF, kF + 3, ex)]);
+                    } else if (k0 <= ex3 - 3) {
+                        f_rhs[p] += sfz * d12dz *
+                            ( fh[idx_fh_F(iF, jF, kF - 2, ex)]
+                             -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
+                             +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
+                             -      fh[idx_fh_F(iF, jF, kF + 2, ex)]);
+                    } else if (k0 <= ex3 - 2) {
+                        f_rhs[p] -= sfz * d12dz *
+                            (-F3  * fh[idx_fh_F(iF, jF, kF + 1, ex)]
+                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
+                             +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
+                             -F6  * fh[idx_fh_F(iF, jF, kF - 2, ex)]
+                             +      fh[idx_fh_F(iF, jF, kF - 3, ex)]);
+                    }
+                } else if (sfz < ZEO) {
+                    if ((k0 - 2) >= kminF) {
+                        f_rhs[p] -= sfz * d12dz *
+                            (-F3  * fh[idx_fh_F(iF, jF, kF + 1, ex)]
+                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
+                             +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
+                             -F6  * fh[idx_fh_F(iF, jF, kF - 2, ex)]
+                             +      fh[idx_fh_F(iF, jF, kF - 3, ex)]);
+                    } else if ((k0 - 1) >= kminF) {
+                        f_rhs[p] += sfz * d12dz *
+                            ( fh[idx_fh_F(iF, jF, kF - 2, ex)]
+                             -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
+                             +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
+                             -      fh[idx_fh_F(iF, jF, kF + 2, ex)]);
+                    } else if (k0 >= kminF) {
+                        f_rhs[p] += sfz * d12dz *
+                            (-F3  * fh[idx_fh_F(iF, jF, kF - 1, ex)]
+                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
+                             +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
+                             -F6  * fh[idx_fh_F(iF, jF, kF + 2, ex)]
+                             +      fh[idx_fh_F(iF, jF, kF + 3, ex)]);
+                    }
+                }
+            }
+        }
+    }
+    free(fh);
+}
+
+
+
+
+
--- a/AMSS_NCKU_source/lopsidediff.f90
+++ b/AMSS_NCKU_source/lopsidediff.f90
@@ -487,6 +487,201 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)

  end subroutine lopsided

+!-----------------------------------------------------------------------------
+! Combined advection (lopsided) + Kreiss-Oliger dissipation (kodis)
+! Shares the symmetry_bd buffer fh, eliminating one full-grid copy per call.
+! Mathematically identical to calling lopsided then kodis separately.
+!-----------------------------------------------------------------------------
+subroutine lopsided_kodis(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA,eps)
+  implicit none
+
+!~~~~~~> Input parameters:
+
+  integer, intent(in)  :: ex(1:3),Symmetry
+  real*8,  intent(in)  :: X(1:ex(1)),Y(1:ex(2)),Z(1:ex(3))
+  real*8,dimension(ex(1),ex(2),ex(3)),intent(in)   :: f,Sfx,Sfy,Sfz
+
+  real*8,dimension(ex(1),ex(2),ex(3)),intent(inout):: f_rhs
+  real*8,dimension(3),intent(in) ::SoA
+  real*8,intent(in) :: eps
+
+!~~~~~~> local variables:
+! note index -2,-1,0, so we have 3 extra points
+  real*8,dimension(-2:ex(1),-2:ex(2),-2:ex(3))   :: fh
+  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
+  real*8 :: dX,dY,dZ
+  real*8 :: d12dx,d12dy,d12dz,d2dx,d2dy,d2dz
+  real*8,  parameter :: ZEO=0.d0,ONE=1.d0, F3=3.d0
+  real*8,  parameter :: TWO=2.d0,F6=6.0d0,F18=1.8d1
+  real*8,  parameter :: F12=1.2d1, F10=1.d1,EIT=8.d0
+  integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
+! kodis parameters
+  real*8, parameter :: SIX=6.d0,FIT=1.5d1,TWT=2.d1
+  real*8, parameter :: cof=6.4d1   ! 2^6
+
+  dX = X(2)-X(1)
+  dY = Y(2)-Y(1)
+  dZ = Z(2)-Z(1)
+
+  d12dx = ONE/F12/dX
+  d12dy = ONE/F12/dY
+  d12dz = ONE/F12/dZ
+
+  d2dx = ONE/TWO/dX
+  d2dy = ONE/TWO/dY
+  d2dz = ONE/TWO/dZ
+
+  imax = ex(1)
+  jmax = ex(2)
+  kmax = ex(3)
+
+  imin = 1
+  jmin = 1
+  kmin = 1
+  if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -2
+  if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -2
+  if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -2
+
+! Single symmetry_bd call shared by both advection and dissipation
+  call symmetry_bd(3,ex,f,fh,SoA)
+
+! ---- Advection (lopsided) loop ----
+! upper bound set ex-1 only for efficiency, 
+! the loop body will set ex 0 also
+  do k=1,ex(3)-1
+  do j=1,ex(2)-1
+  do i=1,ex(1)-1
+! x direction   
+    if(Sfx(i,j,k) > ZEO)then
+      if(i+3 <= imax)then
+     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
+                  Sfx(i,j,k)*d12dx*(-F3*fh(i-1,j,k)-F10*fh(i,j,k)+F18*fh(i+1,j,k) &
+                                    -F6*fh(i+2,j,k)+    fh(i+3,j,k))
+     elseif(i+2 <= imax)then
+     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
+                  Sfx(i,j,k)*d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
+
+     elseif(i+1 <= imax)then
+     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
+                  Sfx(i,j,k)*d12dx*(-F3*fh(i+1,j,k)-F10*fh(i,j,k)+F18*fh(i-1,j,k) &
+                                    -F6*fh(i-2,j,k)+    fh(i-3,j,k))
+     endif
+   elseif(Sfx(i,j,k) < ZEO)then
+      if(i-3 >= imin)then
+     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
+                  Sfx(i,j,k)*d12dx*(-F3*fh(i+1,j,k)-F10*fh(i,j,k)+F18*fh(i-1,j,k) &
+                                    -F6*fh(i-2,j,k)+    fh(i-3,j,k))
+     elseif(i-2 >= imin)then
+     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
+                  Sfx(i,j,k)*d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
+
+     elseif(i-1 >= imin)then
+     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
+                  Sfx(i,j,k)*d12dx*(-F3*fh(i-1,j,k)-F10*fh(i,j,k)+F18*fh(i+1,j,k) &
+                                    -F6*fh(i+2,j,k)+    fh(i+3,j,k))
+     endif
+   endif
+
+! y direction   
+    if(Sfy(i,j,k) > ZEO)then
+      if(j+3 <= jmax)then
+     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
+                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j-1,k)-F10*fh(i,j,k)+F18*fh(i,j+1,k) &
+                                    -F6*fh(i,j+2,k)+    fh(i,j+3,k))
+     elseif(j+2 <= jmax)then
+     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
+                  Sfy(i,j,k)*d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
+
+     elseif(j+1 <= jmax)then
+     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
+                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j+1,k)-F10*fh(i,j,k)+F18*fh(i,j-1,k) &
+                                    -F6*fh(i,j-2,k)+    fh(i,j-3,k))
+     endif
+   elseif(Sfy(i,j,k) < ZEO)then
+      if(j-3 >= jmin)then
+     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
+                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j+1,k)-F10*fh(i,j,k)+F18*fh(i,j-1,k) &
+                                    -F6*fh(i,j-2,k)+    fh(i,j-3,k))
+     elseif(j-2 >= jmin)then
+     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
+                  Sfy(i,j,k)*d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
+
+     elseif(j-1 >= jmin)then
+     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
+                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j-1,k)-F10*fh(i,j,k)+F18*fh(i,j+1,k) &
+                                    -F6*fh(i,j+2,k)+    fh(i,j+3,k))
+     endif
+   endif
+
+! z direction   
+    if(Sfz(i,j,k) > ZEO)then
+      if(k+3 <= kmax)then
+     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
+                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k-1)-F10*fh(i,j,k)+F18*fh(i,j,k+1) &
+                                    -F6*fh(i,j,k+2)+    fh(i,j,k+3))
+     elseif(k+2 <= kmax)then
+     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
+                  Sfz(i,j,k)*d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
+
+     elseif(k+1 <= kmax)then
+     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
+                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k+1)-F10*fh(i,j,k)+F18*fh(i,j,k-1) &
+                                    -F6*fh(i,j,k-2)+    fh(i,j,k-3))
+     endif
+   elseif(Sfz(i,j,k) < ZEO)then
+      if(k-3 >= kmin)then
+     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
+                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k+1)-F10*fh(i,j,k)+F18*fh(i,j,k-1) &
+                                    -F6*fh(i,j,k-2)+    fh(i,j,k-3))
+     elseif(k-2 >= kmin)then
+     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
+                  Sfz(i,j,k)*d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
+
+     elseif(k-1 >= kmin)then
+     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
+                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k-1)-F10*fh(i,j,k)+F18*fh(i,j,k+1) &
+                                    -F6*fh(i,j,k+2)+    fh(i,j,k+3))
+     endif
+   endif
+  enddo
+  enddo
+  enddo
+
+! ---- Dissipation (kodis) loop ----
+  if(eps > ZEO) then
+  do k=1,ex(3)
+  do j=1,ex(2)
+  do i=1,ex(1)
+
+  if(i-3 >= imin .and. i+3 <= imax .and. &
+     j-3 >= jmin .and. j+3 <= jmax .and. &
+     k-3 >= kmin .and. k+3 <= kmax) then
+   f_rhs(i,j,k)       = f_rhs(i,j,k) + eps/cof *( (     &
+                              (fh(i-3,j,k)+fh(i+3,j,k)) - &
+                          SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
+                          FIT*(fh(i-1,j,k)+fh(i+1,j,k)) - &
+                          TWT* fh(i,j,k)            )/dX + &
+                                                  (     &
+                              (fh(i,j-3,k)+fh(i,j+3,k)) - &
+                          SIX*(fh(i,j-2,k)+fh(i,j+2,k)) + &
+                          FIT*(fh(i,j-1,k)+fh(i,j+1,k)) - &
+                          TWT* fh(i,j,k)            )/dY + &
+                                                  (     &
+                              (fh(i,j,k-3)+fh(i,j,k+3)) - &
+                          SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
+                          FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
+                          TWT* fh(i,j,k)            )/dZ )
+  endif
+
+  enddo
+  enddo
+  enddo
+  endif
+
+  return
+
+  end subroutine lopsided_kodis
+
 #elif (ghost_width == 4)
 ! sixth order code
 ! Compute advection terms in right hand sides of field equations
--- a/AMSS_NCKU_source/macrodef.fh
+++ b/AMSS_NCKU_source/macrodef.fh
@@ -1,83 +1,77 @@

-
-#if 0
-note here
-v:r; u: phi; w: theta
-tetradtype 0
-v^a = (x,y,z)
-orthonormal order: v,u,w
-m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of  PRD 75, 124018(2007)
-tetradtype 1
-orthonormal order: w,u,v
-m = (theta + i phi)/sqrt(2) following Sperhake, Eq.(3.2) of  PRD 85, 124062(2012)
-tetradtype 2
-v_a = (x,y,z)
-orthonormal order: v,u,w
-m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of  PRD 75, 124018(2007)
-#endif
 #define tetradtype 2

-#if 0
-note here
-Cell center or Vertex center
-#endif
 #define Cell

-#if 0
-note here
-2nd order: 2
-4th order: 3
-6th order: 4
-8th order: 5
-#endif
 #define ghost_width 3

-#if 0
-note here
-use shell or not
-#endif
-#define WithShell

-#if 0
-note here
-use constraint preserving boundary condition or not
-only affect Z4c
-#endif
-#define CPBC

-#if 0
-note here
-Gauge condition type
-0: B^i gauge
-1: David's puncture gauge
-2: MB B^i gauge
-3: RIT B^i gauge
-4: MB beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
-5: RIT beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
-6: MGB1 B^i gauge
-7: MGB2 B^i gauge
-#endif
-#define GAUGE 2
+#define GAUGE 0

-#if 0
-buffer points for CPBC boundary
-#endif
 #define CPBC_ghost_width  (ghost_width)

-#if 0
-using BSSN variable for constraint violation and psi4 calculation: 0
-using ADM variable for constraint violation and psi4 calculation: 1
-#endif
 #define ABV 0

-#if 0
-Type of Potential and Scalar Distribution in F(R) Scalar-Tensor Theory
-1: Case C of 1112.3928, V=0
-2: shell with a2^2*phi0/(1+a2^2), f(R) = R+a2*R^2 induced V
-3: ground state of Schrodinger-Newton system, f(R) = R+a2*R^2 induced V
-4: a2 = oo and phi(r) = phi0 * 0.5 * ( tanh((r+r0)/sigma) - tanh((r-r0)/sigma) )
-5: shell with phi(r) = phi0*Exp(-(r-r0)**2/sigma), V = 0
-#endif
 #define EScalar_CC 2

+#if 0
+
+define tetradtype
+    v:r; u: phi; w: theta
+    tetradtype 0
+    v^a = (x,y,z)
+    orthonormal order: v,u,w
+    m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of  PRD 75, 124018(2007)
+    tetradtype 1
+    orthonormal order: w,u,v
+    m = (theta + i phi)/sqrt(2) following Sperhake, Eq.(3.2) of  PRD 85, 124062(2012)
+    tetradtype 2
+    v_a = (x,y,z)
+    orthonormal order: v,u,w
+    m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of  PRD 75, 124018(2007)
+
+define Cell or Vertex
+    Cell center or Vertex center
+
+define ghost_width
+    2nd order: 2
+    4th order: 3
+    6th order: 4
+    8th order: 5
+
+define WithShell
+    use shell or not
+
+define CPBC
+    use constraint preserving boundary condition or not
+    only affect Z4c
+    CPBC only supports WithShell
+
+define GAUGE
+    0: B^i gauge
+    1: David puncture gauge
+    2: MB B^i gauge
+    3: RIT B^i gauge
+    4: MB beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
+    5: RIT beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
+    6: MGB1 B^i gauge
+    7: MGB2 B^i gauge
+
+define CPBC_ghost_width  (ghost_width)
+    buffer points for CPBC boundary
+
+define ABV
+    0: using BSSN variable for constraint violation and psi4 calculation
+    1: using ADM variable for constraint violation and psi4 calculation
+
+define EScalar_CC
+Type of Potential and Scalar Distribution in F(R) Scalar-Tensor Theory
+    1: Case C of 1112.3928, V=0
+    2: shell with   phi(r) = phi0 * a2^2/(1+a2^2), f(R) = R+a2*R^2 induced V
+    3: ground state of Schrodinger-Newton system,  f(R) = R+a2*R^2 induced V
+    4: a2 = +oo and phi(r) = phi0 * 0.5 * ( tanh((r+r0)/sigma) - tanh((r-r0)/sigma) )
+    5: shell with   phi(r) = phi0 * Exp(-(r-r0)**2/sigma), V = 0
+
+#endif

--- a/AMSS_NCKU_source/macrodef.h
+++ b/AMSS_NCKU_source/macrodef.h
@@ -6,95 +6,127 @@

 // application parameters

-/// ****
-// sommerfeld boundary type
-// 0: bam, 1: shibata
 #define SommerType 0

-/// ****
-// for Using Gauss-Legendre quadrature in theta direction
 #define GaussInt

-/// ****
+#define ABEtype 0
+
+//#define With_AHF
+#define Psi4type 0
+
+//#define Point_Psi4
+
+#define RPS 1
+
+#define AGM 0
+
+#define RPB 0
+
+#define MAPBH 1
+
+#define PSTR 0
+
+#define REGLEV 0
+
+//#define USE_GPU
+
+//#define CHECKDETAIL
+
+//#define FAKECHECK
+
+//
+// define SommerType
+//     sommerfeld boundary type
+//     0: bam
+//     1: shibata
+//
+// define GaussInt
+//     for Using Gauss-Legendre quadrature in theta direction
+//
+// define ABEtype
 //     0: BSSN vacuum
 //     1: coupled to scalar field
 //     2: Z4c vacuum
 //     3: coupled to Maxwell field
 //
-#define ABEtype 2
-
-/// ****
+// define With_AHF
 //     using Apparent Horizon Finder
-//#define With_AHF
-
-/// ****
+//
+// define Psi4type
 //     Psi4 calculation method
 //     0: EB method
 //     1: 4-D method
 //
-#define Psi4type 0
-
-/// ****
+// define Point_Psi4
 //     for Using point psi4 or not
-//#define Point_Psi4
-
-/// ****
+//
+// define RPS
 //     RestrictProlong in Step (0) or after Step (1)
-#define RPS 1
-
-/// ****
+//
+// define AGM
 //     Enforce algebra constraint
 //     for every RK4 sub step: 0
 //     only when iter_count == 3: 1
 //     after routine Step: 2
-#define AGM 0
-
-/// ****
+//
+// define RPB
 //     Restrict Prolong using BAM style 1 or old style 0
-#define RPB 0
-
-/// ****
+//
+// define MAPBH
 //     1: move Analysis out ot 4 sub steps and treat PBH with Euler method
-#define MAPBH 1
-
-/// ****
-// parallel structure, 0: level by level, 1: considering all levels, 2: as 1 but reverse the CPU order, 3: Frank's scheme
-#define PSTR 0
-
-/// ****
+//
+// define PSTR
+//     parallel structure
+//     0: level by level
+//     1: considering all levels
+//     2: as 1 but reverse the CPU order
+//     3: Frank's scheme
+//
+// define REGLEV
 //     regrid for every level or for all levels at a time
-// 0: for every level; 1: for all
-#define REGLEV 0
-
-/// ****
+//     0: for every level;
+//     1: for all
+//
+// define USE_GPU
 //     use gpu or not
-//#define USE_GPU
-
-/// ****
+//
+// define CHECKDETAIL
 //     use checkpoint for every process
-//#define CHECKDETAIL
-
-/// ****
+//
+// define FAKECHECK
 //     use FakeCheckPrepare to write CheckPoint
-//#define FAKECHECK
+//
+
 ////================================================================
 //  some basic parameters for numerical calculation
+////================================================================
+
 #define dim 3

-//#define Cell or Vertex in "microdef.fh"
+//#define Cell or Vertex in "macrodef.fh" 

-// ******
-// buffer point number for mesh refinement interface
 #define buffer_width 6

-// ******
-// buffer point number shell-box interface, on shell
 #define SC_width buffer_width
-// buffer point number shell-box interface, on box
+
 #define CS_width (2*buffer_width)

+//
+// define Cell or Vertex in "macrodef.fh" 
+//
+// define buffer_width
+//     buffer point number for mesh refinement interface
+//
+// define SC_width buffer_width
+//     buffer point number shell-box interface, on shell
+//
+// define CS_width
+//     buffer point number shell-box interface, on box
+//
+
 #if(buffer_width < ghost_width)
-#error we always assume buffer_width>ghost_width
+#   error we always assume buffer_width>ghost_width
 #endif

 #define PACK 1
@@ -110,3 +142,4 @@
 #define TINY 1e-10

 #endif   /* MICRODEF_H */
+
--- a/AMSS_NCKU_source/makefile
+++ b/AMSS_NCKU_source/makefile
@@ -2,6 +2,27 @@

 include makefile.inc

+## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
+##   make                        -> opt  (PGO-guided, maximum performance)
+##   make PGO_MODE=instrument    -> instrument (Phase 1: collect fresh profile data)
+PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
+
+ifeq ($(PGO_MODE),instrument)
+## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
+CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
+              -Dfortran3 -Dnewc -I${MKLROOT}/include
+f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
+              -align array64byte -fpp -I${MKLROOT}/include
+else
+## opt (default): maximum performance with PGO profile data
+CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
+              -fprofile-instr-use=$(PROFDATA) \
+              -Dfortran3 -Dnewc -I${MKLROOT}/include
+f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
+              -fprofile-instr-use=$(PROFDATA) \
+              -align array64byte -fpp -I${MKLROOT}/include
+endif
+
 .SUFFIXES: .o .f90 .C .for .cu

 .f90.o:
@@ -16,7 +37,36 @@ include makefile.inc
 .cu.o:
 	$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)

+# C rewrite of BSSN RHS kernel and helpers
+bssn_rhs_c.o: bssn_rhs_c.C
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+
+fderivs_c.o: fderivs_c.C
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+
+fdderivs_c.o: fdderivs_c.C
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+
+kodiss_c.o: kodiss_c.C
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+
+lopsided_c.o: lopsided_c.C
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+
+## TwoPunctureABE uses fixed optimal flags, independent of CXXAPPFLAGS (which may be PGO-instrumented)
+TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo -Dfortran3 -Dnewc -I${MKLROOT}/include
+
+TwoPunctures.o: TwoPunctures.C
+	${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
+
+TwoPunctureABE.o: TwoPunctureABE.C
+	${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
+
 # Input files
+
+# C rewrite files
+CFILES = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o
+
 C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
           cgh.o bssn_class.o surface_integral.o ShellPatch.o\
 	   bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
@@ -34,7 +84,7 @@ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o

 F90FILES = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
 	   prolongrestrict_cell.o prolongrestrict_vertex.o\
-	   rungekutta4_rout.o bssn_rhs.o diff_new.o kodiss.o kodiss_sh.o\
+	   rungekutta4_rout.o diff_new.o kodiss.o kodiss_sh.o\
 	   lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
 	   shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
           getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
@@ -57,7 +107,7 @@ TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o
 CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o

 # file dependences
-$(C++FILES) $(C++FILESGPU) $(F90FILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh
+$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh

 $(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
 	     misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\
@@ -80,7 +130,7 @@ $(C++FILES_GPU): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h
             
 $(AHFDOBJS): cctk.h cctk_Config.h cctk_Types.h cctk_Constants.h myglobal.h

-$(C++FILES) $(C++FILES_GPU) $(AHFDOBJS) $(CUDAFILES): macrodef.h
+$(C++FILES) $(C++FILES_GPU) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.h

 TwoPunctureFILES: TwoPunctures.h

@@ -89,14 +139,14 @@ $(CUDAFILES): bssn_gpu.h gpu_mem.h gpu_rhsSS_mem.h
 misc.o : zbesh.o

 # projects
-ABE: $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) 
-	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS)
+ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
+	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS)
 	
-ABEGPU: $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
-	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
+ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
+	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)

 TwoPunctureABE: $(TwoPunctureFILES)
-	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(TwoPunctureFILES) $(LDLIBS)
+	$(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)

 clean:
 	rm *.o ABE ABEGPU TwoPunctureABE make.log -f
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -8,17 +8,12 @@ filein  = -I/usr/include/ -I${MKLROOT}/include

 ## Using sequential MKL (OpenMP disabled for better single-threaded performance)
 ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
-LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl
+LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5

-## Aggressive optimization flags:
-## -O3: Maximum optimization
-## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible)
-## -fp-model fast=2: Aggressive floating-point optimizations
-## -fma: Enable fused multiply-add instructions
-CXXAPPFLAGS  = -O3 -xHost -fp-model fast=2 -fma -ipo -qopenmp \
-               -Dfortran3 -Dnewc -I${MKLROOT}/include
-f90appflags  = -O3 -xHost -fp-model fast=2 -fma -ipo -qopenmp \
-               -align array64byte -fpp -I${MKLROOT}/include
+## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
+##   opt        : (default) maximum performance with PGO profile-guided optimization
+##   instrument : PGO Phase 1 instrumentation to collect fresh profile data
+PGO_MODE ?= opt
 f90          = ifx
 f77          = ifx
 CXX          = icpx
--- a/AMSS_NCKU_source/share_func.h
+++ b/AMSS_NCKU_source/share_func.h
@@ -0,0 +1,146 @@
+#ifndef SHARE_FUNC_H
+#define SHARE_FUNC_H
+
+#include <stdlib.h>
+#include <stddef.h>
+#include <math.h>
+#include <stdio.h>
+/* 主网格：0-based -> 1D */
+static inline size_t idx_ex(int i0, int j0, int k0, const int ex[3]) {
+    const int ex1 = ex[0], ex2 = ex[1];
+    return (size_t)i0 + (size_t)j0 * (size_t)ex1 + (size_t)k0 * (size_t)ex1 * (size_t)ex2;
+}
+
+/*
+ * fh 对应 Fortran: fh(-1:ex1, -1:ex2, -1:ex3)
+ * ord=2 => shift=1
+ * iF/jF/kF 为 Fortran 索引（可为 -1,0,1..ex）
+ */
+static inline size_t idx_fh_F_ord2(int iF, int jF, int kF, const int ex[3]) {
+    const int shift = 1;
+    const int nx = ex[0] + 2;      // ex1 + ord
+    const int ny = ex[1] + 2;
+
+    const int ii = iF + shift;     // 0..ex1+1
+    const int jj = jF + shift;     // 0..ex2+1
+    const int kk = kF + shift;     // 0..ex3+1
+
+    return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
+}
+
+/*
+ * fh 对应 Fortran: fh(-2:ex1, -2:ex2, -2:ex3)
+ * ord=3 => shift=2
+ * iF/jF/kF 是 Fortran 索引（可为负）
+ */
+static inline size_t idx_fh_F(int iF, int jF, int kF, const int ex[3]) {
+    const int shift = 2;                 // ord=3 -> -2..ex
+    const int nx = ex[0] + 3;            // ex1 + ord
+    const int ny = ex[1] + 3;
+
+    const int ii = iF + shift;           // 0..ex1+2
+    const int jj = jF + shift;           // 0..ex2+2
+    const int kk = kF + shift;           // 0..ex3+2
+
+    return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
+}
+
+/*
+ * func:  (1..extc1, 1..extc2, 1..extc3)   1-based in Fortran
+ * funcc: (-ord+1..extc1, -ord+1..extc2, -ord+1..extc3) in Fortran
+ *
+ * C 里我们把：
+ *   func  视为 0-based: i0=0..extc1-1, j0=0..extc2-1, k0=0..extc3-1
+ *   funcc 用“平移下标”存为一维数组：
+ *     iF in [-ord+1..extc1]  -> ii = iF + (ord-1)  in [0..extc1+ord-1]
+ *     总长度 nx = extc1 + ord
+ *     同理 ny = extc2 + ord, nz = extc3 + ord
+ */
+
+static inline size_t idx_func0(int i0, int j0, int k0, const int extc[3]) {
+    const int nx = extc[0], ny = extc[1];
+    return (size_t)i0 + (size_t)j0 * (size_t)nx + (size_t)k0 * (size_t)nx * (size_t)ny;
+}
+
+static inline size_t idx_funcc_F(int iF, int jF, int kF, int ord, const int extc[3]) {
+    const int shift = ord - 1;          // iF = -shift .. extc1
+    const int nx = extc[0] + ord;       // [-shift..extc1] 共 extc1+ord 个
+    const int ny = extc[1] + ord;
+
+    const int ii = iF + shift;          // 0..extc1+shift
+    const int jj = jF + shift;          // 0..extc2+shift
+    const int kk = kF + shift;          // 0..extc3+shift
+
+    return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
+}
+
+/*
+ * 等价于 Fortran:
+ * funcc(1:extc1,1:extc2,1:extc3)=func
+ * do i=0,ord-1
+ *   funcc(-i,1:extc2,1:extc3) = funcc(i+1,1:extc2,1:extc3)*SoA(1)
+ * enddo
+ * do i=0,ord-1
+ *   funcc(:,-i,1:extc3) = funcc(:,i+1,1:extc3)*SoA(2)
+ * enddo
+ * do i=0,ord-1
+ *   funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
+ * enddo
+ */
+static inline void symmetry_bd(int ord,
+                 const int extc[3],
+                 const double *func,
+                 double *funcc,
+                 const double SoA[3])
+{
+    const int extc1 = extc[0], extc2 = extc[1], extc3 = extc[2];
+
+    // 1) funcc(1:extc1,1:extc2,1:extc3) = func
+    // Fortran 的 (iF=1..extc1) 对应 C 的 func(i0=0..extc1-1)
+    for (int k0 = 0; k0 < extc3; ++k0) {
+        for (int j0 = 0; j0 < extc2; ++j0) {
+            for (int i0 = 0; i0 < extc1; ++i0) {
+                const int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
+                funcc[idx_funcc_F(iF, jF, kF, ord, extc)] = func[idx_func0(i0, j0, k0, extc)];
+            }
+        }
+    }
+
+    // 2) do i=0..ord-1: funcc(-i, 1:extc2, 1:extc3) = funcc(i+1, ...)*SoA(1)
+    for (int ii = 0; ii <= ord - 1; ++ii) {
+        const int iF_dst = -ii;       // 0, -1, -2, ...
+        const int iF_src = ii + 1;    // 1, 2, 3, ...
+        for (int kF = 1; kF <= extc3; ++kF) {
+            for (int jF = 1; jF <= extc2; ++jF) {
+                funcc[idx_funcc_F(iF_dst, jF, kF, ord, extc)] =
+                    funcc[idx_funcc_F(iF_src, jF, kF, ord, extc)] * SoA[0];
+            }
+        }
+    }
+
+    // 3) do i=0..ord-1: funcc(:,-i, 1:extc3) = funcc(:, i+1, 1:extc3)*SoA(2)
+    // 注意 Fortran 这里的 ":" 表示 iF 从 (-ord+1..extc1) 全覆盖
+    for (int jj = 0; jj <= ord - 1; ++jj) {
+        const int jF_dst = -jj;
+        const int jF_src = jj + 1;
+        for (int kF = 1; kF <= extc3; ++kF) {
+            for (int iF = -ord + 1; iF <= extc1; ++iF) {
+                funcc[idx_funcc_F(iF, jF_dst, kF, ord, extc)] =
+                    funcc[idx_funcc_F(iF, jF_src, kF, ord, extc)] * SoA[1];
+            }
+        }
+    }
+
+    // 4) do i=0..ord-1: funcc(:,:,-i) = funcc(:,:, i+1)*SoA(3)
+    for (int kk = 0; kk <= ord - 1; ++kk) {
+        const int kF_dst = -kk;
+        const int kF_src = kk + 1;
+        for (int jF = -ord + 1; jF <= extc2; ++jF) {
+            for (int iF = -ord + 1; iF <= extc1; ++iF) {
+                funcc[idx_funcc_F(iF, jF, kF_dst, ord, extc)] =
+                    funcc[idx_funcc_F(iF, jF, kF_src, ord, extc)] * SoA[2];
+            }
+        }
+    }
+}
+#endif
--- a/AMSS_NCKU_source/surface_integral.C
+++ b/AMSS_NCKU_source/surface_integral.C
@@ -220,16 +220,9 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
    pox[2][n] = rex * nz_g[n];
  }

-  double *shellf;
-  shellf = new double[n_tot * InList];
-
-  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);
-
  int mp, Lp, Nmin, Nmax;
-
  mp = n_tot / cpusize;
  Lp = n_tot - cpusize * mp;
-
  if (Lp > myrank)
  {
    Nmin = myrank * mp + myrank;
@@ -241,6 +234,11 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
    Nmax = Nmin + mp - 1;
  }

+  double *shellf;
+  shellf = new double[n_tot * InList];
+
+  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
+
  //|~~~~~> Integrate the dot product of Dphi with the surface normal.

  double *RP_out, *IP_out;
@@ -363,8 +361,17 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
  }
  //|------+  Communicate and sum the results from each processor.

-  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  {
+    double *RPIP_out = new double[2 * NN];
+    double *RPIP = new double[2 * NN];
+    memcpy(RPIP_out, RP_out, NN * sizeof(double));
+    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
+    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    memcpy(RP, RPIP, NN * sizeof(double));
+    memcpy(IP, RPIP + NN, NN * sizeof(double));
+    delete[] RPIP_out;
+    delete[] RPIP;
+  }

  //|------= Free memory.

@@ -556,8 +563,17 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
  }
  //|------+  Communicate and sum the results from each processor.

-  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, Comm_here);
-  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, Comm_here);
+  {
+    double *RPIP_out = new double[2 * NN];
+    double *RPIP = new double[2 * NN];
+    memcpy(RPIP_out, RP_out, NN * sizeof(double));
+    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
+    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, Comm_here);
+    memcpy(RP, RPIP, NN * sizeof(double));
+    memcpy(IP, RPIP + NN, NN * sizeof(double));
+    delete[] RPIP_out;
+    delete[] RPIP;
+  }

  //|------= Free memory.

@@ -735,8 +751,17 @@ void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH, var *Rpsi4
  }
  //|------+  Communicate and sum the results from each processor.

-  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  {
+    double *RPIP_out = new double[2 * NN];
+    double *RPIP = new double[2 * NN];
+    memcpy(RPIP_out, RP_out, NN * sizeof(double));
+    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
+    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    memcpy(RP, RPIP, NN * sizeof(double));
+    memcpy(IP, RPIP + NN, NN * sizeof(double));
+    delete[] RPIP_out;
+    delete[] RPIP;
+  }

  //|------= Free memory.

@@ -984,8 +1009,17 @@ void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH,
  }
  //|------+  Communicate and sum the results from each processor.

-  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  {
+    double *RPIP_out = new double[2 * NN];
+    double *RPIP = new double[2 * NN];
+    memcpy(RPIP_out, RP_out, NN * sizeof(double));
+    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
+    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    memcpy(RP, RPIP, NN * sizeof(double));
+    memcpy(IP, RPIP + NN, NN * sizeof(double));
+    delete[] RPIP_out;
+    delete[] RPIP;
+  }

  //|------= Free memory.

@@ -1419,8 +1453,17 @@ void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH,
  }
  //|------+  Communicate and sum the results from each processor.

-  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  {
+    double *RPIP_out = new double[2 * NN];
+    double *RPIP = new double[2 * NN];
+    memcpy(RPIP_out, RP_out, NN * sizeof(double));
+    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
+    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    memcpy(RP, RPIP, NN * sizeof(double));
+    memcpy(IP, RPIP + NN, NN * sizeof(double));
+    delete[] RPIP_out;
+    delete[] RPIP;
+  }

  //|------= Free memory.

@@ -1854,8 +1897,17 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH,
  }
  //|------+  Communicate and sum the results from each processor.

-  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  {
+    double *RPIP_out = new double[2 * NN];
+    double *RPIP = new double[2 * NN];
+    memcpy(RPIP_out, RP_out, NN * sizeof(double));
+    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
+    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    memcpy(RP, RPIP, NN * sizeof(double));
+    memcpy(IP, RPIP + NN, NN * sizeof(double));
+    delete[] RPIP_out;
+    delete[] RPIP;
+  }

  //|------= Free memory.

@@ -2040,8 +2092,17 @@ void surface_integral::surf_Wave(double rex, int lev, NullShellPatch2 *GH, var *
  }
  //|------+  Communicate and sum the results from each processor.

-  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  {
+    double *RPIP_out = new double[2 * NN];
+    double *RPIP = new double[2 * NN];
+    memcpy(RPIP_out, RP_out, NN * sizeof(double));
+    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
+    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    memcpy(RP, RPIP, NN * sizeof(double));
+    memcpy(IP, RPIP + NN, NN * sizeof(double));
+    delete[] RPIP_out;
+    delete[] RPIP;
+  }

  //|------= Free memory.

@@ -2226,8 +2287,17 @@ void surface_integral::surf_Wave(double rex, int lev, NullShellPatch *GH, var *R
  }
  //|------+  Communicate and sum the results from each processor.

-  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  {
+    double *RPIP_out = new double[2 * NN];
+    double *RPIP = new double[2 * NN];
+    memcpy(RPIP_out, RP_out, NN * sizeof(double));
+    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
+    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    memcpy(RP, RPIP, NN * sizeof(double));
+    memcpy(IP, RPIP + NN, NN * sizeof(double));
+    delete[] RPIP_out;
+    delete[] RPIP;
+  }

  //|------= Free memory.

@@ -2314,25 +2384,9 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
    pox[2][n] = rex * nz_g[n];
  }

-  double *shellf;
-  shellf = new double[n_tot * InList];
-
-  // we have assumed there is only one box on this level,
-  // so we do not need loop boxes
-  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);
-
-  double Mass_out = 0;
-  double ang_outx, ang_outy, ang_outz;
-  double p_outx, p_outy, p_outz;
-  ang_outx = ang_outy = ang_outz = 0.0;
-  p_outx = p_outy = p_outz = 0.0;
-  const double f1o8 = 0.125;
-
  int mp, Lp, Nmin, Nmax;
-
  mp = n_tot / cpusize;
  Lp = n_tot - cpusize * mp;
-
  if (Lp > myrank)
  {
    Nmin = myrank * mp + myrank;
@@ -2344,6 +2398,20 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
    Nmax = Nmin + mp - 1;
  }

+  double *shellf;
+  shellf = new double[n_tot * InList];
+
+  // we have assumed there is only one box on this level,
+  // so we do not need loop boxes
+  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
+
+  double Mass_out = 0;
+  double ang_outx, ang_outy, ang_outz;
+  double p_outx, p_outy, p_outz;
+  ang_outx = ang_outy = ang_outz = 0.0;
+  p_outx = p_outy = p_outz = 0.0;
+  const double f1o8 = 0.125;
+
  double Chi, Psi;
  double Gxx, Gxy, Gxz, Gyy, Gyz, Gzz;
  double gupxx, gupxy, gupxz, gupyy, gupyz, gupzz;
@@ -2464,15 +2532,13 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
    }
  }

-  MPI_Allreduce(&Mass_out, &mass, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-
-  MPI_Allreduce(&ang_outx, &sx, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-  MPI_Allreduce(&ang_outy, &sy, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-  MPI_Allreduce(&ang_outz, &sz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-
-  MPI_Allreduce(&p_outx, &px, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-  MPI_Allreduce(&p_outy, &py, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-  MPI_Allreduce(&p_outz, &pz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  {
+    double scalar_out[7] = {Mass_out, ang_outx, ang_outy, ang_outz, p_outx, p_outy, p_outz};
+    double scalar_in[7];
+    MPI_Allreduce(scalar_out, scalar_in, 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    mass = scalar_in[0]; sx = scalar_in[1]; sy = scalar_in[2]; sz = scalar_in[3];
+    px = scalar_in[4]; py = scalar_in[5]; pz = scalar_in[6];
+  }

 #ifdef GaussInt
  mass = mass * rex * rex * dphi * factor;
@@ -2735,15 +2801,13 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
    }
  }

-  MPI_Allreduce(&Mass_out, &mass, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
-
-  MPI_Allreduce(&ang_outx, &sx, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
-  MPI_Allreduce(&ang_outy, &sy, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
-  MPI_Allreduce(&ang_outz, &sz, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
-
-  MPI_Allreduce(&p_outx, &px, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
-  MPI_Allreduce(&p_outy, &py, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
-  MPI_Allreduce(&p_outz, &pz, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
+  {
+    double scalar_out[7] = {Mass_out, ang_outx, ang_outy, ang_outz, p_outx, p_outy, p_outz};
+    double scalar_in[7];
+    MPI_Allreduce(scalar_out, scalar_in, 7, MPI_DOUBLE, MPI_SUM, Comm_here);
+    mass = scalar_in[0]; sx = scalar_in[1]; sy = scalar_in[2]; sz = scalar_in[3];
+    px = scalar_in[4]; py = scalar_in[5]; pz = scalar_in[6];
+  }

 #ifdef GaussInt
  mass = mass * rex * rex * dphi * factor;
@@ -3020,15 +3084,13 @@ void surface_integral::surf_MassPAng(double rex, int lev, ShellPatch *GH, var *c
    }
  }

-  MPI_Allreduce(&Mass_out, &mass, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-
-  MPI_Allreduce(&ang_outx, &sx, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-  MPI_Allreduce(&ang_outy, &sy, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-  MPI_Allreduce(&ang_outz, &sz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-
-  MPI_Allreduce(&p_outx, &px, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-  MPI_Allreduce(&p_outy, &py, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-  MPI_Allreduce(&p_outz, &pz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  {
+    double scalar_out[7] = {Mass_out, ang_outx, ang_outy, ang_outz, p_outx, p_outy, p_outz};
+    double scalar_in[7];
+    MPI_Allreduce(scalar_out, scalar_in, 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    mass = scalar_in[0]; sx = scalar_in[1]; sy = scalar_in[2]; sz = scalar_in[3];
+    px = scalar_in[4]; py = scalar_in[5]; pz = scalar_in[6];
+  }

 #ifdef GaussInt
  mass = mass * rex * rex * dphi * factor;
@@ -3607,8 +3669,17 @@ void surface_integral::surf_Wave(double rex, cgh *GH, ShellPatch *SH,
  }
  //|------+  Communicate and sum the results from each processor.

-  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  {
+    double *RPIP_out = new double[2 * NN];
+    double *RPIP = new double[2 * NN];
+    memcpy(RPIP_out, RP_out, NN * sizeof(double));
+    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
+    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    memcpy(RP, RPIP, NN * sizeof(double));
+    memcpy(IP, RPIP + NN, NN * sizeof(double));
+    delete[] RPIP_out;
+    delete[] RPIP;
+  }

  //|------= Free memory.

--- a/AMSS_NCKU_source/tool.h
+++ b/AMSS_NCKU_source/tool.h
@@ -0,0 +1,27 @@
+#include "share_func.h"
+void fdderivs(const int ex[3],
+              const double *f,
+              double *fxx, double *fxy, double *fxz,
+              double *fyy, double *fyz, double *fzz,
+              const double *X, const double *Y, const double *Z,
+              double SYM1, double SYM2, double SYM3,
+              int Symmetry, int onoff);
+
+void fderivs(const int ex[3],
+             const double *f,
+             double *fx, double *fy, double *fz,
+             const double *X, const double *Y, const double *Z,
+             double SYM1, double SYM2, double SYM3,
+             int Symmetry, int onoff);
+
+void kodis(const int ex[3],
+           const double *X, const double *Y, const double *Z,
+           const double *f, double *f_rhs,
+           const double SoA[3],
+           int Symmetry, double eps);
+
+void lopsided(const int ex[3],
+              const double *X, const double *Y, const double *Z,
+              const double *f, double *f_rhs,
+              const double *Sfx, const double *Sfy, const double *Sfz,
+              int Symmetry, const double SoA[3]);
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -11,16 +11,46 @@
 import AMSS_NCKU_Input as input_data
 import subprocess
 import time
-## CPU core binding configuration using taskset
-## taskset ensures all child processes inherit the CPU affinity mask
-## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111)
-## Format: taskset -c 4-55,60-111 ensures processes only run on these cores
-NUMACTL_CPU_BIND = "taskset -c 0-111"

-## Build parallelism configuration
-## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
-## Set make -j to utilize available cores for faster builds
-BUILD_JOBS = 104
+
+def get_last_n_cores_per_socket(n=32):
+    """
+    Read CPU topology via lscpu and return a taskset -c string
+    selecting the last `n` cores of each NUMA node (socket).
+
+    Example: 2 sockets x 56 cores each, n=32 -> node0: 24-55, node1: 80-111
+    -> "taskset -c 24-55,80-111"
+    """
+    result = subprocess.run(["lscpu", "--parse=NODE,CPU"], capture_output=True, text=True)
+
+    # Build a dict: node_id -> sorted list of CPU ids
+    node_cpus = {}
+    for line in result.stdout.splitlines():
+        if line.startswith("#") or not line.strip():
+            continue
+        parts = line.split(",")
+        if len(parts) < 2:
+            continue
+        node_id, cpu_id = int(parts[0]), int(parts[1])
+        node_cpus.setdefault(node_id, []).append(cpu_id)
+
+    segments = []
+    for node_id in sorted(node_cpus):
+        cpus = sorted(node_cpus[node_id])
+        selected = cpus[-n:]          # last n cores of this socket
+        segments.append(f"{selected[0]}-{selected[-1]}")
+
+    cpu_str = ",".join(segments)
+    total = len(segments) * n
+    print(f" CPU binding: taskset -c {cpu_str}  ({total} cores, last {n} per socket)")
+    return f"taskset -c {cpu_str}"
+
+
+## CPU core binding: dynamically select the last 32 cores of each socket (64 cores total)
+NUMACTL_CPU_BIND = get_last_n_cores_per_socket(n=32)
+
+## Build parallelism: match the number of bound cores
+BUILD_JOBS = 64


 ##################################################################
@@ -117,6 +147,7 @@ def run_ABE():
    
    if (input_data.GPU_Calculation == "no"):
        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
+        #mpi_command         = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
        mpi_command_outfile = "ABE_out.log"
    elif (input_data.GPU_Calculation == "yes"):
        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
@@ -158,7 +189,8 @@ def run_TwoPunctureABE():
    print(                                                          )
    
    ## Define the command to run
-    TwoPuncture_command         = NUMACTL_CPU_BIND + " ./TwoPunctureABE"
+    #TwoPuncture_command         = NUMACTL_CPU_BIND + " ./TwoPunctureABE"
+    TwoPuncture_command         = " ./TwoPunctureABE"
    TwoPuncture_command_outfile = "TwoPunctureABE_out.log"

    ## Execute the command with subprocess.Popen and stream output
--- a/parallel_plot_helper.py
+++ b/parallel_plot_helper.py
@@ -0,0 +1,29 @@
+import multiprocessing
+
+def run_plot_task(task):
+    """Execute a single plotting task.
+    
+    Parameters
+    ----------
+    task : tuple
+        A tuple of (function, args_tuple) where function is a callable
+        plotting function and args_tuple contains its arguments.
+    """
+    func, args = task
+    return func(*args)
+
+
+def run_plot_tasks_parallel(plot_tasks):
+    """Execute a list of independent plotting tasks in parallel.
+
+    Uses the 'fork' context to create worker processes so that the main
+    script is NOT re-imported/re-executed in child processes.
+
+    Parameters
+    ----------
+    plot_tasks : list of tuples
+        Each element is (function, args_tuple).
+    """
+    ctx = multiprocessing.get_context('fork')
+    with ctx.Pool() as pool:
+        pool.map(run_plot_task, plot_tasks)
--- a/pgo_profile/PGO_Profile_Analysis.md
+++ b/pgo_profile/PGO_Profile_Analysis.md
@@ -0,0 +1,97 @@
+# AMSS-NCKU PGO Profile Analysis Report
+
+## 1. Profiling Environment
+
+| Item | Value |
+|------|-------|
+| Compiler | Intel oneAPI DPC++/C++ 2025.3.0 (icpx/ifx) |
+| Instrumentation Flag | `-fprofile-instr-generate` |
+| Optimization Level (instrumented) | `-O2 -xHost -fma` |
+| MPI Processes | 1 (single process to avoid MPI+instrumentation deadlock) |
+| Profile File | `default_9725750769337483397_0.profraw` (327 KB) |
+| Merged Profile | `default.profdata` (394 KB) |
+| llvm-profdata | `/home/intel/oneapi/compiler/2025.3/bin/compiler/llvm-profdata` |
+
+## 2. Reduced Simulation Parameters (for profiling run)
+
+| Parameter | Production Value | Profiling Value |
+|-----------|-----------------|-----------------|
+| MPI_processes | 64 | 1 |
+| grid_level | 9 | 4 |
+| static_grid_level | 5 | 3 |
+| static_grid_number | 96 | 24 |
+| moving_grid_number | 48 | 16 |
+| largest_box_xyz_max | 320^3 | 160^3 |
+| Final_Evolution_Time | 1000.0 | 10.0 |
+| Evolution_Step_Number | 10,000,000 | 1,000 |
+| Detector_Number | 12 | 2 |
+
+## 3. Profile Summary
+
+| Metric | Value |
+|--------|-------|
+| Total instrumented functions | 1,392 |
+| Functions with non-zero counts | 117 (8.4%) |
+| Functions with zero counts | 1,275 (91.6%) |
+| Maximum function entry count | 386,459,248 |
+| Maximum internal block count | 370,477,680 |
+| Total block count | 4,198,023,118 |
+
+## 4. Top 20 Hotspot Functions
+
+| Rank | Total Count | Max Block Count | Function | Category |
+|------|------------|-----------------|----------|----------|
+| 1 | 1,241,601,732 | 370,477,680 | `polint_` | Interpolation |
+| 2 | 755,994,435 | 230,156,640 | `prolong3_` | Grid prolongation |
+| 3 | 667,964,095 | 3,697,792 | `compute_rhs_bssn_` | BSSN RHS evolution |
+| 4 | 539,736,051 | 386,459,248 | `symmetry_bd_` | Symmetry boundary |
+| 5 | 277,310,808 | 53,170,728 | `lopsided_` | Lopsided FD stencil |
+| 6 | 155,534,488 | 94,535,040 | `decide3d_` | 3D grid decision |
+| 7 | 119,267,712 | 19,266,048 | `rungekutta4_rout_` | RK4 time integrator |
+| 8 | 91,574,616 | 48,824,160 | `kodis_` | Kreiss-Oliger dissipation |
+| 9 | 67,555,389 | 43,243,680 | `fderivs_` | Finite differences |
+| 10 | 55,296,000 | 42,246,144 | `misc::fact(int)` | Factorial utility |
+| 11 | 43,191,071 | 27,663,328 | `fdderivs_` | 2nd-order FD derivatives |
+| 12 | 36,233,965 | 22,429,440 | `restrict3_` | Grid restriction |
+| 13 | 24,698,512 | 17,231,520 | `polin3_` | Polynomial interpolation |
+| 14 | 22,962,942 | 20,968,768 | `copy_` | Data copy |
+| 15 | 20,135,696 | 17,259,168 | `Ansorg::barycentric(...)` | Spectral interpolation |
+| 16 | 14,650,224 | 7,224,768 | `Ansorg::barycentric_omega(...)` | Spectral weights |
+| 17 | 13,242,296 | 2,871,920 | `global_interp_` | Global interpolation |
+| 18 | 12,672,000 | 7,734,528 | `sommerfeld_rout_` | Sommerfeld boundary |
+| 19 | 6,872,832 | 1,880,064 | `sommerfeld_routbam_` | Sommerfeld boundary (BAM) |
+| 20 | 5,709,900 | 2,809,632 | `l2normhelper_` | L2 norm computation |
+
+## 5. Hotspot Category Breakdown
+
+Top 20 functions account for ~98% of total execution counts:
+
+| Category | Functions | Combined Count | Share |
+|----------|-----------|---------------|-------|
+| Interpolation / Prolongation / Restriction | polint_, prolong3_, restrict3_, polin3_, global_interp_, Ansorg::* | ~2,093M | ~50% |
+| BSSN RHS + FD stencils | compute_rhs_bssn_, lopsided_, fderivs_, fdderivs_ | ~1,056M | ~25% |
+| Boundary conditions | symmetry_bd_, sommerfeld_rout_, sommerfeld_routbam_ | ~559M | ~13% |
+| Time integration | rungekutta4_rout_ | ~119M | ~3% |
+| Dissipation | kodis_ | ~92M | ~2% |
+| Utilities | misc::fact, decide3d_, copy_, l2normhelper_ | ~256M | ~6% |
+
+## 6. Conclusions
+
+1. **Profile data is valid**: 1,392 functions instrumented, 117 exercised with ~4.2 billion total counts.
+2. **Hotspot concentration is high**: Top 5 functions alone account for ~76% of all counts, which is ideal for PGO — the compiler has strong branch/layout optimization targets.
+3. **Fortran numerical kernels dominate**: `polint_`, `prolong3_`, `compute_rhs_bssn_`, `symmetry_bd_`, `lopsided_` are all Fortran routines in the inner evolution loop. PGO will optimize their branch prediction and basic block layout.
+4. **91.6% of functions have zero counts**: These are code paths for unused features (GPU, BSSN-EScalar, BSSN-EM, Z4C, etc.). PGO will deprioritize them, improving instruction cache utilization.
+5. **Profile is representative**: Despite the reduced grid size, the code path coverage matches production — the same kernels (RHS, prolongation, restriction, boundary) are exercised. PGO branch probabilities from this profile will transfer well to full-scale runs.
+
+## 7. PGO Phase 2 Usage
+
+To apply the profile, use the following flags in `makefile.inc`:
+
+```makefile
+CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
+              -fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \
+              -Dfortran3 -Dnewc -I${MKLROOT}/include
+f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
+              -fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \
+              -align array64byte -fpp -I${MKLROOT}/include
+```
--- a/pgo_profile/default.profdata
+++ b/pgo_profile/default.profdata
--- a/pgo_profile/default.profdata.backup
+++ b/pgo_profile/default.profdata.backup
--- a/pgo_profile/default.profdata.backup2
+++ b/pgo_profile/default.profdata.backup2
--- a/pgo_profile/default.profdatabackup3
+++ b/pgo_profile/default.profdatabackup3
--- a/pgo_profile/default_15874826282416242821_0_58277.profraw
+++ b/pgo_profile/default_15874826282416242821_0_58277.profraw
--- a/pgo_profile/default_9725750769337483397_0.profraw
+++ b/pgo_profile/default_9725750769337483397_0.profraw
--- a/pgo_profile/default_9725923726611433605_0.profraw
+++ b/pgo_profile/default_9725923726611433605_0.profraw
--- a/plot_GW_strain_amplitude_xiaoqu.py
+++ b/plot_GW_strain_amplitude_xiaoqu.py
@@ -11,6 +11,8 @@
 import numpy                               ## numpy for array operations
 import scipy                               ## scipy for interpolation and signal processing
 import math
+import matplotlib
+matplotlib.use('Agg')                      ## use non-interactive backend for multiprocessing safety
 import matplotlib.pyplot    as     plt     ## matplotlib for plotting
 import os                                  ## os for system/file operations

--- a/plot_binary_data.py
+++ b/plot_binary_data.py
@@ -8,16 +8,23 @@
 ##
 #################################################

+## Restrict OpenMP to one thread per process so that running
+## many workers in parallel does not create an O(workers * BLAS_threads)
+## thread explosion.  The variable MUST be set before numpy/scipy
+## are imported, because the BLAS library reads them only at load time.
+import os
+os.environ.setdefault("OMP_NUM_THREADS",        "1")
+
 import numpy
 import scipy
+import matplotlib
+matplotlib.use('Agg')                      ## use non-interactive backend for multiprocessing safety
 import matplotlib.pyplot    as     plt
 from   matplotlib.colors    import LogNorm
 from   mpl_toolkits.mplot3d import Axes3D
 ## import torch
 import AMSS_NCKU_Input      as input_data

-import os
-

 #########################################################################################

@@ -192,3 +199,19 @@ def get_data_xy( Rmin, Rmax, n, data0, time, figure_title, figure_outdir ):

 ####################################################################################

+
+####################################################################################
+## Allow this module to be run as a standalone script so that each
+## binary-data plot can be executed in a fresh subprocess whose BLAS
+## environment variables (set above) take effect before numpy loads.
+##
+## Usage:  python3 plot_binary_data.py <filename> <binary_outdir> <figure_outdir>
+####################################################################################
+
+if __name__ == '__main__':
+    import sys
+    if len(sys.argv) != 4:
+        print(f"Usage: {sys.argv[0]} <filename> <binary_outdir> <figure_outdir>")
+        sys.exit(1)
+    plot_binary_data(sys.argv[1], sys.argv[2], sys.argv[3])
+
--- a/plot_xiaoqu.py
+++ b/plot_xiaoqu.py
@@ -8,6 +8,8 @@
 #################################################

 import numpy                               ## numpy for array operations
+import matplotlib
+matplotlib.use('Agg')                      ## use non-interactive backend for multiprocessing safety
 import matplotlib.pyplot    as     plt     ## matplotlib for plotting
 from   mpl_toolkits.mplot3d import Axes3D  ## needed for 3D plots
 import glob
@@ -15,6 +17,9 @@ import os                                  ## operating system utilities

 import plot_binary_data
 import AMSS_NCKU_Input as input_data
+import subprocess
+import sys
+import multiprocessing

 # plt.rcParams['text.usetex'] = True  ## enable LaTeX fonts in plots

@@ -50,10 +55,40 @@ def generate_binary_data_plot( binary_outdir, figure_outdir ):
        file_list.append(x)
        print(x)

-    ## Plot each file in the list
+    ## Plot each file in parallel using subprocesses.
+    ## Each subprocess is a fresh Python process where the BLAS thread-count
+    ## environment variables (set at the top of plot_binary_data.py) take
+    ## effect before numpy is imported.  This avoids the thread explosion
+    ## that occurs when multiprocessing.Pool with 'fork' context inherits
+    ## already-initialized multi-threaded BLAS from the parent.
+    script = os.path.join( os.path.dirname(__file__), "plot_binary_data.py" )
+    max_workers = min( multiprocessing.cpu_count(), len(file_list) ) if file_list else 0
+
+    running = []
+    failed  = []
    for filename in file_list:
        print(filename)
-        plot_binary_data.plot_binary_data(filename, binary_outdir, figure_outdir)
+        proc = subprocess.Popen(
+            [sys.executable, script, filename, binary_outdir, figure_outdir],
+        )
+        running.append( (proc, filename) )
+        ## Keep at most max_workers subprocesses active at a time
+        if len(running) >= max_workers:
+            p, fn = running.pop(0)
+            p.wait()
+            if p.returncode != 0:
+                failed.append(fn)
+
+    ## Wait for all remaining subprocesses to finish
+    for p, fn in running:
+        p.wait()
+        if p.returncode != 0:
+            failed.append(fn)
+
+    if failed:
+        print( " WARNING: the following binary data plots failed:" )
+        for fn in failed:
+            print( "   ", fn )

    print(                        )
    print( " Binary Data Plot Has been Finished " )
Author	SHA1	Message	Date
ianchb	f7ada421cf	skip redundant MPI ghost cell syncs for stages 0, 1 & 2 BSSN 每个 RK4 时间步执行 4 次 MPI ghost zone 同步： Stage 0（预测）结束后：Parallel::Sync(SynchList_pre) Stage 1（校正 1）结束后：Parallel::Sync(SynchList_cor) Stage 2（校正 2）结束后：Parallel::Sync(SynchList_cor) Stage 3（校正 3）结束后：Parallel::Sync(SynchList_cor) ← 必要（为下一步提供 ghost） bssnEM_class.C、Z4c_class.C 结构相同，一起修改了	2026-02-26 16:16:33 +08:00
ianchb	fb9f153662	Initialize output arrays to zero in fdderivs_c.C and fderivs_c.C	2026-02-26 11:48:28 +08:00
ianchb	f5a63f1e42	Revert "Fix timing: replace clock() with MPI_Wtime() for wall-clock measurement" This reverts commit `09b937c022`.	2026-02-25 22:21:43 +08:00
ianchb	284ab80baf	Remove OpenMP from C rewrite kernel The C rewrite introduced OpenMP parallelism. Remove all OpenMP.	2026-02-25 22:21:20 +08:00
copilot-swe-agent[bot]	09b937c022	Fix timing: replace clock() with MPI_Wtime() for wall-clock measurement clock() measures total CPU time across all threads, not wall-clock time. With the new OpenMP parallel regions in bssn_rhs_c.C, clock() sums CPU time from all OpenMP threads, producing inflated timing that scales with thread count rather than reflecting actual elapsed time. MPI_Wtime() returns wall-clock seconds, giving accurate timing regardless of the number of OpenMP threads running inside the measured interval. Co-authored-by: ianchb <i@4t.pw>	2026-02-25 22:21:19 +08:00
wingrew	8a9c775705	Replace Fortran bssn_rhs with C implementation and add C helper kernels - Modify bssn_rhs_c.C to use existing project headers (macrodef.h, bssn_rhs.h) - Update makefile: remove bssn_rhs.o from F90FILES, add CFILES with OpenMP - Keep Fortran helper files (diff_new.f90, kodiss.f90, lopsidediff.f90) for other Fortran callers [copilot: fix compiling errors & a nan error] Co-authored-by: ianchb <i@4t.pw> Co-authored-by: copilot-swe-agent[bot] <198982749+copilot@users.noreply.github.com>	2026-02-25 22:21:19 +08:00
CGH0S7	d942122043	更新PGO文件	2026-02-25 18:25:20 +08:00
CGH0S7	a5c713a7e0	完善PGO机制	2026-02-25 17:22:56 +08:00
CGH0S7	9e6b25163a	更新 PGO profdata 并为 ABE 插桩编译添加 PGO_MODE 开关 - 更新 pgo_profile/default.profdata 为最新收集的 profile 数据 - 备份旧 profdata 至 default.profdata.backup2 - makefile: 新增 PGO_MODE 开关（默认 opt），支持 make PGO_MODE=instrument 切换到 Phase 1 插桩模式重新收集数据，无需手动修改 flags - makefile: TwoPunctureABE 独立使用 TP_OPTFLAGS，不受 PGO_MODE 影响 - makefile: PROFDATA 路径改为 /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata - makefile.inc: 移除硬编码的编译 flags，改由 makefile 中的 ifeq 逻辑管理 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-02-25 17:00:55 +08:00
CGH0S7	efc8bf29ea	按需失效同步缓存：Regrid_Onelevel 改为返回 bool 将 cgh::Regrid_Onelevel 的返回类型从 void 改为 bool，在网格真正发生移动时返回 true，否则返回 false。调用方仅在返回 true 时才失效 sync_cache_*，避免了每次 RecursiveStep 结束后无条件失效所有层级缓存的冗余开销。 Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>	2026-02-25 16:00:26 +08:00
CGH0S7	ccf6adaf75	提供正确的macrodef.h避免llm被误导	2026-02-25 11:47:14 +08:00
CGH0S7	e2bc472845	优化绑核逻辑，取消硬编码改为智能识别	2026-02-25 10:59:32 +08:00
ianchb	82339f5282	Merge lopsided advection + kodis dissipation to share symmetry_bd buffer Cherry-picked from `38c2c30`.	2026-02-20 13:36:27 +08:00
ianchb	94f38c57f9	Don't hardcode pgo profile path	2026-02-20 13:36:27 +08:00
CGH0S7	85d1e8de87	Add Intel SIMD vectorization directives to hot-spot functions Apply Intel Advisor optimization recommendations: - Add FORCEINLINE to polint for better inlining - Add SIMD VECTORLENGTHFOR and UNROLL directives to fderivs, fdderivs, symmetry_bd, and kodis functions This improves vectorization efficiency of finite difference computations. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-14 00:43:39 +08:00
CGH0S7	5b7e05cd32	PGO updated	2026-02-11 18:26:30 +08:00
CGH0S7	85afe00fc5	Merge plotting optimizations from chb-copilot-test - Implement multiprocessing-based parallel plotting - Add parallel_plot_helper.py for concurrent plot task execution - Use matplotlib 'Agg' backend for multiprocessing safety - Set OMP_NUM_THREADS=1 to prevent BLAS thread explosion - Use subprocess for binary data plots to avoid thread conflicts - Add fork bomb protection in main program This merge only includes plotting improvements and excludes MPI communication changes to preserve existing optimizations. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>	2026-02-11 16:19:17 +08:00
CGH0S7	5c1790277b	Replace nested OutBdLow2Hi loops with batch calls in RestrictProlong The 8 nested while(Ppc){while(Pp){OutBdLow2Hi(single,single,...)}} loops across RestrictProlong (3 overloads) and ProlongRestrict each produced N_c × N_f separate transfer() → MPI_Waitall barriers. Replace with the existing batch OutBdLow2Hi(MyList<Patch>*,...) which merges all patch pairs into a single transfer() call with 1 MPI_Waitall. Also add Restrict_cached, OutBdLow2Hi_cached, OutBdLow2Himix_cached to Parallel (unused for now — kept as infrastructure for future use). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-02-11 16:09:08 +08:00
CGH0S7	e09ae438a2	Cache data_packer lengths in Sync_start to skip redundant buffer-size traversals The data_packer(NULL, ...) calls that compute send/recv buffer lengths traverse all grid segments × variables × nprocs on every Sync_start invocation, even though lengths never change once the cache is built. Add a lengths_valid flag to SyncCache so these length computations are done once and reused on subsequent calls (4× per RK4 step). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-02-10 21:39:22 +08:00
CGH0S7	d06d5b4db8	Add targeted point-to-point Interp_Points overload for surface_integral Instead of broadcasting all interpolated point data to every MPI rank, the new overload sends each point only to the one rank that needs it for integration, reducing communication volume by ~nprocs times. The consumer rank is computed deterministically using the same Nmin/Nmax work distribution formula used by surface_integral callers. Two active call sites (surf_Wave and surf_MassPAng with MPI_COMM_WORLD) now use the new overload. Other callers (ShellPatch, Comm_here variants, etc.) remain unchanged. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-02-10 19:18:56 +08:00
CGH0S7	50e2a845f8	Replace MPI_Allreduce with owner-rank MPI_Bcast in Patch::Interp_Points The two MPI_Allreduce calls (data + weight) were the #1 hotspot at 38.5% CPU time. Since all ranks traverse the same block list and agree on point ownership, we replace the global reduction with targeted MPI_Bcast from each owner rank. This also eliminates the weight array/Allreduce entirely, removes redundant heap allocations (shellf, weight, DH, llb, uub), and writes interpolation results directly into the output buffer. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-09 22:39:18 +08:00
CGH0S7	738498cb28	Optimize MPI communication in RestrictProlong and surface_integral Cache Sync in RestrictProlong: replace 11 basic Parallel::Sync() calls with Parallel::Sync_cached() across RestrictProlong, RestrictProlong_aux, and ProlongRestrict to avoid rebuilding grid segment lists every call. Merge paired MPI_Allreduce in surface_integral: combine 9 pairs of consecutive RP/IP Allreduce calls into single calls with count=2*NN. Merge scalar MPI_Allreduce in surf_MassPAng: combine 3 groups of 7 scalar Allreduce calls (mass + angular/linear momentum) into single calls with count=7. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-09 22:07:12 +08:00
CGH0S7	42b9cf1ad9	Optimize MPI Sync with merged transfers, caching, and async overlap Phase 1: Merge N+1 transfer() calls into a single transfer() per Sync(PatchList), reducing N+1 MPI_Waitall barriers to 1 via new Sync_merged() that collects all intra-patch and inter-patch grid segment lists into combined per-rank arrays. Phase 2: Cache grid segment lists and reuse grow-only communication buffers across RK4 substeps via SyncCache struct. Caches are per-level and per-variable-list (predictor/corrector), invalidated on regrid. Eliminates redundant build_ghost_gsl/build_owned_gsl0/build_gstl rebuilds and malloc/free cycles between regrids. Phase 3: Split Sync into async Sync_start/Sync_finish to overlap Cartesian ghost zone exchange (MPI_Isend/Irecv) with Shell patch synchronization. Uses MPI tag 2 to avoid conflicts with SH->Synch() which uses transfer() with tag 1. Also updates makefile.inc paths and flags for local build environment. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-09 21:03:37 +08:00
CGH0S7	e9d321fd00	Convert MPI_Allreduce error checks to non-blocking MPI_Iallreduce overlapped with Sync Replace all 8 blocking MPI_Allreduce error-check calls with MPI_Iallreduce, overlapping the reduction with subsequent Parallel::Sync/SH->Synch operations. MPI_Wait is called after Sync completes to retrieve the error result. This hides the Allreduce latency (46.5% of CPU time) behind the ghost zone exchange communication that must happen anyway. Safe because Sync only copies existing data to ghost zones and the error check + abort happens before any further computation uses the synced data. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-09 12:39:29 +08:00
CGH0S7	ed1d86ade9	Merge paired MPI_Allreduce error checks to reduce global sync barriers In the two Step() functions that handle both Patch and Shell Patch, defer the Patch error check until after Shell Patch computation completes, then perform a single combined MPI_Allreduce instead of two separate ones. This eliminates 4 MPI_Allreduce calls per timestep (2 per Step function, Predictor + Corrector phases each). The optimization is mathematically equivalent: in normal execution (no NaN) behavior is identical; on error, both Patch and Shell data are dumped before MPI_Abort. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-09 12:12:16 +08:00
CGH0S7	471baa5065	PGO supported	2026-02-09 10:59:26 +08:00
CGH0S7	4bb6c03013	makefile setting updated	2026-02-08 16:14:43 +08:00
ianchb	b8e41b2b39	Only enable OpenMP for TwoPunctures	2026-02-08 13:00:37 +08:00
ianchb	133e4f13a2	Use OpenMP's parallel for with schedule(dynamic,1)	2026-02-07 19:48:24 +08:00
ianchb	914c4f4791	Optimize memory allocation in JFD_times_dv This should reduce the pressure on the memory allocator, indirectly improving caching behavior. Co-authored-by: copilot-swe-agent[bot] <198982749+copilot@users.noreply.github.com>	2026-02-07 15:55:45 +08:00