2026-03-02 21:36:26 +08:00
8 changed files with 954 additions and 531 deletions
--- a/AMSS_NCKU_source/MPatch.C
+++ b/AMSS_NCKU_source/MPatch.C
@@ -7,6 +7,7 @@
 #include <string>
 #include <cmath>
 #include <new>
 #include <vector>
 using namespace std;
 #include "misc.h"
@@ -17,6 +18,168 @@ using namespace std;
 #include "interp_lb_profile.h"
 #endif
 namespace
 {
 struct InterpBlockView
 {
  Block *bp;
  double llb[dim];
  double uub[dim];
 };
 struct BlockBinIndex
 {
  int bins[dim];
  double lo[dim];
  double inv[dim];
  vector<InterpBlockView> views;
  vector<vector<int>> bin_to_blocks;
  bool valid;
  BlockBinIndex() : valid(false)
  {
    for (int i = 0; i < dim; i++)
    {
      bins[i] = 1;
      lo[i] = 0.0;
      inv[i] = 0.0;
    }
  }
 };
 inline int clamp_int(int v, int lo, int hi)
 {
  return (v < lo) ? lo : ((v > hi) ? hi : v);
 }
 inline int coord_to_bin(double x, double lo, double inv, int nb)
 {
  if (nb <= 1 || inv <= 0.0)
    return 0;
  int b = int(floor((x - lo) * inv));
  return clamp_int(b, 0, nb - 1);
 }
 inline int bin_loc(const BlockBinIndex &index, int b0, int b1, int b2)
 {
  return b0 + index.bins[0] * (b1 + index.bins[1] * b2);
 }
 inline bool point_in_block_view(const InterpBlockView &view, const double *pox, const double *DH)
 {
  for (int i = 0; i < dim; i++)
  {
    if (pox[i] - view.llb[i] < -DH[i] / 2 || pox[i] - view.uub[i] > DH[i] / 2)
      return false;
  }
  return true;
 }
 void build_block_bin_index(Patch *patch, const double *DH, BlockBinIndex &index)
 {
  index = BlockBinIndex();
  MyList<Block> *Bp = patch->blb;
  while (Bp)
  {
    Block *BP = Bp->data;
    InterpBlockView view;
    view.bp = BP;
    for (int i = 0; i < dim; i++)
    {
 #ifdef Vertex
 #ifdef Cell
 #error Both Cell and Vertex are defined
 #endif
      view.llb[i] = (feq(BP->bbox[i], patch->bbox[i], DH[i] / 2)) ? BP->bbox[i] + patch->lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
      view.uub[i] = (feq(BP->bbox[dim + i], patch->bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - patch->uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
 #else
 #ifdef Cell
      view.llb[i] = (feq(BP->bbox[i], patch->bbox[i], DH[i] / 2)) ? BP->bbox[i] + patch->lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
      view.uub[i] = (feq(BP->bbox[dim + i], patch->bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - patch->uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
 #else
 #error Not define Vertex nor Cell
 #endif
 #endif
    }
    index.views.push_back(view);
    if (Bp == patch->ble)
      break;
    Bp = Bp->next;
  }
  const int nblocks = int(index.views.size());
  if (nblocks <= 0)
    return;
  int bins_1d = int(ceil(pow(double(nblocks), 1.0 / 3.0)));
  bins_1d = clamp_int(bins_1d, 1, 32);
  for (int i = 0; i < dim; i++)
  {
    index.bins[i] = bins_1d;
    index.lo[i] = patch->bbox[i] + patch->lli[i] * DH[i];
    const double hi = patch->bbox[dim + i] - patch->uui[i] * DH[i];
    if (hi > index.lo[i] && bins_1d > 1)
      index.inv[i] = bins_1d / (hi - index.lo[i]);
    else
      index.inv[i] = 0.0;
  }
  index.bin_to_blocks.resize(index.bins[0] * index.bins[1] * index.bins[2]);
  for (int bi = 0; bi < nblocks; bi++)
  {
    const InterpBlockView &view = index.views[bi];
    int bmin[dim], bmax[dim];
    for (int d = 0; d < dim; d++)
    {
      const double low = view.llb[d] - DH[d] / 2;
      const double up = view.uub[d] + DH[d] / 2;
      bmin[d] = coord_to_bin(low, index.lo[d], index.inv[d], index.bins[d]);
      bmax[d] = coord_to_bin(up, index.lo[d], index.inv[d], index.bins[d]);
      if (bmax[d] < bmin[d])
      {
        int t = bmin[d];
        bmin[d] = bmax[d];
        bmax[d] = t;
      }
    }
    for (int bz = bmin[2]; bz <= bmax[2]; bz++)
      for (int by = bmin[1]; by <= bmax[1]; by++)
        for (int bx = bmin[0]; bx <= bmax[0]; bx++)
          index.bin_to_blocks[bin_loc(index, bx, by, bz)].push_back(bi);
  }
  index.valid = true;
 }
 int find_block_index_for_point(const BlockBinIndex &index, const double *pox, const double *DH)
 {
  if (!index.valid)
    return -1;
  const int bx = coord_to_bin(pox[0], index.lo[0], index.inv[0], index.bins[0]);
  const int by = coord_to_bin(pox[1], index.lo[1], index.inv[1], index.bins[1]);
  const int bz = coord_to_bin(pox[2], index.lo[2], index.inv[2], index.bins[2]);
  const vector<int> &cand = index.bin_to_blocks[bin_loc(index, bx, by, bz)];
  for (size_t ci = 0; ci < cand.size(); ci++)
  {
    const int bi = cand[ci];
    if (point_in_block_view(index.views[bi], pox, DH))
      return bi;
  }
  // Fallback to full scan for numerical edge cases around bin boundaries.
  for (size_t bi = 0; bi < index.views.size(); bi++)
    if (point_in_block_view(index.views[bi], pox, DH))
      return int(bi);
  return -1;
 }
 } // namespace
 Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
 {
@@ -367,9 +530,11 @@ void Patch::Interp_Points(MyList<var> *VarList,
  for (int j = 0; j < NN; j++)
    owner_rank[j] = -1;
-  double DH[dim], llb[dim], uub[dim];
+  double DH[dim];
  for (int i = 0; i < dim; i++)
    DH[i] = getdX(i);
  BlockBinIndex block_index;
  build_block_bin_index(this, DH, block_index);
  for (int j = 0; j < NN; j++) // run along points
  {
@@ -392,57 +557,24 @@ void Patch::Interp_Points(MyList<var> *VarList,
      }
    }
-    MyList<Block> *Bp = blb;
+    const int block_i = find_block_index_for_point(block_index, pox, DH);
-    bool notfind = true;
+    if (block_i >= 0)
    while (notfind && Bp) // run along Blocks
    {
-      Block *BP = Bp->data;
+      Block *BP = block_index.views[block_i].bp;
-
+      owner_rank[j] = BP->rank;
-      bool flag = true;
+      if (myrank == BP->rank)
      for (int i = 0; i < dim; i++)
      {
-#ifdef Vertex
+        //---> interpolation
-#ifdef Cell
+        varl = VarList;
-#error Both Cell and Vertex are defined
+        int k = 0;
-#endif
+        while (varl) // run along variables
        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
 #else
 #ifdef Cell
        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
 #else
 #error Not define Vertex nor Cell
 #endif
 #endif
        if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
        {
-          flag = false;
+          f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
-          break;
+                          pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
          varl = varl->next;
          k++;
        }
      }
      if (flag)
      {
        notfind = false;
        owner_rank[j] = BP->rank;
        if (myrank == BP->rank)
        {
          //---> interpolation
          varl = VarList;
          int k = 0;
          while (varl) // run along variables
          {
            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
            varl = varl->next;
            k++;
          }
        }
      }
      if (Bp == ble)
        break;
      Bp = Bp->next;
    }
  }
@@ -535,9 +667,11 @@ void Patch::Interp_Points(MyList<var> *VarList,
  for (int j = 0; j < NN; j++)
    owner_rank[j] = -1;
-  double DH[dim], llb[dim], uub[dim];
+  double DH[dim];
  for (int i = 0; i < dim; i++)
    DH[i] = getdX(i);
  BlockBinIndex block_index;
  build_block_bin_index(this, DH, block_index);
  // --- Interpolation phase (identical to original) ---
  for (int j = 0; j < NN; j++)
@@ -561,56 +695,23 @@ void Patch::Interp_Points(MyList<var> *VarList,
      }
    }
-    MyList<Block> *Bp = blb;
+    const int block_i = find_block_index_for_point(block_index, pox, DH);
-    bool notfind = true;
+    if (block_i >= 0)
    while (notfind && Bp)
    {
-      Block *BP = Bp->data;
+      Block *BP = block_index.views[block_i].bp;
-
+      owner_rank[j] = BP->rank;
-      bool flag = true;
+      if (myrank == BP->rank)
      for (int i = 0; i < dim; i++)
      {
-#ifdef Vertex
+        varl = VarList;
-#ifdef Cell
+        int k = 0;
-#error Both Cell and Vertex are defined
+        while (varl)
 #endif
        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
 #else
 #ifdef Cell
        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
 #else
 #error Not define Vertex nor Cell
 #endif
 #endif
        if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
        {
-          flag = false;
+          f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
-          break;
+                          pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
          varl = varl->next;
          k++;
        }
      }
      if (flag)
      {
        notfind = false;
        owner_rank[j] = BP->rank;
        if (myrank == BP->rank)
        {
          varl = VarList;
          int k = 0;
          while (varl)
          {
            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
            varl = varl->next;
            k++;
          }
        }
      }
      if (Bp == ble)
        break;
      Bp = Bp->next;
    }
  }
@@ -833,9 +934,11 @@ void Patch::Interp_Points(MyList<var> *VarList,
  MPI_Comm_group(MPI_COMM_WORLD, &world_group);
  MPI_Comm_group(Comm_here, &local_group);
-  double DH[dim], llb[dim], uub[dim];
+  double DH[dim];
  for (int i = 0; i < dim; i++)
    DH[i] = getdX(i);
  BlockBinIndex block_index;
  build_block_bin_index(this, DH, block_index);
  for (int j = 0; j < NN; j++) // run along points
  {
@@ -858,57 +961,24 @@ void Patch::Interp_Points(MyList<var> *VarList,
      }
    }
-    MyList<Block> *Bp = blb;
+    const int block_i = find_block_index_for_point(block_index, pox, DH);
-    bool notfind = true;
+    if (block_i >= 0)
    while (notfind && Bp) // run along Blocks
    {
-      Block *BP = Bp->data;
+      Block *BP = block_index.views[block_i].bp;
-
+      owner_rank[j] = BP->rank;
-      bool flag = true;
+      if (myrank == BP->rank)
      for (int i = 0; i < dim; i++)
      {
-#ifdef Vertex
+        //---> interpolation
-#ifdef Cell
+        varl = VarList;
-#error Both Cell and Vertex are defined
+        int k = 0;
-#endif
+        while (varl) // run along variables
        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
 #else
 #ifdef Cell
        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
 #else
 #error Not define Vertex nor Cell
 #endif
 #endif
        if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
        {
-          flag = false;
+          f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
-          break;
+                          pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
          varl = varl->next;
          k++;
        }
      }
      if (flag)
      {
        notfind = false;
        owner_rank[j] = BP->rank;
        if (myrank == BP->rank)
        {
          //---> interpolation
          varl = VarList;
          int k = 0;
          while (varl) // run along variables
          {
            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
            varl = varl->next;
            k++;
          }
        }
      }
      if (Bp == ble)
        break;
      Bp = Bp->next;
    }
  }
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -3893,66 +3893,105 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
  int node;
-  MPI_Request *reqs;
+  MPI_Request *reqs = new MPI_Request[2 * cpusize];
-  MPI_Status *stats;
+  MPI_Status *stats = new MPI_Status[2 * cpusize];
-  reqs = new MPI_Request[2 * cpusize];
+  int *req_node = new int[2 * cpusize];
-  stats = new MPI_Status[2 * cpusize];
+  int *req_is_recv = new int[2 * cpusize];
  int *completed = new int[2 * cpusize];
  int req_no = 0;
  int pending_recv = 0;
-  double **send_data, **rec_data;
+  double **send_data = new double *[cpusize];
-  send_data = new double *[cpusize];
+  double **rec_data = new double *[cpusize];
-  rec_data = new double *[cpusize];
+  int *send_lengths = new int[cpusize];
-  int length;
+  int *recv_lengths = new int[cpusize];
  for (node = 0; node < cpusize; node++)
  {
    send_data[node] = rec_data[node] = 0;
-    if (node == myrank)
+    send_lengths[node] = recv_lengths[node] = 0;
  }
  // Post receives first so peers can progress rendezvous early.
  for (node = 0; node < cpusize; node++)
  {
    if (node == myrank) continue;
    recv_lengths[node] = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
    if (recv_lengths[node] > 0)
    {
-      if (length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
+      rec_data[node] = new double[recv_lengths[node]];
      if (!rec_data[node])
      {
-        rec_data[node] = new double[length];
+        cout << "out of memory when new in short transfer, place 1" << endl;
-        if (!rec_data[node])
+        MPI_Abort(MPI_COMM_WORLD, 1);
        {
          cout << "out of memory when new in short transfer, place 1" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        data_packer(rec_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
      }
      MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
      req_node[req_no] = node;
      req_is_recv[req_no] = 1;
      req_no++;
      pending_recv++;
    }
-    else
+  }
  // Local transfer on this rank.
  recv_lengths[myrank] = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
  if (recv_lengths[myrank] > 0)
  {
    rec_data[myrank] = new double[recv_lengths[myrank]];
    if (!rec_data[myrank])
    {
-      // send from this cpu to cpu#node
+      cout << "out of memory when new in short transfer, place 2" << endl;
-      if (length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
+      MPI_Abort(MPI_COMM_WORLD, 1);
    }
    data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
  }
  // Pack and post sends.
  for (node = 0; node < cpusize; node++)
  {
    if (node == myrank) continue;
    send_lengths[node] = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
    if (send_lengths[node] > 0)
    {
      send_data[node] = new double[send_lengths[node]];
      if (!send_data[node])
      {
-        send_data[node] = new double[length];
+        cout << "out of memory when new in short transfer, place 3" << endl;
-        if (!send_data[node])
+        MPI_Abort(MPI_COMM_WORLD, 1);
        {
          cout << "out of memory when new in short transfer, place 2" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
        MPI_Isend((void *)send_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
      }
-      // receive from cpu#node to this cpu
+      data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
-      if (length = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry))
+      MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
      req_node[req_no] = node;
      req_is_recv[req_no] = 0;
      req_no++;
    }
  }
  // Unpack as soon as receive completes to reduce pure wait time.
  while (pending_recv > 0)
  {
    int outcount = 0;
    MPI_Waitsome(req_no, reqs, &outcount, completed, stats);
    if (outcount == MPI_UNDEFINED) break;
    for (int i = 0; i < outcount; i++)
    {
      int idx = completed[i];
      if (idx >= 0 && req_is_recv[idx])
      {
-        rec_data[node] = new double[length];
+        int recv_node = req_node[idx];
-        if (!rec_data[node])
+        data_packer(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry);
-        {
+        pending_recv--;
          cout << "out of memory when new in short transfer, place 3" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        MPI_Irecv((void *)rec_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
      }
    }
  }
  // wait for all requests to complete
  MPI_Waitall(req_no, reqs, stats);
-  for (node = 0; node < cpusize; node++)
+  if (req_no > 0) MPI_Waitall(req_no, reqs, stats);
-    if (rec_data[node])
+
-      data_packer(rec_data[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
+  if (rec_data[myrank])
    data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
  for (node = 0; node < cpusize; node++)
  {
@@ -3964,8 +4003,13 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
  delete[] reqs;
  delete[] stats;
  delete[] req_node;
  delete[] req_is_recv;
  delete[] completed;
  delete[] send_data;
  delete[] rec_data;
  delete[] send_lengths;
  delete[] recv_lengths;
 }
 //
 void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
@@ -3978,66 +4022,105 @@ void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gri
  int node;
-  MPI_Request *reqs;
+  MPI_Request *reqs = new MPI_Request[2 * cpusize];
-  MPI_Status *stats;
+  MPI_Status *stats = new MPI_Status[2 * cpusize];
-  reqs = new MPI_Request[2 * cpusize];
+  int *req_node = new int[2 * cpusize];
-  stats = new MPI_Status[2 * cpusize];
+  int *req_is_recv = new int[2 * cpusize];
  int *completed = new int[2 * cpusize];
  int req_no = 0;
  int pending_recv = 0;
-  double **send_data, **rec_data;
+  double **send_data = new double *[cpusize];
-  send_data = new double *[cpusize];
+  double **rec_data = new double *[cpusize];
-  rec_data = new double *[cpusize];
+  int *send_lengths = new int[cpusize];
-  int length;
+  int *recv_lengths = new int[cpusize];
  for (node = 0; node < cpusize; node++)
  {
    send_data[node] = rec_data[node] = 0;
-    if (node == myrank)
+    send_lengths[node] = recv_lengths[node] = 0;
  }
  // Post receives first so peers can progress rendezvous early.
  for (node = 0; node < cpusize; node++)
  {
    if (node == myrank) continue;
    recv_lengths[node] = data_packermix(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
    if (recv_lengths[node] > 0)
    {
-      if (length = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
+      rec_data[node] = new double[recv_lengths[node]];
      if (!rec_data[node])
      {
-        rec_data[node] = new double[length];
+        cout << "out of memory when new in short transfer, place 1" << endl;
-        if (!rec_data[node])
+        MPI_Abort(MPI_COMM_WORLD, 1);
        {
          cout << "out of memory when new in short transfer, place 1" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        data_packermix(rec_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
      }
      MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
      req_node[req_no] = node;
      req_is_recv[req_no] = 1;
      req_no++;
      pending_recv++;
    }
-    else
+  }
  // Local transfer on this rank.
  recv_lengths[myrank] = data_packermix(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
  if (recv_lengths[myrank] > 0)
  {
    rec_data[myrank] = new double[recv_lengths[myrank]];
    if (!rec_data[myrank])
    {
-      // send from this cpu to cpu#node
+      cout << "out of memory when new in short transfer, place 2" << endl;
-      if (length = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
+      MPI_Abort(MPI_COMM_WORLD, 1);
    }
    data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
  }
  // Pack and post sends.
  for (node = 0; node < cpusize; node++)
  {
    if (node == myrank) continue;
    send_lengths[node] = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
    if (send_lengths[node] > 0)
    {
      send_data[node] = new double[send_lengths[node]];
      if (!send_data[node])
      {
-        send_data[node] = new double[length];
+        cout << "out of memory when new in short transfer, place 3" << endl;
-        if (!send_data[node])
+        MPI_Abort(MPI_COMM_WORLD, 1);
        {
          cout << "out of memory when new in short transfer, place 2" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        data_packermix(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
        MPI_Isend((void *)send_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
      }
-      // receive from cpu#node to this cpu
+      data_packermix(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
-      if (length = data_packermix(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry))
+      MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
      req_node[req_no] = node;
      req_is_recv[req_no] = 0;
      req_no++;
    }
  }
  // Unpack as soon as receive completes to reduce pure wait time.
  while (pending_recv > 0)
  {
    int outcount = 0;
    MPI_Waitsome(req_no, reqs, &outcount, completed, stats);
    if (outcount == MPI_UNDEFINED) break;
    for (int i = 0; i < outcount; i++)
    {
      int idx = completed[i];
      if (idx >= 0 && req_is_recv[idx])
      {
-        rec_data[node] = new double[length];
+        int recv_node = req_node[idx];
-        if (!rec_data[node])
+        data_packermix(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry);
-        {
+        pending_recv--;
          cout << "out of memory when new in short transfer, place 3" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        MPI_Irecv((void *)rec_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
      }
    }
  }
  // wait for all requests to complete
  MPI_Waitall(req_no, reqs, stats);
-  for (node = 0; node < cpusize; node++)
+  if (req_no > 0) MPI_Waitall(req_no, reqs, stats);
-    if (rec_data[node])
+
-      data_packermix(rec_data[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
+  if (rec_data[myrank])
    data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
  for (node = 0; node < cpusize; node++)
  {
@@ -4049,8 +4132,13 @@ void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gri
  delete[] reqs;
  delete[] stats;
  delete[] req_node;
  delete[] req_is_recv;
  delete[] completed;
  delete[] send_data;
  delete[] rec_data;
  delete[] send_lengths;
  delete[] recv_lengths;
 }
 void Parallel::Sync(Patch *Pat, MyList<var> *VarList, int Symmetry)
 {
@@ -4232,7 +4320,7 @@ Parallel::SyncCache::SyncCache()
    : valid(false), cpusize(0), combined_src(0), combined_dst(0),
      send_lengths(0), recv_lengths(0), send_bufs(0), recv_bufs(0),
      send_buf_caps(0), recv_buf_caps(0), reqs(0), stats(0), max_reqs(0),
-      lengths_valid(false)
+      lengths_valid(false), tc_req_node(0), tc_req_is_recv(0), tc_completed(0)
 {
 }
 // SyncCache invalidate: free grid segment lists but keep buffers
@@ -4271,11 +4359,15 @@ void Parallel::SyncCache::destroy()
  if (recv_bufs) delete[] recv_bufs;
  if (reqs) delete[] reqs;
  if (stats) delete[] stats;
  if (tc_req_node) delete[] tc_req_node;
  if (tc_req_is_recv) delete[] tc_req_is_recv;
  if (tc_completed) delete[] tc_completed;
  combined_src = combined_dst = 0;
  send_lengths = recv_lengths = 0;
  send_buf_caps = recv_buf_caps = 0;
  send_bufs = recv_bufs = 0;
  reqs = 0; stats = 0;
  tc_req_node = 0; tc_req_is_recv = 0; tc_completed = 0;
  cpusize = 0; max_reqs = 0;
 }
 // transfer_cached: reuse pre-allocated buffers from SyncCache
@@ -4289,64 +4381,96 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
  int cpusize = cache.cpusize;
  int req_no = 0;
  int pending_recv = 0;
  int node;
  int *req_node = cache.tc_req_node;
  int *req_is_recv = cache.tc_req_is_recv;
  int *completed = cache.tc_completed;
  // Post receives first so peers can progress rendezvous early.
  for (node = 0; node < cpusize; node++)
  {
-    if (node == myrank)
+    if (node == myrank) continue;
    int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
    cache.recv_lengths[node] = rlength;
    if (rlength > 0)
    {
-      int length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+      if (rlength > cache.recv_buf_caps[node])
      cache.recv_lengths[node] = length;
      if (length > 0)
      {
-        if (length > cache.recv_buf_caps[node])
+        if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
-        {
+        cache.recv_bufs[node] = new double[rlength];
-          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
+        cache.recv_buf_caps[node] = rlength;
          cache.recv_bufs[node] = new double[length];
          cache.recv_buf_caps[node] = length;
        }
        data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
      }
      MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
      req_node[req_no] = node;
      req_is_recv[req_no] = 1;
      req_no++;
      pending_recv++;
    }
-    else
+  }
  // Local transfer on this rank.
  int self_len = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
  cache.recv_lengths[myrank] = self_len;
  if (self_len > 0)
  {
    if (self_len > cache.recv_buf_caps[myrank])
    {
-      // send
+      if (cache.recv_bufs[myrank]) delete[] cache.recv_bufs[myrank];
-      int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+      cache.recv_bufs[myrank] = new double[self_len];
-      cache.send_lengths[node] = slength;
+      cache.recv_buf_caps[myrank] = self_len;
-      if (slength > 0)
+    }
    data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
  }
  // Pack and post sends.
  for (node = 0; node < cpusize; node++)
  {
    if (node == myrank) continue;
    int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
    cache.send_lengths[node] = slength;
    if (slength > 0)
    {
      if (slength > cache.send_buf_caps[node])
      {
-        if (slength > cache.send_buf_caps[node])
+        if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
-        {
+        cache.send_bufs[node] = new double[slength];
-          if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
+        cache.send_buf_caps[node] = slength;
          cache.send_bufs[node] = new double[slength];
          cache.send_buf_caps[node] = slength;
        }
        data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
        MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
      }
-      // recv
+      data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
-      int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
+      MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
-      cache.recv_lengths[node] = rlength;
+      req_node[req_no] = node;
-      if (rlength > 0)
+      req_is_recv[req_no] = 0;
      req_no++;
    }
  }
  // Unpack as soon as receive completes to reduce pure wait time.
  while (pending_recv > 0)
  {
    int outcount = 0;
    MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats);
    if (outcount == MPI_UNDEFINED) break;
    for (int i = 0; i < outcount; i++)
    {
      int idx = completed[i];
      if (idx >= 0 && req_is_recv[idx])
      {
-        if (rlength > cache.recv_buf_caps[node])
+        int recv_node_i = req_node[idx];
-        {
+        data_packer(cache.recv_bufs[recv_node_i], src[recv_node_i], dst[recv_node_i], recv_node_i, UNPACK, VarList1, VarList2, Symmetry);
-          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
+        pending_recv--;
          cache.recv_bufs[node] = new double[rlength];
          cache.recv_buf_caps[node] = rlength;
        }
        MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
      }
    }
  }
-  MPI_Waitall(req_no, cache.reqs, cache.stats);
+  if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats);
-  for (node = 0; node < cpusize; node++)
+  if (self_len > 0)
-    if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
+    data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
      data_packer(cache.recv_bufs[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
 }
 // Sync_cached: build grid segment lists on first call, reuse on subsequent calls
 void Parallel::Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache)
 {
  if (!cache.valid)
@@ -4374,6 +4498,9 @@ void Parallel::Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmet
      cache.max_reqs = 2 * cpusize;
      cache.reqs = new MPI_Request[cache.max_reqs];
      cache.stats = new MPI_Status[cache.max_reqs];
      cache.tc_req_node = new int[cache.max_reqs];
      cache.tc_req_is_recv = new int[cache.max_reqs];
      cache.tc_completed = new int[cache.max_reqs];
    }
    for (int node = 0; node < cpusize; node++)
@@ -4474,6 +4601,9 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
      cache.max_reqs = 2 * cpusize;
      cache.reqs = new MPI_Request[cache.max_reqs];
      cache.stats = new MPI_Status[cache.max_reqs];
      cache.tc_req_node = new int[cache.max_reqs];
      cache.tc_req_is_recv = new int[cache.max_reqs];
      cache.tc_completed = new int[cache.max_reqs];
    }
    for (int node = 0; node < cpusize; node++)
@@ -4544,6 +4674,11 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
  int cpusize = cache.cpusize;
  state.req_no = 0;
  state.active = true;
  state.pending_recv = 0;
  // Allocate tracking arrays
  delete[] state.req_node; delete[] state.req_is_recv;
  state.req_node = new int[cache.max_reqs];
  state.req_is_recv = new int[cache.max_reqs];
  MyList<Parallel::gridseg> **src = cache.combined_src;
  MyList<Parallel::gridseg> **dst = cache.combined_dst;
@@ -4588,6 +4723,8 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
          cache.send_buf_caps[node] = slength;
        }
        data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
        state.req_node[state.req_no] = node;
        state.req_is_recv[state.req_no] = 0;
        MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
      }
      int rlength;
@@ -4605,29 +4742,60 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
          cache.recv_bufs[node] = new double[rlength];
          cache.recv_buf_caps[node] = rlength;
        }
        state.req_node[state.req_no] = node;
        state.req_is_recv[state.req_no] = 1;
        state.pending_recv++;
        MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
      }
    }
  }
  cache.lengths_valid = true;
 }
-// Sync_finish: wait for async MPI operations and unpack
+// Sync_finish: progressive unpack as receives complete, then wait for sends
 void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state,
                           MyList<var> *VarList, int Symmetry)
 {
  if (!state.active)
    return;
-  MPI_Waitall(state.req_no, cache.reqs, cache.stats);
+  int myrank;
-
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  int cpusize = cache.cpusize;
  MyList<Parallel::gridseg> **src = cache.combined_src;
  MyList<Parallel::gridseg> **dst = cache.combined_dst;
-  for (int node = 0; node < cpusize; node++)
+  // Unpack local data first (no MPI needed)
-    if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
+  if (cache.recv_bufs[myrank] && cache.recv_lengths[myrank] > 0)
-      data_packer(cache.recv_bufs[node], src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry);
+    data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList, VarList, Symmetry);
  // Progressive unpack of remote receives
  if (state.pending_recv > 0 && state.req_no > 0)
  {
    int pending = state.pending_recv;
    int *completed = new int[cache.max_reqs];
    while (pending > 0)
    {
      int outcount = 0;
      MPI_Waitsome(state.req_no, cache.reqs, &outcount, completed, cache.stats);
      if (outcount == MPI_UNDEFINED) break;
      for (int i = 0; i < outcount; i++)
      {
        int idx = completed[i];
        if (idx >= 0 && state.req_is_recv[idx])
        {
          int recv_node = state.req_node[idx];
          data_packer(cache.recv_bufs[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList, VarList, Symmetry);
          pending--;
        }
      }
    }
    delete[] completed;
  }
  // Wait for remaining sends
  if (state.req_no > 0) MPI_Waitall(state.req_no, cache.reqs, cache.stats);
  delete[] state.req_node; state.req_node = 0;
  delete[] state.req_is_recv; state.req_is_recv = 0;
  state.active = false;
 }
 // collect buffer grid segments or blocks for the periodic boundary condition of given patch
@@ -5694,6 +5862,9 @@ void Parallel::Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
      cache.max_reqs = 2 * cpusize;
      cache.reqs = new MPI_Request[cache.max_reqs];
      cache.stats = new MPI_Status[cache.max_reqs];
      cache.tc_req_node = new int[cache.max_reqs];
      cache.tc_req_is_recv = new int[cache.max_reqs];
      cache.tc_completed = new int[cache.max_reqs];
    }
    MyList<Parallel::gridseg> *dst = build_complete_gsl(PatcL);
@@ -5740,6 +5911,9 @@ void Parallel::OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
      cache.max_reqs = 2 * cpusize;
      cache.reqs = new MPI_Request[cache.max_reqs];
      cache.stats = new MPI_Status[cache.max_reqs];
      cache.tc_req_node = new int[cache.max_reqs];
      cache.tc_req_is_recv = new int[cache.max_reqs];
      cache.tc_completed = new int[cache.max_reqs];
    }
    MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
@@ -5786,6 +5960,9 @@ void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
      cache.max_reqs = 2 * cpusize;
      cache.reqs = new MPI_Request[cache.max_reqs];
      cache.stats = new MPI_Status[cache.max_reqs];
      cache.tc_req_node = new int[cache.max_reqs];
      cache.tc_req_is_recv = new int[cache.max_reqs];
      cache.tc_completed = new int[cache.max_reqs];
    }
    MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
@@ -5807,58 +5984,98 @@ void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
  int cpusize = cache.cpusize;
  int req_no = 0;
  int pending_recv = 0;
  int *req_node = new int[cache.max_reqs];
  int *req_is_recv = new int[cache.max_reqs];
  int *completed = new int[cache.max_reqs];
  // Post receives first so peers can progress rendezvous early.
  for (int node = 0; node < cpusize; node++)
  {
-    if (node == myrank)
+    if (node == myrank) continue;
    int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
    cache.recv_lengths[node] = rlength;
    if (rlength > 0)
    {
-      int length = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+      if (rlength > cache.recv_buf_caps[node])
      cache.recv_lengths[node] = length;
      if (length > 0)
      {
-        if (length > cache.recv_buf_caps[node])
+        if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
-        {
+        cache.recv_bufs[node] = new double[rlength];
-          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
+        cache.recv_buf_caps[node] = rlength;
          cache.recv_bufs[node] = new double[length];
          cache.recv_buf_caps[node] = length;
        }
        data_packermix(cache.recv_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
      }
      MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
      req_node[req_no] = node;
      req_is_recv[req_no] = 1;
      req_no++;
      pending_recv++;
    }
-    else
+  }
  // Local transfer on this rank.
  int self_len = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
  cache.recv_lengths[myrank] = self_len;
  if (self_len > 0)
  {
    if (self_len > cache.recv_buf_caps[myrank])
    {
-      int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+      if (cache.recv_bufs[myrank]) delete[] cache.recv_bufs[myrank];
-      cache.send_lengths[node] = slength;
+      cache.recv_bufs[myrank] = new double[self_len];
-      if (slength > 0)
+      cache.recv_buf_caps[myrank] = self_len;
    }
    data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
  }
  // Pack and post sends.
  for (int node = 0; node < cpusize; node++)
  {
    if (node == myrank) continue;
    int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
    cache.send_lengths[node] = slength;
    if (slength > 0)
    {
      if (slength > cache.send_buf_caps[node])
      {
-        if (slength > cache.send_buf_caps[node])
+        if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
-        {
+        cache.send_bufs[node] = new double[slength];
-          if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
+        cache.send_buf_caps[node] = slength;
          cache.send_bufs[node] = new double[slength];
          cache.send_buf_caps[node] = slength;
        }
        data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
        MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
      }
-      int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
+      data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
-      cache.recv_lengths[node] = rlength;
+      MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
-      if (rlength > 0)
+      req_node[req_no] = node;
      req_is_recv[req_no] = 0;
      req_no++;
    }
  }
  // Unpack as soon as receive completes to reduce pure wait time.
  while (pending_recv > 0)
  {
    int outcount = 0;
    MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats);
    if (outcount == MPI_UNDEFINED) break;
    for (int i = 0; i < outcount; i++)
    {
      int idx = completed[i];
      if (idx >= 0 && req_is_recv[idx])
      {
-        if (rlength > cache.recv_buf_caps[node])
+        int recv_node_i = req_node[idx];
-        {
+        data_packermix(cache.recv_bufs[recv_node_i], cache.combined_src[recv_node_i], cache.combined_dst[recv_node_i], recv_node_i, UNPACK, VarList1, VarList2, Symmetry);
-          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
+        pending_recv--;
          cache.recv_bufs[node] = new double[rlength];
          cache.recv_buf_caps[node] = rlength;
        }
        MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
      }
    }
  }
-  MPI_Waitall(req_no, cache.reqs, cache.stats);
+  if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats);
-  for (int node = 0; node < cpusize; node++)
+  if (self_len > 0)
-    if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
+    data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
-      data_packermix(cache.recv_bufs[node], cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
+
  delete[] req_node;
  delete[] req_is_recv;
  delete[] completed;
 }
 // collect all buffer grid segments or blocks for given patch
--- a/AMSS_NCKU_source/Parallel.h
+++ b/AMSS_NCKU_source/Parallel.h
@@ -108,6 +108,9 @@ namespace Parallel
    MPI_Status *stats;
    int max_reqs;
    bool lengths_valid;
    int *tc_req_node;
    int *tc_req_is_recv;
    int *tc_completed;
    SyncCache();
    void invalidate();
    void destroy();
@@ -121,7 +124,10 @@ namespace Parallel
  struct AsyncSyncState {
    int req_no;
    bool active;
-    AsyncSyncState() : req_no(0), active(false) {}
+    int *req_node;
    int *req_is_recv;
    int pending_recv;
    AsyncSyncState() : req_no(0), active(false), req_node(0), req_is_recv(0), pending_recv(0) {}
  };
  void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -736,6 +736,8 @@ void bssn_class::Initialize()
  sync_cache_cor = new Parallel::SyncCache[GH->levels];
  sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
  sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
  sync_cache_restrict = new Parallel::SyncCache[GH->levels];
  sync_cache_outbd = new Parallel::SyncCache[GH->levels];
 }
 //================================================================================================
@@ -2213,7 +2215,7 @@ void bssn_class::Evolve(int Steps)
    GH->Regrid(Symmetry, BH_num, Porgbr, Porg0,
               SynchList_cor, OldStateList, StateList, SynchList_pre,
               fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor);
-    for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
+    for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
 #endif
 #if (REGLEV == 0 && (PSTR == 1 || PSTR == 2))
@@ -2429,7 +2431,7 @@ void bssn_class::RecursiveStep(int lev)
  if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
                      SynchList_cor, OldStateList, StateList, SynchList_pre,
                      fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
-  for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
+  for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
 #endif
 }
@@ -2608,7 +2610,7 @@ void bssn_class::ParallelStep()
  if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
                      SynchList_cor, OldStateList, StateList, SynchList_pre,
                      fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
-  for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
+  for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
 #endif
 }
@@ -2775,7 +2777,7 @@ void bssn_class::ParallelStep()
        if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
                            SynchList_cor, OldStateList, StateList, SynchList_pre,
                            fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor))
-        for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
+        for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
        //               a_stream.clear();
        //               a_stream.str("");
@@ -2790,7 +2792,7 @@ void bssn_class::ParallelStep()
      if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
                          SynchList_cor, OldStateList, StateList, SynchList_pre,
                          fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
-      for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
+      for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
      //               a_stream.clear();
      //               a_stream.str("");
@@ -2809,7 +2811,7 @@ void bssn_class::ParallelStep()
          if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
                              SynchList_cor, OldStateList, StateList, SynchList_pre,
                              fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
-          for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
+          for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
          //               a_stream.clear();
          //               a_stream.str("");
@@ -2825,7 +2827,7 @@ void bssn_class::ParallelStep()
          if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
                              SynchList_cor, OldStateList, StateList, SynchList_pre,
                              fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
-          for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
+          for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
          //               a_stream.clear();
          //               a_stream.str("");
@@ -5796,7 +5798,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif
 #if (RPB == 0)
-      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry);
+      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
 #elif (RPB == 1)
      //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry);
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
@@ -5820,7 +5822,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
+      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry, sync_cache_outbd[lev]);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
 #endif
@@ -5847,7 +5849,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif
 #if (RPB == 0)
-      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
+      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]);
 #elif (RPB == 1)
      //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
@@ -5871,7 +5873,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
+      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_outbd[lev]);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
 #endif
@@ -5940,7 +5942,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
      }
 #if (RPB == 0)
-      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry);
+      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
 #elif (RPB == 1)
      //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry);
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
@@ -5950,7 +5952,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
+      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry, sync_cache_outbd[lev]);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
 #endif
@@ -5962,7 +5964,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
    else // no time refinement levels and for all same time levels
    {
 #if (RPB == 0)
-      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
+      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]);
 #elif (RPB == 1)
      //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
@@ -5972,7 +5974,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
+      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_outbd[lev]);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
 #endif
@@ -6027,7 +6029,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
      }
 #if (RPB == 0)
-      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry);
+      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
 #elif (RPB == 1)
      //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_cor,SynchList_pre,Symmetry);
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry);
@@ -6037,7 +6039,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
+      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
 #endif
@@ -6051,7 +6053,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
      if (myrank == 0)
        cout << "===: " << GH->Lt[lev - 1] << "," << GH->Lt[lev] + dT_lev << endl;
 #if (RPB == 0)
-      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry);
+      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry, sync_cache_restrict[lev]);
 #elif (RPB == 1)
      //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_cor,StateList,Symmetry);
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry);
@@ -6061,7 +6063,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
+      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
 #endif
@@ -6102,7 +6104,7 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
+      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
 #endif
@@ -6115,7 +6117,7 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
    {
 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
+      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
 #endif
--- a/AMSS_NCKU_source/bssn_class.h
+++ b/AMSS_NCKU_source/bssn_class.h
@@ -130,6 +130,8 @@ public:
       Parallel::SyncCache *sync_cache_cor;  // per-level cache for corrector sync
       Parallel::SyncCache *sync_cache_rp_coarse;  // RestrictProlong sync on PatL[lev-1]
       Parallel::SyncCache *sync_cache_rp_fine;    // RestrictProlong sync on PatL[lev]
       Parallel::SyncCache *sync_cache_restrict;   // cached Restrict in RestrictProlong
       Parallel::SyncCache *sync_cache_outbd;      // cached OutBdLow2Hi in RestrictProlong
       monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
       monitor *ConVMonitor;
--- a/AMSS_NCKU_source/diff_newwb.f90
+++ b/AMSS_NCKU_source/diff_newwb.f90
@@ -1513,6 +1513,7 @@
  real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3))   :: fh
  real*8, dimension(3) :: SoA
  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
  integer :: i_core_min,i_core_max,j_core_min,j_core_max,k_core_min,k_core_max
  real*8  :: Sdxdx,Sdydy,Sdzdz,Fdxdx,Fdydy,Fdzdz
  real*8  :: Sdxdy,Sdxdz,Sdydz,Fdxdy,Fdxdz,Fdydz
  integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
@@ -1565,9 +1566,47 @@
  fxz = ZEO
  fyz = ZEO
  i_core_min = max(1, imin+2)
  i_core_max = min(ex(1), imax-2)
  j_core_min = max(1, jmin+2)
  j_core_max = min(ex(2), jmax-2)
  k_core_min = max(1, kmin+2)
  k_core_max = min(ex(3), kmax-2)
  if(i_core_min <= i_core_max .and. j_core_min <= j_core_max .and. k_core_min <= k_core_max)then
   do k=k_core_min,k_core_max
   do j=j_core_min,j_core_max
   do i=i_core_min,i_core_max
 ! interior points always use 4th-order stencils without branch checks
      fxx(i,j,k) = Fdxdx*(-fh(i-2,j,k)+F16*fh(i-1,j,k)-F30*fh(i,j,k) &
                          -fh(i+2,j,k)+F16*fh(i+1,j,k)              )
      fyy(i,j,k) = Fdydy*(-fh(i,j-2,k)+F16*fh(i,j-1,k)-F30*fh(i,j,k) &
                          -fh(i,j+2,k)+F16*fh(i,j+1,k)              )
      fzz(i,j,k) = Fdzdz*(-fh(i,j,k-2)+F16*fh(i,j,k-1)-F30*fh(i,j,k) &
                          -fh(i,j,k+2)+F16*fh(i,j,k+1)              )
      fxy(i,j,k) = Fdxdy*(     (fh(i-2,j-2,k)-F8*fh(i-1,j-2,k)+F8*fh(i+1,j-2,k)-fh(i+2,j-2,k))  &
                          -F8 *(fh(i-2,j-1,k)-F8*fh(i-1,j-1,k)+F8*fh(i+1,j-1,k)-fh(i+2,j-1,k))  &
                          +F8 *(fh(i-2,j+1,k)-F8*fh(i-1,j+1,k)+F8*fh(i+1,j+1,k)-fh(i+2,j+1,k))  &
                          -    (fh(i-2,j+2,k)-F8*fh(i-1,j+2,k)+F8*fh(i+1,j+2,k)-fh(i+2,j+2,k)))
      fxz(i,j,k) = Fdxdz*(     (fh(i-2,j,k-2)-F8*fh(i-1,j,k-2)+F8*fh(i+1,j,k-2)-fh(i+2,j,k-2))  &
                          -F8 *(fh(i-2,j,k-1)-F8*fh(i-1,j,k-1)+F8*fh(i+1,j,k-1)-fh(i+2,j,k-1))  &
                          +F8 *(fh(i-2,j,k+1)-F8*fh(i-1,j,k+1)+F8*fh(i+1,j,k+1)-fh(i+2,j,k+1))  &
                          -    (fh(i-2,j,k+2)-F8*fh(i-1,j,k+2)+F8*fh(i+1,j,k+2)-fh(i+2,j,k+2)))
      fyz(i,j,k) = Fdydz*(     (fh(i,j-2,k-2)-F8*fh(i,j-1,k-2)+F8*fh(i,j+1,k-2)-fh(i,j+2,k-2))  &
                          -F8 *(fh(i,j-2,k-1)-F8*fh(i,j-1,k-1)+F8*fh(i,j+1,k-1)-fh(i,j+2,k-1))  &
                          +F8 *(fh(i,j-2,k+1)-F8*fh(i,j-1,k+1)+F8*fh(i,j+1,k+1)-fh(i,j+2,k+1))  &
                          -    (fh(i,j-2,k+2)-F8*fh(i,j-1,k+2)+F8*fh(i,j+1,k+2)-fh(i,j+2,k+2)))
   enddo
   enddo
   enddo
  endif
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
      if(i>=i_core_min .and. i<=i_core_max .and. &
         j>=j_core_min .and. j<=j_core_max .and. &
         k>=k_core_min .and. k<=k_core_max) cycle
 !~~~~~~ fxx
        if(i+2 <= imax .and. i-2 >= imin)then
 !
--- a/AMSS_NCKU_source/fdderivs_c.C
+++ b/AMSS_NCKU_source/fdderivs_c.C
@@ -141,12 +141,26 @@ void fdderivs(const int ex[3],
    const int j4_hi = ex2 - 3;
    const int k4_hi = ex3 - 3;
    /*
     * Strategy A:
     * Avoid redundant work in overlap of 2nd/4th-order regions.
     * Only compute 2nd-order on shell points that are NOT overwritten by
     * the 4th-order pass.
     */
    const int has4 = (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi);
    if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) {
        for (int k0 = k2_lo; k0 <= k2_hi; ++k0) {
            const int kF = k0 + 1;
            for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
                const int jF = j0 + 1;
                for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
                    if (has4 &&
                        i0 >= i4_lo && i0 <= i4_hi &&
                        j0 >= j4_lo && j0 <= j4_hi &&
                        k0 >= k4_lo && k0 <= k4_hi) {
                        continue;
                    }
                    const int iF = i0 + 1;
                    const size_t p = idx_ex(i0, j0, k0, ex);
@@ -193,7 +207,7 @@ void fdderivs(const int ex[3],
        }
    }
-    if (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi) {
+    if (has4) {
        for (int k0 = k4_lo; k0 <= k4_hi; ++k0) {
            const int kF = k0 + 1;
            for (int j0 = j4_lo; j0 <= j4_hi; ++j0) {
--- a/AMSS_NCKU_source/prolongrestrict_cell.f90
+++ b/AMSS_NCKU_source/prolongrestrict_cell.f90
@@ -1955,7 +1955,14 @@
  integer::maxcx,maxcy,maxcz
  real*8,dimension(3) :: CD,FD
-  
+  real*8 :: tmp_yz(extc(1), 6)      ! 存储整条 X 线上 6 个 Y 轴偏置的 Z 向插值结果
  real*8 :: tmp_xyz_line(-2:extc(1))   ! 包含 X 向 6 点模板访问所需下界
  real*8 :: v1, v2, v3, v4, v5, v6
  integer :: ic, jc, kc, ix_offset,ix,iy,iz,jc_min,jc_max,ic_min,ic_max,kc_min,kc_max
  integer :: i_lo, i_hi, j_lo, j_hi, k_lo, k_hi
  logical :: need_full_symmetry
  real*8 :: res_line
  real*8 :: tmp_z_slab(-2:extc(1), -2:extc(2))  ! 包含 Y/X 向模板访问所需下界
  if(wei.ne.3)then
     write(*,*)"prolongrestrict.f90::prolong3: this routine only surport 3 dimension"
     write(*,*)"dim = ",wei
@@ -2058,130 +2065,112 @@
     endif
  enddo
-  maxcx = maxval(cix(imino:imaxo))
+  ic_min = minval(cix(imino:imaxo))
-  maxcy = maxval(ciy(jmino:jmaxo))
+  ic_max = maxval(cix(imino:imaxo))
-  maxcz = maxval(ciz(kmino:kmaxo))
+  jc_min = minval(ciy(jmino:jmaxo))
  jc_max = maxval(ciy(jmino:jmaxo))
  kc_min = minval(ciz(kmino:kmaxo))
  kc_max = maxval(ciz(kmino:kmaxo))
  maxcx = ic_max
  maxcy = jc_max
  maxcz = kc_max
  if(maxcx+3 > extc(1) .or. maxcy+3 > extc(2) .or. maxcz+3 > extc(3))then
     write(*,*)"error in prolong"
     return
  endif
-  call symmetry_bd(3,extc,func,funcc,SoA)
+  i_lo = ic_min - 2
  i_hi = ic_max + 3
  j_lo = jc_min - 2
  j_hi = jc_max + 3
  k_lo = kc_min - 2
  k_hi = kc_max + 3
  need_full_symmetry = (i_lo < 1) .or. (j_lo < 1) .or. (k_lo < 1)
  if(need_full_symmetry)then
     call symmetry_bd(3,extc,func,funcc,SoA)
  else
     funcc(i_lo:i_hi,j_lo:j_hi,k_lo:k_hi) = func(i_lo:i_hi,j_lo:j_hi,k_lo:k_hi)
  endif
     ! 对每个 k（pz, kc 固定）预计算 Z 向插值的 2D 切片
 do k = kmino, kmaxo
    pz = piz(k); kc = ciz(k)
    ! --- Pass 1: Z 方向，只算一次 ---
    do iy = jc_min-2, jc_max+3   ! 仅需的 iy 范围（对应 jc-2:jc+3）
        do ii = ic_min-2, ic_max+3  ! 仅需的 ii 范围（对应 cix-2:cix+3）
            tmp_z_slab(ii, iy) = sum(WC(:,pz) * funcc(ii, iy, kc-2:kc+3))
        end do
    end do
    do j = jmino, jmaxo
        py = piy(j); jc = ciy(j)
        ! --- Pass 2: Y 方向 ---
        do ii = ic_min-2, ic_max+3
            tmp_xyz_line(ii) = sum(WC(:,py) * tmp_z_slab(ii, jc-2:jc+3))
        end do
        ! --- Pass 3: X 方向 ---
        do i = imino, imaxo
            funf(i,j,k) = sum(WC(:,pix(i)) * tmp_xyz_line(cix(i)-2:cix(i)+3))
        end do
    end do
 end do
 !~~~~~~> prolongation start...
  do k = kmino,kmaxo
   do j = jmino,jmaxo
    do i = imino,imaxo
       cxI(1) = cix(i)
       cxI(2) = ciy(j)
       cxI(3) = ciz(k)
       px = pix(i)
       py = piy(j)
       pz = piz(k)
 #if 0
-       if(ii/2*2==ii)then
+ do k = kmino, kmaxo
-         if(jj/2*2==jj)then
+     pz = piz(k)
-           if(kk/2*2==kk)then
+     kc = ciz(k)
             tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
                   C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
             tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
             funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
           else
             tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
                   C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
             tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
             funf(i,j,k)=  C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
           endif
         else
           if(kk/2*2==kk)then
             tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
                   C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
             tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
             funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
           else
             tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
                   C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
             tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
             funf(i,j,k)=  C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
           endif
         endif
       else
         if(jj/2*2==jj)then
           if(kk/2*2==kk)then               
             tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
                   C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
             tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
             funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
           else
             tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
                   C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
             tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
             funf(i,j,k)=  C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
           endif
         else
           if(kk/2*2==kk)then
             tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
                   C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
             tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
             funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
           else
             tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
                   C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
             tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
             funf(i,j,k)=  C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
           endif
         endif
       endif
 #else 
       tmp2= WC(1,pz)*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
             WC(2,pz)*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
             WC(3,pz)*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
             WC(4,pz)*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
             WC(5,pz)*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
             WC(6,pz)*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
-       tmp1= WC(1,py)*tmp2(:,1)+WC(2,py)*tmp2(:,2)+WC(3,py)*tmp2(:,3)+&
+     do j = jmino, jmaxo
-             WC(4,py)*tmp2(:,4)+WC(5,py)*tmp2(:,5)+WC(6,py)*tmp2(:,6)
+        py = piy(j)
        jc = ciy(j)
-       funf(i,j,k)= WC(1,px)*tmp1(1)+WC(2,px)*tmp1(2)+WC(3,px)*tmp1(3)+&
+! --- 步骤 1 & 2 融合：分段处理 X 轴，提升 Cache 命中率 ---
-                    WC(4,px)*tmp1(4)+WC(5,px)*tmp1(5)+WC(6,px)*tmp1(6)
+        ! 我们将 ii 循环逻辑重组，减少对 funcc 的跨行重复访问
        do ii = 1, extc(1)
           ! 1. 先做 Z 方向的 6 条线插值（针对当前的 ii 和当前的 6 个 iy）
           ! 我们直接在这里把 Y 方向的加权也做了，省去 tmp_yz 数组
           ! 这样 funcc 的数据读进来后立即完成所有维度的贡献，不再写回内存
           res_line = 0.0d0
           do jj = 1, 6
              iy = jc - 3 + jj
              ! 这一行代码是核心：一次性完成 Z 插值并加上 Y 的权重
              ! 编译器会把 WC(jj, py) 存在寄存器里
              res_line = res_line + WC(jj, py) * ( &
                         WC(1, pz) * funcc(ii, iy, kc-2) + &
                         WC(2, pz) * funcc(ii, iy, kc-1) + &
                         WC(3, pz) * funcc(ii, iy, kc  ) + &
                         WC(4, pz) * funcc(ii, iy, kc+1) + &
                         WC(5, pz) * funcc(ii, iy, kc+2) + &
                         WC(6, pz) * funcc(ii, iy, kc+3) )
           end do
           tmp_xyz_line(ii) = res_line
        end do
        ! 3. 【降维：X 向】最后在最内层只处理 X 方向的 6 点加权
        ! 此时每个点的计算量从原来的 200+ 次乘法降到了仅 6 次
        do i = imino, imaxo
           px = pix(i)
           ic = cix(i)
           ! 直接从预计算好的 line 中读取连续的 6 个点
           ! ic-2 到 ic+3 对应原始 6 点算子
           funf(i,j,k) = WC(1,px)*tmp_xyz_line(ic-2) + &
                         WC(2,px)*tmp_xyz_line(ic-1) + &
                         WC(3,px)*tmp_xyz_line(ic  ) + &
                         WC(4,px)*tmp_xyz_line(ic+1) + &
                         WC(5,px)*tmp_xyz_line(ic+2) + &
                         WC(6,px)*tmp_xyz_line(ic+3)
        end do
     end do
  end do
 #endif
    enddo
   enddo
  enddo
  return
  end subroutine prolong3
@@ -2381,6 +2370,13 @@
  real*8,dimension(3) :: CD,FD
  real*8 :: tmp_xz_plane(-1:extf(1), 6)
  real*8 :: tmp_x_line(-1:extf(1))
  integer :: fi, fj, fk, ii, jj, kk
  integer :: fi_min, fi_max, ii_lo, ii_hi
  integer :: fj_min, fj_max, fk_min, fk_max, jj_lo, jj_hi, kk_lo, kk_hi
  logical :: need_full_symmetry
  if(wei.ne.3)then
     write(*,*)"prolongrestrict.f90::restrict3: this routine only surport 3 dimension"
     write(*,*)"dim = ",wei
@@ -2459,9 +2455,86 @@
          stop
  endif
-  call symmetry_bd(2,extf,funf,funff,SoA)
+  ! 仅计算 X 向最终写回所需的窗口：
  ! func(i,j,k) 只访问 tmp_x_line(fi-2:fi+3)
  fi_min = 2*(imino + lbc(1) - 1) - 1 - lbf(1) + 1
  fi_max = 2*(imaxo + lbc(1) - 1) - 1 - lbf(1) + 1
  fj_min = 2*(jmino + lbc(2) - 1) - 1 - lbf(2) + 1
  fj_max = 2*(jmaxo + lbc(2) - 1) - 1 - lbf(2) + 1
  fk_min = 2*(kmino + lbc(3) - 1) - 1 - lbf(3) + 1
  fk_max = 2*(kmaxo + lbc(3) - 1) - 1 - lbf(3) + 1
  ii_lo = fi_min - 2
  ii_hi = fi_max + 3
  jj_lo = fj_min - 2
  jj_hi = fj_max + 3
  kk_lo = fk_min - 2
  kk_hi = fk_max + 3
  if(ii_lo < -1 .or. ii_hi > extf(1) .or. &
     jj_lo < -1 .or. jj_hi > extf(2) .or. &
     kk_lo < -1 .or. kk_hi > extf(3))then
      write(*,*)"restrict3: invalid stencil window"
      write(*,*)"ii=",ii_lo,ii_hi," jj=",jj_lo,jj_hi," kk=",kk_lo,kk_hi
      write(*,*)"extf=",extf
      stop
  endif
  need_full_symmetry = (ii_lo < 1) .or. (jj_lo < 1) .or. (kk_lo < 1)
  if(need_full_symmetry)then
      call symmetry_bd(2,extf,funf,funff,SoA)
  else
      funff(ii_lo:ii_hi,jj_lo:jj_hi,kk_lo:kk_hi) = funf(ii_lo:ii_hi,jj_lo:jj_hi,kk_lo:kk_hi)
  endif
 !~~~~~~> restriction start...
 do k = kmino, kmaxo
    fk = 2*(k + lbc(3) - 1) - 1 - lbf(3) + 1
    do j = jmino, jmaxo
        fj = 2*(j + lbc(2) - 1) - 1 - lbf(2) + 1
        ! 优化点 1: 显式展开 Z 方向计算，减少循环开销
        ! 确保 ii 循环是最内层且连续访问
        !DIR$ VECTOR ALWAYS
        do ii = ii_lo, ii_hi
            ! 预计算当前 j 对应的 6 行在 Z 方向的压缩结果
            ! 这里直接硬编码 jj 的偏移，彻底消除一层循环
            tmp_xz_plane(ii, 1) = C1*(funff(ii,fj-2,fk-2)+funff(ii,fj-2,fk+3)) + &
                                  C2*(funff(ii,fj-2,fk-1)+funff(ii,fj-2,fk+2)) + &
                                  C3*(funff(ii,fj-2,fk  )+funff(ii,fj-2,fk+1))
            tmp_xz_plane(ii, 2) = C1*(funff(ii,fj-1,fk-2)+funff(ii,fj-1,fk+3)) + &
                                  C2*(funff(ii,fj-1,fk-1)+funff(ii,fj-1,fk+2)) + &
                                  C3*(funff(ii,fj-1,fk  )+funff(ii,fj-1,fk+1))
            tmp_xz_plane(ii, 3) = C1*(funff(ii,fj  ,fk-2)+funff(ii,fj  ,fk+3)) + &
                                  C2*(funff(ii,fj  ,fk-1)+funff(ii,fj  ,fk+2)) + &
                                  C3*(funff(ii,fj  ,fk  )+funff(ii,fj  ,fk+1))
            tmp_xz_plane(ii, 4) = C1*(funff(ii,fj+1,fk-2)+funff(ii,fj+1,fk+3)) + &
                                  C2*(funff(ii,fj+1,fk-1)+funff(ii,fj+1,fk+2)) + &
                                  C3*(funff(ii,fj+1,fk  )+funff(ii,fj+1,fk+1))
            tmp_xz_plane(ii, 5) = C1*(funff(ii,fj+2,fk-2)+funff(ii,fj+2,fk+3)) + &
                                  C2*(funff(ii,fj+2,fk-1)+funff(ii,fj+2,fk+2)) + &
                                  C3*(funff(ii,fj+2,fk  )+funff(ii,fj+2,fk+1))
            tmp_xz_plane(ii, 6) = C1*(funff(ii,fj+3,fk-2)+funff(ii,fj+3,fk+3)) + &
                                  C2*(funff(ii,fj+3,fk-1)+funff(ii,fj+3,fk+2)) + &
                                  C3*(funff(ii,fj+3,fk  )+funff(ii,fj+3,fk+1))
        end do
        ! 优化点 2: 同样向量化 Y 方向压缩
        !DIR$ VECTOR ALWAYS
        do ii = ii_lo, ii_hi
            tmp_x_line(ii) = C1*(tmp_xz_plane(ii, 1) + tmp_xz_plane(ii, 6)) + &
                            C2*(tmp_xz_plane(ii, 2) + tmp_xz_plane(ii, 5)) + &
                            C3*(tmp_xz_plane(ii, 3) + tmp_xz_plane(ii, 4))
        end do
        ! 优化点 3: 最终写入，利用已经缓存在 tmp_x_line 的数据
        do i = imino, imaxo
            fi = 2*(i + lbc(1) - 1) - 1 - lbf(1) + 1
            func(i, j, k) = C1*(tmp_x_line(fi-2) + tmp_x_line(fi+3)) + &
                            C2*(tmp_x_line(fi-1) + tmp_x_line(fi+2)) + &
                            C3*(tmp_x_line(fi  ) + tmp_x_line(fi+1))
        end do
    end do
 end do
 #if 0
  do k = kmino,kmaxo
   do j = jmino,jmaxo
    do i = imino,imaxo
@@ -2485,7 +2558,7 @@
    enddo
   enddo
  enddo
-  
+#endif
  return
  end subroutine restrict3