From f147f79ffa9902e69f2ef89735522f340dc11d4f Mon Sep 17 00:00:00 2001
From: jaunatisblue <jaunatisblue@gmail>
Date: Thu, 26 Feb 2026 09:40:46 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9block=E5=88=92=E5=88=86?=
 =?UTF-8?q?=EF=BC=8C=E5=AF=B9=E8=B4=9F=E8=BD=BD=E9=AB=98=E7=9A=84rank?=
 =?UTF-8?q?=E6=89=80=E5=9C=A8block=E8=BF=9B=E8=A1=8C=E5=88=92=E5=88=86?=
 =?UTF-8?q?=EF=BC=8C=E6=B7=BB=E5=8A=A0=E5=88=B0=E7=A9=BArank=EF=BC=8C?=
 =?UTF-8?q?=E7=A9=BArank=E6=98=AF=E5=B9=B3=E7=A7=BB=E5=BE=97=E5=88=B0?=
 =?UTF-8?q?=E7=9A=84?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 AMSS_NCKU_source/MPatch.C           |  3492 ++++---
 AMSS_NCKU_source/NullShellPatch.h   |     1 +
 AMSS_NCKU_source/Parallel.C         | 13612 ++++++++++++++------------
 AMSS_NCKU_source/Parallel.h         |   448 +-
 AMSS_NCKU_source/cgh.C              |  3546 +++----
 AMSS_NCKU_source/cgh.h              |   199 +-
 AMSS_NCKU_source/surface_integral.C |  7501 +++++++-------
 7 files changed, 14791 insertions(+), 14008 deletions(-)
diff --git a/AMSS_NCKU_source/MPatch.C b/AMSS_NCKU_source/MPatch.C
index e712a74..b3dc6bd 100644
--- a/AMSS_NCKU_source/MPatch.C
+++ b/AMSS_NCKU_source/MPatch.C
@@ -1,1762 +1,1732 @@
-
-#include <iostream>
-#include <iomanip>
-#include <fstream>
-#include <cstdlib>
-#include <cstdio>
-#include <string>
-#include <cmath>
-#include <new>
-using namespace std;
-
-#include "misc.h"
-#include "MPatch.h"
-#include "Parallel.h"
-#include "fmisc.h"
-
-Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
-{
-
-  int hbuffer_width = buffer_width;
-  if (lev == 0)
-    hbuffer_width = CS_width; // specific for shell-box coulping
-
-  if (DIM != dim)
-  {
-    cout << "dimension is not consistent in Patch construction" << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-  for (int i = 0; i < dim; i++)
-  {
-    shape[i] = shapei[i];
-    bbox[i] = bboxi[i];
-    bbox[dim + i] = bboxi[dim + i];
-    lli[i] = uui[i] = 0;
-    if (buflog)
-    {
-      double DH;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-      DH = (bbox[dim + i] - bbox[i]) / (shape[i] - 1);
-#else
-#ifdef Cell
-      DH = (bbox[dim + i] - bbox[i]) / shape[i];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      uui[i] = hbuffer_width;
-      bbox[dim + i] = bbox[dim + i] + uui[i] * DH;
-      shape[i] = shape[i] + uui[i];
-    }
-  }
-
-  if (buflog)
-  {
-    if (DIM != 3)
-    {
-      cout << "Symmetry in Patch construction only support 3 yet but dim = " << DIM << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-    double tmpb, DH;
-    if (Symmetry > 0)
-    {
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-      DH = (bbox[5] - bbox[2]) / (shape[2] - 1);
-#else
-#ifdef Cell
-      DH = (bbox[5] - bbox[2]) / shape[2];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      tmpb = Mymax(0, bbox[2] - hbuffer_width * DH);
-      lli[2] = int((bbox[2] - tmpb) / DH + 0.4);
-      bbox[2] = bbox[2] - lli[2] * DH;
-      shape[2] = shape[2] + lli[2];
-      if (lli[2] < hbuffer_width)
-      {
-        if (feq(bbox[2], 0, DH / 2))
-          lli[2] = 0;
-        else
-        {
-          cout << "Code mistake for lli[2] = " << lli[2] << ", bbox[2] = " << bbox[2] << endl;
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-      }
-      if (Symmetry > 1)
-      {
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        DH = (bbox[3] - bbox[0]) / (shape[0] - 1);
-#else
-#ifdef Cell
-        DH = (bbox[3] - bbox[0]) / shape[0];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-        tmpb = Mymax(0, bbox[0] - hbuffer_width * DH);
-        lli[0] = int((bbox[0] - tmpb) / DH + 0.4);
-        bbox[0] = bbox[0] - lli[0] * DH;
-        shape[0] = shape[0] + lli[0];
-        if (lli[0] < hbuffer_width)
-        {
-          if (feq(bbox[0], 0, DH / 2))
-            lli[0] = 0;
-          else
-          {
-            cout << "Code mistake for lli[0] = " << lli[0] << ", bbox[0] = " << bbox[0] << endl;
-            MPI_Abort(MPI_COMM_WORLD, 1);
-          }
-        }
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        DH = (bbox[4] - bbox[1]) / (shape[1] - 1);
-#else
-#ifdef Cell
-        DH = (bbox[4] - bbox[1]) / shape[1];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-        tmpb = Mymax(0, bbox[1] - hbuffer_width * DH);
-        lli[1] = int((bbox[1] - tmpb) / DH + 0.4);
-        bbox[1] = bbox[1] - lli[1] * DH;
-        shape[1] = shape[1] + lli[1];
-        if (lli[1] < hbuffer_width)
-        {
-          if (feq(bbox[1], 0, DH / 2))
-            lli[1] = 0;
-          else
-          {
-            cout << "Code mistake for lli[1] = " << lli[1] << ", bbox[1] = " << bbox[1] << endl;
-            MPI_Abort(MPI_COMM_WORLD, 1);
-          }
-        }
-      }
-      else
-      {
-        for (int i = 0; i < 2; i++)
-        {
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-          DH = (bbox[dim + i] - bbox[i]) / (shape[i] - 1);
-#else
-#ifdef Cell
-          DH = (bbox[dim + i] - bbox[i]) / shape[i];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-          lli[i] = hbuffer_width;
-          bbox[i] = bbox[i] - lli[i] * DH;
-          shape[i] = shape[i] + lli[i];
-        }
-      }
-    }
-    else
-    {
-      for (int i = 0; i < dim; i++)
-      {
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        DH = (bbox[dim + i] - bbox[i]) / (shape[i] - 1);
-#else
-#ifdef Cell
-        DH = (bbox[dim + i] - bbox[i]) / shape[i];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-        lli[i] = hbuffer_width;
-        bbox[i] = bbox[i] - lli[i] * DH;
-        shape[i] = shape[i] + lli[i];
-      }
-    }
-  }
-
-  blb = ble = 0;
-}
-Patch::~Patch()
-{
-}
-// buflog 1: with buffer points; 0 without
-void Patch::checkPatch(bool buflog)
-{
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  if (myrank == 0)
-  {
-    if (buflog)
-    {
-      cout << " belong to level " << lev << endl;
-      cout << " shape: [";
-      for (int i = 0; i < dim; i++)
-      {
-        cout << shape[i];
-        if (i < dim - 1)
-          cout << ",";
-        else
-          cout << "]";
-      }
-      cout << " resolution: [";
-      for (int i = 0; i < dim; i++)
-      {
-        cout << getdX(i);
-        if (i < dim - 1)
-          cout << ",";
-        else
-          cout << "]" << endl;
-      }
-      cout << " range:" << "(";
-      for (int i = 0; i < dim; i++)
-      {
-        cout << bbox[i] << ":" << bbox[dim + i];
-        if (i < dim - 1)
-          cout << ",";
-        else
-          cout << ")" << endl;
-      }
-    }
-    else
-    {
-      cout << " belong to level " << lev << endl;
-      cout << " shape: [";
-      for (int i = 0; i < dim; i++)
-      {
-        cout << shape[i] - lli[i] - uui[i];
-        if (i < dim - 1)
-          cout << ",";
-        else
-          cout << "]";
-      }
-      cout << " resolution: [";
-      for (int i = 0; i < dim; i++)
-      {
-        cout << getdX(i);
-        if (i < dim - 1)
-          cout << ",";
-        else
-          cout << "]" << endl;
-      }
-      cout << " range:" << "(";
-      for (int i = 0; i < dim; i++)
-      {
-        cout << bbox[i] + lli[i] * getdX(i) << ":" << bbox[dim + i] - uui[i] * getdX(i);
-        if (i < dim - 1)
-          cout << ",";
-        else
-          cout << ")" << endl;
-      }
-    }
-  }
-}
-void Patch::checkPatch(bool buflog, const int out_rank)
-{
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  if (myrank == out_rank)
-  {
-    cout << " out_rank = " << out_rank << endl;
-    if (buflog)
-    {
-      cout << " belong to level " << lev << endl;
-      cout << " shape: [";
-      for (int i = 0; i < dim; i++)
-      {
-        cout << shape[i];
-        if (i < dim - 1)
-          cout << ",";
-        else
-          cout << "]";
-      }
-      cout << " resolution: [";
-      for (int i = 0; i < dim; i++)
-      {
-        cout << getdX(i);
-        if (i < dim - 1)
-          cout << ",";
-        else
-          cout << "]" << endl;
-      }
-      cout << " range:" << "(";
-      for (int i = 0; i < dim; i++)
-      {
-        cout << bbox[i] << ":" << bbox[dim + i];
-        if (i < dim - 1)
-          cout << ",";
-        else
-          cout << ")" << endl;
-      }
-    }
-    else
-    {
-      cout << " belong to level " << lev << endl;
-      cout << " shape: [";
-      for (int i = 0; i < dim; i++)
-      {
-        cout << shape[i] - lli[i] - uui[i];
-        if (i < dim - 1)
-          cout << ",";
-        else
-          cout << "]";
-      }
-      cout << " resolution: [";
-      for (int i = 0; i < dim; i++)
-      {
-        cout << getdX(i);
-        if (i < dim - 1)
-          cout << ",";
-        else
-          cout << "]" << endl;
-      }
-      cout << " range:" << "(";
-      for (int i = 0; i < dim; i++)
-      {
-        cout << bbox[i] + lli[i] * getdX(i) << ":" << bbox[dim + i] - uui[i] * getdX(i);
-        if (i < dim - 1)
-          cout << ",";
-        else
-          cout << ")" << endl;
-      }
-    }
-  }
-}
-void Patch::Interp_Points(MyList<var> *VarList,
-                          int NN, double **XX,
-                          double *Shellf, int Symmetry)
-{
-  // NOTE: we do not Synchnize variables here, make sure of that before calling this routine
-  double t_calc_end, t_calc_total = 0;
-  double t_calc_start = MPI_Wtime();
-  int myrank, nprocs;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-
-  int ordn = 2 * ghost_width;
-  MyList<var> *varl;
-  int num_var = 0;
-  varl = VarList;
-  while (varl)
-  {
-    num_var++;
-    varl = varl->next;
-  }
-
-  memset(Shellf, 0, sizeof(double) * NN * num_var);
-
-  // owner_rank[j] records which MPI rank owns point j
-  // All ranks traverse the same block list so they all agree on ownership
-  int *owner_rank;
-  owner_rank = new int[NN];
-  for (int j = 0; j < NN; j++)
-    owner_rank[j] = -1;
-
-  double DH[dim], llb[dim], uub[dim];
-  for (int i = 0; i < dim; i++)
-    DH[i] = getdX(i);
-
-  for (int j = 0; j < NN; j++) // run along points
-  {
-    double pox[dim];
-    for (int i = 0; i < dim; i++)
-    {
-      pox[i] = XX[i][j];
-      if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
-      {
-        cout << "Patch::Interp_Points: point (";
-        for (int k = 0; k < dim; k++)
-        {
-          cout << XX[k][j];
-          if (k < dim - 1)
-            cout << ",";
-          else
-            cout << ") is out of current Patch." << endl;
-        }
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-    }
-
-    MyList<Block> *Bp = blb;
-    bool notfind = true;
-    while (notfind && Bp) // run along Blocks
-    {
-      Block *BP = Bp->data;
-
-      bool flag = true;
-      for (int i = 0; i < dim; i++)
-      {
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
-        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
-#else
-#ifdef Cell
-        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
-        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-        if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
-        {
-          flag = false;
-          break;
-        }
-      }
-
-      if (flag)
-      {
-        notfind = false;
-        owner_rank[j] = BP->rank;
-        if (myrank == BP->rank)
-        {
-          //---> interpolation
-          varl = VarList;
-          int k = 0;
-          while (varl) // run along variables
-          {
-            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
-                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
-            varl = varl->next;
-            k++;
-          }
-        }
-      }
-      if (Bp == ble)
-        break;
-      Bp = Bp->next;
-    }
-  }
-        t_calc_end = MPI_Wtime();
-      t_calc_total = t_calc_end - t_calc_start;
-  // Replace MPI_Allreduce with per-owner MPI_Bcast:
-  // Group consecutive points by owner rank and broadcast each group.
-  // Since each point's data is non-zero only on the owner rank,
-  // Bcast from owner is equivalent to Allreduce(MPI_SUM) but much cheaper.
-  {
-    int j = 0;
-    while (j < NN)
-    {
-      int cur_owner = owner_rank[j];
-      if (cur_owner < 0)
-      {
-        if (myrank == 0)
-        {
-          cout << "ERROR: Patch::Interp_Points fails to find point (";
-          for (int d = 0; d < dim; d++)
-          {
-            cout << XX[d][j];
-            if (d < dim - 1)
-              cout << ",";
-            else
-              cout << ")";
-          }
-          cout << " on Patch (";
-          for (int d = 0; d < dim; d++)
-          {
-            cout << bbox[d] << "+" << lli[d] * DH[d];
-            if (d < dim - 1)
-              cout << ",";
-            else
-              cout << ")--";
-          }
-          cout << "(";
-          for (int d = 0; d < dim; d++)
-          {
-            cout << bbox[dim + d] << "-" << uui[d] * DH[d];
-            if (d < dim - 1)
-              cout << ",";
-            else
-              cout << ")" << endl;
-          }
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-        j++;
-        continue;
-      }
-      // Find contiguous run of points with the same owner
-      int jstart = j;
-      while (j < NN && owner_rank[j] == cur_owner)
-        j++;
-      int count = (j - jstart) * num_var;
-      MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner, MPI_COMM_WORLD);
-    }
-  }
-
-  delete[] owner_rank;
-
-
-  
-  // 4. 汇总并输出真正干活最慢的 Top 10
-  struct RankStats {
-    int rank;
-    double calc_time; // 净计算时间
-    double comm_time; // 等待时间
-  };
-
-  // 创建当前进程的统计数据
-  RankStats local_stat;
-  local_stat.rank = myrank;
-  local_stat.calc_time = t_calc_total;
-  local_stat.comm_time = 0; // 此函数中未跟踪通信时间
-
-  // 为所有进程的统计数据分配内存
-  RankStats *all_stats = nullptr;
-  if (myrank == 0) {
-    all_stats = new RankStats[nprocs];
-  }
-
-  // 使用MPI_Gather收集所有进程的数据到rank 0
-  MPI_Gather(&local_stat, sizeof(RankStats), MPI_BYTE,
-             all_stats, sizeof(RankStats), MPI_BYTE,
-             0, MPI_COMM_WORLD);
-
-  if (myrank == 0) {
-    // 按 calc_time（净计算时间）排序
-    std::sort(all_stats, all_stats + nprocs, [](const RankStats& a, const RankStats& b) {
-        return a.calc_time > b.calc_time;
-    });
-
-    printf("\n--- Top 10 Ranks by ACTIVE COMPUTATION (CPU Time) ---\n");
-    int display_count = (nprocs < 10) ? nprocs : 10;
-    for (int i = 0; i < display_count; i++) {
-        printf("Rank [%4d]: Calc %.6f s\n", 
-                all_stats[i].rank, all_stats[i].calc_time);
-    }
-    
-    // 清理分配的内存
-    delete[] all_stats;
-  }
-}
-void Patch::Interp_Points(MyList<var> *VarList,
-                          int NN, double **XX,
-                          double *Shellf, int Symmetry,
-                          int Nmin_consumer, int Nmax_consumer)
-{
-  // Targeted point-to-point overload: each owner sends each point only to
-  // the one rank that needs it for integration (consumer), reducing
-  // communication volume by ~nprocs times compared to the Bcast version.
-  double t_calc_end, t_calc_total = 0;
-  double t_calc_start = MPI_Wtime();
-  int myrank, nprocs;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-
-  int ordn = 2 * ghost_width;
-  MyList<var> *varl;
-  int num_var = 0;
-  varl = VarList;
-  while (varl)
-  {
-    num_var++;
-    varl = varl->next;
-  }
-
-  memset(Shellf, 0, sizeof(double) * NN * num_var);
-
-  // owner_rank[j] records which MPI rank owns point j
-  int *owner_rank;
-  owner_rank = new int[NN];
-  for (int j = 0; j < NN; j++)
-    owner_rank[j] = -1;
-
-  double DH[dim], llb[dim], uub[dim];
-  for (int i = 0; i < dim; i++)
-    DH[i] = getdX(i);
-
-  // --- Interpolation phase (identical to original) ---
-  for (int j = 0; j < NN; j++)
-  {
-    double pox[dim];
-    for (int i = 0; i < dim; i++)
-    {
-      pox[i] = XX[i][j];
-      if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
-      {
-        cout << "Patch::Interp_Points: point (";
-        for (int k = 0; k < dim; k++)
-        {
-          cout << XX[k][j];
-          if (k < dim - 1)
-            cout << ",";
-          else
-            cout << ") is out of current Patch." << endl;
-        }
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-    }
-
-    MyList<Block> *Bp = blb;
-    bool notfind = true;
-    while (notfind && Bp)
-    {
-      Block *BP = Bp->data;
-
-      bool flag = true;
-      for (int i = 0; i < dim; i++)
-      {
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
-        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
-#else
-#ifdef Cell
-        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
-        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-        if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
-        {
-          flag = false;
-          break;
-        }
-      }
-
-      if (flag)
-      {
-        notfind = false;
-        owner_rank[j] = BP->rank;
-        if (myrank == BP->rank)
-        {
-          varl = VarList;
-          int k = 0;
-          while (varl)
-          {
-            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
-                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
-            varl = varl->next;
-            k++;
-          }
-        }
-      }
-      if (Bp == ble)
-        break;
-      Bp = Bp->next;
-    }
-  }
-      t_calc_end = MPI_Wtime();
-      t_calc_total = t_calc_end - t_calc_start;
-  // --- Error check for unfound points ---
-  for (int j = 0; j < NN; j++)
-  {
-    if (owner_rank[j] < 0 && myrank == 0)
-    {
-      cout << "ERROR: Patch::Interp_Points fails to find point (";
-      for (int d = 0; d < dim; d++)
-      {
-        cout << XX[d][j];
-        if (d < dim - 1)
-          cout << ",";
-        else
-          cout << ")";
-      }
-      cout << " on Patch (";
-      for (int d = 0; d < dim; d++)
-      {
-        cout << bbox[d] << "+" << lli[d] * DH[d];
-        if (d < dim - 1)
-          cout << ",";
-        else
-          cout << ")--";
-      }
-      cout << "(";
-      for (int d = 0; d < dim; d++)
-      {
-        cout << bbox[dim + d] << "-" << uui[d] * DH[d];
-        if (d < dim - 1)
-          cout << ",";
-        else
-          cout << ")" << endl;
-      }
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-  }
-
-  // --- Targeted point-to-point communication phase ---
-  // Compute consumer_rank[j] using the same deterministic formula as surface_integral
-  int *consumer_rank = new int[NN];
-  {
-    int mp = NN / nprocs;
-    int Lp = NN - nprocs * mp;
-    for (int j = 0; j < NN; j++)
-    {
-      if (j < Lp * (mp + 1))
-        consumer_rank[j] = j / (mp + 1);
-      else
-        consumer_rank[j] = Lp + (j - Lp * (mp + 1)) / mp;
-    }
-  }
-
-  // Count sends and recvs per rank
-  int *send_count = new int[nprocs];
-  int *recv_count = new int[nprocs];
-  memset(send_count, 0, sizeof(int) * nprocs);
-  memset(recv_count, 0, sizeof(int) * nprocs);
-
-  for (int j = 0; j < NN; j++)
-  {
-    int own = owner_rank[j];
-    int con = consumer_rank[j];
-    if (own == con)
-      continue; // local — no communication needed
-    if (own == myrank)
-      send_count[con]++;
-    if (con == myrank)
-      recv_count[own]++;
-  }
-
-  // Build send buffers: for each destination rank, pack (index, data) pairs
-  // Each entry: 1 int (point index j) + num_var doubles
-  int total_send = 0, total_recv = 0;
-  int *send_offset = new int[nprocs];
-  int *recv_offset = new int[nprocs];
-  for (int r = 0; r < nprocs; r++)
-  {
-    send_offset[r] = total_send;
-    total_send += send_count[r];
-    recv_offset[r] = total_recv;
-    total_recv += recv_count[r];
-  }
-
-  // Pack send buffers: each message contains (j, data[0..num_var-1]) per point
-  int stride = 1 + num_var; // 1 double for index + num_var doubles for data
-  double *sendbuf = new double[total_send * stride];
-  double *recvbuf = new double[total_recv * stride];
-
-  // Temporary counters for packing
-  int *pack_pos = new int[nprocs];
-  memset(pack_pos, 0, sizeof(int) * nprocs);
-
-  for (int j = 0; j < NN; j++)
-  {
-    int own = owner_rank[j];
-    int con = consumer_rank[j];
-    if (own != myrank || con == myrank)
-      continue;
-    int pos = (send_offset[con] + pack_pos[con]) * stride;
-    sendbuf[pos] = (double)j; // point index
-    for (int v = 0; v < num_var; v++)
-      sendbuf[pos + 1 + v] = Shellf[j * num_var + v];
-    pack_pos[con]++;
-  }
-
-  // Post non-blocking recvs and sends
-  int n_req = 0;
-  for (int r = 0; r < nprocs; r++)
-  {
-    if (recv_count[r] > 0) n_req++;
-    if (send_count[r] > 0) n_req++;
-  }
-
-  MPI_Request *reqs = new MPI_Request[n_req];
-  int req_idx = 0;
-
-  for (int r = 0; r < nprocs; r++)
-  {
-    if (recv_count[r] > 0)
-    {
-      MPI_Irecv(recvbuf + recv_offset[r] * stride,
-                recv_count[r] * stride, MPI_DOUBLE,
-                r, 0, MPI_COMM_WORLD, &reqs[req_idx++]);
-    }
-  }
-  for (int r = 0; r < nprocs; r++)
-  {
-    if (send_count[r] > 0)
-    {
-      MPI_Isend(sendbuf + send_offset[r] * stride,
-                send_count[r] * stride, MPI_DOUBLE,
-                r, 0, MPI_COMM_WORLD, &reqs[req_idx++]);
-    }
-  }
-
-  if (n_req > 0)
-    MPI_Waitall(n_req, reqs, MPI_STATUSES_IGNORE);
-
-  // Unpack recv buffers into Shellf
-  for (int i = 0; i < total_recv; i++)
-  {
-    int pos = i * stride;
-    int j = (int)recvbuf[pos];
-    for (int v = 0; v < num_var; v++)
-      Shellf[j * num_var + v] = recvbuf[pos + 1 + v];
-  }
-
-  delete[] reqs;
-  delete[] sendbuf;
-  delete[] recvbuf;
-  delete[] pack_pos;
-  delete[] send_offset;
-  delete[] recv_offset;
-  delete[] send_count;
-  delete[] recv_count;
-  delete[] consumer_rank;
-  delete[] owner_rank;
-
-  // 4. 汇总并输出真正干活最慢的 Top 10
-  struct RankStats {
-    int rank;
-    double calc_time; // 净计算时间
-    double comm_time; // 等待时间
-  };
-
-  // 创建当前进程的统计数据
-  RankStats local_stat;
-  local_stat.rank = myrank;
-  local_stat.calc_time = t_calc_total;
-  local_stat.comm_time = 0; // 此函数中未跟踪通信时间
-
-  // 为所有进程的统计数据分配内存
-  RankStats *all_stats = nullptr;
-  if (myrank == 0) {
-    all_stats = new RankStats[nprocs];
-  }
-
-  // 使用MPI_Gather收集所有进程的数据到rank 0
-  MPI_Gather(&local_stat, sizeof(RankStats), MPI_BYTE,
-             all_stats, sizeof(RankStats), MPI_BYTE,
-             0, MPI_COMM_WORLD);
-  
-  if (myrank == 0) {
-    // 按 calc_time（净计算时间）排序
-    std::sort(all_stats, all_stats + nprocs, [](const RankStats& a, const RankStats& b) {
-        return a.calc_time > b.calc_time;
-    });
-/*
-    printf("\n--- Top 10 Ranks by ACTIVE COMPUTATION (CPU Time) ---\n");
-    int display_count = (nprocs < 10) ? nprocs : 10;
-    for (int i = 0; i < display_count; i++) {
-        printf("Rank [%4d]: Calc %.6f s\n", 
-                all_stats[i].rank, all_stats[i].calc_time);
-    }*/
-    
-    // 清理分配的内存
-    delete[] all_stats;
-  }
-
-}
-void Patch::Interp_Points(MyList<var> *VarList,
-                          int NN, double **XX,
-                          double *Shellf, int Symmetry, MPI_Comm Comm_here)
-{
-  // NOTE: we do not Synchnize variables here, make sure of that before calling this routine
-  int myrank, lmyrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  MPI_Comm_rank(Comm_here, &lmyrank);
-
-  int ordn = 2 * ghost_width;
-  MyList<var> *varl;
-  int num_var = 0;
-  varl = VarList;
-  while (varl)
-  {
-    num_var++;
-    varl = varl->next;
-  }
-
-  memset(Shellf, 0, sizeof(double) * NN * num_var);
-
-  // owner_rank[j] stores the global rank that owns point j
-  int *owner_rank;
-  owner_rank = new int[NN];
-  for (int j = 0; j < NN; j++)
-    owner_rank[j] = -1;
-
-  // Build global-to-local rank translation for Comm_here
-  MPI_Group world_group, local_group;
-  MPI_Comm_group(MPI_COMM_WORLD, &world_group);
-  MPI_Comm_group(Comm_here, &local_group);
-
-  double DH[dim], llb[dim], uub[dim];
-  for (int i = 0; i < dim; i++)
-    DH[i] = getdX(i);
-
-  for (int j = 0; j < NN; j++) // run along points
-  {
-    double pox[dim];
-    for (int i = 0; i < dim; i++)
-    {
-      pox[i] = XX[i][j];
-      if (lmyrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
-      {
-        cout << "Patch::Interp_Points: point (";
-        for (int k = 0; k < dim; k++)
-        {
-          cout << XX[k][j];
-          if (k < dim - 1)
-            cout << ",";
-          else
-            cout << ") is out of current Patch." << endl;
-        }
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-    }
-
-    MyList<Block> *Bp = blb;
-    bool notfind = true;
-    while (notfind && Bp) // run along Blocks
-    {
-      Block *BP = Bp->data;
-
-      bool flag = true;
-      for (int i = 0; i < dim; i++)
-      {
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
-        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
-#else
-#ifdef Cell
-        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
-        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-        if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
-        {
-          flag = false;
-          break;
-        }
-      }
-
-      if (flag)
-      {
-        notfind = false;
-        owner_rank[j] = BP->rank;
-        if (myrank == BP->rank)
-        {
-          //---> interpolation
-          varl = VarList;
-          int k = 0;
-          while (varl) // run along variables
-          {
-            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
-                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
-            varl = varl->next;
-            k++;
-          }
-        }
-      }
-      if (Bp == ble)
-        break;
-      Bp = Bp->next;
-    }
-  }
-
-  // Collect unique global owner ranks and translate to local ranks in Comm_here
-  // Then broadcast each owner's points via MPI_Bcast on Comm_here
-  {
-    int j = 0;
-    while (j < NN)
-    {
-      int cur_owner_global = owner_rank[j];
-      if (cur_owner_global < 0)
-      {
-        // Point not found — skip (error check disabled for sub-communicator levels)
-        j++;
-        continue;
-      }
-      // Translate global rank to local rank in Comm_here
-      int cur_owner_local;
-      MPI_Group_translate_ranks(world_group, 1, &cur_owner_global, local_group, &cur_owner_local);
-
-      // Find contiguous run of points with the same owner
-      int jstart = j;
-      while (j < NN && owner_rank[j] == cur_owner_global)
-        j++;
-      int count = (j - jstart) * num_var;
-      MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner_local, Comm_here);
-    }
-  }
-
-  MPI_Group_free(&world_group);
-  MPI_Group_free(&local_group);
-  delete[] owner_rank;
-}
-void Patch::checkBlock()
-{
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  if (myrank == 0)
-  {
-    MyList<Block> *BP = blb;
-    while (BP)
-    {
-      BP->data->checkBlock();
-      if (BP == ble)
-        break;
-      BP = BP->next;
-    }
-  }
-}
-double Patch::getdX(int dir)
-{
-  if (dir < 0 || dir >= dim)
-  {
-    cout << "Patch::getdX: error input dir = " << dir << ", this Patch has direction (0," << dim - 1 << ")" << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-  double h;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-  if (shape[dir] == 1)
-  {
-    cout << "Patch::getdX: for direction " << dir << ", this Patch has only one point. Can not determine dX for vertex center grid." << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-  h = (bbox[dim + dir] - bbox[dir]) / (shape[dir] - 1);
-#else
-#ifdef Cell
-  h = (bbox[dim + dir] - bbox[dir]) / shape[dir];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-  return h;
-}
-bool Patch::Interp_ONE_Point(MyList<var> *VarList, double *XX,
-                             double *Shellf, int Symmetry)
-{
-  // NOTE: we do not Synchnize variables here, make sure of that before calling this routine
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  int ordn = 2 * ghost_width;
-  MyList<var> *varl;
-  int num_var = 0;
-  varl = VarList;
-  while (varl)
-  {
-    num_var++;
-    varl = varl->next;
-  }
-
-  double *shellf;
-  shellf = new double[num_var];
-  memset(shellf, 0, sizeof(double) * num_var);
-
-  double *DH, *llb, *uub;
-  DH = new double[dim];
-
-  for (int i = 0; i < dim; i++)
-  {
-    DH[i] = getdX(i);
-  }
-  llb = new double[dim];
-  uub = new double[dim];
-
-  double pox[dim];
-  for (int i = 0; i < dim; i++)
-  {
-    pox[i] = XX[i];
-    // has excluded the buffer points
-    if (XX[i] < bbox[i] + lli[i] * DH[i] - DH[i] / 100 || XX[i] > bbox[dim + i] - uui[i] * DH[i] + DH[i] / 100)
-    {
-      delete[] shellf;
-      delete[] DH;
-      delete[] llb;
-      delete[] uub;
-      return false; // out of current patch,
-                    // remember to delete the allocated arrays before return!!!
-    }
-  }
-
-  MyList<Block> *Bp = blb;
-  bool notfind = true;
-  while (notfind && Bp) // run along Blocks
-  {
-    Block *BP = Bp->data;
-
-    bool flag = true;
-    for (int i = 0; i < dim; i++)
-    {
-// NOTE: our dividing structure is (exclude ghost)
-// -1 0
-//       1  2
-// so (0,1) does not belong to any part for vertex structure
-// here we put (0,0.5) to left part and (0.5,1) to right part
-// BUT for cell structure the bbox is (-1.5,0.5) and (0.5,2.5), there is no missing region at all
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-      llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
-      uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
-#else
-#ifdef Cell
-      llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
-      uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      if (XX[i] - llb[i] < -DH[i] / 2 || XX[i] - uub[i] > DH[i] / 2)
-      {
-        flag = false;
-        break;
-      }
-    }
-
-    if (flag)
-    {
-      notfind = false;
-      if (myrank == BP->rank)
-      {
-// test old code
-#if 0
-#define floorint(a) ((a) < 0 ? int(a) - 1 : int(a))
-//---> interpolation
-                int ixl,iyl,izl,ixu,iyu,izu;
-	    	double Delx,Dely,Delz;
-
-		ixl = 1+floorint((pox[0]-BP->X[0][0])/DH[0]);
-	   	iyl = 1+floorint((pox[1]-BP->X[1][0])/DH[1]);
-	   	izl = 1+floorint((pox[2]-BP->X[2][0])/DH[2]);
-
-		int nn=ordn/2;
-
-		ixl = ixl-nn;
-		iyl = iyl-nn;
-		izl = izl-nn;
-	   
-		int tmi;
-		tmi = (Symmetry==2)?-1:0;
-		if(ixl<tmi) ixl=tmi;
-	   	if(iyl<tmi) iyl=tmi;
-		tmi = (Symmetry>0)?-1:0;
-	   	if(izl<tmi) izl=tmi;
-      
-	   	if(ixl+ordn>BP->shape[0]) ixl=BP->shape[0]-ordn;
-	   	if(iyl+ordn>BP->shape[1]) iyl=BP->shape[1]-ordn;
-	   	if(izl+ordn>BP->shape[2]) izl=BP->shape[2]-ordn;
-// support cell center
-		if(ixl>=0) Delx = ( pox[0] - BP->X[0][ixl] )/ DH[0];
-		else       Delx = ( pox[0] + BP->X[0][0] )/ DH[0];
-                if(iyl>=0) Dely = ( pox[1] - BP->X[1][iyl] )/ DH[1];
-		else       Dely = ( pox[1] + BP->X[1][0] )/ DH[1];
-                if(izl>=0) Delz = ( pox[2] - BP->X[2][izl] )/ DH[2];
-		else       Delz = ( pox[2] + BP->X[2][0] )/ DH[2];
-//change to fortran index
-                ixl++;
-	   	iyl++;
-	   	izl++;
-	   	ixu = ixl + ordn - 1;
-	   	iyu = iyl + ordn - 1;
-	   	izu = izl + ordn - 1;
-	    	varl=VarList;
-		int j=0;
-	    	while(varl)
-		{
-                 f_interp_2(BP->shape,BP->fgfs[varl->data->sgfn],shellf[j],ixl,ixu,iyl,iyu,izl,izu,Delx,Dely,Delz,
-                                     ordn,varl->data->SoA,Symmetry);
-		 varl=varl->next;
-		 j++;
-		} //varl
-#else
-        //---> interpolation
-        varl = VarList;
-        int k = 0;
-        while (varl) // run along variables
-        {
-          //              shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn],
-          //	  		                                    pox,ordn,varl->data->SoA,Symmetry);
-          f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[k],
-                          pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
-          varl = varl->next;
-          k++;
-        }
-#endif
-      }
-    }
-    if (Bp == ble)
-      break;
-    Bp = Bp->next;
-  }
-
-  if (notfind && myrank == 0)
-  {
-    cout << "ERROR: Patch::Interp_Points fails to find point (";
-    for (int j = 0; j < dim; j++)
-    {
-      cout << XX[j];
-      if (j < dim - 1)
-        cout << ",";
-      else
-        cout << ")";
-    }
-    cout << " on Patch (";
-    for (int j = 0; j < dim; j++)
-    {
-      cout << bbox[j] << "+" << lli[j] * getdX(j);
-      if (j < dim - 1)
-        cout << ",";
-      else
-        cout << ")--";
-    }
-    cout << "(";
-    for (int j = 0; j < dim; j++)
-    {
-      cout << bbox[dim + j] << "-" << uui[j] * getdX(j);
-      if (j < dim - 1)
-        cout << ",";
-      else
-        cout << ")" << endl;
-    }
-#if 0
-       checkBlock();
-#else
-    cout << "splited domains:" << endl;
-    {
-      MyList<Block> *Bp = blb;
-      while (Bp)
-      {
-        Block *BP = Bp->data;
-
-        for (int i = 0; i < dim; i++)
-        {
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-          llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
-          uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
-#else
-#ifdef Cell
-          llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
-          uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-        }
-        cout << "(";
-        for (int j = 0; j < dim; j++)
-        {
-          cout << llb[j] << ":" << uub[j];
-          if (j < dim - 1)
-            cout << ",";
-          else
-            cout << ")" << endl;
-        }
-        if (Bp == ble)
-          break;
-        Bp = Bp->next;
-      }
-    }
-#endif
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-  MPI_Allreduce(shellf, Shellf, num_var, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-
-  delete[] shellf;
-  delete[] DH;
-  delete[] llb;
-  delete[] uub;
-
-  return true;
-}
-bool Patch::Interp_ONE_Point(MyList<var> *VarList, double *XX,
-                             double *Shellf, int Symmetry, MPI_Comm Comm_here)
-{
-  // NOTE: we do not Synchnize variables here, make sure of that before calling this routine
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  int ordn = 2 * ghost_width;
-  MyList<var> *varl;
-  int num_var = 0;
-  varl = VarList;
-  while (varl)
-  {
-    num_var++;
-    varl = varl->next;
-  }
-
-  double *shellf;
-  shellf = new double[num_var];
-  memset(shellf, 0, sizeof(double) * num_var);
-
-  double *DH, *llb, *uub;
-  DH = new double[dim];
-
-  for (int i = 0; i < dim; i++)
-  {
-    DH[i] = getdX(i);
-  }
-  llb = new double[dim];
-  uub = new double[dim];
-
-  double pox[dim];
-  for (int i = 0; i < dim; i++)
-  {
-    pox[i] = XX[i];
-    // has excluded the buffer points
-    if (XX[i] < bbox[i] + lli[i] * DH[i] - DH[i] / 100 || XX[i] > bbox[dim + i] - uui[i] * DH[i] + DH[i] / 100)
-    {
-      delete[] shellf;
-      delete[] DH;
-      delete[] llb;
-      delete[] uub;
-      return false; // out of current patch,
-                    // remember to delete the allocated arrays before return!!!
-    }
-  }
-
-  MyList<Block> *Bp = blb;
-  bool notfind = true;
-  while (notfind && Bp) // run along Blocks
-  {
-    Block *BP = Bp->data;
-
-    bool flag = true;
-    for (int i = 0; i < dim; i++)
-    {
-// NOTE: our dividing structure is (exclude ghost)
-// -1 0
-//       1  2
-// so (0,1) does not belong to any part for vertex structure
-// here we put (0,0.5) to left part and (0.5,1) to right part
-// BUT for cell structure the bbox is (-1.5,0.5) and (0.5,2.5), there is no missing region at all
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-      llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
-      uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
-#else
-#ifdef Cell
-      llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
-      uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      if (XX[i] - llb[i] < -DH[i] / 2 || XX[i] - uub[i] > DH[i] / 2)
-      {
-        flag = false;
-        break;
-      }
-    }
-
-    if (flag)
-    {
-      notfind = false;
-      if (myrank == BP->rank)
-      {
-// test old code
-#if 0
-#define floorint(a) ((a) < 0 ? int(a) - 1 : int(a))
-//---> interpolation
-                int ixl,iyl,izl,ixu,iyu,izu;
-	    	double Delx,Dely,Delz;
-
-		ixl = 1+floorint((pox[0]-BP->X[0][0])/DH[0]);
-	   	iyl = 1+floorint((pox[1]-BP->X[1][0])/DH[1]);
-	   	izl = 1+floorint((pox[2]-BP->X[2][0])/DH[2]);
-
-		int nn=ordn/2;
-
-		ixl = ixl-nn;
-		iyl = iyl-nn;
-		izl = izl-nn;
-	   
-		int tmi;
-		tmi = (Symmetry==2)?-1:0;
-		if(ixl<tmi) ixl=tmi;
-	   	if(iyl<tmi) iyl=tmi;
-		tmi = (Symmetry>0)?-1:0;
-	   	if(izl<tmi) izl=tmi;
-      
-	   	if(ixl+ordn>BP->shape[0]) ixl=BP->shape[0]-ordn;
-	   	if(iyl+ordn>BP->shape[1]) iyl=BP->shape[1]-ordn;
-	   	if(izl+ordn>BP->shape[2]) izl=BP->shape[2]-ordn;
-// support cell center
-		if(ixl>=0) Delx = ( pox[0] - BP->X[0][ixl] )/ DH[0];
-		else       Delx = ( pox[0] + BP->X[0][0] )/ DH[0];
-                if(iyl>=0) Dely = ( pox[1] - BP->X[1][iyl] )/ DH[1];
-		else       Dely = ( pox[1] + BP->X[1][0] )/ DH[1];
-                if(izl>=0) Delz = ( pox[2] - BP->X[2][izl] )/ DH[2];
-		else       Delz = ( pox[2] + BP->X[2][0] )/ DH[2];
-//change to fortran index
-                ixl++;
-	   	iyl++;
-	   	izl++;
-	   	ixu = ixl + ordn - 1;
-	   	iyu = iyl + ordn - 1;
-	   	izu = izl + ordn - 1;
-	    	varl=VarList;
-		int j=0;
-	    	while(varl)
-		{
-                 f_interp_2(BP->shape,BP->fgfs[varl->data->sgfn],shellf[j],ixl,ixu,iyl,iyu,izl,izu,Delx,Dely,Delz,
-                                     ordn,varl->data->SoA,Symmetry);
-		 varl=varl->next;
-		 j++;
-		} //varl
-#else
-        //---> interpolation
-        varl = VarList;
-        int k = 0;
-        while (varl) // run along variables
-        {
-          //              shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn],
-          //	  		                                    pox,ordn,varl->data->SoA,Symmetry);
-          f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[k],
-                          pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
-          varl = varl->next;
-          k++;
-        }
-#endif
-      }
-    }
-    if (Bp == ble)
-      break;
-    Bp = Bp->next;
-  }
-
-  if (notfind && myrank == 0)
-  {
-    cout << "ERROR: Patch::Interp_Points fails to find point (";
-    for (int j = 0; j < dim; j++)
-    {
-      cout << XX[j];
-      if (j < dim - 1)
-        cout << ",";
-      else
-        cout << ")";
-    }
-    cout << " on Patch (";
-    for (int j = 0; j < dim; j++)
-    {
-      cout << bbox[j] << "+" << lli[j] * getdX(j);
-      if (j < dim - 1)
-        cout << ",";
-      else
-        cout << ")--";
-    }
-    cout << "(";
-    for (int j = 0; j < dim; j++)
-    {
-      cout << bbox[dim + j] << "-" << uui[j] * getdX(j);
-      if (j < dim - 1)
-        cout << ",";
-      else
-        cout << ")" << endl;
-    }
-#if 0
-       checkBlock();
-#else
-    cout << "splited domains:" << endl;
-    {
-      MyList<Block> *Bp = blb;
-      while (Bp)
-      {
-        Block *BP = Bp->data;
-
-        for (int i = 0; i < dim; i++)
-        {
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-          llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
-          uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
-#else
-#ifdef Cell
-          llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
-          uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-        }
-        cout << "(";
-        for (int j = 0; j < dim; j++)
-        {
-          cout << llb[j] << ":" << uub[j];
-          if (j < dim - 1)
-            cout << ",";
-          else
-            cout << ")" << endl;
-        }
-        if (Bp == ble)
-          break;
-        Bp = Bp->next;
-      }
-    }
-#endif
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-  MPI_Allreduce(shellf, Shellf, num_var, MPI_DOUBLE, MPI_SUM, Comm_here);
-
-  delete[] shellf;
-  delete[] DH;
-  delete[] llb;
-  delete[] uub;
-
-  return true;
-}
-// find maximum of abstract value, XX store position for maximum, Shellf store maximum themselvs
-void Patch::Find_Maximum(MyList<var> *VarList, double *XX,
-                         double *Shellf)
-{
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  MyList<var> *varl;
-  int num_var = 0;
-  varl = VarList;
-  while (varl)
-  {
-    num_var++;
-    varl = varl->next;
-  }
-
-  double *shellf, *xx;
-  shellf = new double[num_var];
-  xx = new double[dim * num_var];
-  memset(shellf, 0, sizeof(double) * num_var);
-  memset(xx, 0, sizeof(double) * dim * num_var);
-
-  double *DH;
-  int *llb, *uub;
-  DH = new double[dim];
-
-  for (int i = 0; i < dim; i++)
-  {
-    DH[i] = getdX(i);
-  }
-
-  llb = new int[dim];
-  uub = new int[dim];
-
-  MyList<Block> *Bp = blb;
-  while (Bp) // run along Blocks
-  {
-    Block *BP = Bp->data;
-
-    if (myrank == BP->rank)
-    {
-
-      for (int i = 0; i < dim; i++)
-      {
-        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? lli[i] : ghost_width;
-        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? uui[i] : ghost_width;
-      }
-
-      varl = VarList;
-      int k = 0;
-      double tmp, tmpx[dim];
-      while (varl) // run along variables
-      {
-        f_find_maximum(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], tmp, tmpx, llb, uub);
-        if (tmp > shellf[k])
-        {
-          shellf[k] = tmp;
-          for (int i = 0; i < dim; i++)
-            xx[dim * k + i] = tmpx[i];
-        }
-        varl = varl->next;
-        k++;
-      }
-    }
-
-    if (Bp == ble)
-      break;
-    Bp = Bp->next;
-  }
-
-  struct mloc
-  {
-    double val;
-    int rank;
-  };
-
-  mloc *IN, *OUT;
-  IN = new mloc[num_var];
-  OUT = new mloc[num_var];
-  for (int i = 0; i < num_var; i++)
-  {
-    IN[i].val = shellf[i];
-    IN[i].rank = myrank;
-  }
-
-  MPI_Allreduce(IN, OUT, num_var, MPI_DOUBLE_INT, MPI_MAXLOC, MPI_COMM_WORLD);
-
-  for (int i = 0; i < num_var; i++)
-  {
-    Shellf[i] = OUT[i].val;
-    if (myrank != OUT[i].rank)
-      for (int k = 0; k < 3; k++)
-        xx[3 * i + k] = 0;
-  }
-
-  MPI_Allreduce(xx, XX, dim * num_var, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-
-  delete[] IN;
-  delete[] OUT;
-  delete[] shellf;
-  delete[] xx;
-  delete[] DH;
-  delete[] llb;
-  delete[] uub;
-}
-void Patch::Find_Maximum(MyList<var> *VarList, double *XX,
-                         double *Shellf, MPI_Comm Comm_here)
-{
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  MyList<var> *varl;
-  int num_var = 0;
-  varl = VarList;
-  while (varl)
-  {
-    num_var++;
-    varl = varl->next;
-  }
-
-  double *shellf, *xx;
-  shellf = new double[num_var];
-  xx = new double[dim * num_var];
-  memset(shellf, 0, sizeof(double) * num_var);
-  memset(xx, 0, sizeof(double) * dim * num_var);
-
-  double *DH;
-  int *llb, *uub;
-  DH = new double[dim];
-
-  for (int i = 0; i < dim; i++)
-  {
-    DH[i] = getdX(i);
-  }
-
-  llb = new int[dim];
-  uub = new int[dim];
-
-  MyList<Block> *Bp = blb;
-  while (Bp) // run along Blocks
-  {
-    Block *BP = Bp->data;
-
-    if (myrank == BP->rank)
-    {
-
-      for (int i = 0; i < dim; i++)
-      {
-        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? lli[i] : ghost_width;
-        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? uui[i] : ghost_width;
-      }
-
-      varl = VarList;
-      int k = 0;
-      double tmp, tmpx[dim];
-      while (varl) // run along variables
-      {
-        f_find_maximum(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], tmp, tmpx, llb, uub);
-        if (tmp > shellf[k])
-        {
-          shellf[k] = tmp;
-          for (int i = 0; i < dim; i++)
-            xx[dim * k + i] = tmpx[i];
-        }
-        varl = varl->next;
-        k++;
-      }
-    }
-
-    if (Bp == ble)
-      break;
-    Bp = Bp->next;
-  }
-
-  struct mloc
-  {
-    double val;
-    int rank;
-  };
-
-  mloc *IN, *OUT;
-  IN = new mloc[num_var];
-  OUT = new mloc[num_var];
-  for (int i = 0; i < num_var; i++)
-  {
-    IN[i].val = shellf[i];
-    IN[i].rank = myrank;
-  }
-
-  MPI_Allreduce(IN, OUT, num_var, MPI_DOUBLE_INT, MPI_MAXLOC, Comm_here);
-
-  for (int i = 0; i < num_var; i++)
-  {
-    Shellf[i] = OUT[i].val;
-    if (myrank != OUT[i].rank)
-      for (int k = 0; k < 3; k++)
-        xx[3 * i + k] = 0;
-  }
-
-  MPI_Allreduce(xx, XX, dim * num_var, MPI_DOUBLE, MPI_SUM, Comm_here);
-
-  delete[] IN;
-  delete[] OUT;
-  delete[] shellf;
-  delete[] xx;
-  delete[] DH;
-  delete[] llb;
-  delete[] uub;
-}
-// if the given point locates in the present Patch return true
-// otherwise return false
-bool Patch::Find_Point(double *XX)
-{
-  double *DH;
-  DH = new double[dim];
-
-  for (int i = 0; i < dim; i++)
-  {
-    DH[i] = getdX(i);
-  }
-
-  for (int i = 0; i < dim; i++)
-  {
-    // has excluded the buffer points
-    if (XX[i] < bbox[i] + lli[i] * DH[i] - DH[i] / 100 || XX[i] > bbox[dim + i] - uui[i] * DH[i] + DH[i] / 100)
-    {
-      delete[] DH;
-      return false; // out of current patch,
-                    // remember to delete the allocated arrays before return!!!
-    }
-  }
-
-  delete[] DH;
-
-  return true;
+
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <cstdlib>
+#include <cstdio>
+#include <string>
+#include <cmath>
+#include <new>
+using namespace std;
+
+#include "misc.h"
+#include "MPatch.h"
+#include "Parallel.h"
+#include "fmisc.h"
+
+Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
+{
+
+  int hbuffer_width = buffer_width;
+  if (lev == 0)
+    hbuffer_width = CS_width; // specific for shell-box coulping
+
+  if (DIM != dim)
+  {
+    cout << "dimension is not consistent in Patch construction" << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+  for (int i = 0; i < dim; i++)
+  {
+    shape[i] = shapei[i];
+    bbox[i] = bboxi[i];
+    bbox[dim + i] = bboxi[dim + i];
+    lli[i] = uui[i] = 0;
+    if (buflog)
+    {
+      double DH;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+      DH = (bbox[dim + i] - bbox[i]) / (shape[i] - 1);
+#else
+#ifdef Cell
+      DH = (bbox[dim + i] - bbox[i]) / shape[i];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      uui[i] = hbuffer_width;
+      bbox[dim + i] = bbox[dim + i] + uui[i] * DH;
+      shape[i] = shape[i] + uui[i];
+    }
+  }
+
+  if (buflog)
+  {
+    if (DIM != 3)
+    {
+      cout << "Symmetry in Patch construction only support 3 yet but dim = " << DIM << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+    double tmpb, DH;
+    if (Symmetry > 0)
+    {
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+      DH = (bbox[5] - bbox[2]) / (shape[2] - 1);
+#else
+#ifdef Cell
+      DH = (bbox[5] - bbox[2]) / shape[2];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      tmpb = Mymax(0, bbox[2] - hbuffer_width * DH);
+      lli[2] = int((bbox[2] - tmpb) / DH + 0.4);
+      bbox[2] = bbox[2] - lli[2] * DH;
+      shape[2] = shape[2] + lli[2];
+      if (lli[2] < hbuffer_width)
+      {
+        if (feq(bbox[2], 0, DH / 2))
+          lli[2] = 0;
+        else
+        {
+          cout << "Code mistake for lli[2] = " << lli[2] << ", bbox[2] = " << bbox[2] << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+      }
+      if (Symmetry > 1)
+      {
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        DH = (bbox[3] - bbox[0]) / (shape[0] - 1);
+#else
+#ifdef Cell
+        DH = (bbox[3] - bbox[0]) / shape[0];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+        tmpb = Mymax(0, bbox[0] - hbuffer_width * DH);
+        lli[0] = int((bbox[0] - tmpb) / DH + 0.4);
+        bbox[0] = bbox[0] - lli[0] * DH;
+        shape[0] = shape[0] + lli[0];
+        if (lli[0] < hbuffer_width)
+        {
+          if (feq(bbox[0], 0, DH / 2))
+            lli[0] = 0;
+          else
+          {
+            cout << "Code mistake for lli[0] = " << lli[0] << ", bbox[0] = " << bbox[0] << endl;
+            MPI_Abort(MPI_COMM_WORLD, 1);
+          }
+        }
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        DH = (bbox[4] - bbox[1]) / (shape[1] - 1);
+#else
+#ifdef Cell
+        DH = (bbox[4] - bbox[1]) / shape[1];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+        tmpb = Mymax(0, bbox[1] - hbuffer_width * DH);
+        lli[1] = int((bbox[1] - tmpb) / DH + 0.4);
+        bbox[1] = bbox[1] - lli[1] * DH;
+        shape[1] = shape[1] + lli[1];
+        if (lli[1] < hbuffer_width)
+        {
+          if (feq(bbox[1], 0, DH / 2))
+            lli[1] = 0;
+          else
+          {
+            cout << "Code mistake for lli[1] = " << lli[1] << ", bbox[1] = " << bbox[1] << endl;
+            MPI_Abort(MPI_COMM_WORLD, 1);
+          }
+        }
+      }
+      else
+      {
+        for (int i = 0; i < 2; i++)
+        {
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+          DH = (bbox[dim + i] - bbox[i]) / (shape[i] - 1);
+#else
+#ifdef Cell
+          DH = (bbox[dim + i] - bbox[i]) / shape[i];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+          lli[i] = hbuffer_width;
+          bbox[i] = bbox[i] - lli[i] * DH;
+          shape[i] = shape[i] + lli[i];
+        }
+      }
+    }
+    else
+    {
+      for (int i = 0; i < dim; i++)
+      {
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        DH = (bbox[dim + i] - bbox[i]) / (shape[i] - 1);
+#else
+#ifdef Cell
+        DH = (bbox[dim + i] - bbox[i]) / shape[i];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+        lli[i] = hbuffer_width;
+        bbox[i] = bbox[i] - lli[i] * DH;
+        shape[i] = shape[i] + lli[i];
+      }
+    }
+  }
+
+  blb = ble = 0;
+}
+Patch::~Patch()
+{
+}
+// buflog 1: with buffer points; 0 without
+void Patch::checkPatch(bool buflog)
+{
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  if (myrank == 0)
+  {
+    if (buflog)
+    {
+      cout << " belong to level " << lev << endl;
+      cout << " shape: [";
+      for (int i = 0; i < dim; i++)
+      {
+        cout << shape[i];
+        if (i < dim - 1)
+          cout << ",";
+        else
+          cout << "]";
+      }
+      cout << " resolution: [";
+      for (int i = 0; i < dim; i++)
+      {
+        cout << getdX(i);
+        if (i < dim - 1)
+          cout << ",";
+        else
+          cout << "]" << endl;
+      }
+      cout << " range:" << "(";
+      for (int i = 0; i < dim; i++)
+      {
+        cout << bbox[i] << ":" << bbox[dim + i];
+        if (i < dim - 1)
+          cout << ",";
+        else
+          cout << ")" << endl;
+      }
+    }
+    else
+    {
+      cout << " belong to level " << lev << endl;
+      cout << " shape: [";
+      for (int i = 0; i < dim; i++)
+      {
+        cout << shape[i] - lli[i] - uui[i];
+        if (i < dim - 1)
+          cout << ",";
+        else
+          cout << "]";
+      }
+      cout << " resolution: [";
+      for (int i = 0; i < dim; i++)
+      {
+        cout << getdX(i);
+        if (i < dim - 1)
+          cout << ",";
+        else
+          cout << "]" << endl;
+      }
+      cout << " range:" << "(";
+      for (int i = 0; i < dim; i++)
+      {
+        cout << bbox[i] + lli[i] * getdX(i) << ":" << bbox[dim + i] - uui[i] * getdX(i);
+        if (i < dim - 1)
+          cout << ",";
+        else
+          cout << ")" << endl;
+      }
+    }
+  }
+}
+void Patch::checkPatch(bool buflog, const int out_rank)
+{
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  if (myrank == out_rank)
+  {
+    cout << " out_rank = " << out_rank << endl;
+    if (buflog)
+    {
+      cout << " belong to level " << lev << endl;
+      cout << " shape: [";
+      for (int i = 0; i < dim; i++)
+      {
+        cout << shape[i];
+        if (i < dim - 1)
+          cout << ",";
+        else
+          cout << "]";
+      }
+      cout << " resolution: [";
+      for (int i = 0; i < dim; i++)
+      {
+        cout << getdX(i);
+        if (i < dim - 1)
+          cout << ",";
+        else
+          cout << "]" << endl;
+      }
+      cout << " range:" << "(";
+      for (int i = 0; i < dim; i++)
+      {
+        cout << bbox[i] << ":" << bbox[dim + i];
+        if (i < dim - 1)
+          cout << ",";
+        else
+          cout << ")" << endl;
+      }
+    }
+    else
+    {
+      cout << " belong to level " << lev << endl;
+      cout << " shape: [";
+      for (int i = 0; i < dim; i++)
+      {
+        cout << shape[i] - lli[i] - uui[i];
+        if (i < dim - 1)
+          cout << ",";
+        else
+          cout << "]";
+      }
+      cout << " resolution: [";
+      for (int i = 0; i < dim; i++)
+      {
+        cout << getdX(i);
+        if (i < dim - 1)
+          cout << ",";
+        else
+          cout << "]" << endl;
+      }
+      cout << " range:" << "(";
+      for (int i = 0; i < dim; i++)
+      {
+        cout << bbox[i] + lli[i] * getdX(i) << ":" << bbox[dim + i] - uui[i] * getdX(i);
+        if (i < dim - 1)
+          cout << ",";
+        else
+          cout << ")" << endl;
+      }
+    }
+  }
+}
+void Patch::Interp_Points(MyList<var> *VarList,
+                          int NN, double **XX,
+                          double *Shellf, int Symmetry)
+{
+  // NOTE: we do not Synchnize variables here, make sure of that before calling this routine
+  int myrank, nprocs;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+  int ordn = 2 * ghost_width;
+  MyList<var> *varl;
+  int num_var = 0;
+  varl = VarList;
+  while (varl)
+  {
+    num_var++;
+    varl = varl->next;
+  }
+
+  memset(Shellf, 0, sizeof(double) * NN * num_var);
+
+  // owner_rank[j] records which MPI rank owns point j
+  // All ranks traverse the same block list so they all agree on ownership
+  int *owner_rank;
+  owner_rank = new int[NN];
+  for (int j = 0; j < NN; j++)
+    owner_rank[j] = -1;
+
+  double DH[dim], llb[dim], uub[dim];
+  for (int i = 0; i < dim; i++)
+    DH[i] = getdX(i);
+
+  for (int j = 0; j < NN; j++) // run along points
+  {
+    double pox[dim];
+    for (int i = 0; i < dim; i++)
+    {
+      pox[i] = XX[i][j];
+      if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
+      {
+        cout << "Patch::Interp_Points: point (";
+        for (int k = 0; k < dim; k++)
+        {
+          cout << XX[k][j];
+          if (k < dim - 1)
+            cout << ",";
+          else
+            cout << ") is out of current Patch." << endl;
+        }
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+    }
+
+    MyList<Block> *Bp = blb;
+    bool notfind = true;
+    while (notfind && Bp) // run along Blocks
+    {
+      Block *BP = Bp->data;
+
+      bool flag = true;
+      for (int i = 0; i < dim; i++)
+      {
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
+        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
+#else
+#ifdef Cell
+        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
+        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+        if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
+        {
+          flag = false;
+          break;
+        }
+      }
+
+      if (flag)
+      {
+        notfind = false;
+        owner_rank[j] = BP->rank;
+        if (myrank == BP->rank)
+        {
+          //---> interpolation
+          varl = VarList;
+          int k = 0;
+          while (varl) // run along variables
+          {
+            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
+                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
+            varl = varl->next;
+            k++;
+          }
+        }
+      }
+      if (Bp == ble)
+        break;
+      Bp = Bp->next;
+    }
+  }
+  // Replace MPI_Allreduce with per-owner MPI_Bcast:
+  // Group consecutive points by owner rank and broadcast each group.
+  // Since each point's data is non-zero only on the owner rank,
+  // Bcast from owner is equivalent to Allreduce(MPI_SUM) but much cheaper.
+  {
+    int j = 0;
+    while (j < NN)
+    {
+      int cur_owner = owner_rank[j];
+      if (cur_owner < 0)
+      {
+        if (myrank == 0)
+        {
+          cout << "ERROR: Patch::Interp_Points fails to find point (";
+          for (int d = 0; d < dim; d++)
+          {
+            cout << XX[d][j];
+            if (d < dim - 1)
+              cout << ",";
+            else
+              cout << ")";
+          }
+          cout << " on Patch (";
+          for (int d = 0; d < dim; d++)
+          {
+            cout << bbox[d] << "+" << lli[d] * DH[d];
+            if (d < dim - 1)
+              cout << ",";
+            else
+              cout << ")--";
+          }
+          cout << "(";
+          for (int d = 0; d < dim; d++)
+          {
+            cout << bbox[dim + d] << "-" << uui[d] * DH[d];
+            if (d < dim - 1)
+              cout << ",";
+            else
+              cout << ")" << endl;
+          }
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        j++;
+        continue;
+      }
+      // Find contiguous run of points with the same owner
+      int jstart = j;
+      while (j < NN && owner_rank[j] == cur_owner)
+        j++;
+      int count = (j - jstart) * num_var;
+      MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner, MPI_COMM_WORLD);
+    }
+  }
+
+  delete[] owner_rank;
+}
+void Patch::Interp_Points(MyList<var> *VarList,
+                          int NN, double **XX,
+                          double *Shellf, int Symmetry,
+                          int Nmin_consumer, int Nmax_consumer)
+{
+  // Targeted point-to-point overload: each owner sends each point only to
+  // the one rank that needs it for integration (consumer), reducing
+  // communication volume by ~nprocs times compared to the Bcast version.
+  /*
+  double t_calc_end, t_calc_total = 0;
+  double t_calc_start = MPI_Wtime();*/
+  int myrank, nprocs;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+  int ordn = 2 * ghost_width;
+  MyList<var> *varl;
+  int num_var = 0;
+  varl = VarList;
+  while (varl)
+  {
+    num_var++;
+    varl = varl->next;
+  }
+
+  memset(Shellf, 0, sizeof(double) * NN * num_var);
+
+  // owner_rank[j] records which MPI rank owns point j
+  int *owner_rank;
+  owner_rank = new int[NN];
+  for (int j = 0; j < NN; j++)
+    owner_rank[j] = -1;
+
+  double DH[dim], llb[dim], uub[dim];
+  for (int i = 0; i < dim; i++)
+    DH[i] = getdX(i);
+
+  // --- Interpolation phase (identical to original) ---
+  for (int j = 0; j < NN; j++)
+  {
+    double pox[dim];
+    for (int i = 0; i < dim; i++)
+    {
+      pox[i] = XX[i][j];
+      if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
+      {
+        cout << "Patch::Interp_Points: point (";
+        for (int k = 0; k < dim; k++)
+        {
+          cout << XX[k][j];
+          if (k < dim - 1)
+            cout << ",";
+          else
+            cout << ") is out of current Patch." << endl;
+        }
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+    }
+
+    MyList<Block> *Bp = blb;
+    bool notfind = true;
+    while (notfind && Bp)
+    {
+      Block *BP = Bp->data;
+
+      bool flag = true;
+      for (int i = 0; i < dim; i++)
+      {
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
+        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
+#else
+#ifdef Cell
+        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
+        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+        if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
+        {
+          flag = false;
+          break;
+        }
+      }
+
+      if (flag)
+      {
+        notfind = false;
+        owner_rank[j] = BP->rank;
+        if (myrank == BP->rank)
+        {
+          varl = VarList;
+          int k = 0;
+          while (varl)
+          {
+            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
+                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
+            varl = varl->next;
+            k++;
+          }
+        }
+      }
+      if (Bp == ble)
+        break;
+      Bp = Bp->next;
+    }
+  }
+  /*
+      t_calc_end = MPI_Wtime();
+      t_calc_total = t_calc_end - t_calc_start;*/
+  // --- Error check for unfound points ---
+  for (int j = 0; j < NN; j++)
+  {
+    if (owner_rank[j] < 0 && myrank == 0)
+    {
+      cout << "ERROR: Patch::Interp_Points fails to find point (";
+      for (int d = 0; d < dim; d++)
+      {
+        cout << XX[d][j];
+        if (d < dim - 1)
+          cout << ",";
+        else
+          cout << ")";
+      }
+      cout << " on Patch (";
+      for (int d = 0; d < dim; d++)
+      {
+        cout << bbox[d] << "+" << lli[d] * DH[d];
+        if (d < dim - 1)
+          cout << ",";
+        else
+          cout << ")--";
+      }
+      cout << "(";
+      for (int d = 0; d < dim; d++)
+      {
+        cout << bbox[dim + d] << "-" << uui[d] * DH[d];
+        if (d < dim - 1)
+          cout << ",";
+        else
+          cout << ")" << endl;
+      }
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+  }
+
+  // --- Targeted point-to-point communication phase ---
+  // Compute consumer_rank[j] using the same deterministic formula as surface_integral
+  int *consumer_rank = new int[NN];
+  {
+    int mp = NN / nprocs;
+    int Lp = NN - nprocs * mp;
+    for (int j = 0; j < NN; j++)
+    {
+      if (j < Lp * (mp + 1))
+        consumer_rank[j] = j / (mp + 1);
+      else
+        consumer_rank[j] = Lp + (j - Lp * (mp + 1)) / mp;
+    }
+  }
+
+  // Count sends and recvs per rank
+  int *send_count = new int[nprocs];
+  int *recv_count = new int[nprocs];
+  memset(send_count, 0, sizeof(int) * nprocs);
+  memset(recv_count, 0, sizeof(int) * nprocs);
+
+  for (int j = 0; j < NN; j++)
+  {
+    int own = owner_rank[j];
+    int con = consumer_rank[j];
+    if (own == con)
+      continue; // local — no communication needed
+    if (own == myrank)
+      send_count[con]++;
+    if (con == myrank)
+      recv_count[own]++;
+  }
+
+  // Build send buffers: for each destination rank, pack (index, data) pairs
+  // Each entry: 1 int (point index j) + num_var doubles
+  int total_send = 0, total_recv = 0;
+  int *send_offset = new int[nprocs];
+  int *recv_offset = new int[nprocs];
+  for (int r = 0; r < nprocs; r++)
+  {
+    send_offset[r] = total_send;
+    total_send += send_count[r];
+    recv_offset[r] = total_recv;
+    total_recv += recv_count[r];
+  }
+
+  // Pack send buffers: each message contains (j, data[0..num_var-1]) per point
+  int stride = 1 + num_var; // 1 double for index + num_var doubles for data
+  double *sendbuf = new double[total_send * stride];
+  double *recvbuf = new double[total_recv * stride];
+
+  // Temporary counters for packing
+  int *pack_pos = new int[nprocs];
+  memset(pack_pos, 0, sizeof(int) * nprocs);
+
+  for (int j = 0; j < NN; j++)
+  {
+    int own = owner_rank[j];
+    int con = consumer_rank[j];
+    if (own != myrank || con == myrank)
+      continue;
+    int pos = (send_offset[con] + pack_pos[con]) * stride;
+    sendbuf[pos] = (double)j; // point index
+    for (int v = 0; v < num_var; v++)
+      sendbuf[pos + 1 + v] = Shellf[j * num_var + v];
+    pack_pos[con]++;
+  }
+
+  // Post non-blocking recvs and sends
+  int n_req = 0;
+  for (int r = 0; r < nprocs; r++)
+  {
+    if (recv_count[r] > 0) n_req++;
+    if (send_count[r] > 0) n_req++;
+  }
+
+  MPI_Request *reqs = new MPI_Request[n_req];
+  int req_idx = 0;
+
+  for (int r = 0; r < nprocs; r++)
+  {
+    if (recv_count[r] > 0)
+    {
+      MPI_Irecv(recvbuf + recv_offset[r] * stride,
+                recv_count[r] * stride, MPI_DOUBLE,
+                r, 0, MPI_COMM_WORLD, &reqs[req_idx++]);
+    }
+  }
+  for (int r = 0; r < nprocs; r++)
+  {
+    if (send_count[r] > 0)
+    {
+      MPI_Isend(sendbuf + send_offset[r] * stride,
+                send_count[r] * stride, MPI_DOUBLE,
+                r, 0, MPI_COMM_WORLD, &reqs[req_idx++]);
+    }
+  }
+
+  if (n_req > 0)
+    MPI_Waitall(n_req, reqs, MPI_STATUSES_IGNORE);
+
+  // Unpack recv buffers into Shellf
+  for (int i = 0; i < total_recv; i++)
+  {
+    int pos = i * stride;
+    int j = (int)recvbuf[pos];
+    for (int v = 0; v < num_var; v++)
+      Shellf[j * num_var + v] = recvbuf[pos + 1 + v];
+  }
+
+  delete[] reqs;
+  delete[] sendbuf;
+  delete[] recvbuf;
+  delete[] pack_pos;
+  delete[] send_offset;
+  delete[] recv_offset;
+  delete[] send_count;
+  delete[] recv_count;
+  delete[] consumer_rank;
+  delete[] owner_rank;
+  /*
+  // 4. 汇总并输出真正干活最慢的 Top 4
+  struct RankStats {
+    int rank;
+    double calc_time; // 净计算时间
+  };
+
+  // 创建当前进程的统计数据
+  RankStats local_stat;
+  local_stat.rank = myrank;
+  local_stat.calc_time = t_calc_total;
+
+  // 为所有进程的统计数据分配内存
+  RankStats *all_stats = nullptr;
+  if (myrank == 0) {
+    all_stats = new RankStats[nprocs];
+  }
+
+  // 使用MPI_Gather收集所有进程的数据到rank 0
+  MPI_Gather(&local_stat, sizeof(RankStats), MPI_BYTE,
+             all_stats, sizeof(RankStats), MPI_BYTE,
+             0, MPI_COMM_WORLD);
+
+  // 准备输出前4个rank的信息（所有rank都参与，确保广播后一致）
+  int top10_ranks[10] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
+  double top10_times[10] = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
+  int num_top10 = 0;
+
+  if (myrank == 0) {
+    // 按 calc_time（净计算时间）排序
+    std::sort(all_stats, all_stats + nprocs, [](const RankStats& a, const RankStats& b) {
+        return a.calc_time > b.calc_time;
+    });
+
+    // 取前4个
+    num_top10 = (nprocs < 10) ? nprocs : 10;
+    for (int i = 0; i < num_top10; i++) {
+      top10_ranks[i] = all_stats[i].rank;
+      top10_times[i] = all_stats[i].calc_time;
+    }
+
+    printf("\n--- Top %d Ranks by ACTIVE COMPUTATION (CPU Time) ---\n", num_top10);
+    for (int i = 0; i < num_top10; i++) {
+      printf("Rank [%4d]: Calc %.6f s\n", top10_ranks[i], top10_times[i]);
+    }
+
+    // 清理分配的内存
+    delete[] all_stats;
+  }
+
+  // 广播前4个rank的信息给所有进程
+  MPI_Bcast(&num_top10, 1, MPI_INT, 0, MPI_COMM_WORLD);
+  if (num_top10 > 0) {
+    MPI_Bcast(top10_ranks, 10, MPI_INT, 0, MPI_COMM_WORLD);
+    MPI_Bcast(top10_times, 10, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+  }
+*/
+}
+void Patch::Interp_Points(MyList<var> *VarList,
+                          int NN, double **XX,
+                          double *Shellf, int Symmetry, MPI_Comm Comm_here)
+{
+  // NOTE: we do not Synchnize variables here, make sure of that before calling this routine
+  int myrank, lmyrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  MPI_Comm_rank(Comm_here, &lmyrank);
+
+  int ordn = 2 * ghost_width;
+  MyList<var> *varl;
+  int num_var = 0;
+  varl = VarList;
+  while (varl)
+  {
+    num_var++;
+    varl = varl->next;
+  }
+
+  memset(Shellf, 0, sizeof(double) * NN * num_var);
+
+  // owner_rank[j] stores the global rank that owns point j
+  int *owner_rank;
+  owner_rank = new int[NN];
+  for (int j = 0; j < NN; j++)
+    owner_rank[j] = -1;
+
+  // Build global-to-local rank translation for Comm_here
+  MPI_Group world_group, local_group;
+  MPI_Comm_group(MPI_COMM_WORLD, &world_group);
+  MPI_Comm_group(Comm_here, &local_group);
+
+  double DH[dim], llb[dim], uub[dim];
+  for (int i = 0; i < dim; i++)
+    DH[i] = getdX(i);
+
+  for (int j = 0; j < NN; j++) // run along points
+  {
+    double pox[dim];
+    for (int i = 0; i < dim; i++)
+    {
+      pox[i] = XX[i][j];
+      if (lmyrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
+      {
+        cout << "Patch::Interp_Points: point (";
+        for (int k = 0; k < dim; k++)
+        {
+          cout << XX[k][j];
+          if (k < dim - 1)
+            cout << ",";
+          else
+            cout << ") is out of current Patch." << endl;
+        }
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+    }
+
+    MyList<Block> *Bp = blb;
+    bool notfind = true;
+    while (notfind && Bp) // run along Blocks
+    {
+      Block *BP = Bp->data;
+
+      bool flag = true;
+      for (int i = 0; i < dim; i++)
+      {
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
+        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
+#else
+#ifdef Cell
+        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
+        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+        if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
+        {
+          flag = false;
+          break;
+        }
+      }
+
+      if (flag)
+      {
+        notfind = false;
+        owner_rank[j] = BP->rank;
+        if (myrank == BP->rank)
+        {
+          //---> interpolation
+          varl = VarList;
+          int k = 0;
+          while (varl) // run along variables
+          {
+            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
+                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
+            varl = varl->next;
+            k++;
+          }
+        }
+      }
+      if (Bp == ble)
+        break;
+      Bp = Bp->next;
+    }
+  }
+
+  // Collect unique global owner ranks and translate to local ranks in Comm_here
+  // Then broadcast each owner's points via MPI_Bcast on Comm_here
+  {
+    int j = 0;
+    while (j < NN)
+    {
+      int cur_owner_global = owner_rank[j];
+      if (cur_owner_global < 0)
+      {
+        // Point not found — skip (error check disabled for sub-communicator levels)
+        j++;
+        continue;
+      }
+      // Translate global rank to local rank in Comm_here
+      int cur_owner_local;
+      MPI_Group_translate_ranks(world_group, 1, &cur_owner_global, local_group, &cur_owner_local);
+
+      // Find contiguous run of points with the same owner
+      int jstart = j;
+      while (j < NN && owner_rank[j] == cur_owner_global)
+        j++;
+      int count = (j - jstart) * num_var;
+      MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner_local, Comm_here);
+    }
+  }
+
+  MPI_Group_free(&world_group);
+  MPI_Group_free(&local_group);
+  delete[] owner_rank;
+}
+void Patch::checkBlock()
+{
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  if (myrank == 0)
+  {
+    MyList<Block> *BP = blb;
+    while (BP)
+    {
+      BP->data->checkBlock();
+      if (BP == ble)
+        break;
+      BP = BP->next;
+    }
+  }
+}
+double Patch::getdX(int dir)
+{
+  if (dir < 0 || dir >= dim)
+  {
+    cout << "Patch::getdX: error input dir = " << dir << ", this Patch has direction (0," << dim - 1 << ")" << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+  double h;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+  if (shape[dir] == 1)
+  {
+    cout << "Patch::getdX: for direction " << dir << ", this Patch has only one point. Can not determine dX for vertex center grid." << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+  h = (bbox[dim + dir] - bbox[dir]) / (shape[dir] - 1);
+#else
+#ifdef Cell
+  h = (bbox[dim + dir] - bbox[dir]) / shape[dir];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+  return h;
+}
+bool Patch::Interp_ONE_Point(MyList<var> *VarList, double *XX,
+                             double *Shellf, int Symmetry)
+{
+  // NOTE: we do not Synchnize variables here, make sure of that before calling this routine
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+  int ordn = 2 * ghost_width;
+  MyList<var> *varl;
+  int num_var = 0;
+  varl = VarList;
+  while (varl)
+  {
+    num_var++;
+    varl = varl->next;
+  }
+
+  double *shellf;
+  shellf = new double[num_var];
+  memset(shellf, 0, sizeof(double) * num_var);
+
+  double *DH, *llb, *uub;
+  DH = new double[dim];
+
+  for (int i = 0; i < dim; i++)
+  {
+    DH[i] = getdX(i);
+  }
+  llb = new double[dim];
+  uub = new double[dim];
+
+  double pox[dim];
+  for (int i = 0; i < dim; i++)
+  {
+    pox[i] = XX[i];
+    // has excluded the buffer points
+    if (XX[i] < bbox[i] + lli[i] * DH[i] - DH[i] / 100 || XX[i] > bbox[dim + i] - uui[i] * DH[i] + DH[i] / 100)
+    {
+      delete[] shellf;
+      delete[] DH;
+      delete[] llb;
+      delete[] uub;
+      return false; // out of current patch,
+                    // remember to delete the allocated arrays before return!!!
+    }
+  }
+
+  MyList<Block> *Bp = blb;
+  bool notfind = true;
+  while (notfind && Bp) // run along Blocks
+  {
+    Block *BP = Bp->data;
+
+    bool flag = true;
+    for (int i = 0; i < dim; i++)
+    {
+// NOTE: our dividing structure is (exclude ghost)
+// -1 0
+//       1  2
+// so (0,1) does not belong to any part for vertex structure
+// here we put (0,0.5) to left part and (0.5,1) to right part
+// BUT for cell structure the bbox is (-1.5,0.5) and (0.5,2.5), there is no missing region at all
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+      llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
+      uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
+#else
+#ifdef Cell
+      llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
+      uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      if (XX[i] - llb[i] < -DH[i] / 2 || XX[i] - uub[i] > DH[i] / 2)
+      {
+        flag = false;
+        break;
+      }
+    }
+
+    if (flag)
+    {
+      notfind = false;
+      if (myrank == BP->rank)
+      {
+// test old code
+#if 0
+#define floorint(a) ((a) < 0 ? int(a) - 1 : int(a))
+//---> interpolation
+                int ixl,iyl,izl,ixu,iyu,izu;
+	    	double Delx,Dely,Delz;
+
+		ixl = 1+floorint((pox[0]-BP->X[0][0])/DH[0]);
+	   	iyl = 1+floorint((pox[1]-BP->X[1][0])/DH[1]);
+	   	izl = 1+floorint((pox[2]-BP->X[2][0])/DH[2]);
+
+		int nn=ordn/2;
+
+		ixl = ixl-nn;
+		iyl = iyl-nn;
+		izl = izl-nn;
+	   
+		int tmi;
+		tmi = (Symmetry==2)?-1:0;
+		if(ixl<tmi) ixl=tmi;
+	   	if(iyl<tmi) iyl=tmi;
+		tmi = (Symmetry>0)?-1:0;
+	   	if(izl<tmi) izl=tmi;
+      
+	   	if(ixl+ordn>BP->shape[0]) ixl=BP->shape[0]-ordn;
+	   	if(iyl+ordn>BP->shape[1]) iyl=BP->shape[1]-ordn;
+	   	if(izl+ordn>BP->shape[2]) izl=BP->shape[2]-ordn;
+// support cell center
+		if(ixl>=0) Delx = ( pox[0] - BP->X[0][ixl] )/ DH[0];
+		else       Delx = ( pox[0] + BP->X[0][0] )/ DH[0];
+                if(iyl>=0) Dely = ( pox[1] - BP->X[1][iyl] )/ DH[1];
+		else       Dely = ( pox[1] + BP->X[1][0] )/ DH[1];
+                if(izl>=0) Delz = ( pox[2] - BP->X[2][izl] )/ DH[2];
+		else       Delz = ( pox[2] + BP->X[2][0] )/ DH[2];
+//change to fortran index
+                ixl++;
+	   	iyl++;
+	   	izl++;
+	   	ixu = ixl + ordn - 1;
+	   	iyu = iyl + ordn - 1;
+	   	izu = izl + ordn - 1;
+	    	varl=VarList;
+		int j=0;
+	    	while(varl)
+		{
+                 f_interp_2(BP->shape,BP->fgfs[varl->data->sgfn],shellf[j],ixl,ixu,iyl,iyu,izl,izu,Delx,Dely,Delz,
+                                     ordn,varl->data->SoA,Symmetry);
+		 varl=varl->next;
+		 j++;
+		} //varl
+#else
+        //---> interpolation
+        varl = VarList;
+        int k = 0;
+        while (varl) // run along variables
+        {
+          //              shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn],
+          //	  		                                    pox,ordn,varl->data->SoA,Symmetry);
+          f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[k],
+                          pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
+          varl = varl->next;
+          k++;
+        }
+#endif
+      }
+    }
+    if (Bp == ble)
+      break;
+    Bp = Bp->next;
+  }
+
+  if (notfind && myrank == 0)
+  {
+    cout << "ERROR: Patch::Interp_Points fails to find point (";
+    for (int j = 0; j < dim; j++)
+    {
+      cout << XX[j];
+      if (j < dim - 1)
+        cout << ",";
+      else
+        cout << ")";
+    }
+    cout << " on Patch (";
+    for (int j = 0; j < dim; j++)
+    {
+      cout << bbox[j] << "+" << lli[j] * getdX(j);
+      if (j < dim - 1)
+        cout << ",";
+      else
+        cout << ")--";
+    }
+    cout << "(";
+    for (int j = 0; j < dim; j++)
+    {
+      cout << bbox[dim + j] << "-" << uui[j] * getdX(j);
+      if (j < dim - 1)
+        cout << ",";
+      else
+        cout << ")" << endl;
+    }
+#if 0
+       checkBlock();
+#else
+    cout << "splited domains:" << endl;
+    {
+      MyList<Block> *Bp = blb;
+      while (Bp)
+      {
+        Block *BP = Bp->data;
+
+        for (int i = 0; i < dim; i++)
+        {
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+          llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
+          uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
+#else
+#ifdef Cell
+          llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
+          uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+        }
+        cout << "(";
+        for (int j = 0; j < dim; j++)
+        {
+          cout << llb[j] << ":" << uub[j];
+          if (j < dim - 1)
+            cout << ",";
+          else
+            cout << ")" << endl;
+        }
+        if (Bp == ble)
+          break;
+        Bp = Bp->next;
+      }
+    }
+#endif
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+
+  MPI_Allreduce(shellf, Shellf, num_var, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+
+  delete[] shellf;
+  delete[] DH;
+  delete[] llb;
+  delete[] uub;
+
+  return true;
+}
+bool Patch::Interp_ONE_Point(MyList<var> *VarList, double *XX,
+                             double *Shellf, int Symmetry, MPI_Comm Comm_here)
+{
+  // NOTE: we do not Synchnize variables here, make sure of that before calling this routine
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+  int ordn = 2 * ghost_width;
+  MyList<var> *varl;
+  int num_var = 0;
+  varl = VarList;
+  while (varl)
+  {
+    num_var++;
+    varl = varl->next;
+  }
+
+  double *shellf;
+  shellf = new double[num_var];
+  memset(shellf, 0, sizeof(double) * num_var);
+
+  double *DH, *llb, *uub;
+  DH = new double[dim];
+
+  for (int i = 0; i < dim; i++)
+  {
+    DH[i] = getdX(i);
+  }
+  llb = new double[dim];
+  uub = new double[dim];
+
+  double pox[dim];
+  for (int i = 0; i < dim; i++)
+  {
+    pox[i] = XX[i];
+    // has excluded the buffer points
+    if (XX[i] < bbox[i] + lli[i] * DH[i] - DH[i] / 100 || XX[i] > bbox[dim + i] - uui[i] * DH[i] + DH[i] / 100)
+    {
+      delete[] shellf;
+      delete[] DH;
+      delete[] llb;
+      delete[] uub;
+      return false; // out of current patch,
+                    // remember to delete the allocated arrays before return!!!
+    }
+  }
+
+  MyList<Block> *Bp = blb;
+  bool notfind = true;
+  while (notfind && Bp) // run along Blocks
+  {
+    Block *BP = Bp->data;
+
+    bool flag = true;
+    for (int i = 0; i < dim; i++)
+    {
+// NOTE: our dividing structure is (exclude ghost)
+// -1 0
+//       1  2
+// so (0,1) does not belong to any part for vertex structure
+// here we put (0,0.5) to left part and (0.5,1) to right part
+// BUT for cell structure the bbox is (-1.5,0.5) and (0.5,2.5), there is no missing region at all
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+      llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
+      uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
+#else
+#ifdef Cell
+      llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
+      uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      if (XX[i] - llb[i] < -DH[i] / 2 || XX[i] - uub[i] > DH[i] / 2)
+      {
+        flag = false;
+        break;
+      }
+    }
+
+    if (flag)
+    {
+      notfind = false;
+      if (myrank == BP->rank)
+      {
+// test old code
+#if 0
+#define floorint(a) ((a) < 0 ? int(a) - 1 : int(a))
+//---> interpolation
+                int ixl,iyl,izl,ixu,iyu,izu;
+	    	double Delx,Dely,Delz;
+
+		ixl = 1+floorint((pox[0]-BP->X[0][0])/DH[0]);
+	   	iyl = 1+floorint((pox[1]-BP->X[1][0])/DH[1]);
+	   	izl = 1+floorint((pox[2]-BP->X[2][0])/DH[2]);
+
+		int nn=ordn/2;
+
+		ixl = ixl-nn;
+		iyl = iyl-nn;
+		izl = izl-nn;
+	   
+		int tmi;
+		tmi = (Symmetry==2)?-1:0;
+		if(ixl<tmi) ixl=tmi;
+	   	if(iyl<tmi) iyl=tmi;
+		tmi = (Symmetry>0)?-1:0;
+	   	if(izl<tmi) izl=tmi;
+      
+	   	if(ixl+ordn>BP->shape[0]) ixl=BP->shape[0]-ordn;
+	   	if(iyl+ordn>BP->shape[1]) iyl=BP->shape[1]-ordn;
+	   	if(izl+ordn>BP->shape[2]) izl=BP->shape[2]-ordn;
+// support cell center
+		if(ixl>=0) Delx = ( pox[0] - BP->X[0][ixl] )/ DH[0];
+		else       Delx = ( pox[0] + BP->X[0][0] )/ DH[0];
+                if(iyl>=0) Dely = ( pox[1] - BP->X[1][iyl] )/ DH[1];
+		else       Dely = ( pox[1] + BP->X[1][0] )/ DH[1];
+                if(izl>=0) Delz = ( pox[2] - BP->X[2][izl] )/ DH[2];
+		else       Delz = ( pox[2] + BP->X[2][0] )/ DH[2];
+//change to fortran index
+                ixl++;
+	   	iyl++;
+	   	izl++;
+	   	ixu = ixl + ordn - 1;
+	   	iyu = iyl + ordn - 1;
+	   	izu = izl + ordn - 1;
+	    	varl=VarList;
+		int j=0;
+	    	while(varl)
+		{
+                 f_interp_2(BP->shape,BP->fgfs[varl->data->sgfn],shellf[j],ixl,ixu,iyl,iyu,izl,izu,Delx,Dely,Delz,
+                                     ordn,varl->data->SoA,Symmetry);
+		 varl=varl->next;
+		 j++;
+		} //varl
+#else
+        //---> interpolation
+        varl = VarList;
+        int k = 0;
+        while (varl) // run along variables
+        {
+          //              shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn],
+          //	  		                                    pox,ordn,varl->data->SoA,Symmetry);
+          f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[k],
+                          pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
+          varl = varl->next;
+          k++;
+        }
+#endif
+      }
+    }
+    if (Bp == ble)
+      break;
+    Bp = Bp->next;
+  }
+
+  if (notfind && myrank == 0)
+  {
+    cout << "ERROR: Patch::Interp_Points fails to find point (";
+    for (int j = 0; j < dim; j++)
+    {
+      cout << XX[j];
+      if (j < dim - 1)
+        cout << ",";
+      else
+        cout << ")";
+    }
+    cout << " on Patch (";
+    for (int j = 0; j < dim; j++)
+    {
+      cout << bbox[j] << "+" << lli[j] * getdX(j);
+      if (j < dim - 1)
+        cout << ",";
+      else
+        cout << ")--";
+    }
+    cout << "(";
+    for (int j = 0; j < dim; j++)
+    {
+      cout << bbox[dim + j] << "-" << uui[j] * getdX(j);
+      if (j < dim - 1)
+        cout << ",";
+      else
+        cout << ")" << endl;
+    }
+#if 0
+       checkBlock();
+#else
+    cout << "splited domains:" << endl;
+    {
+      MyList<Block> *Bp = blb;
+      while (Bp)
+      {
+        Block *BP = Bp->data;
+
+        for (int i = 0; i < dim; i++)
+        {
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+          llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
+          uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
+#else
+#ifdef Cell
+          llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
+          uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+        }
+        cout << "(";
+        for (int j = 0; j < dim; j++)
+        {
+          cout << llb[j] << ":" << uub[j];
+          if (j < dim - 1)
+            cout << ",";
+          else
+            cout << ")" << endl;
+        }
+        if (Bp == ble)
+          break;
+        Bp = Bp->next;
+      }
+    }
+#endif
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+
+  MPI_Allreduce(shellf, Shellf, num_var, MPI_DOUBLE, MPI_SUM, Comm_here);
+
+  delete[] shellf;
+  delete[] DH;
+  delete[] llb;
+  delete[] uub;
+
+  return true;
+}
+// find maximum of abstract value, XX store position for maximum, Shellf store maximum themselvs
+void Patch::Find_Maximum(MyList<var> *VarList, double *XX,
+                         double *Shellf)
+{
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+  MyList<var> *varl;
+  int num_var = 0;
+  varl = VarList;
+  while (varl)
+  {
+    num_var++;
+    varl = varl->next;
+  }
+
+  double *shellf, *xx;
+  shellf = new double[num_var];
+  xx = new double[dim * num_var];
+  memset(shellf, 0, sizeof(double) * num_var);
+  memset(xx, 0, sizeof(double) * dim * num_var);
+
+  double *DH;
+  int *llb, *uub;
+  DH = new double[dim];
+
+  for (int i = 0; i < dim; i++)
+  {
+    DH[i] = getdX(i);
+  }
+
+  llb = new int[dim];
+  uub = new int[dim];
+
+  MyList<Block> *Bp = blb;
+  while (Bp) // run along Blocks
+  {
+    Block *BP = Bp->data;
+
+    if (myrank == BP->rank)
+    {
+
+      for (int i = 0; i < dim; i++)
+      {
+        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? lli[i] : ghost_width;
+        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? uui[i] : ghost_width;
+      }
+
+      varl = VarList;
+      int k = 0;
+      double tmp, tmpx[dim];
+      while (varl) // run along variables
+      {
+        f_find_maximum(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], tmp, tmpx, llb, uub);
+        if (tmp > shellf[k])
+        {
+          shellf[k] = tmp;
+          for (int i = 0; i < dim; i++)
+            xx[dim * k + i] = tmpx[i];
+        }
+        varl = varl->next;
+        k++;
+      }
+    }
+
+    if (Bp == ble)
+      break;
+    Bp = Bp->next;
+  }
+
+  struct mloc
+  {
+    double val;
+    int rank;
+  };
+
+  mloc *IN, *OUT;
+  IN = new mloc[num_var];
+  OUT = new mloc[num_var];
+  for (int i = 0; i < num_var; i++)
+  {
+    IN[i].val = shellf[i];
+    IN[i].rank = myrank;
+  }
+
+  MPI_Allreduce(IN, OUT, num_var, MPI_DOUBLE_INT, MPI_MAXLOC, MPI_COMM_WORLD);
+
+  for (int i = 0; i < num_var; i++)
+  {
+    Shellf[i] = OUT[i].val;
+    if (myrank != OUT[i].rank)
+      for (int k = 0; k < 3; k++)
+        xx[3 * i + k] = 0;
+  }
+
+  MPI_Allreduce(xx, XX, dim * num_var, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+
+  delete[] IN;
+  delete[] OUT;
+  delete[] shellf;
+  delete[] xx;
+  delete[] DH;
+  delete[] llb;
+  delete[] uub;
+}
+void Patch::Find_Maximum(MyList<var> *VarList, double *XX,
+                         double *Shellf, MPI_Comm Comm_here)
+{
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+  MyList<var> *varl;
+  int num_var = 0;
+  varl = VarList;
+  while (varl)
+  {
+    num_var++;
+    varl = varl->next;
+  }
+
+  double *shellf, *xx;
+  shellf = new double[num_var];
+  xx = new double[dim * num_var];
+  memset(shellf, 0, sizeof(double) * num_var);
+  memset(xx, 0, sizeof(double) * dim * num_var);
+
+  double *DH;
+  int *llb, *uub;
+  DH = new double[dim];
+
+  for (int i = 0; i < dim; i++)
+  {
+    DH[i] = getdX(i);
+  }
+
+  llb = new int[dim];
+  uub = new int[dim];
+
+  MyList<Block> *Bp = blb;
+  while (Bp) // run along Blocks
+  {
+    Block *BP = Bp->data;
+
+    if (myrank == BP->rank)
+    {
+
+      for (int i = 0; i < dim; i++)
+      {
+        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? lli[i] : ghost_width;
+        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? uui[i] : ghost_width;
+      }
+
+      varl = VarList;
+      int k = 0;
+      double tmp, tmpx[dim];
+      while (varl) // run along variables
+      {
+        f_find_maximum(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], tmp, tmpx, llb, uub);
+        if (tmp > shellf[k])
+        {
+          shellf[k] = tmp;
+          for (int i = 0; i < dim; i++)
+            xx[dim * k + i] = tmpx[i];
+        }
+        varl = varl->next;
+        k++;
+      }
+    }
+
+    if (Bp == ble)
+      break;
+    Bp = Bp->next;
+  }
+
+  struct mloc
+  {
+    double val;
+    int rank;
+  };
+
+  mloc *IN, *OUT;
+  IN = new mloc[num_var];
+  OUT = new mloc[num_var];
+  for (int i = 0; i < num_var; i++)
+  {
+    IN[i].val = shellf[i];
+    IN[i].rank = myrank;
+  }
+
+  MPI_Allreduce(IN, OUT, num_var, MPI_DOUBLE_INT, MPI_MAXLOC, Comm_here);
+
+  for (int i = 0; i < num_var; i++)
+  {
+    Shellf[i] = OUT[i].val;
+    if (myrank != OUT[i].rank)
+      for (int k = 0; k < 3; k++)
+        xx[3 * i + k] = 0;
+  }
+
+  MPI_Allreduce(xx, XX, dim * num_var, MPI_DOUBLE, MPI_SUM, Comm_here);
+
+  delete[] IN;
+  delete[] OUT;
+  delete[] shellf;
+  delete[] xx;
+  delete[] DH;
+  delete[] llb;
+  delete[] uub;
+}
+// if the given point locates in the present Patch return true
+// otherwise return false
+bool Patch::Find_Point(double *XX)
+{
+  double *DH;
+  DH = new double[dim];
+
+  for (int i = 0; i < dim; i++)
+  {
+    DH[i] = getdX(i);
+  }
+
+  for (int i = 0; i < dim; i++)
+  {
+    // has excluded the buffer points
+    if (XX[i] < bbox[i] + lli[i] * DH[i] - DH[i] / 100 || XX[i] > bbox[dim + i] - uui[i] * DH[i] + DH[i] / 100)
+    {
+      delete[] DH;
+      return false; // out of current patch,
+                    // remember to delete the allocated arrays before return!!!
+    }
+  }
+
+  delete[] DH;
+
+  return true;
 }
\ No newline at end of file
diff --git a/AMSS_NCKU_source/NullShellPatch.h b/AMSS_NCKU_source/NullShellPatch.h
index 26ff030..bad52b3 100644
--- a/AMSS_NCKU_source/NullShellPatch.h
+++ b/AMSS_NCKU_source/NullShellPatch.h
@@ -24,6 +24,7 @@ using namespace std;
 #endif
 
 #include <mpi.h>
+#include <memory.h>
 #include "MyList.h"
 #include "Block.h"
 #include "Parallel.h"
diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C
index a9fb3cd..20d70f2 100644
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -1,6484 +1,7128 @@
-
-#include "Parallel.h"
-#include "fmisc.h"
-#include "prolongrestrict.h"
-#include "misc.h"
-#include "parameters.h"
-
-int Parallel::partition1(int &nx, int split_size, int min_width, int cpusize, int shape) // special for 1 diemnsion
-{
-  nx = Mymax(1, shape / min_width);
-  nx = Mymin(cpusize, nx);
-
-  return nx;
-}
-int Parallel::partition2(int *nxy, int split_size, int *min_width, int cpusize, int *shape) // special for 2 diemnsions
-{
-#define SEARCH_SIZE 5
-  int i, j, nx, ny;
-  int maxnx, maxny;
-  int mnx, mny;
-  int dn, hmin_width, cmin_width;
-  int cnx, cny;
-  double fx, fy;
-  int block_size;
-  int n;
-
-  block_size = shape[0] * shape[1];
-  n = Mymax(1, (block_size + split_size / 2) / split_size);
-
-  maxnx = Mymax(1, shape[0] / min_width[0]);
-  maxnx = Mymin(cpusize, maxnx);
-  maxny = Mymax(1, shape[1] / min_width[1]);
-  maxny = Mymin(cpusize, maxny);
-  fx = (double)shape[0] / (shape[0] + shape[1]);
-  fy = (double)shape[1] / (shape[0] + shape[1]);
-  nx = mnx = Mymax(1, Mymin(maxnx, (int)(sqrt(double(n)) * fx / fy)));
-  ny = mny = Mymax(1, Mymin(maxny, (int)(sqrt(double(n)) * fy / fx)));
-  dn = abs(n - nx * ny);
-  hmin_width = Mymin(shape[0] / nx, shape[1] / ny);
-  for (cny = Mymax(1, mny - SEARCH_SIZE); cny <= (Mymin(mny + SEARCH_SIZE, maxny)); cny++)
-    for (cnx = Mymax(1, mnx - SEARCH_SIZE); cnx <= (Mymin(mnx + SEARCH_SIZE, maxnx)); cnx++)
-    {
-      cmin_width = Mymin(shape[0] / cnx, shape[1] / cny);
-      if (dn > abs(n - cnx * cny) || (dn == abs(n - cnx * cny) && cmin_width > hmin_width))
-      {
-        dn = abs(n - cnx * cny);
-        nx = cnx;
-        ny = cny;
-        hmin_width = cmin_width;
-      }
-    }
-
-  nxy[0] = nx;
-  nxy[1] = ny;
-
-  return nx * ny;
-#undef SEARCH_SIZE
-}
-int Parallel::partition3(int *nxyz, int split_size, int *min_width, int cpusize, int *shape) // special for 3 diemnsions
-#if 1                                                                                        // algrithsm from Pretorius
-{
-//	cout<<split_size<<endl<<min_width[0]<<endl<<min_width[1]<<endl<<min_width[2]<<endl
-//            <<shape[0]<<endl<<shape[1]<<endl<<shape[2]<<endl<<cpusize<<endl;
-#define SEARCH_SIZE 5
-  int i, j, k, nx, ny, nz;
-  int maxnx, maxny, maxnz;
-  int mnx, mny, mnz;
-  int dn, hmin_width, cmin_width;
-  int cnx, cny, cnz;
-  double fx, fy, fz, max_fxfy, max_fxfz, max_fyfz;
-  int block_size;
-  int n;
-
-  block_size = shape[0] * shape[1] * shape[2];
-  n = Mymax(1, (block_size + split_size / 2) / split_size);
-
-  maxnx = Mymax(1, shape[0] / min_width[0]);
-  maxnx = Mymin(cpusize, maxnx);
-  maxny = Mymax(1, shape[1] / min_width[1]);
-  maxny = Mymin(cpusize, maxny);
-  maxnz = Mymax(1, shape[2] / min_width[2]);
-  maxnz = Mymin(cpusize, maxnz);
-  fx = (double)shape[0] / (shape[0] + shape[1] + shape[2]);
-  fy = (double)shape[1] / (shape[0] + shape[1] + shape[2]);
-  fz = (double)shape[2] / (shape[0] + shape[1] + shape[2]);
-  max_fxfy = Mymax(fx, fy);
-  max_fxfz = Mymax(fx, fz);
-  max_fyfz = Mymax(fy, fz);
-  nx = mnx = Mymax(1, Mymin(maxnx, (int)(pow(n, 1.0 / 3.0) * fx / max_fyfz)));
-  ny = mny = Mymax(1, Mymin(maxny, (int)(pow(n, 1.0 / 3.0) * fy / max_fxfz)));
-  nz = mnz = Mymax(1, Mymin(maxnz, (int)(pow(n, 1.0 / 3.0) * fz / max_fxfy)));
-  dn = abs(n - nx * ny * nz);
-  hmin_width = Mymin(shape[2] / nz, shape[1] / ny);
-  hmin_width = Mymin(hmin_width, shape[0] / nx);
-  for (cnz = Mymax(1, mnz - SEARCH_SIZE); cnz <= (Mymin(mnz + SEARCH_SIZE, maxnz)); cnz++)
-    for (cny = Mymax(1, mny - SEARCH_SIZE); cny <= (Mymin(mny + SEARCH_SIZE, maxny)); cny++)
-      for (cnx = Mymax(1, mnx - SEARCH_SIZE); cnx <= (Mymin(mnx + SEARCH_SIZE, maxnx)); cnx++)
-      {
-        cmin_width = Mymin(shape[2] / cnz, shape[1] / cny);
-        cmin_width = Mymin(cmin_width, shape[0] / cnx);
-        if (dn > abs(n - cnx * cny * cnz) || (dn == abs(n - cnx * cny * cnz) && cmin_width > hmin_width))
-        {
-          dn = abs(n - cnx * cny * cnz);
-          nx = cnx;
-          ny = cny;
-          nz = cnz;
-          hmin_width = cmin_width;
-        }
-      }
-
-  nxyz[0] = nx;
-  nxyz[1] = ny;
-  nxyz[2] = nz;
-
-  return nx * ny * nz;
-#undef SEARCH_SIZE
-}
-#elif 1 // Zhihui's idea one on 2013-09-25
-{
-  int nx, ny, nz;
-  int hmin_width;
-  hmin_width = Mymin(min_width[0], min_width[1]);
-  hmin_width = Mymin(hmin_width, min_width[2]);
-  nx = shape[0] / hmin_width;
-  if (nx * hmin_width < shape[0])
-    nx++;
-  ny = shape[1] / hmin_width;
-  if (ny * hmin_width < shape[1])
-    ny++;
-  nz = shape[2] / hmin_width;
-  if (nz * hmin_width < shape[2])
-    nz++;
-  while (nx * ny * nz > cpusize)
-  {
-    hmin_width++;
-    nx = shape[0] / hmin_width;
-    if (nx * hmin_width < shape[0])
-      nx++;
-    ny = shape[1] / hmin_width;
-    if (ny * hmin_width < shape[1])
-      ny++;
-    nz = shape[2] / hmin_width;
-    if (nz * hmin_width < shape[2])
-      nz++;
-  }
-
-  nxyz[0] = nx;
-  nxyz[1] = ny;
-  nxyz[2] = nz;
-
-  return nx * ny * nz;
-}
-#elif 1 // Zhihui's idea two on 2013-09-25
-{
-  int nx, ny, nz;
-  const int hmin_width = 8; // for example we use 8
-  nx = shape[0] / hmin_width;
-  if (nx * hmin_width < shape[0])
-    nx++;
-  ny = shape[1] / hmin_width;
-  if (ny * hmin_width < shape[1])
-    ny++;
-  nz = shape[2] / hmin_width;
-  if (nz * hmin_width < shape[2])
-    nz++;
-
-  nxyz[0] = nx;
-  nxyz[1] = ny;
-  nxyz[2] = nz;
-
-  return nx * ny * nz;
-}
-#endif
-// distribute the data to cprocessors
-#if (PSTR == 0)
-MyList<Block> *Parallel::distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
-                                    bool periodic, int nodes)
-{
-#ifdef USE_GPU_DIVIDE
-  double cpu_part, gpu_part;
-  map<string, double>::iterator iter;
-  iter = parameters::dou_par.find("cpu part");
-  if (iter != parameters::dou_par.end())
-  {
-    cpu_part = iter->second;
-  }
-  else
-  {
-    int myrank;
-    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-    // read parameter from file
-    const int LEN = 256;
-    char pline[LEN];
-    string str, sgrp, skey, sval;
-    int sind;
-    char pname[50];
-    {
-      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
-      if (iter != parameters::str_par.end())
-      {
-        strcpy(pname, (iter->second).c_str());
-      }
-      else
-      {
-        cout << "Error inputpar" << endl;
-        exit(0);
-      }
-    }
-    ifstream inf(pname, ifstream::in);
-    if (!inf.good() && myrank == 0)
-    {
-      cout << "Can not open parameter file " << pname << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-
-    for (int i = 1; inf.good(); i++)
-    {
-      inf.getline(pline, LEN);
-      str = pline;
-
-      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
-      if (status == -1)
-      {
-        cout << "error reading parameter file " << pname << " in line " << i << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-      else if (status == 0)
-        continue;
-
-      if (sgrp == "ABE")
-      {
-        if (skey == "cpu part")
-          cpu_part = atof(sval.c_str());
-      }
-    }
-    inf.close();
-
-    parameters::dou_par.insert(map<string, double>::value_type("cpu part", cpu_part));
-  }
-  iter = parameters::dou_par.find("gpu part");
-  if (iter != parameters::dou_par.end())
-  {
-    gpu_part = iter->second;
-  }
-  else
-  {
-    int myrank;
-    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-    // read parameter from file
-    const int LEN = 256;
-    char pline[LEN];
-    string str, sgrp, skey, sval;
-    int sind;
-    char pname[50];
-    {
-      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
-      if (iter != parameters::str_par.end())
-      {
-        strcpy(pname, (iter->second).c_str());
-      }
-      else
-      {
-        cout << "Error inputpar" << endl;
-        exit(0);
-      }
-    }
-    ifstream inf(pname, ifstream::in);
-    if (!inf.good() && myrank == 0)
-    {
-      cout << "Can not open parameter file " << pname << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-
-    for (int i = 1; inf.good(); i++)
-    {
-      inf.getline(pline, LEN);
-      str = pline;
-
-      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
-      if (status == -1)
-      {
-        cout << "error reading parameter file " << pname << " in line " << i << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-      else if (status == 0)
-        continue;
-
-      if (sgrp == "ABE")
-      {
-        if (skey == "gpu part")
-          gpu_part = atof(sval.c_str());
-      }
-    }
-    inf.close();
-
-    parameters::dou_par.insert(map<string, double>::value_type("gpu part", gpu_part));
-  }
-
-  if (nodes == 0)
-    nodes = cpusize / 2;
-#else
-  if (nodes == 0)
-    nodes = cpusize;
-#endif
-
-  if (dim != 3)
-  {
-    cout << "distrivute: now we only support 3-dimension" << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-  MyList<Block> *BlL = 0;
-
-  int split_size, min_size, block_size = 0;
-
-  int min_width = 2 * Mymax(ghost_width, buffer_width);
-  int nxyz[dim], mmin_width[dim], min_shape[dim];
-
-  MyList<Patch> *PLi = PatchLIST;
-  for (int i = 0; i < dim; i++)
-    min_shape[i] = PLi->data->shape[i];
-  int lev = PLi->data->lev;
-  PLi = PLi->next;
-  while (PLi)
-  {
-    Patch *PP = PLi->data;
-    for (int i = 0; i < dim; i++)
-      min_shape[i] = Mymin(min_shape[i], PP->shape[i]);
-    if (lev != PLi->data->lev)
-      cout << "Parallel::distribute CAUSTION: meet Patches for different level: " << lev << " and " << PLi->data->lev << endl;
-    PLi = PLi->next;
-  }
-
-  for (int i = 0; i < dim; i++)
-    mmin_width[i] = Mymin(min_width, min_shape[i]);
-
-  min_size = mmin_width[0];
-  for (int i = 1; i < dim; i++)
-    min_size = min_size * mmin_width[i];
-
-  PLi = PatchLIST;
-  while (PLi)
-  {
-    Patch *PP = PLi->data;
-    //    PP->checkPatch(true);
-    int bs = PP->shape[0];
-    for (int i = 1; i < dim; i++)
-      bs = bs * PP->shape[i];
-    block_size = block_size + bs;
-    PLi = PLi->next;
-  }
-  split_size = Mymax(min_size, block_size / nodes);
-  split_size = Mymax(1, split_size);
-
-  int n_rank = 0;
-  PLi = PatchLIST;
-  int reacpu = 0;
-  while (PLi)
-  {
-    Patch *PP = PLi->data;
-
-    reacpu += partition3(nxyz, split_size, mmin_width, nodes, PP->shape);
-
-    Block *ng0, *ng;
-    int shape_here[dim], ibbox_here[2 * dim];
-    double bbox_here[2 * dim], dd;
-
-    // ibbox : 0,...N-1
-    for (int i = 0; i < nxyz[0]; i++)
-      for (int j = 0; j < nxyz[1]; j++)
-        for (int k = 0; k < nxyz[2]; k++)
-        {
-          ibbox_here[0] = (PP->shape[0] * i) / nxyz[0];
-          ibbox_here[3] = (PP->shape[0] * (i + 1)) / nxyz[0] - 1;
-          ibbox_here[1] = (PP->shape[1] * j) / nxyz[1];
-          ibbox_here[4] = (PP->shape[1] * (j + 1)) / nxyz[1] - 1;
-          ibbox_here[2] = (PP->shape[2] * k) / nxyz[2];
-          ibbox_here[5] = (PP->shape[2] * (k + 1)) / nxyz[2] - 1;
-
-          if (periodic)
-          {
-            ibbox_here[0] = ibbox_here[0] - ghost_width;
-            ibbox_here[3] = ibbox_here[3] + ghost_width;
-            ibbox_here[1] = ibbox_here[1] - ghost_width;
-            ibbox_here[4] = ibbox_here[4] + ghost_width;
-            ibbox_here[2] = ibbox_here[2] - ghost_width;
-            ibbox_here[5] = ibbox_here[5] + ghost_width;
-          }
-          else
-          {
-            ibbox_here[0] = Mymax(0, ibbox_here[0] - ghost_width);
-            ibbox_here[3] = Mymin(PP->shape[0] - 1, ibbox_here[3] + ghost_width);
-            ibbox_here[1] = Mymax(0, ibbox_here[1] - ghost_width);
-            ibbox_here[4] = Mymin(PP->shape[1] - 1, ibbox_here[4] + ghost_width);
-            ibbox_here[2] = Mymax(0, ibbox_here[2] - ghost_width);
-            ibbox_here[5] = Mymin(PP->shape[2] - 1, ibbox_here[5] + ghost_width);
-          }
-
-          shape_here[0] = ibbox_here[3] - ibbox_here[0] + 1;
-          shape_here[1] = ibbox_here[4] - ibbox_here[1] + 1;
-          shape_here[2] = ibbox_here[5] - ibbox_here[2] + 1;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-          // 0--4, 5--10
-          dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1);
-          bbox_here[0] = PP->bbox[0] + ibbox_here[0] * dd;
-          bbox_here[3] = PP->bbox[0] + ibbox_here[3] * dd;
-
-          dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1);
-          bbox_here[1] = PP->bbox[1] + ibbox_here[1] * dd;
-          bbox_here[4] = PP->bbox[1] + ibbox_here[4] * dd;
-
-          dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1);
-          bbox_here[2] = PP->bbox[2] + ibbox_here[2] * dd;
-          bbox_here[5] = PP->bbox[2] + ibbox_here[5] * dd;
-#else
-#ifdef Cell
-          // 0--5, 5--10
-          dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0];
-          bbox_here[0] = PP->bbox[0] + (ibbox_here[0]) * dd;
-          bbox_here[3] = PP->bbox[0] + (ibbox_here[3] + 1) * dd;
-
-          dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1];
-          bbox_here[1] = PP->bbox[1] + (ibbox_here[1]) * dd;
-          bbox_here[4] = PP->bbox[1] + (ibbox_here[4] + 1) * dd;
-
-          dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2];
-          bbox_here[2] = PP->bbox[2] + (ibbox_here[2]) * dd;
-          bbox_here[5] = PP->bbox[2] + (ibbox_here[5] + 1) * dd;
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-
-#ifdef USE_GPU_DIVIDE
-          {
-            const int pices = 2;
-            double picef[pices];
-            picef[0] = cpu_part;
-            picef[1] = gpu_part;
-            int shape_res[dim * pices];
-            double bbox_res[2 * dim * pices];
-            misc::dividBlock(dim, shape_here, bbox_here, pices, picef, shape_res, bbox_res, min_width);
-            ng = ng0 = new Block(dim, shape_res, bbox_res, n_rank++, ingfsi, fngfsi, PP->lev, 0); // delete through KillBlocks
-
-            //	       if(n_rank==cpusize) {n_rank=0; cerr<<"place one!!"<<endl;}
-
-            //	       ng->checkBlock();
-            if (BlL)
-              BlL->insert(ng);
-            else
-              BlL = new MyList<Block>(ng); // delete through KillBlocks
-
-            for (int i = 1; i < pices; i++)
-            {
-              ng = new Block(dim, shape_res + i * dim, bbox_res + i * 2 * dim, n_rank++, ingfsi, fngfsi, PP->lev, i); // delete through KillBlocks
-              //	        if(n_rank==cpusize) {n_rank=0; cerr<<"place two!! "<<i<<endl;}
-              //	        ng->checkBlock();
-              BlL->insert(ng);
-            }
-          }
-#else
-          ng = ng0 = new Block(dim, shape_here, bbox_here, n_rank++, ingfsi, fngfsi, PP->lev); // delete through KillBlocks
-          //	    ng->checkBlock();
-          if (BlL)
-            BlL->insert(ng);
-          else
-            BlL = new MyList<Block>(ng); // delete through KillBlocks
-#endif
-          if (n_rank == cpusize)
-            n_rank = 0;
-
-          // set PP->blb
-          if (i == 0 && j == 0 && k == 0)
-          {
-            MyList<Block> *Bp = BlL;
-            while (Bp->data != ng0)
-              Bp = Bp->next; // ng0 is the first of the pices list
-            PP->blb = Bp;
-          }
-        }
-    // set PP->ble
-    {
-      MyList<Block> *Bp = BlL;
-      while (Bp->data != ng)
-        Bp = Bp->next; // ng is the last of the pices list
-      PP->ble = Bp;
-    }
-    PLi = PLi->next;
-  }
-  if (reacpu < nodes * 2 / 3)
-  {
-    int myrank;
-    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-    if (myrank == 0)
-      cout << "Parallel::distribute CAUSTION: level#" << lev << " uses essencially " << reacpu << " processors vs " << nodes << " nodes run, your scientific computation scale is not as large as you estimate." << endl;
-  }
-
-  return BlL;
-}
-#elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
-MyList<Block> *Parallel::distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
-                                    bool periodic, int start_rank, int end_rank, int nodes)
-{
-#ifdef USE_GPU_DIVIDE
-  double cpu_part, gpu_part;
-  map<string, double>::iterator iter;
-  iter = parameters::dou_par.find("cpu part");
-  if (iter != parameters::dou_par.end())
-  {
-    cpu_part = iter->second;
-  }
-  else
-  {
-    int myrank;
-    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-    // read parameter from file
-    const int LEN = 256;
-    char pline[LEN];
-    string str, sgrp, skey, sval;
-    int sind;
-    char pname[50];
-    {
-      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
-      if (iter != parameters::str_par.end())
-      {
-        strcpy(pname, (iter->second).c_str());
-      }
-      else
-      {
-        cout << "Error inputpar" << endl;
-        exit(0);
-      }
-    }
-    ifstream inf(pname, ifstream::in);
-    if (!inf.good() && myrank == 0)
-    {
-      cout << "Can not open parameter file " << pname << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-
-    for (int i = 1; inf.good(); i++)
-    {
-      inf.getline(pline, LEN);
-      str = pline;
-
-      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
-      if (status == -1)
-      {
-        cout << "error reading parameter file " << pname << " in line " << i << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-      else if (status == 0)
-        continue;
-
-      if (sgrp == "ABE")
-      {
-        if (skey == "cpu part")
-          cpu_part = atof(sval.c_str());
-      }
-    }
-    inf.close();
-
-    parameters::dou_par.insert(map<string, double>::value_type("cpu part", cpu_part));
-  }
-  iter = parameters::dou_par.find("gpu part");
-  if (iter != parameters::dou_par.end())
-  {
-    gpu_part = iter->second;
-  }
-  else
-  {
-    int myrank;
-    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-    // read parameter from file
-    const int LEN = 256;
-    char pline[LEN];
-    string str, sgrp, skey, sval;
-    int sind;
-    char pname[50];
-    {
-      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
-      if (iter != parameters::str_par.end())
-      {
-        strcpy(pname, (iter->second).c_str());
-      }
-      else
-      {
-        cout << "Error inputpar" << endl;
-        exit(0);
-      }
-    }
-    ifstream inf(pname, ifstream::in);
-    if (!inf.good() && myrank == 0)
-    {
-      cout << "Can not open parameter file " << pname << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-
-    for (int i = 1; inf.good(); i++)
-    {
-      inf.getline(pline, LEN);
-      str = pline;
-
-      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
-      if (status == -1)
-      {
-        cout << "error reading parameter file " << pname << " in line " << i << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-      else if (status == 0)
-        continue;
-
-      if (sgrp == "ABE")
-      {
-        if (skey == "gpu part")
-          gpu_part = atof(sval.c_str());
-      }
-    }
-    inf.close();
-
-    parameters::dou_par.insert(map<string, double>::value_type("gpu part", gpu_part));
-  }
-
-  if (nodes == 0)
-    nodes = cpusize / 2;
-#else
-  if (nodes == 0)
-    nodes = cpusize;
-#endif
-
-  if (dim != 3)
-  {
-    cout << "distrivute: now we only support 3-dimension" << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-  MyList<Block> *BlL = 0;
-
-  int split_size, min_size, block_size = 0;
-
-  int min_width = 2 * Mymax(ghost_width, buffer_width);
-  int nxyz[dim], mmin_width[dim], min_shape[dim];
-
-  MyList<Patch> *PLi = PatchLIST;
-  for (int i = 0; i < dim; i++)
-    min_shape[i] = PLi->data->shape[i];
-  int lev = PLi->data->lev;
-  PLi = PLi->next;
-  while (PLi)
-  {
-    Patch *PP = PLi->data;
-    for (int i = 0; i < dim; i++)
-      min_shape[i] = Mymin(min_shape[i], PP->shape[i]);
-    if (lev != PLi->data->lev)
-      cout << "Parallel::distribute CAUSTION: meet Patches for different level: " << lev << " and " << PLi->data->lev << endl;
-    PLi = PLi->next;
-  }
-
-  for (int i = 0; i < dim; i++)
-    mmin_width[i] = Mymin(min_width, min_shape[i]);
-
-  min_size = mmin_width[0];
-  for (int i = 1; i < dim; i++)
-    min_size = min_size * mmin_width[i];
-
-  PLi = PatchLIST;
-  while (PLi)
-  {
-    Patch *PP = PLi->data;
-    //    PP->checkPatch(true);
-    int bs = PP->shape[0];
-    for (int i = 1; i < dim; i++)
-      bs = bs * PP->shape[i];
-    block_size = block_size + bs;
-    PLi = PLi->next;
-  }
-  split_size = Mymax(min_size, block_size / cpusize);
-  split_size = Mymax(1, split_size);
-
-  int n_rank = start_rank;
-  PLi = PatchLIST;
-  int reacpu = 0;
-  while (PLi)
-  {
-    Patch *PP = PLi->data;
-
-    reacpu += partition3(nxyz, split_size, mmin_width, cpusize, PP->shape);
-
-    Block *ng, *ng0;
-    int shape_here[dim], ibbox_here[2 * dim];
-    double bbox_here[2 * dim], dd;
-
-    // ibbox : 0,...N-1
-    for (int i = 0; i < nxyz[0]; i++)
-      for (int j = 0; j < nxyz[1]; j++)
-        for (int k = 0; k < nxyz[2]; k++)
-        {
-          ibbox_here[0] = (PP->shape[0] * i) / nxyz[0];
-          ibbox_here[3] = (PP->shape[0] * (i + 1)) / nxyz[0] - 1;
-          ibbox_here[1] = (PP->shape[1] * j) / nxyz[1];
-          ibbox_here[4] = (PP->shape[1] * (j + 1)) / nxyz[1] - 1;
-          ibbox_here[2] = (PP->shape[2] * k) / nxyz[2];
-          ibbox_here[5] = (PP->shape[2] * (k + 1)) / nxyz[2] - 1;
-
-          if (periodic)
-          {
-            ibbox_here[0] = ibbox_here[0] - ghost_width;
-            ibbox_here[3] = ibbox_here[3] + ghost_width;
-            ibbox_here[1] = ibbox_here[1] - ghost_width;
-            ibbox_here[4] = ibbox_here[4] + ghost_width;
-            ibbox_here[2] = ibbox_here[2] - ghost_width;
-            ibbox_here[5] = ibbox_here[5] + ghost_width;
-          }
-          else
-          {
-            ibbox_here[0] = Mymax(0, ibbox_here[0] - ghost_width);
-            ibbox_here[3] = Mymin(PP->shape[0] - 1, ibbox_here[3] + ghost_width);
-            ibbox_here[1] = Mymax(0, ibbox_here[1] - ghost_width);
-            ibbox_here[4] = Mymin(PP->shape[1] - 1, ibbox_here[4] + ghost_width);
-            ibbox_here[2] = Mymax(0, ibbox_here[2] - ghost_width);
-            ibbox_here[5] = Mymin(PP->shape[2] - 1, ibbox_here[5] + ghost_width);
-          }
-
-          shape_here[0] = ibbox_here[3] - ibbox_here[0] + 1;
-          shape_here[1] = ibbox_here[4] - ibbox_here[1] + 1;
-          shape_here[2] = ibbox_here[5] - ibbox_here[2] + 1;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-          // 0--4, 5--10
-          dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1);
-          bbox_here[0] = PP->bbox[0] + ibbox_here[0] * dd;
-          bbox_here[3] = PP->bbox[0] + ibbox_here[3] * dd;
-
-          dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1);
-          bbox_here[1] = PP->bbox[1] + ibbox_here[1] * dd;
-          bbox_here[4] = PP->bbox[1] + ibbox_here[4] * dd;
-
-          dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1);
-          bbox_here[2] = PP->bbox[2] + ibbox_here[2] * dd;
-          bbox_here[5] = PP->bbox[2] + ibbox_here[5] * dd;
-#else
-#ifdef Cell
-          // 0--5, 5--10
-          dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0];
-          bbox_here[0] = PP->bbox[0] + (ibbox_here[0]) * dd;
-          bbox_here[3] = PP->bbox[0] + (ibbox_here[3] + 1) * dd;
-
-          dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1];
-          bbox_here[1] = PP->bbox[1] + (ibbox_here[1]) * dd;
-          bbox_here[4] = PP->bbox[1] + (ibbox_here[4] + 1) * dd;
-
-          dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2];
-          bbox_here[2] = PP->bbox[2] + (ibbox_here[2]) * dd;
-          bbox_here[5] = PP->bbox[2] + (ibbox_here[5] + 1) * dd;
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-
-#ifdef USE_GPU_DIVIDE
-          {
-            const int pices = 2;
-            double picef[pices];
-            picef[0] = cpu_part;
-            picef[1] = gpu_part;
-            int shape_res[dim * pices];
-            double bbox_res[2 * dim * pices];
-            misc::dividBlock(dim, shape_here, bbox_here, pices, picef, shape_res, bbox_res, min_width);
-            ng = ng0 = new Block(dim, shape_res, bbox_res, n_rank++, ingfsi, fngfsi, PP->lev, 0); // delete through KillBlocks
-            //	       ng->checkBlock();
-            if (BlL)
-              BlL->insert(ng);
-            else
-              BlL = new MyList<Block>(ng); // delete through KillBlocks
-
-            for (int i = 1; i < pices; i++)
-            {
-              ng = new Block(dim, shape_res + i * dim, bbox_res + i * 2 * dim, n_rank++, ingfsi, fngfsi, PP->lev, i); // delete through KillBlocks
-              //	        ng->checkBlock();
-              BlL->insert(ng);
-            }
-          }
-#else
-          ng = ng0 = new Block(dim, shape_here, bbox_here, n_rank++, ingfsi, fngfsi, PP->lev); // delete through KillBlocks
-          //	    ng->checkBlock();
-          if (BlL)
-            BlL->insert(ng);
-          else
-            BlL = new MyList<Block>(ng); // delete through KillBlocks
-#endif
-
-          if (n_rank == end_rank + 1)
-            n_rank = start_rank;
-
-          // set PP->blb
-          if (i == 0 && j == 0 && k == 0)
-          {
-            MyList<Block> *Bp = BlL;
-            while (Bp->data != ng0)
-              Bp = Bp->next; // ng0 is the first of the pices list
-            PP->blb = Bp;
-          }
-        }
-    // set PP->ble
-    {
-      MyList<Block> *Bp = BlL;
-      while (Bp->data != ng)
-        Bp = Bp->next; // ng is the last of the pices list
-      PP->ble = Bp;
-    }
-    PLi = PLi->next;
-  }
-  if (reacpu < nodes * 2 / 3)
-  {
-    int myrank;
-    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-    if (myrank == start_rank)
-      cout << "Parallel::distribute CAUSTION: level#" << lev << " uses essencially " << reacpu << " processors vs " << nodes << " nodes run, your scientific computation scale is not as large as you estimate." << endl;
-  }
-
-  return BlL;
-}
-#endif
-void Parallel::setfunction(MyList<Block> *BlL, var *vn, double func(double x, double y, double z))
-{
-  while (BlL)
-  {
-    if (BlL->data->X[0])
-    {
-      int nn = BlL->data->shape[0] * BlL->data->shape[1] * BlL->data->shape[2];
-      double *p = BlL->data->fgfs[vn->sgfn];
-      for (int i = 0; i < nn; i++)
-      {
-        int ind[3];
-        getarrayindex(3, BlL->data->shape, ind, i);
-        p[i] = func(BlL->data->X[0][ind[0]], BlL->data->X[1][ind[1]], BlL->data->X[2][ind[2]]);
-      }
-    }
-    BlL = BlL->next;
-  }
-}
-// set function only for cpu rank
-void Parallel::setfunction(int rank, MyList<Block> *BlL, var *vn, double func(double x, double y, double z))
-{
-  while (BlL)
-  {
-    if (BlL->data->X[0] && BlL->data->rank == rank)
-    {
-      int nn = BlL->data->shape[0] * BlL->data->shape[1] * BlL->data->shape[2];
-      double *p = BlL->data->fgfs[vn->sgfn];
-      for (int i = 0; i < nn; i++)
-      {
-        int ind[3];
-        getarrayindex(3, BlL->data->shape, ind, i);
-        p[i] = func(BlL->data->X[0][ind[0]], BlL->data->X[1][ind[1]], BlL->data->X[2][ind[2]]);
-      }
-    }
-    BlL = BlL->next;
-  }
-}
-void Parallel::getarrayindex(int DIM, int *shape, int *index, int n)
-{
-  // we assume index has already memory space
-  int *mu;
-  mu = new int[DIM];
-  mu[0] = 1;
-  for (int i = 1; i < DIM; i++)
-    mu[i] = mu[i - 1] * shape[i - 1];
-  for (int i = DIM - 1; i >= 0; i--)
-  {
-    index[i] = n / mu[i];
-    n = n - index[i] * mu[i];
-  }
-
-  delete[] mu;
-}
-int Parallel::getarraylocation(int DIM, int *shape, int *index)
-{
-  int n, mu;
-  mu = shape[0];
-  n = index[0];
-  for (int i = 1; i < DIM; i++)
-  {
-    n = n + index[i] * mu;
-    mu = mu * shape[i];
-  }
-
-  return n;
-}
-void Parallel::copy(int DIM, double *llbout, double *uubout, int *Dshape, double *DD, double *llbin, double *uubin,
-                    int *shape, double *datain, double *llb, double *uub)
-{
-  // for 3 dimensional case, based on simple test, I found this is half slower than f90 code
-  int *illi, *iuui;
-  int *illo, *iuuo;
-  int *indi, *indo;
-  illi = new int[DIM];
-  iuui = new int[DIM];
-  illo = new int[DIM];
-  iuuo = new int[DIM];
-  indi = new int[DIM];
-  indo = new int[DIM];
-
-  int ial = 1;
-  for (int i = 0; i < DIM; i++)
-  {
-    double ho, hi;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-    ho = (uubout[i] - llbout[i]) / (Dshape[i] - 1);
-    hi = (uubin[i] - llbin[i]) / (shape[i] - 1);
-#else
-#ifdef Cell
-    ho = (uubout[i] - llbout[i]) / Dshape[i];
-    hi = (uubin[i] - llbin[i]) / shape[i];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-    illo[i] = int((llb[i] - llbout[i]) / ho);
-    iuuo[i] = Dshape[i] - 1 - int((uubout[i] - uub[i]) / ho);
-    illi[i] = int((llb[i] - llbin[i]) / hi);
-    iuui[i] = shape[i] - 1 - int((uubin[i] - uub[i]) / hi);
-
-    if (illo[i] > iuuo[i] || illi[i] > iuui[i] || illo[i] < 0 || illi[i] < 0 ||
-        iuui[i] >= shape[i] || iuuo[i] >= Dshape[i])
-    {
-      cout << "Parallel copy: in direction " << i << ":" << endl;
-      cout << "llb = " << llb[i] << ", uub = " << uub[i] << endl;
-      cout << " in data : il = " << illi[i] << ", iu = " << iuui[i] << endl;
-      cout << "bbox = (" << llbin[i] << "," << uubin[i] << ")" << endl;
-      cout << "shape = " << shape[i] << endl;
-      cout << "out data : il = " << illo[i] << ", iu = " << iuuo[i] << endl;
-      cout << "bbox = (" << llbout[i] << "," << uubout[i] << ")" << endl;
-      cout << "shape = " << Dshape[i] << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-    int ihi = iuui[i] - illi[i] + 1, iho = iuuo[i] - illo[i] + 1;
-    if (!(feq(ho, hi, ho / 2)) || ihi != iho)
-    {
-      cout << "Parallel copy: in direction " << i << ":" << endl;
-      cout << "Parallel copy: not the same grid structure." << endl;
-      cout << "hi = " << hi << ", bbox = (" << llbin[i] << "," << uubin[i] << "), shape = " << shape[i] << endl;
-      cout << "ho = " << ho << ", bbox = (" << llbout[i] << "," << uubout[i] << "), shape = " << Dshape[i] << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-    ial = ial * ihi;
-  }
-
-  for (int i = 0; i < DIM; i++)
-  {
-    indi[i] = illi[i];
-    indo[i] = illo[i];
-  }
-  /*
-  //check start index
-     for(int i=0;i<DIM;i++)
-     {
-       cout << "Parallel copy: in direction " <<i<<":"<< endl;
-       cout<<"start : indi = "<<indi[i]<<", indo = "<<indo[i]<<endl;
-     }
-  */
-  int NNi = 1, NNo = 1;
-  for (int i = 0; i < DIM; i++)
-  {
-    NNi = NNi * shape[i];
-    NNo = NNo * Dshape[i];
-  }
-  for (int i = 0; i < ial; i++)
-  {
-    int ni, no;
-    ni = getarraylocation(DIM, shape, indi);
-    no = getarraylocation(DIM, Dshape, indo);
-    if (no < 0 || no > NNo)
-    {
-      cout << "Parallel copy: no = " << no << " is out of array range (0," << NNo << ")." << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-    if (ni < 0 || ni > NNi)
-    {
-      cout << "Parallel copy: ni = " << ni << " is out of array range (0," << NNi << ")." << endl;
-      cout << "shape = (";
-      for (int j = 0; j < DIM; j++)
-      {
-        cout << shape[j];
-        if (j < DIM - 1)
-          cout << ",";
-        else
-          cout << ")" << endl;
-      }
-      cout << "ind = (";
-      for (int j = 0; j < DIM; j++)
-      {
-        cout << indi[j];
-        if (j < DIM - 1)
-          cout << ",";
-        else
-          cout << ")" << endl;
-      }
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-    DD[no] = datain[ni];
-
-    indi[0]++;
-    for (int j = 1; j < DIM; j++)
-    {
-      if (indi[j - 1] == iuui[j - 1] + 1)
-      {
-        indi[j - 1] = illi[j - 1];
-        indi[j]++;
-      } // carry 1 to next digital
-      else
-        break;
-    }
-    indo[0]++;
-    for (int j = 1; j < DIM; j++)
-    {
-      if (indo[j - 1] == iuuo[j - 1] + 1)
-      {
-        indo[j - 1] = illo[j - 1];
-        indo[j]++;
-      }
-      else
-        break;
-    }
-  }
-  /*
-  //check final index
-     for(int i=0;i<DIM;i++)
-     {
-       cout << "Parallel copy: in direction " <<i<<":"<< endl;
-       cout<<"final : indi = "<<indi[i]<<", indo = "<<indo[i]<<endl;
-     }
-  */
-  delete[] illi;
-  delete[] iuui;
-  delete[] illo;
-  delete[] iuuo;
-  delete[] indi;
-  delete[] indo;
-}
-void Parallel::writefile(double time, int nx, int ny, int nz, double xmin, double xmax, double ymin, double ymax,
-                         double zmin, double zmax, char *filename, double *data_out)
-{
-  ofstream outfile;
-  outfile.open(filename, ios::out | ios::trunc);
-  if (!outfile)
-  {
-    cout << "Can't open " << filename << " for output." << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-  outfile.write((char *)&time, sizeof(double));
-  outfile.write((char *)&nx, sizeof(int));
-  outfile.write((char *)&ny, sizeof(int));
-  outfile.write((char *)&nz, sizeof(int));
-  outfile.write((char *)&xmin, sizeof(double));
-  outfile.write((char *)&xmax, sizeof(double));
-  outfile.write((char *)&ymin, sizeof(double));
-  outfile.write((char *)&ymax, sizeof(double));
-  outfile.write((char *)&zmin, sizeof(double));
-  outfile.write((char *)&zmax, sizeof(double));
-  outfile.write((char *)data_out, nx * ny * nz * sizeof(double));
-  outfile.close();
-}
-void Parallel::writefile(double time, int nx, int ny, double xmin, double xmax, double ymin, double ymax,
-                         char *filename, double *datain)
-{
-  int i, j;
-  double *X, *Y;
-  X = new double[nx];
-  Y = new double[ny];
-  double dd;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-  dd = (xmax - xmin) / (nx - 1);
-  for (i = 0; i < nx; i++)
-    X[i] = xmin + i * dd;
-  dd = (ymax - ymin) / (ny - 1);
-  for (j = 0; j < ny; j++)
-    Y[j] = ymin + j * dd;
-#else
-#ifdef Cell
-  dd = (xmax - xmin) / nx;
-  for (i = 0; i < nx; i++)
-    X[i] = xmin + (i + 0.5) * dd;
-  dd = (ymax - ymin) / ny;
-  for (j = 0; j < ny; j++)
-    Y[j] = ymin + (j + 0.5) * dd;
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-  ofstream outfile;
-  outfile.open(filename, ios::out | ios::trunc);
-  if (!outfile)
-  {
-    cout << "Can't open " << filename << " for output." << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-  outfile << "# t = " << time << endl;
-  for (j = 0; j < ny; j++)
-  {
-    for (i = 0; i < nx; i++)
-    {
-      int ind1 = i + j * nx;
-      outfile << setw(10) << setprecision(10) << X[i] << " "
-              << setw(10) << setprecision(10) << Y[j] << " "
-              << setw(16) << setprecision(15) << datain[ind1]
-              << endl;
-    }
-    outfile << "\n"; /* blanck line for gnuplot */
-  }
-  outfile.close();
-
-  delete[] X;
-  delete[] Y;
-}
-void Parallel::Dump_CPU_Data(MyList<Block> *BlL, MyList<var> *DumpList, char *tag, double time, double dT)
-{
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  // round at 4 and 5
-  int ncount = int(time / dT + 0.5);
-
-  MyList<Block> *Bp;
-  while (DumpList)
-  {
-    Bp = BlL;
-    int Bi = 0;
-    while (Bp)
-    {
-      Block *BP = Bp->data;
-      var *VP = DumpList->data;
-      if (BP->rank == myrank)
-      {
-
-        string out_dir;
-        map<string, string>::iterator iter;
-        iter = parameters::str_par.find("output dir");
-        if (iter != parameters::str_par.end())
-        {
-          out_dir = iter->second;
-        }
-        else
-        {
-          // read parameter from file
-          const int LEN = 256;
-          char pline[LEN];
-          string str, sgrp, skey, sval;
-          int sind;
-          char pname[50];
-          {
-            map<string, string>::iterator iter = parameters::str_par.find("inputpar");
-            if (iter != parameters::str_par.end())
-            {
-              strcpy(pname, (iter->second).c_str());
-            }
-            else
-            {
-              cout << "Error inputpar" << endl;
-              exit(0);
-            }
-          }
-          ifstream inf(pname, ifstream::in);
-          if (!inf.good())
-          {
-            cout << "Can not open parameter file " << pname << endl;
-            MPI_Abort(MPI_COMM_WORLD, 1);
-          }
-
-          for (int i = 1; inf.good(); i++)
-          {
-            inf.getline(pline, LEN);
-            str = pline;
-
-            int status = misc::parse_parts(str, sgrp, skey, sval, sind);
-            if (status == -1)
-            {
-              cout << "error reading parameter file " << pname << " in line " << i << endl;
-              MPI_Abort(MPI_COMM_WORLD, 1);
-            }
-            else if (status == 0)
-              continue;
-
-            if (sgrp == "ABE")
-            {
-              if (skey == "output dir")
-                out_dir = sval;
-            }
-          }
-          inf.close();
-
-          parameters::str_par.insert(map<string, string>::value_type("output dir", out_dir));
-        }
-
-        char filename[100];
-        if (tag)
-          sprintf(filename, "%s/%s_Lev%02d-%02d_%02d_%s_%05d.bin", out_dir.c_str(), tag, BP->lev, Bi, myrank, VP->name, ncount);
-        else
-          sprintf(filename, "%s/Lev%02d-%02d_%02d_%s_%05d.bin", out_dir.c_str(), BP->lev, Bi, myrank, VP->name, ncount);
-        writefile(time, BP->shape[0], BP->shape[1], BP->shape[2], BP->bbox[0], BP->bbox[3], BP->bbox[1], BP->bbox[4],
-                  BP->bbox[2], BP->bbox[5], filename, BP->fgfs[VP->sgfn]);
-        cout << "end of dump " << VP->name << " at time " << time << ", on node " << myrank << endl;
-      }
-      Bp = Bp->next;
-      Bi++;
-    }
-    DumpList = DumpList->next;
-  }
-}
-// Now we dump the data including buffer points
-void Parallel::Dump_Data(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT, int grd)
-{
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  //   round at 4 and 5
-  int ncount = int(time / dT + 0.5);
-
-  MPI_Status sta;
-  int DIM = 3;
-  double llb[3], uub[3];
-  double DX, DY, DZ;
-
-  double *databuffer = 0;
-  if (myrank == 0)
-  {
-    databuffer = (double *)malloc(sizeof(double) * PP->shape[0] * PP->shape[1] * PP->shape[2]);
-    if (!databuffer)
-    {
-      cout << "Parallel::Dump_Data: out of memory when dumping data." << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-  }
-
-  while (DumpList)
-  {
-    var *VP = DumpList->data;
-
-    MyList<Block> *Bp = PP->blb;
-    while (Bp)
-    {
-      Block *BP = Bp->data;
-      if (BP->rank == 0 && myrank == 0)
-      {
-        DX = BP->getdX(0);
-        DY = BP->getdX(1);
-        DZ = BP->getdX(2);
-        llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
-        llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
-        llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
-        uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
-        uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
-        uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
-        f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, BP->fgfs[VP->sgfn], llb, uub);
-      }
-      else
-      {
-        int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]);
-        if (myrank == 0)
-        {
-          double *bufferhere = (double *)malloc(sizeof(double) * nnn);
-          if (!bufferhere)
-          {
-            cout << "on node#" << myrank << ", out of memory when dumping data." << endl;
-            MPI_Abort(MPI_COMM_WORLD, 1);
-          }
-          MPI_Recv(bufferhere, nnn, MPI_DOUBLE, BP->rank, 0, MPI_COMM_WORLD, &sta);
-          DX = BP->getdX(0);
-          DY = BP->getdX(1);
-          DZ = BP->getdX(2);
-          llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
-          llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
-          llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
-          uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
-          uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
-          uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
-          f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, bufferhere, llb, uub);
-          free(bufferhere);
-        }
-        else if (myrank == BP->rank)
-        {
-          MPI_Send(BP->fgfs[VP->sgfn], nnn, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
-        }
-      }
-      if (Bp == PP->ble)
-        break;
-      Bp = Bp->next;
-    }
-    if (myrank == 0)
-    {
-
-      string out_dir;
-      map<string, string>::iterator iter;
-      iter = parameters::str_par.find("output dir");
-      if (iter != parameters::str_par.end())
-      {
-        out_dir = iter->second;
-      }
-      else
-      {
-        // read parameter from file
-        const int LEN = 256;
-        char pline[LEN];
-        string str, sgrp, skey, sval;
-        int sind;
-        char pname[50];
-        {
-          map<string, string>::iterator iter = parameters::str_par.find("inputpar");
-          if (iter != parameters::str_par.end())
-          {
-            strcpy(pname, (iter->second).c_str());
-          }
-          else
-          {
-            cout << "Error inputpar" << endl;
-            exit(0);
-          }
-        }
-        ifstream inf(pname, ifstream::in);
-        if (!inf.good())
-        {
-          cout << "Can not open parameter file " << pname << endl;
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-
-        for (int i = 1; inf.good(); i++)
-        {
-          inf.getline(pline, LEN);
-          str = pline;
-
-          int status = misc::parse_parts(str, sgrp, skey, sval, sind);
-          if (status == -1)
-          {
-            cout << "error reading parameter file " << pname << " in line " << i << endl;
-            MPI_Abort(MPI_COMM_WORLD, 1);
-          }
-          else if (status == 0)
-            continue;
-
-          if (sgrp == "ABE")
-          {
-            if (skey == "output dir")
-              out_dir = sval;
-          }
-        }
-        inf.close();
-
-        parameters::str_par.insert(map<string, string>::value_type("output dir", out_dir));
-      }
-
-      char filename[100];
-      if (tag)
-        sprintf(filename, "%s/%s_Lev%02d-%02d_%s_%05d.bin", out_dir.c_str(), tag, PP->lev, grd, VP->name, ncount);
-      else
-        sprintf(filename, "%s/Lev%02d-%02d_%s_%05d.bin", out_dir.c_str(), PP->lev, grd, VP->name, ncount);
-
-      writefile(time, PP->shape[0], PP->shape[1], PP->shape[2], PP->bbox[0], PP->bbox[3], PP->bbox[1], PP->bbox[4],
-                PP->bbox[2], PP->bbox[5], filename, databuffer);
-    }
-    DumpList = DumpList->next;
-  }
-
-  if (myrank == 0)
-    free(databuffer);
-}
-void Parallel::Dump_Data(MyList<Patch> *PL, MyList<var> *DumpList, char *tag, double time, double dT)
-{
-  MyList<Patch> *Pp;
-  Pp = PL;
-  int grd = 0;
-  while (Pp)
-  {
-    Patch *PP = Pp->data;
-    Dump_Data(PP, DumpList, tag, time, dT, grd);
-    grd++;
-    Pp = Pp->next;
-  }
-}
-// collect the data including buffer points
-double *Parallel::Collect_Data(Patch *PP, var *VP)
-{
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  MPI_Status sta;
-  int DIM = 3;
-  double llb[3], uub[3];
-  double DX, DY, DZ;
-
-  double *databuffer = 0;
-  if (myrank == 0)
-  {
-    databuffer = (double *)malloc(sizeof(double) * PP->shape[0] * PP->shape[1] * PP->shape[2]);
-    if (!databuffer)
-    {
-      cout << "Parallel::Collect_Data: out of memory when dumping data." << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-  }
-
-  MyList<Block> *Bp = PP->blb;
-  while (Bp)
-  {
-    Block *BP = Bp->data;
-    if (BP->rank == 0 && myrank == 0)
-    {
-      DX = BP->getdX(0);
-      DY = BP->getdX(1);
-      DZ = BP->getdX(2);
-      llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
-      llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
-      llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
-      uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
-      uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
-      uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
-      f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, BP->fgfs[VP->sgfn], llb, uub);
-    }
-    else
-    {
-      int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]);
-      if (myrank == 0)
-      {
-        double *bufferhere = (double *)malloc(sizeof(double) * nnn);
-        if (!bufferhere)
-        {
-          cout << "on node#" << myrank << ", out of memory when dumping data." << endl;
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-        MPI_Recv(bufferhere, nnn, MPI_DOUBLE, BP->rank, 0, MPI_COMM_WORLD, &sta);
-        DX = BP->getdX(0);
-        DY = BP->getdX(1);
-        DZ = BP->getdX(2);
-        llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
-        llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
-        llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
-        uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
-        uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
-        uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
-        f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, bufferhere, llb, uub);
-        free(bufferhere);
-      }
-      else if (myrank == BP->rank)
-      {
-        MPI_Send(BP->fgfs[VP->sgfn], nnn, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
-      }
-    }
-    if (Bp == PP->ble)
-      break;
-    Bp = Bp->next;
-  }
-
-  return databuffer;
-}
-// Now we dump the data including buffer points
-// dump z = 0 plane
-void Parallel::d2Dump_Data(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT, int grd)
-{
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  //   round at 4 and 5
-  int ncount = int(time / dT + 0.5);
-
-  MPI_Status sta;
-  int DIM = 3;
-  double llb[3], uub[3];
-  double DX, DY, DZ;
-
-  double *databuffer = 0, *databuffer2 = 0;
-  if (myrank == 0)
-  {
-    databuffer = (double *)malloc(sizeof(double) * PP->shape[0] * PP->shape[1] * PP->shape[2]);
-    databuffer2 = (double *)malloc(sizeof(double) * PP->shape[0] * PP->shape[1]);
-    if (!databuffer || !databuffer2)
-    {
-      cout << "Parallel::d2Dump_Data: out of memory when dumping data." << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-  }
-
-  while (DumpList)
-  {
-    var *VP = DumpList->data;
-
-    MyList<Block> *Bp = PP->blb;
-    while (Bp)
-    {
-      Block *BP = Bp->data;
-      if (BP->rank == 0 && myrank == 0)
-      {
-        DX = BP->getdX(0);
-        DY = BP->getdX(1);
-        DZ = BP->getdX(2);
-        llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
-        llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
-        llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
-        uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
-        uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
-        uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
-        f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, BP->fgfs[VP->sgfn], llb, uub);
-      }
-      else
-      {
-        int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]);
-        if (myrank == 0)
-        {
-          double *bufferhere = (double *)malloc(sizeof(double) * nnn);
-          if (!bufferhere)
-          {
-            cout << "on node#" << myrank << ", out of memory when dumping data." << endl;
-            MPI_Abort(MPI_COMM_WORLD, 1);
-          }
-          MPI_Recv(bufferhere, nnn, MPI_DOUBLE, BP->rank, 0, MPI_COMM_WORLD, &sta);
-          DX = BP->getdX(0);
-          DY = BP->getdX(1);
-          DZ = BP->getdX(2);
-          llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
-          llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
-          llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
-          uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
-          uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
-          uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
-          f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, bufferhere, llb, uub);
-          free(bufferhere);
-        }
-        else if (myrank == BP->rank)
-        {
-          MPI_Send(BP->fgfs[VP->sgfn], nnn, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
-        }
-      }
-      if (Bp == PP->ble)
-        break;
-      Bp = Bp->next;
-    }
-    if (myrank == 0)
-    {
-
-      string out_dir;
-      map<string, string>::iterator iter;
-      iter = parameters::str_par.find("output dir");
-      if (iter != parameters::str_par.end())
-      {
-        out_dir = iter->second;
-      }
-      else
-      {
-        // read parameter from file
-        const int LEN = 256;
-        char pline[LEN];
-        string str, sgrp, skey, sval;
-        int sind;
-        char pname[50];
-        {
-          map<string, string>::iterator iter = parameters::str_par.find("inputpar");
-          if (iter != parameters::str_par.end())
-          {
-            strcpy(pname, (iter->second).c_str());
-          }
-          else
-          {
-            cout << "Error inputpar" << endl;
-            exit(0);
-          }
-        }
-        ifstream inf(pname, ifstream::in);
-        if (!inf.good())
-        {
-          cout << "Can not open parameter file " << pname << endl;
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-
-        for (int i = 1; inf.good(); i++)
-        {
-          inf.getline(pline, LEN);
-          str = pline;
-
-          int status = misc::parse_parts(str, sgrp, skey, sval, sind);
-          if (status == -1)
-          {
-            cout << "error reading parameter file " << pname << " in line " << i << endl;
-            MPI_Abort(MPI_COMM_WORLD, 1);
-          }
-          else if (status == 0)
-            continue;
-
-          if (sgrp == "ABE")
-          {
-            if (skey == "output dir")
-              out_dir = sval;
-          }
-        }
-        inf.close();
-
-        parameters::str_par.insert(map<string, string>::value_type("output dir", out_dir));
-      }
-
-      char filename[100];
-      if (tag)
-        sprintf(filename, "%s/%s_2d_Lev%02d-%02d_%s_%05d.dat", out_dir.c_str(), tag, PP->lev, grd, VP->name, ncount);
-      else
-        sprintf(filename, "%s/2d_Lev%02d-%02d_%s_%05d.dat", out_dir.c_str(), PP->lev, grd, VP->name, ncount);
-
-      int gord = ghost_width;
-      f_d2dump(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, databuffer2, gord, VP->SoA);
-      writefile(time, PP->shape[0], PP->shape[1], PP->bbox[0], PP->bbox[3], PP->bbox[1], PP->bbox[4],
-                filename, databuffer2);
-    }
-    DumpList = DumpList->next;
-  }
-
-  if (myrank == 0)
-  {
-    free(databuffer);
-    free(databuffer2);
-  }
-}
-void Parallel::d2Dump_Data(MyList<Patch> *PL, MyList<var> *DumpList, char *tag, double time, double dT)
-{
-  MyList<Patch> *Pp;
-  Pp = PL;
-  int grd = 0;
-  while (Pp)
-  {
-    Patch *PP = Pp->data;
-    d2Dump_Data(PP, DumpList, tag, time, dT, grd);
-    grd++;
-    Pp = Pp->next;
-  }
-}
-// Now we dump the data including buffer points and ghost points of the given patch
-void Parallel::Dump_Data0(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT)
-{
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  //   round at 4 and 5
-  int ncount = int(time / dT + 0.5);
-
-  MPI_Status sta;
-  int DIM = 3;
-  double llb[3], uub[3], tllb[3], tuub[3];
-  int tshape[3];
-  double DX, DY, DZ;
-
-  for (int i = 0; i < 3; i++)
-  {
-    double DX = PP->blb->data->getdX(i);
-    tshape[i] = PP->shape[i] + 2 * ghost_width;
-    tllb[i] = PP->bbox[i] - ghost_width * DX;
-    tuub[i] = PP->bbox[i + dim] + ghost_width * DX;
-  }
-
-  int NN = tshape[0] * tshape[1] * tshape[2];
-  double *databuffer = 0;
-  if (myrank == 0)
-  {
-    databuffer = (double *)malloc(sizeof(double) * NN);
-    if (!databuffer)
-    {
-      cout << "on node# " << myrank << ", out of memory when dumping data." << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-  }
-
-  while (DumpList)
-  {
-    var *VP = DumpList->data;
-    MyList<Block> *Bp = PP->blb;
-    while (Bp)
-    {
-      Block *BP = Bp->data;
-      if (BP->rank == 0 && myrank == 0)
-      {
-        DX = BP->getdX(0);
-        DY = BP->getdX(1);
-        DZ = BP->getdX(2);
-        llb[0] = (feq(BP->bbox[0], tllb[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
-        llb[1] = (feq(BP->bbox[1], tllb[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
-        llb[2] = (feq(BP->bbox[2], tllb[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
-        uub[0] = (feq(BP->bbox[3], tuub[0], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
-        uub[1] = (feq(BP->bbox[4], tuub[1], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
-        uub[2] = (feq(BP->bbox[5], tuub[2], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
-        f_copy(DIM, tllb, tuub, tshape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, BP->fgfs[VP->sgfn], llb, uub);
-      }
-      else
-      {
-        if (myrank == 0)
-        {
-          int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]);
-          double *bufferhere = (double *)malloc(sizeof(double) * nnn);
-          if (!bufferhere)
-          {
-            cout << "on node#" << myrank << ", out of memory when dumping data." << endl;
-            MPI_Abort(MPI_COMM_WORLD, 1);
-          }
-          MPI_Recv(bufferhere, nnn, MPI_DOUBLE, BP->rank, 0, MPI_COMM_WORLD, &sta);
-          DX = BP->getdX(0);
-          DY = BP->getdX(1);
-          DZ = BP->getdX(2);
-          llb[0] = (feq(BP->bbox[0], tllb[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
-          llb[1] = (feq(BP->bbox[1], tllb[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
-          llb[2] = (feq(BP->bbox[2], tllb[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
-          uub[0] = (feq(BP->bbox[3], tuub[0], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
-          uub[1] = (feq(BP->bbox[4], tuub[1], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
-          uub[2] = (feq(BP->bbox[5], tuub[2], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
-          f_copy(DIM, tllb, tuub, tshape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, bufferhere, llb, uub);
-          free(bufferhere);
-        }
-        else if (myrank == BP->rank)
-        {
-          int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]);
-          MPI_Send(BP->fgfs[VP->sgfn], nnn, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
-        }
-      }
-      if (Bp == PP->ble)
-        break;
-      Bp = Bp->next;
-    }
-    if (myrank == 0)
-    {
-
-      string out_dir;
-      map<string, string>::iterator iter;
-      iter = parameters::str_par.find("output dir");
-      if (iter != parameters::str_par.end())
-      {
-        out_dir = iter->second;
-      }
-      else
-      {
-        // read parameter from file
-        const int LEN = 256;
-        char pline[LEN];
-        string str, sgrp, skey, sval;
-        int sind;
-        char pname[50];
-        {
-          map<string, string>::iterator iter = parameters::str_par.find("inputpar");
-          if (iter != parameters::str_par.end())
-          {
-            strcpy(pname, (iter->second).c_str());
-          }
-          else
-          {
-            cout << "Error inputpar" << endl;
-            exit(0);
-          }
-        }
-        ifstream inf(pname, ifstream::in);
-        if (!inf.good())
-        {
-          cout << "Can not open parameter file " << pname << endl;
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-
-        for (int i = 1; inf.good(); i++)
-        {
-          inf.getline(pline, LEN);
-          str = pline;
-
-          int status = misc::parse_parts(str, sgrp, skey, sval, sind);
-          if (status == -1)
-          {
-            cout << "error reading parameter file " << pname << " in line " << i << endl;
-            MPI_Abort(MPI_COMM_WORLD, 1);
-          }
-          else if (status == 0)
-            continue;
-
-          if (sgrp == "ABE")
-          {
-            if (skey == "output dir")
-              out_dir = sval;
-          }
-        }
-        inf.close();
-
-        parameters::str_par.insert(map<string, string>::value_type("output dir", out_dir));
-      }
-
-      char filename[100];
-      if (tag)
-        sprintf(filename, "%s/%s_Lev%02d_%s_%05d.bin", out_dir.c_str(), tag, PP->lev, VP->name, ncount);
-      else
-        sprintf(filename, "%s/Lev%02d_%s_%05d.bin", out_dir.c_str(), PP->lev, VP->name, ncount);
-
-      writefile(time, tshape[0], tshape[1], tshape[2], tllb[0], tuub[0], tllb[1], tuub[2],
-                tllb[2], tuub[2], filename, databuffer);
-    }
-    DumpList = DumpList->next;
-  }
-
-  if (myrank == 0)
-    free(databuffer);
-}
-// Map point is much easier than maping data itself
-// But the main problem is about the points near the boundary
-// worst case is -ghost -ghost+1 .... 0 * ......
-double Parallel::global_interp(int DIM, int *ext, double **CoX, double *datain,
-                               double *poXb, int ordn, double *SoA, int Symmetry)
-{
-  if (DIM != 3)
-  {
-    cout << "Parallel::global_interp does not suport DIM = " << DIM << " for Symmetry." << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-  double resu;
-  double poX[3];
-  double asgn = 1;
-
-  for (int i = 0; i < 3; i++)
-    poX[i] = poXb[i];
-
-  switch (Symmetry)
-  {
-  case 2:
-    for (int i = 0; i < 3; i++)
-      if (poX[i] < 0)
-      {
-        poX[i] = -poX[i];
-        asgn = asgn * SoA[i];
-      }
-    break;
-  case 1:
-    if (poX[2] < 0)
-    {
-      poX[2] = -poX[2];
-      asgn = asgn * SoA[2];
-    }
-  }
-
-  int extb[3];
-
-  for (int i = 0; i < 3; i++)
-    extb[i] = ext[i];
-
-  switch (Symmetry)
-  {
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-  case 2:
-    if (poX[0] < (ghost_width - 1) * (CoX[0][1] - CoX[0][0]))
-      extb[0] = extb[0] + ghost_width - 1;
-    if (poX[1] < (ghost_width - 1) * (CoX[1][1] - CoX[1][0]))
-      extb[1] = extb[1] + ghost_width - 1;
-  case 1:
-    if (poX[2] < (ghost_width - 1) * (CoX[2][1] - CoX[2][0]))
-      extb[2] = extb[2] + ghost_width - 1;
-#else
-#ifdef Cell
-  case 2:
-    if (poX[0] < (ghost_width - 0.5) * (CoX[0][1] - CoX[0][0]))
-      extb[0] = extb[0] + ghost_width;
-    if (poX[1] < (ghost_width - 0.5) * (CoX[1][1] - CoX[1][0]))
-      extb[1] = extb[1] + ghost_width;
-  case 1:
-    if (poX[2] < (ghost_width - 0.5) * (CoX[2][1] - CoX[2][0]))
-      extb[2] = extb[2] + ghost_width;
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-  }
-
-  if (extb[0] > ext[0] || extb[1] > ext[1] || extb[2] > ext[2])
-  {
-    double *CoXb[3];
-    int Nb = extb[0] * extb[1] * extb[2];
-    double *datab;
-    datab = new double[Nb];
-    for (int i = 0; i < 3; i++)
-    {
-      CoXb[i] = new double[extb[i]];
-      double DH = CoX[i][1] - CoX[i][0];
-      if (extb[i] > ext[i])
-      {
-        if (CoX[i][0] > DH)
-        {
-          cout << "lower boundary[" << i << "] = " << CoX[i][0] << ", but SYmmetry = " << Symmetry << endl;
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        for (int j = 0; j < ghost_width - 1; j++)
-          CoXb[i][j] = -CoX[i][ghost_width - 1 - j];
-        for (int j = ghost_width - 1; j < extb[i]; j++)
-          CoXb[i][j] = CoX[i][j - ghost_width + 1];
-#else
-#ifdef Cell
-        for (int j = 0; j < ghost_width; j++)
-          CoXb[i][j] = -CoX[i][ghost_width - 1 - j];
-        for (int j = ghost_width; j < extb[i]; j++)
-          CoXb[i][j] = CoX[i][j - ghost_width];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-      else
-      {
-        for (int j = 0; j < extb[i]; j++)
-          CoXb[i][j] = CoX[i][j];
-      }
-    }
-
-    for (int i = 0; i < Nb; i++)
-    {
-      int ind[3], indb[3];
-      getarrayindex(3, extb, indb, i);
-      double sgn = 1;
-      for (int j = 0; j < 3; j++)
-      {
-        if (extb[j] > ext[j])
-        {
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-          if (indb[j] < ghost_width - 1)
-          {
-            ind[j] = ghost_width - 1 - indb[j];
-            sgn = sgn * SoA[j];
-          }
-          else
-          {
-            ind[j] = 1 + indb[j] - ghost_width;
-          }
-#else
-#ifdef Cell
-          if (indb[j] < ghost_width)
-          {
-            ind[j] = ghost_width - 1 - indb[j];
-            sgn = sgn * SoA[j];
-          }
-          else
-          {
-            ind[j] = indb[j] - ghost_width;
-          }
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-        }
-        else
-          ind[j] = indb[j];
-      }
-      int lon = getarraylocation(3, ext, ind);
-      datab[i] = datain[lon] * sgn;
-    }
-
-    resu = global_interp(DIM, extb, CoXb, datab, poX, ordn);
-
-    for (int i = 0; i < 3; i++)
-      delete[] CoXb[i];
-    delete[] datab;
-  }
-  else
-  {
-    resu = global_interp(DIM, ext, CoX, datain, poX, ordn);
-  }
-
-  return resu * asgn;
-}
-double Parallel::global_interp(int DIM, int *ext, double **CoX, double *datain,
-                               double *poX, int ordn)
-{
-  if (ordn > 2 * ghost_width)
-  {
-    cout << "Parallel::global_interp can not handle ordn = " << ordn << " > 2*ghost_width = " << 2 * ghost_width << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-  double *bbox, *datainbbox;
-  bbox = new double[2 * DIM];
-  datainbbox = new double[2 * DIM];
-
-  int *NN, *ind, *shape;
-  NN = new int[DIM];
-  ind = new int[DIM];
-  shape = new int[DIM];
-
-  for (int i = 0; i < DIM; i++)
-  {
-    ind[i] = int((poX[i] - CoX[i][0]) / (CoX[i][1] - CoX[i][0])) - ordn / 2 + 1;
-    // poX may exactly locate on the boundary (exclude ghost)
-    if (ind[i] == -1 && feq(poX[i], CoX[i][0], (CoX[i][1] - CoX[i][0]) / 2))
-      ind[i] = 0;
-    /*
-         if(ind[i] < 0)
-         {
-           cout<<"Parallel::global_interp error ind["<<i<<"] = "<<ind[i]<<endl;
-           cout<<"pox = "<<poX[i]<<", CoX[0] = "<<CoX[i][0]<<endl;
-           MPI_Abort(MPI_COMM_WORLD,1);
-         }
-    */
-    if (ind[i] == ext[i] - ordn + 1 && feq(poX[i], CoX[i][ext[i] - ordn / 2], (CoX[i][1] - CoX[i][0]) / 2))
-      ind[i] = ext[i] - ordn - 1;
-    /*
-         if(ind[i]+ordn-1 > ext[i]-1)
-         {
-           cout<<"Parallel::global_interp error ind["<<i<<"] = "<<ind[i]<<" + ordn ("<<ordn<<") > ext = "<<ext[i]<<endl;
-           cout<<"pox = "<<poX[i]<<", CoX[ind] = "<<CoX[i][ind[i]]<<", CoX = ("<<CoX[i][0]<<","<<CoX[i][ext[i]-1]<<")"<<endl;
-           MPI_Abort(MPI_COMM_WORLD,1);
-         }
-    */
-    bbox[i] = CoX[i][ind[i]];
-    bbox[DIM + i] = CoX[i][ind[i] + ordn - 1];
-    datainbbox[i] = CoX[i][0];
-    datainbbox[DIM + i] = CoX[i][ext[i] - 1];
-    shape[i] = ordn;
-  }
-
-  NN[DIM - 1] = ordn;
-  for (int i = DIM - 2; i >= 0; i--)
-    NN[i] = NN[i + 1] * ordn;
-
-  double *xpts, *funcvals;
-  xpts = new double[ordn];
-  funcvals = new double[ordn];
-  double *DDd, *DDd1, rr;
-
-  DDd = new double[NN[0]];
-
-  copy(DIM, bbox, bbox + DIM, shape, DDd, datainbbox, datainbbox + DIM, ext, datain, bbox, bbox + DIM);
-
-  for (int i = 0; i < DIM; i++)
-  {
-    for (int j = ind[i]; j < ind[i] + ordn; j++)
-    {
-      xpts[j - ind[i]] = CoX[i][j];
-    }
-
-    if (i < DIM - 1)
-    {
-      DDd1 = new double[NN[i + 1]];
-      for (int j = 0; j < NN[i + 1]; j++)
-      {
-        for (int k = 0; k < ordn; k++)
-          funcvals[k] = DDd[k + j * ordn];
-        DDd1[j] = Lagrangian_Int(poX[i], ordn, xpts, funcvals);
-      }
-      delete[] DDd;
-      DDd = DDd1;
-    }
-    else
-    {
-      for (int j = 0; j < ordn; j++)
-        funcvals[j] = DDd[j];
-      rr = Lagrangian_Int(poX[i], ordn, xpts, funcvals);
-      delete[] DDd1; // since DDd and DDd1 now point to the same stuff, we need delete after above int
-    }
-  }
-
-  delete[] NN;
-  delete[] ind;
-  delete[] xpts;
-  delete[] funcvals;
-  delete[] bbox;
-  delete[] datainbbox;
-  delete[] shape;
-
-  return rr;
-}
-double Parallel::Lagrangian_Int(double x, int npts, double *xpts, double *funcvals)
-{
-  double sum = 0;
-  for (int i = 0; i < npts; i++)
-  {
-    sum = sum + funcvals[i] * LagrangePoly(x, i, npts, xpts);
-  }
-  return sum;
-}
-double Parallel::LagrangePoly(double x, int pt, int npts, double *xpts)
-{
-  double h = 1;
-  int i;
-
-  for (i = 0; i < pt; i++)
-    h = h * (x - xpts[i]) / (xpts[pt] - xpts[i]);
-
-  for (i = pt + 1; i < npts; i++)
-    h = h * (x - xpts[i]) / (xpts[pt] - xpts[i]);
-
-  return h;
-}
-// collect all grid segments or blocks including ghost and buffer for given patch
-MyList<Parallel::gridseg> *Parallel::build_complete_gsl(Patch *Pat)
-{
-  MyList<Parallel::gridseg> *cgsl = 0, *gs;
-  MyList<Block> *BP = Pat->blb;
-  while (BP)
-  {
-    if (!cgsl)
-    {
-      cgsl = gs = new MyList<Parallel::gridseg>; // delete through destroyList();
-      gs->data = new Parallel::gridseg;
-    }
-    else
-    {
-      gs->next = new MyList<Parallel::gridseg>;
-      gs = gs->next;
-      gs->data = new Parallel::gridseg;
-    }
-
-    for (int i = 0; i < dim; i++)
-    {
-      gs->data->llb[i] = BP->data->bbox[i];
-      gs->data->uub[i] = BP->data->bbox[dim + i];
-      gs->data->shape[i] = BP->data->shape[i];
-    }
-    gs->data->Bg = BP->data;
-    gs->next = 0;
-
-    if (BP == Pat->ble)
-      break;
-    BP = BP->next;
-  }
-
-  return cgsl;
-}
-// collect all grid segments or blocks including ghost and buffer for given patch list
-MyList<Parallel::gridseg> *Parallel::build_complete_gsl(MyList<Patch> *PatL)
-{
-  MyList<Parallel::gridseg> *cgsl = 0, *gs;
-  while (PatL)
-  {
-    if (!cgsl)
-    {
-      cgsl = build_complete_gsl(PatL->data);
-      gs = cgsl;
-      while (gs->next)
-        gs = gs->next;
-    }
-    else
-    {
-      gs->next = build_complete_gsl(PatL->data);
-      gs = gs->next;
-      while (gs->next)
-        gs = gs->next;
-    }
-    PatL = PatL->next;
-  }
-
-  return cgsl;
-}
-// cellect the information of Patch list
-MyList<Parallel::gridseg> *Parallel::build_complete_gsl_virtual(MyList<Patch> *PatL)
-{
-  MyList<Parallel::gridseg> *cgsl = 0, *gs;
-  while (PatL)
-  {
-    if (cgsl)
-    {
-      gs->next = new MyList<Parallel::gridseg>;
-      gs = gs->next;
-      gs->data = new Parallel::gridseg;
-    }
-    else
-    {
-      cgsl = gs = new MyList<Parallel::gridseg>;
-      gs->data = new Parallel::gridseg;
-    }
-
-    for (int i = 0; i < dim; i++)
-    {
-      gs->data->llb[i] = PatL->data->bbox[i];
-      gs->data->uub[i] = PatL->data->bbox[dim + i];
-      gs->data->shape[i] = PatL->data->shape[i];
-    }
-    gs->data->Bg = 0;
-    gs->next = 0;
-
-    PatL = PatL->next;
-  }
-
-  return cgsl;
-}
-// cellect the information of Patch list without buffer points
-MyList<Parallel::gridseg> *Parallel::build_complete_gsl_virtual2(MyList<Patch> *PatL) // - buffer
-{
-  MyList<Parallel::gridseg> *cgsl = 0, *gs;
-  while (PatL)
-  {
-    if (cgsl)
-    {
-      gs->next = new MyList<Parallel::gridseg>;
-      gs = gs->next;
-      gs->data = new Parallel::gridseg;
-    }
-    else
-    {
-      cgsl = gs = new MyList<Parallel::gridseg>;
-      gs->data = new Parallel::gridseg;
-    }
-
-    for (int i = 0; i < dim; i++)
-    {
-      double DH = PatL->data->getdX(i);
-      gs->data->llb[i] = PatL->data->bbox[i] + PatL->data->lli[i] * DH;
-      gs->data->uub[i] = PatL->data->bbox[dim + i] - PatL->data->uui[i] * DH;
-      gs->data->shape[i] = PatL->data->shape[i] - PatL->data->lli[i] - PatL->data->uui[i];
-    }
-    gs->data->Bg = 0;
-    gs->next = 0;
-
-    PatL = PatL->next;
-  }
-
-  return cgsl;
-}
-// collect all grid segments or blocks without ghost for given patch, without extension
-MyList<Parallel::gridseg> *Parallel::build_bulk_gsl(Patch *Pat)
-{
-  MyList<Parallel::gridseg> *cgsl = 0, *gs;
-  MyList<Block> *BP = Pat->blb;
-  while (BP)
-  {
-    Block *bp = BP->data;
-    if (!cgsl)
-    {
-      cgsl = gs = new MyList<Parallel::gridseg>;
-      gs->data = new Parallel::gridseg;
-    }
-    else
-    {
-      gs->next = new MyList<Parallel::gridseg>;
-      gs = gs->next;
-      gs->data = new Parallel::gridseg;
-    }
-
-    for (int i = 0; i < dim; i++)
-    {
-      double DH = bp->getdX(i);
-      gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] : bp->bbox[dim + i] - ghost_width * DH;
-      gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + ghost_width * DH;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-      gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
-#else
-#ifdef Cell
-      gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-    }
-    gs->data->Bg = BP->data;
-    gs->next = 0;
-
-    if (BP == Pat->ble)
-      break;
-    BP = BP->next;
-  }
-
-  return cgsl;
-}
-// bulk part for given Block within given patch, without extension
-MyList<Parallel::gridseg> *Parallel::build_bulk_gsl(Block *bp, Patch *Pat)
-{
-  MyList<Parallel::gridseg> *gs = 0;
-
-  gs = new MyList<Parallel::gridseg>;
-  gs->data = new Parallel::gridseg;
-
-  for (int i = 0; i < dim; i++)
-  {
-    double DH = bp->getdX(i);
-    gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] : bp->bbox[dim + i] - ghost_width * DH;
-    gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + ghost_width * DH;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-    gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
-#else
-#ifdef Cell
-    gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-  }
-  gs->data->Bg = bp;
-  gs->next = 0;
-
-  return gs;
-}
-MyList<Parallel::gridseg> *Parallel::clone_gsl(MyList<Parallel::gridseg> *p, bool first_only)
-{
-  MyList<Parallel::gridseg> *np = 0, *q = 0, *pq = 0;
-
-  while (p)
-  {
-    q = new MyList<Parallel::gridseg>;
-    q->data = new Parallel::gridseg;
-    q->data->Bg = p->data->Bg;
-    for (int i = 0; i < dim; i++)
-    {
-      q->data->llb[i] = p->data->llb[i];
-      q->data->uub[i] = p->data->uub[i];
-      q->data->shape[i] = p->data->shape[i];
-    }
-    if (pq)
-      pq->next = q;
-    else
-      np = q;
-    if (first_only)
-    {
-      np->next = 0;
-      return np;
-    }
-    pq = q;
-    p = p->next;
-  }
-  return np;
-}
-MyList<Parallel::gridseg> *Parallel::gs_subtract(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B)
-{
-  if (!A)
-    return 0;
-  if (!B)
-    return clone_gsl(A, true);
-
-  double cut_plane[2 * dim], DH[dim];
-
-  for (int i = 0; i < dim; i++)
-  {
-    DH[i] = A->data->Bg->getdX(i);
-    if (B->data->Bg && !feq(DH[i], B->data->Bg->getdX(i), DH[i] / 2))
-    {
-      cout << "Parallel::gs_subtract meets different grid segment " << DH[i] << " vs " << B->data->Bg->getdX(i) << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-  }
-
-  MyList<Parallel::gridseg> *C = 0, *q;
-  for (int i = 0; i < dim; i++)
-  {
-    if (B->data->llb[i] > A->data->uub[i] || B->data->uub[i] < A->data->llb[i])
-      return clone_gsl(A, true);
-    cut_plane[i] = A->data->llb[i];
-    cut_plane[i + dim] = A->data->uub[i];
-  }
-
-  for (int i = 0; i < dim; i++)
-  {
-    cut_plane[i] = Mymax(A->data->llb[i], B->data->llb[i]);
-    if (cut_plane[i] - A->data->llb[i] > DH[i] / 2)
-    {
-      q = clone_gsl(A, true);
-      // prolong the list from head
-      if (C)
-        q->next = C;
-      C = q;
-      for (int j = 0; j < dim; j++)
-      {
-        if (i == j)
-        {
-          C->data->llb[i] = A->data->llb[i];
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-          C->data->uub[i] = Mymax(C->data->llb[i], cut_plane[i] - DH[i]);
-#else
-#ifdef Cell
-          C->data->uub[i] = Mymax(C->data->llb[i], cut_plane[i]);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-        }
-        else
-        {
-          C->data->llb[j] = cut_plane[j];
-          C->data->uub[j] = cut_plane[j + dim];
-        }
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4) + 1;
-#else
-#ifdef Cell
-        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-    }
-
-    cut_plane[i + dim] = Mymin(A->data->uub[i], B->data->uub[i]);
-    if (A->data->uub[i] - cut_plane[i + dim] > DH[i] / 2)
-    {
-      q = clone_gsl(A, true);
-      if (C)
-        q->next = C;
-      C = q;
-      for (int j = 0; j < dim; j++)
-      {
-        if (i == j)
-        {
-          C->data->uub[i] = A->data->uub[i];
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-          C->data->llb[i] = Mymin(C->data->uub[i], cut_plane[i + dim] + DH[i]);
-#else
-#ifdef Cell
-          C->data->llb[i] = Mymin(C->data->uub[i], cut_plane[i + dim]);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-        }
-        else
-        {
-          C->data->llb[j] = cut_plane[j];
-          C->data->uub[j] = cut_plane[j + dim];
-        }
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4) + 1;
-#else
-#ifdef Cell
-        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-    }
-  }
-  return C;
-}
-// stupid method
-/*
-MyList<Parallel::gridseg> *Parallel::gsl_subtract(MyList<Parallel::gridseg> *A,MyList<Parallel::gridseg> *B) //A subtract B but with A's information
-{
-// always make return and A, B distinct
-  if(!A) return 0;
-
-  if(!B) return clone_gsl(A,0);
-
-  MyList<Parallel::gridseg> *C=0,*C0,*C1,*Cc,*CC0,*gs;
-
-  while(A)
-  {
-     C0=gs_subtract(A,B);  // note C0 becomes a list after subtraction
-     C1=B->next;
-     while(C1)
-     {
-  CC0=C0;
-  Cc=0;
-  while(CC0)
-  {
-    gs=gs_subtract(CC0,C1);
-    if(Cc) Cc->catList(gs);
-    else   Cc=gs;
-    CC0=CC0->next;
-  }
-  if(C0) C0->destroyList();
-  C0=Cc;
-  C1=C1->next;
-     }
-     if(C) C->catList(C0);
-     else  C=C0;
-     A=A->next;
-  }
-
-  return C;
-}
-*/
-// more clever method
-MyList<Parallel::gridseg> *Parallel::gsl_subtract(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B) // A subtract B but with A's information
-{
-  // always make return and A, B distinct
-  if (!A)
-    return 0;
-
-  MyList<Parallel::gridseg> *C = 0, *C0, *C1;
-
-  C = clone_gsl(A, 0);
-
-  while (B)
-  {
-    C0 = 0;
-    C1 = C;
-    while (C1)
-    {
-      if (C0)
-        C0->catList(gs_subtract(C1, B));
-      else
-        C0 = gs_subtract(C1, B);
-      C1 = C1->next;
-    }
-    if (C)
-      C->destroyList();
-    else
-    {
-      if (C0)
-        C0->destroyList();
-      return 0;
-    }
-
-    C = C0;
-    B = B->next;
-  }
-
-  return C;
-}
-MyList<Parallel::gridseg> *Parallel::gs_and(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B)
-{
-  if (!A || !B)
-    return 0;
-
-  double llb[dim], uub[dim];
-  bool flag = false;
-  for (int i = 0; i < dim; i++)
-  {
-    llb[i] = Mymax(A->data->llb[i], B->data->llb[i]);
-    uub[i] = Mymin(A->data->uub[i], B->data->uub[i]);
-    if (llb[i] > uub[i])
-    {
-      flag = true;
-      break;
-    }
-  }
-  if (flag)
-    return 0;
-
-  MyList<Parallel::gridseg> *C;
-  C = clone_gsl(A, true);
-  for (int i = 0; i < dim; i++)
-  {
-    C->data->llb[i] = llb[i];
-    C->data->uub[i] = uub[i];
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-    C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / C->data->Bg->getdX(i) + 0.4) + 1;
-#else
-#ifdef Cell
-    C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / C->data->Bg->getdX(i) + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-  }
-
-  return C;
-}
-// overlap of A_i and (union of all j of B_j)
-MyList<Parallel::gridseg> *Parallel::gsl_and(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B) // A and B but with A's information
-{
-  MyList<Parallel::gridseg> *C = 0, *C1;
-
-  while (A)
-  {
-    C1 = B;
-    while (C1)
-    {
-      if (C)
-        C->catList(gs_and(A, C1));
-      else
-        C = gs_and(A, C1);
-      C1 = C1->next;
-    }
-    A = A->next;
-  }
-  return C;
-}
-// collect all ghost grid segments or blocks for given patch
-MyList<Parallel::gridseg> *Parallel::build_ghost_gsl(Patch *Pat)
-{
-  MyList<Parallel::gridseg> *cgsl = 0, *gs, *gsb;
-  MyList<Block> *BP = Pat->blb;
-  while (BP)
-  {
-    gs = new MyList<Parallel::gridseg>;
-    gs->data = new Parallel::gridseg;
-
-    for (int i = 0; i < dim; i++)
-    {
-      gs->data->llb[i] = BP->data->bbox[i];
-      gs->data->uub[i] = BP->data->bbox[dim + i];
-      gs->data->shape[i] = BP->data->shape[i];
-    }
-    gs->data->Bg = BP->data;
-    gs->next = 0;
-
-    gsb = build_bulk_gsl(BP->data, Pat);
-
-    if (!cgsl)
-      cgsl = gs_subtract(gs, gsb);
-    else
-      cgsl->catList(gs_subtract(gs, gsb));
-
-    gsb->destroyList();
-    gs->destroyList();
-
-    if (BP == Pat->ble)
-      break;
-    BP = BP->next;
-  }
-
-  return cgsl;
-}
-// collect all ghost grid segments or blocks for given patch list
-MyList<Parallel::gridseg> *Parallel::build_ghost_gsl(MyList<Patch> *PatL)
-{
-  MyList<Parallel::gridseg> *cgsl = 0, *gs;
-  while (PatL)
-  {
-    if (!cgsl)
-    {
-      cgsl = build_ghost_gsl(PatL->data);
-      gs = cgsl;
-      while (gs->next)
-        gs = gs->next;
-    }
-    else
-    {
-      gs->next = build_ghost_gsl(PatL->data);
-      gs = gs->next;
-      while (gs->next)
-        gs = gs->next;
-    }
-    PatL = PatL->next;
-  }
-
-  return cgsl;
-}
-// collect all grid segments or blocks without ghost for given patch
-// special for Sync usage, so we do not need consider missing points
-MyList<Parallel::gridseg> *Parallel::build_owned_gsl0(Patch *Pat, int rank_in)
-{
-  MyList<Parallel::gridseg> *cgsl = 0, *gs;
-  MyList<Block> *BP = Pat->blb;
-  while (BP)
-  {
-    Block *bp = BP->data;
-    if (bp->rank == rank_in)
-    {
-      if (!cgsl)
-      {
-        cgsl = gs = new MyList<Parallel::gridseg>;
-        gs->data = new Parallel::gridseg;
-      }
-      else
-      {
-        gs->next = new MyList<Parallel::gridseg>;
-        gs = gs->next;
-        gs->data = new Parallel::gridseg;
-      }
-
-      for (int i = 0; i < dim; i++)
-      {
-        double DH = bp->getdX(i);
-        gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] : bp->bbox[dim + i] - ghost_width * DH;
-        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + ghost_width * DH;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
-#else
-#ifdef Cell
-        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-      gs->data->Bg = BP->data;
-      gs->next = 0;
-    }
-
-    if (BP == Pat->ble)
-      break;
-    BP = BP->next;
-  }
-
-  return cgsl;
-}
-// collect all grid segments or blocks without ghost for given patch
-MyList<Parallel::gridseg> *Parallel::build_owned_gsl1(Patch *Pat, int rank_in)
-{
-  MyList<Parallel::gridseg> *cgsl = 0, *gs;
-  MyList<Block> *BP = Pat->blb;
-  while (BP)
-  {
-    Block *bp = BP->data;
-    if (bp->rank == rank_in)
-    {
-      if (!cgsl)
-      {
-        cgsl = gs = new MyList<Parallel::gridseg>;
-        gs->data = new Parallel::gridseg;
-      }
-      else
-      {
-        gs->next = new MyList<Parallel::gridseg>;
-        gs = gs->next;
-        gs->data = new Parallel::gridseg;
-      }
-
-      for (int i = 0; i < dim; i++)
-      {
-        double DH = bp->getdX(i);
-        gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] : bp->bbox[dim + i] - ghost_width * DH;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        // NOTE: our dividing structure is (exclude ghost)
-        // -1 0
-        //       1  2
-        // so (0,1) does not belong to any part for vertex structure, we always put it to right part, this is consistent to
-        // the fortran routine where we always take floor to get index
-        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + (ghost_width - 1) * DH;
-        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
-#else
-#ifdef Cell
-        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + ghost_width * DH;
-        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-      gs->data->Bg = BP->data;
-      gs->next = 0;
-    }
-
-    if (BP == Pat->ble)
-      break;
-    BP = BP->next;
-  }
-
-  return cgsl;
-}
-// collect all grid segments or blocks without ghost nor buffer for given patch
-MyList<Parallel::gridseg> *Parallel::build_owned_gsl2(Patch *Pat, int rank_in)
-{
-  MyList<Parallel::gridseg> *cgsl = 0, *gs;
-  MyList<Block> *BP = Pat->blb;
-  while (BP)
-  {
-    Block *bp = BP->data;
-    if (bp->rank == rank_in)
-    {
-      if (!cgsl)
-      {
-        cgsl = gs = new MyList<Parallel::gridseg>;
-        gs->data = new Parallel::gridseg;
-      }
-      else
-      {
-        gs->next = new MyList<Parallel::gridseg>;
-        gs = gs->next;
-        gs->data = new Parallel::gridseg;
-      }
-
-      for (int i = 0; i < dim; i++)
-      {
-        double DH = bp->getdX(i);
-        gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] - Pat->uui[i] * DH : bp->bbox[dim + i] - ghost_width * DH;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        // NOTE: our dividing structure is (exclude ghost)
-        // -1 0
-        //       1  2
-        // so (0,1) does not belong to any part for vertex structure, we always put it to right part, this is consistent to
-        // the fortran routine where we always take floor to get index
-        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i] + (ghost_width - 1) * DH;
-        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
-#else
-#ifdef Cell
-        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i] + ghost_width * DH;
-        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-      gs->data->Bg = BP->data;
-      gs->next = 0;
-    }
-
-    if (BP == Pat->ble)
-      break;
-    BP = BP->next;
-  }
-
-  return cgsl;
-}
-// collect all grid segments or blocks without ghost for given patch, and delete the ghost_width for interpolation consideration on the patch boundary
-MyList<Parallel::gridseg> *Parallel::build_owned_gsl3(Patch *Pat, int rank_in, int Symmetry)
-{
-  MyList<Parallel::gridseg> *cgsl = 0, *gs;
-  MyList<Block> *BP = Pat->blb;
-  while (BP)
-  {
-    Block *bp = BP->data;
-    if (bp->rank == rank_in)
-    {
-      if (!cgsl)
-      {
-        cgsl = gs = new MyList<Parallel::gridseg>;
-        gs->data = new Parallel::gridseg;
-      }
-      else
-      {
-        gs->next = new MyList<Parallel::gridseg>;
-        gs = gs->next;
-        gs->data = new Parallel::gridseg;
-      }
-
-      for (int i = 0; i < dim; i++)
-      {
-        double DH = bp->getdX(i);
-        gs->data->uub[i] = bp->bbox[dim + i] - ghost_width * DH;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        // NOTE: our dividing structure is (exclude ghost)
-        // -1 0
-        //       1  2
-        // so (0,1) does not belong to any part for vertex structure, we always put it to right part, this is consistent to
-        // the fortran routine where we always take floor to get index
-        gs->data->llb[i] = bp->bbox[i] + (ghost_width - 1) * DH;
-        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
-#else
-#ifdef Cell
-        gs->data->llb[i] = bp->bbox[i] + ghost_width * DH;
-        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-      // Symmetry consideration
-      if (Symmetry > 0)
-      {
-        double DH = bp->getdX(2);
-        if (feq(bp->bbox[2], 0, DH / 2))
-        {
-          gs->data->llb[2] = bp->bbox[2];
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-          gs->data->shape[2] = int((gs->data->uub[2] - gs->data->llb[2]) / DH + 0.4) + 1;
-#else
-#ifdef Cell
-          gs->data->shape[2] = int((gs->data->uub[2] - gs->data->llb[2]) / DH + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-        }
-        if (Symmetry > 1)
-        {
-          for (int i = 0; i < 2; i++)
-          {
-            DH = bp->getdX(i);
-            if (feq(bp->bbox[i], 0, DH / 2))
-            {
-              gs->data->llb[i] = bp->bbox[i];
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-              gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
-#else
-#ifdef Cell
-              gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-            }
-          }
-        }
-      }
-
-      gs->data->Bg = BP->data;
-      gs->next = 0;
-    }
-
-    if (BP == Pat->ble)
-      break;
-    BP = BP->next;
-  }
-
-  return cgsl;
-}
-// collect all grid segments or blocks without ghost nor buffer for given patch,
-// and delete the ghost_width for interpolation consideration on the patch boundary
-MyList<Parallel::gridseg> *Parallel::build_owned_gsl4(Patch *Pat, int rank_in, int Symmetry)
-{
-  MyList<Parallel::gridseg> *cgsl = 0, *gs;
-  MyList<Block> *BP = Pat->blb;
-  while (BP)
-  {
-    Block *bp = BP->data;
-    if (bp->rank == rank_in)
-    {
-      if (!cgsl)
-      {
-        cgsl = gs = new MyList<Parallel::gridseg>;
-        gs->data = new Parallel::gridseg;
-      }
-      else
-      {
-        gs->next = new MyList<Parallel::gridseg>;
-        gs = gs->next;
-        gs->data = new Parallel::gridseg;
-      }
-
-      for (int i = 0; i < dim; i++)
-      {
-        double DH = bp->getdX(i);
-        gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] - Pat->uui[i] * DH : bp->bbox[dim + i];
-        gs->data->uub[i] -= ghost_width * DH;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        // NOTE: our dividing structure is (exclude ghost)
-        // -1 0
-        //       1  2
-        // so (0,1) does not belong to any part for vertex structure, we always put it to right part, this is consistent to
-        // the fortran routine where we always take floor to get index
-        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i];
-        gs->data->llb[i] += (ghost_width - 1) * DH;
-        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
-#else
-#ifdef Cell
-        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i];
-        gs->data->llb[i] += ghost_width * DH;
-        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-      // Symmetry consideration
-      if (Symmetry > 0)
-      {
-        double DH = bp->getdX(2);
-        if (feq(bp->bbox[2], 0, DH / 2))
-        {
-          gs->data->llb[2] = bp->bbox[2];
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-          gs->data->shape[2] = int((gs->data->uub[2] - gs->data->llb[2]) / DH + 0.4) + 1;
-#else
-#ifdef Cell
-          gs->data->shape[2] = int((gs->data->uub[2] - gs->data->llb[2]) / DH + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-        }
-        if (Symmetry > 1)
-        {
-          for (int i = 0; i < 2; i++)
-          {
-            DH = bp->getdX(i);
-            if (feq(bp->bbox[i], 0, DH / 2))
-            {
-              gs->data->llb[i] = bp->bbox[i];
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-              gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
-#else
-#ifdef Cell
-              gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-            }
-          }
-        }
-      }
-
-      gs->data->Bg = BP->data;
-      gs->next = 0;
-    }
-
-    if (BP == Pat->ble)
-      break;
-    BP = BP->next;
-  }
-
-  return cgsl;
-}
-// collect all grid segments or blocks without ghost nor buffer for given patch, no extention
-MyList<Parallel::gridseg> *Parallel::build_owned_gsl5(Patch *Pat, int rank_in)
-{
-  MyList<Parallel::gridseg> *cgsl = 0, *gs;
-  MyList<Block> *BP = Pat->blb;
-  while (BP)
-  {
-    Block *bp = BP->data;
-    if (bp->rank == rank_in)
-    {
-      if (!cgsl)
-      {
-        cgsl = gs = new MyList<Parallel::gridseg>;
-        gs->data = new Parallel::gridseg;
-      }
-      else
-      {
-        gs->next = new MyList<Parallel::gridseg>;
-        gs = gs->next;
-        gs->data = new Parallel::gridseg;
-      }
-
-      for (int i = 0; i < dim; i++)
-      {
-        double DH = bp->getdX(i);
-        gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] - Pat->uui[i] * DH : bp->bbox[dim + i] - ghost_width * DH;
-        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i] + ghost_width * DH;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
-#else
-#ifdef Cell
-        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-      gs->data->Bg = BP->data;
-      gs->next = 0;
-    }
-
-    if (BP == Pat->ble)
-      break;
-    BP = BP->next;
-  }
-
-  return cgsl;
-}
-// collect all grid segments or blocks without ghost for given patch list
-// stupid method
-/*
-MyList<Parallel::gridseg> *Parallel::build_owned_gsl(MyList<Patch> *PatL,int rank_in,int type,int Symmetry)
-{
-       MyList<Parallel::gridseg> *cgsl=0,*gs;
-       while(PatL)
-       {
-    if(!cgsl)
-    {
-            switch(type)
-      {
-         case 0:
-                  cgsl = build_owned_gsl0(PatL->data,rank_in);
-      break;
-         case 1:
-                  cgsl = build_owned_gsl1(PatL->data,rank_in);
-      break;
-         case 2:
-                  cgsl = build_owned_gsl2(PatL->data,rank_in);
-      break;
-         case 3:
-                  cgsl = build_owned_gsl3(PatL->data,rank_in,Symmetry);
-      break;
-         case 4:
-                  cgsl = build_owned_gsl4(PatL->data,rank_in,Symmetry);
-      break;
-         case 5:
-                  cgsl = build_owned_gsl5(PatL->data,rank_in);
-      break;
-               default:
-      cout<<"Parallel::build_owned_gsl : unknown type = "<<type<<endl;
-                  MPI_Abort(MPI_COMM_WORLD,1);
-      }
-       gs = cgsl;
-       while(gs && gs->next) gs = gs->next;
-    }
-    else
-    {
-       switch(type)
-      {
-         case 0:
-                  gs->next = build_owned_gsl0(PatL->data,rank_in);
-      break;
-         case 1:
-                  gs->next = build_owned_gsl1(PatL->data,rank_in);
-      break;
-         case 2:
-                  gs->next = build_owned_gsl2(PatL->data,rank_in);
-      break;
-         case 3:
-                  gs->next = build_owned_gsl3(PatL->data,rank_in,Symmetry);
-      break;
-         case 4:
-                  gs->next = build_owned_gsl4(PatL->data,rank_in,Symmetry);
-      break;
-         case 5:
-                  gs->next = build_owned_gsl5(PatL->data,rank_in);
-      break;
-               default:
-      cout<<"Parallel::build_owned_gsl : unknown type = "<<type<<endl;
-                  MPI_Abort(MPI_COMM_WORLD,1);
-      }
-       while(gs && gs->next) gs = gs->next;
-    }
-    PatL = PatL->next;
-       }
-
-       return cgsl;
-}
-*/
-// more clever method
-MyList<Parallel::gridseg> *Parallel::build_owned_gsl(MyList<Patch> *PatL, int rank_in, int type, int Symmetry)
-{
-  MyList<Parallel::gridseg> *cgsl = 0, *gs;
-  while (PatL)
-  {
-    switch (type)
-    {
-    case 0:
-      gs = build_owned_gsl0(PatL->data, rank_in);
-      break;
-    case 1:
-      gs = build_owned_gsl1(PatL->data, rank_in);
-      break;
-    case 2:
-      gs = build_owned_gsl2(PatL->data, rank_in);
-      break;
-    case 3:
-      gs = build_owned_gsl3(PatL->data, rank_in, Symmetry);
-      break;
-    case 4:
-      gs = build_owned_gsl4(PatL->data, rank_in, Symmetry);
-      break;
-    case 5:
-      gs = build_owned_gsl5(PatL->data, rank_in);
-      break;
-    default:
-      cout << "Parallel::build_owned_gsl : unknown type = " << type << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-    if (cgsl)
-      cgsl->catList(gs);
-    else
-      cgsl = gs;
-    PatL = PatL->next;
-  }
-
-  return cgsl;
-}
-// according to overlape to determine real grid segments
-void Parallel::build_gstl(MyList<Parallel::gridseg> *srci, MyList<Parallel::gridseg> *dsti,
-                          MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst)
-{
-  *out_src = *out_dst = 0;
-
-  if (!srci || !dsti)
-    return;
-
-  MyList<Parallel::gridseg> *s, *d;
-  MyList<Parallel::gridseg> *s2, *d2;
-
-  double llb[dim], uub[dim];
-
-  s = srci;
-  while (s)
-  {
-    Parallel::gridseg *sd = s->data;
-    d = dsti;
-    while (d)
-    {
-      Parallel::gridseg *dd = d->data;
-      bool flag = true;
-      for (int i = 0; i < dim; i++)
-      {
-        double SH = sd->Bg->getdX(i), DH = dd->Bg->getdX(i);
-        llb[i] = Mymax(sd->llb[i], dd->llb[i]);
-        uub[i] = Mymin(sd->uub[i], dd->uub[i]);
-        // make sure the region boundary is consistent to the grids
-        // here we only judge if the domain is empty, so do not need to adjust the align
-        double lb = llb[i], ub = uub[i];
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        // ---*---
-        // x-------x
-        //		if     (int(2*(sd->uub[i]-uub[i])/SH+0.4)%2 == 1) ub = uub[i]-SH/2;
-        //		else if(int(2*(dd->uub[i]-uub[i])/DH+0.4)%2 == 1) ub = uub[i]-DH/2;
-        //		if     (int(2*(llb[i]-sd->llb[i])/SH+0.4)%2 == 1) lb = llb[i]+SH/2;
-        //		else if(int(2*(llb[i]-dd->llb[i])/DH+0.4)%2 == 1) lb = llb[i]+DH/2;
-        if (lb > ub + Mymin(SH, DH) / 2)
-        {
-          flag = false;
-          break;
-        } // special for isolated point
-#else
-#ifdef Cell
-        // |------|
-        // |-------------|
-        //		if     (int(2*(sd->uub[i]-uub[i])/SH+0.4)%2 == 1) ub = uub[i]+SH/2;
-        //		else if(int(2*(dd->uub[i]-uub[i])/DH+0.4)%2 == 1) ub = uub[i]+DH/2;
-        //        |------|
-        // |-------------|
-        //		if     (int(2*(llb[i]-sd->llb[i])/SH+0.4)%2 == 1) lb = llb[i]-SH/2;
-        //		else if(int(2*(llb[i]-dd->llb[i])/DH+0.4)%2 == 1) lb = llb[i]-DH/2;
-        if (ub - lb < Mymin(SH, DH) / 2)
-        {
-          flag = false;
-          break;
-        } // even for isolated point, it has a cell belong to it
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-
-      if (flag)
-      {
-        if (!(*out_src))
-        {
-          *out_src = s2 = new MyList<Parallel::gridseg>;
-          *out_dst = d2 = new MyList<Parallel::gridseg>;
-          s2->data = new Parallel::gridseg;
-          d2->data = new Parallel::gridseg;
-        }
-        else
-        {
-          s2->next = new MyList<Parallel::gridseg>;
-          s2 = s2->next;
-          d2->next = new MyList<Parallel::gridseg>;
-          d2 = d2->next;
-          s2->data = new Parallel::gridseg;
-          d2->data = new Parallel::gridseg;
-        }
-
-        for (int i = 0; i < dim; i++)
-        {
-          double SH = sd->Bg->getdX(i), DH = dd->Bg->getdX(i);
-          s2->data->llb[i] = d2->data->llb[i] = llb[i];
-          s2->data->uub[i] = d2->data->uub[i] = uub[i];
-// using float method to count point, we do not need following consideration (2012 nov 17)
-#if 1
-
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-          // old code distuinguish vertex and cell
-          //		   if     (int(2*(sd->uub[i]-uub[i])/SH+0.4)%2 == 1) s2->data->uub[i] = uub[i]-SH/2;
-          //		   else if(int(2*(dd->uub[i]-uub[i])/DH+0.4)%2 == 1) d2->data->uub[i] = uub[i]-DH/2;
-          //	           if     (int(2*(llb[i]-sd->llb[i])/SH+0.4)%2 == 1) s2->data->llb[i] = llb[i]+SH/2;
-          //		   else if(int(2*(llb[i]-dd->llb[i])/DH+0.4)%2 == 1) d2->data->llb[i] = llb[i]+DH/2;
-          // new code: here we concern much more about missing point, because overlaping domain has been gaureented above
-          if (int(2 * (sd->uub[i] - uub[i]) / SH + 0.4) % 2 == 1)
-            s2->data->uub[i] = uub[i] + SH / 2;
-          else if (int(2 * (dd->uub[i] - uub[i]) / DH + 0.4) % 2 == 1)
-            d2->data->uub[i] = uub[i] + DH / 2;
-          if (int(2 * (llb[i] - sd->llb[i]) / SH + 0.4) % 2 == 1)
-            s2->data->llb[i] = llb[i] - SH / 2;
-          else if (int(2 * (llb[i] - dd->llb[i]) / DH + 0.4) % 2 == 1)
-            d2->data->llb[i] = llb[i] - DH / 2;
-          s2->data->shape[i] = int((s2->data->uub[i] - s2->data->llb[i]) / SH + 0.4) + 1;
-          d2->data->shape[i] = int((d2->data->uub[i] - d2->data->llb[i]) / DH + 0.4) + 1;
-#else
-#ifdef Cell
-          if (int(2 * (sd->uub[i] - uub[i]) / SH + 0.4) % 2 == 1)
-            s2->data->uub[i] = uub[i] + SH / 2;
-          else if (int(2 * (dd->uub[i] - uub[i]) / DH + 0.4) % 2 == 1)
-            d2->data->uub[i] = uub[i] + DH / 2;
-          if (int(2 * (llb[i] - sd->llb[i]) / SH + 0.4) % 2 == 1)
-            s2->data->llb[i] = llb[i] - SH / 2;
-          else if (int(2 * (llb[i] - dd->llb[i]) / DH + 0.4) % 2 == 1)
-            d2->data->llb[i] = llb[i] - DH / 2;
-          s2->data->shape[i] = int((s2->data->uub[i] - s2->data->llb[i]) / SH + 0.4);
-          d2->data->shape[i] = int((d2->data->uub[i] - d2->data->llb[i]) / DH + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-
-#endif
-          s2->data->illb[i] = sd->illb[i];
-          d2->data->illb[i] = dd->illb[i];
-          s2->data->iuub[i] = sd->iuub[i];
-          d2->data->iuub[i] = dd->iuub[i];
-        }
-        s2->data->Bg = sd->Bg;
-        s2->next = 0;
-        d2->data->Bg = dd->Bg;
-        d2->next = 0;
-      }
-      d = d->next;
-    }
-    s = s->next;
-  }
-}
-//   PACK: prepare target data in 'data'
-// UNPACK: copy target data from 'data' to corresponding numerical grids
-int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<Parallel::gridseg> *dst, int rank_in, int dir,
-                          MyList<var> *VarLists /* source */, MyList<var> *VarListd /* target */, int Symmetry)
-{
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  int DIM = dim;
-
-  if (dir != PACK && dir != UNPACK)
-  {
-    cout << "error dir " << dir << " for data_packer " << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-  int size_out = 0;
-
-  if (!src || !dst)
-    return size_out;
-
-  MyList<var> *varls, *varld;
-
-  varls = VarLists;
-  varld = VarListd;
-  while (varls && varld)
-  {
-    varls = varls->next;
-    varld = varld->next;
-  }
-
-  if (varls || varld)
-  {
-    cout << "error in short data packer, var lists does not match." << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-  int type; /* 1 copy, 2 restrict, 3 prolong */
-  if (src->data->Bg->lev == dst->data->Bg->lev)
-    type = 1;
-  else if (src->data->Bg->lev > dst->data->Bg->lev)
-    type = 2;
-  else
-    type = 3;
-
-  while (src && dst)
-  {
-    if ((dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
-        (dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank))
-    {
-      varls = VarLists;
-      varld = VarListd;
-      while (varls && varld)
-      {
-        if (data)
-        {
-          if (dir == PACK)
-            switch (type)
-            {
-              // attention must be paied to the difference between src's llb,uub and dst's llb,uub
-            case 1:
-              f_copy(DIM, dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
-                     src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
-                     dst->data->llb, dst->data->uub);
-              break;
-            case 2:
-              f_restrict3(DIM, dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
-                          src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
-                          dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
-              break;
-            case 3:
-              f_prolong3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
-                         dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
-                         dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
-            }
-          if (dir == UNPACK) // from target data to corresponding grid
-            f_copy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn],
-                   dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
-                   dst->data->llb, dst->data->uub);
-        }
-        size_out += dst->data->shape[0] * dst->data->shape[1] * dst->data->shape[2];
-        varls = varls->next;
-        varld = varld->next;
-      }
-    }
-    dst = dst->next;
-    src = src->next;
-  }
-
-  return size_out;
-}
-int Parallel::data_packermix(double *data, MyList<Parallel::gridseg> *src, MyList<Parallel::gridseg> *dst, int rank_in, int dir,
-                             MyList<var> *VarLists /* source */, MyList<var> *VarListd /* target */, int Symmetry)
-{
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  int DIM = dim;
-
-  if (dir != PACK && dir != UNPACK)
-  {
-    cout << "Parallel::data_packermix: error dir " << dir << " for data_packermix." << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-  int size_out = 0;
-
-  if (!src || !dst)
-    return size_out;
-
-  MyList<var> *varls, *varld;
-
-  varls = VarLists;
-  varld = VarListd;
-  while (varls && varld)
-  {
-    varls = varls->next;
-    varld = varld->next;
-  }
-
-  if (varls || varld)
-  {
-    cout << "error in short data packer, var lists does not match." << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-  int type; /* 1 copy, 2 restrict, 3 prolong */
-  if (src->data->Bg->lev == dst->data->Bg->lev)
-    type = 1;
-  else if (src->data->Bg->lev > dst->data->Bg->lev)
-    type = 2;
-  else
-    type = 3;
-
-  if (type != 3)
-  {
-    cout << "Parallel::data_packermix: error type " << type << " for data_packermix." << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-  while (src && dst)
-  {
-    if ((dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
-        (dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank))
-    {
-      varls = VarLists;
-      varld = VarListd;
-      while (varls && varld)
-      {
-        if (data)
-        {
-          if (dir == PACK)
-            f_prolongcopy3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
-                           dst->data->llb, dst->data->uub, src->data->shape, data + size_out,
-                           src->data->llb, src->data->uub, varls->data->SoA, Symmetry);
-          if (dir == UNPACK) // from target data to corresponding grid
-            f_prolongmix3(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn],
-                          src->data->llb, src->data->uub, src->data->shape, data + size_out,
-                          dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry, dst->data->illb, dst->data->iuub);
-        }
-        // the symmetry problem should be dealt in prolongcopy3,
-        // so we always have ghost_width for both sides
-        size_out += (src->data->shape[0] + 2 * ghost_width) * (src->data->shape[1] + 2 * ghost_width) * (src->data->shape[2] + 2 * ghost_width);
-        varls = varls->next;
-        varld = varld->next;
-      }
-    }
-    dst = dst->next;
-    src = src->next;
-  }
-
-  return size_out;
-}
-//
-void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
-                        MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
-                        int Symmetry)
-{
-  int myrank, cpusize;
-  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  int node;
-
-  MPI_Request *reqs;
-  MPI_Status *stats;
-  reqs = new MPI_Request[2 * cpusize];
-  stats = new MPI_Status[2 * cpusize];
-  int req_no = 0;
-
-  double **send_data, **rec_data;
-  send_data = new double *[cpusize];
-  rec_data = new double *[cpusize];
-  int length;
-
-  for (node = 0; node < cpusize; node++)
-  {
-    send_data[node] = rec_data[node] = 0;
-    if (node == myrank)
-    {
-      if (length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
-      {
-        rec_data[node] = new double[length];
-        if (!rec_data[node])
-        {
-          cout << "out of memory when new in short transfer, place 1" << endl;
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-        data_packer(rec_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
-      }
-    }
-    else
-    {
-      // send from this cpu to cpu#node
-      if (length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
-      {
-        send_data[node] = new double[length];
-        if (!send_data[node])
-        {
-          cout << "out of memory when new in short transfer, place 2" << endl;
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-        data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
-        MPI_Isend((void *)send_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
-      }
-      // receive from cpu#node to this cpu
-      if (length = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry))
-      {
-        rec_data[node] = new double[length];
-        if (!rec_data[node])
-        {
-          cout << "out of memory when new in short transfer, place 3" << endl;
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-        MPI_Irecv((void *)rec_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
-      }
-    }
-  }
-  // wait for all requests to complete
-  MPI_Waitall(req_no, reqs, stats);
-
-  for (node = 0; node < cpusize; node++)
-    if (rec_data[node])
-      data_packer(rec_data[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
-
-  for (node = 0; node < cpusize; node++)
-  {
-    if (send_data[node])
-      delete[] send_data[node];
-    if (rec_data[node])
-      delete[] rec_data[node];
-  }
-
-  delete[] reqs;
-  delete[] stats;
-  delete[] send_data;
-  delete[] rec_data;
-}
-//
-void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
-                           MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
-                           int Symmetry)
-{
-  int myrank, cpusize;
-  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  int node;
-
-  MPI_Request *reqs;
-  MPI_Status *stats;
-  reqs = new MPI_Request[2 * cpusize];
-  stats = new MPI_Status[2 * cpusize];
-  int req_no = 0;
-
-  double **send_data, **rec_data;
-  send_data = new double *[cpusize];
-  rec_data = new double *[cpusize];
-  int length;
-
-  for (node = 0; node < cpusize; node++)
-  {
-    send_data[node] = rec_data[node] = 0;
-    if (node == myrank)
-    {
-      if (length = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
-      {
-        rec_data[node] = new double[length];
-        if (!rec_data[node])
-        {
-          cout << "out of memory when new in short transfer, place 1" << endl;
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-        data_packermix(rec_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
-      }
-    }
-    else
-    {
-      // send from this cpu to cpu#node
-      if (length = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
-      {
-        send_data[node] = new double[length];
-        if (!send_data[node])
-        {
-          cout << "out of memory when new in short transfer, place 2" << endl;
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-        data_packermix(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
-        MPI_Isend((void *)send_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
-      }
-      // receive from cpu#node to this cpu
-      if (length = data_packermix(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry))
-      {
-        rec_data[node] = new double[length];
-        if (!rec_data[node])
-        {
-          cout << "out of memory when new in short transfer, place 3" << endl;
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-        MPI_Irecv((void *)rec_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
-      }
-    }
-  }
-  // wait for all requests to complete
-  MPI_Waitall(req_no, reqs, stats);
-
-  for (node = 0; node < cpusize; node++)
-    if (rec_data[node])
-      data_packermix(rec_data[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
-
-  for (node = 0; node < cpusize; node++)
-  {
-    if (send_data[node])
-      delete[] send_data[node];
-    if (rec_data[node])
-      delete[] rec_data[node];
-  }
-
-  delete[] reqs;
-  delete[] stats;
-  delete[] send_data;
-  delete[] rec_data;
-}
-void Parallel::Sync(Patch *Pat, MyList<var> *VarList, int Symmetry)
-{
-  int cpusize;
-  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-
-  MyList<Parallel::gridseg> *dst;
-  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
-  src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
-
-  dst = build_ghost_gsl(Pat); // ghost region only
-  for (int node = 0; node < cpusize; node++)
-  {
-    src[node] = build_owned_gsl0(Pat, node);                              // for the part without ghost points and do not extend
-    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer_src[node], data locate on cpu#node;
-                                                                          // but for transfer_dst[node] the data may locate on any node
-  }
-
-  transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
-
-  if (dst)
-    dst->destroyList();
-  for (int node = 0; node < cpusize; node++)
-  {
-    if (src[node])
-      src[node]->destroyList();
-    if (transfer_src[node])
-      transfer_src[node]->destroyList();
-    if (transfer_dst[node])
-      transfer_dst[node]->destroyList();
-  }
-
-  delete[] src;
-  delete[] transfer_src;
-  delete[] transfer_dst;
-}
-void Parallel::Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
-{
-  // Patch inner Synch
-  MyList<Patch> *Pp = PatL;
-  while (Pp)
-  {
-    Sync(Pp->data, VarList, Symmetry);
-    Pp = Pp->next;
-  }
-
-  // Patch inter Synch
-  int cpusize;
-  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-
-  MyList<Parallel::gridseg> *dst;
-  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
-  src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
-
-  dst = build_buffer_gsl(PatL); // buffer region only
-  for (int node = 0; node < cpusize; node++)
-  {
-    src[node] = build_owned_gsl(PatL, node, 5, Symmetry);                 // for the part without ghost nor buffer points and do not extend
-    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
-  }
-
-  transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
-
-  if (dst)
-    dst->destroyList();
-  for (int node = 0; node < cpusize; node++)
-  {
-    if (src[node])
-      src[node]->destroyList();
-    if (transfer_src[node])
-      transfer_src[node]->destroyList();
-    if (transfer_dst[node])
-      transfer_dst[node]->destroyList();
-  }
-
-  delete[] src;
-  delete[] transfer_src;
-  delete[] transfer_dst;
-}
-// Merged Sync: collect all intra-patch and inter-patch grid segment lists,
-// then issue a single transfer() call instead of N+1 separate ones.
-void Parallel::Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
-{
-  int cpusize;
-  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-
-  MyList<Parallel::gridseg> **combined_src = new MyList<Parallel::gridseg> *[cpusize];
-  MyList<Parallel::gridseg> **combined_dst = new MyList<Parallel::gridseg> *[cpusize];
-  for (int node = 0; node < cpusize; node++)
-    combined_src[node] = combined_dst[node] = 0;
-
-  // Phase A: Intra-patch ghost exchange segments
-  MyList<Patch> *Pp = PatL;
-  while (Pp)
-  {
-    Patch *Pat = Pp->data;
-    MyList<Parallel::gridseg> *dst_ghost = build_ghost_gsl(Pat);
-
-    for (int node = 0; node < cpusize; node++)
-    {
-      MyList<Parallel::gridseg> *src_owned = build_owned_gsl0(Pat, node);
-      MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
-      build_gstl(src_owned, dst_ghost, &tsrc, &tdst);
-
-      if (tsrc)
-      {
-        if (combined_src[node])
-          combined_src[node]->catList(tsrc);
-        else
-          combined_src[node] = tsrc;
-      }
-      if (tdst)
-      {
-        if (combined_dst[node])
-          combined_dst[node]->catList(tdst);
-        else
-          combined_dst[node] = tdst;
-      }
-
-      if (src_owned)
-        src_owned->destroyList();
-    }
-
-    if (dst_ghost)
-      dst_ghost->destroyList();
-
-    Pp = Pp->next;
-  }
-
-  // Phase B: Inter-patch buffer exchange segments
-  MyList<Parallel::gridseg> *dst_buffer = build_buffer_gsl(PatL);
-  for (int node = 0; node < cpusize; node++)
-  {
-    MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatL, node, 5, Symmetry);
-    MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
-    build_gstl(src_owned, dst_buffer, &tsrc, &tdst);
-
-    if (tsrc)
-    {
-      if (combined_src[node])
-        combined_src[node]->catList(tsrc);
-      else
-        combined_src[node] = tsrc;
-    }
-    if (tdst)
-    {
-      if (combined_dst[node])
-        combined_dst[node]->catList(tdst);
-      else
-        combined_dst[node] = tdst;
-    }
-
-    if (src_owned)
-      src_owned->destroyList();
-  }
-  if (dst_buffer)
-    dst_buffer->destroyList();
-
-  // Phase C: Single transfer
-  transfer(combined_src, combined_dst, VarList, VarList, Symmetry);
-
-  // Phase D: Cleanup
-  for (int node = 0; node < cpusize; node++)
-  {
-    if (combined_src[node])
-      combined_src[node]->destroyList();
-    if (combined_dst[node])
-      combined_dst[node]->destroyList();
-  }
-  delete[] combined_src;
-  delete[] combined_dst;
-}
-// SyncCache constructor
-Parallel::SyncCache::SyncCache()
-    : valid(false), cpusize(0), combined_src(0), combined_dst(0),
-      send_lengths(0), recv_lengths(0), send_bufs(0), recv_bufs(0),
-      send_buf_caps(0), recv_buf_caps(0), reqs(0), stats(0), max_reqs(0),
-      lengths_valid(false)
-{
-}
-// SyncCache invalidate: free grid segment lists but keep buffers
-void Parallel::SyncCache::invalidate()
-{
-  if (!valid)
-    return;
-  for (int i = 0; i < cpusize; i++)
-  {
-    if (combined_src[i])
-      combined_src[i]->destroyList();
-    if (combined_dst[i])
-      combined_dst[i]->destroyList();
-    combined_src[i] = combined_dst[i] = 0;
-    send_lengths[i] = recv_lengths[i] = 0;
-  }
-  valid = false;
-  lengths_valid = false;
-}
-// SyncCache destroy: free everything
-void Parallel::SyncCache::destroy()
-{
-  invalidate();
-  if (combined_src) delete[] combined_src;
-  if (combined_dst) delete[] combined_dst;
-  if (send_lengths) delete[] send_lengths;
-  if (recv_lengths) delete[] recv_lengths;
-  if (send_buf_caps) delete[] send_buf_caps;
-  if (recv_buf_caps) delete[] recv_buf_caps;
-  for (int i = 0; i < cpusize; i++)
-  {
-    if (send_bufs && send_bufs[i]) delete[] send_bufs[i];
-    if (recv_bufs && recv_bufs[i]) delete[] recv_bufs[i];
-  }
-  if (send_bufs) delete[] send_bufs;
-  if (recv_bufs) delete[] recv_bufs;
-  if (reqs) delete[] reqs;
-  if (stats) delete[] stats;
-  combined_src = combined_dst = 0;
-  send_lengths = recv_lengths = 0;
-  send_buf_caps = recv_buf_caps = 0;
-  send_bufs = recv_bufs = 0;
-  reqs = 0; stats = 0;
-  cpusize = 0; max_reqs = 0;
-}
-// transfer_cached: reuse pre-allocated buffers from SyncCache
-void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
-                               MyList<var> *VarList1, MyList<var> *VarList2,
-                               int Symmetry, SyncCache &cache)
-{
-  int myrank;
-  MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  int cpusize = cache.cpusize;
-
-  int req_no = 0;
-  int node;
-
-  for (node = 0; node < cpusize; node++)
-  {
-    if (node == myrank)
-    {
-      int length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
-      cache.recv_lengths[node] = length;
-      if (length > 0)
-      {
-        if (length > cache.recv_buf_caps[node])
-        {
-          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
-          cache.recv_bufs[node] = new double[length];
-          cache.recv_buf_caps[node] = length;
-        }
-        data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
-      }
-    }
-    else
-    {
-      // send
-      int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
-      cache.send_lengths[node] = slength;
-      if (slength > 0)
-      {
-        if (slength > cache.send_buf_caps[node])
-        {
-          if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
-          cache.send_bufs[node] = new double[slength];
-          cache.send_buf_caps[node] = slength;
-        }
-        data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
-        MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
-      }
-      // recv
-      int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
-      cache.recv_lengths[node] = rlength;
-      if (rlength > 0)
-      {
-        if (rlength > cache.recv_buf_caps[node])
-        {
-          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
-          cache.recv_bufs[node] = new double[rlength];
-          cache.recv_buf_caps[node] = rlength;
-        }
-        MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
-      }
-    }
-  }
-
-  MPI_Waitall(req_no, cache.reqs, cache.stats);
-
-  for (node = 0; node < cpusize; node++)
-    if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
-      data_packer(cache.recv_bufs[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
-}
-// Sync_cached: build grid segment lists on first call, reuse on subsequent calls
-void Parallel::Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache)
-{
-  if (!cache.valid)
-  {
-    int cpusize;
-    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-    cache.cpusize = cpusize;
-
-    // Allocate cache arrays if needed
-    if (!cache.combined_src)
-    {
-      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
-      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
-      cache.send_lengths = new int[cpusize];
-      cache.recv_lengths = new int[cpusize];
-      cache.send_bufs = new double *[cpusize];
-      cache.recv_bufs = new double *[cpusize];
-      cache.send_buf_caps = new int[cpusize];
-      cache.recv_buf_caps = new int[cpusize];
-      for (int i = 0; i < cpusize; i++)
-      {
-        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
-        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
-      }
-      cache.max_reqs = 2 * cpusize;
-      cache.reqs = new MPI_Request[cache.max_reqs];
-      cache.stats = new MPI_Status[cache.max_reqs];
-    }
-
-    for (int node = 0; node < cpusize; node++)
-    {
-      cache.combined_src[node] = cache.combined_dst[node] = 0;
-      cache.send_lengths[node] = cache.recv_lengths[node] = 0;
-    }
-
-    // Build intra-patch segments (same as Sync_merged Phase A)
-    MyList<Patch> *Pp = PatL;
-    while (Pp)
-    {
-      Patch *Pat = Pp->data;
-      MyList<Parallel::gridseg> *dst_ghost = build_ghost_gsl(Pat);
-      for (int node = 0; node < cpusize; node++)
-      {
-        MyList<Parallel::gridseg> *src_owned = build_owned_gsl0(Pat, node);
-        MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
-        build_gstl(src_owned, dst_ghost, &tsrc, &tdst);
-        if (tsrc)
-        {
-          if (cache.combined_src[node])
-            cache.combined_src[node]->catList(tsrc);
-          else
-            cache.combined_src[node] = tsrc;
-        }
-        if (tdst)
-        {
-          if (cache.combined_dst[node])
-            cache.combined_dst[node]->catList(tdst);
-          else
-            cache.combined_dst[node] = tdst;
-        }
-        if (src_owned) src_owned->destroyList();
-      }
-      if (dst_ghost) dst_ghost->destroyList();
-      Pp = Pp->next;
-    }
-
-    // Build inter-patch segments (same as Sync_merged Phase B)
-    MyList<Parallel::gridseg> *dst_buffer = build_buffer_gsl(PatL);
-    for (int node = 0; node < cpusize; node++)
-    {
-      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatL, node, 5, Symmetry);
-      MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
-      build_gstl(src_owned, dst_buffer, &tsrc, &tdst);
-      if (tsrc)
-      {
-        if (cache.combined_src[node])
-          cache.combined_src[node]->catList(tsrc);
-        else
-          cache.combined_src[node] = tsrc;
-      }
-      if (tdst)
-      {
-        if (cache.combined_dst[node])
-          cache.combined_dst[node]->catList(tdst);
-        else
-          cache.combined_dst[node] = tdst;
-      }
-      if (src_owned) src_owned->destroyList();
-    }
-    if (dst_buffer) dst_buffer->destroyList();
-
-    cache.valid = true;
-  }
-
-  // Use cached lists with buffer-reusing transfer
-  transfer_cached(cache.combined_src, cache.combined_dst, VarList, VarList, Symmetry, cache);
-}
-// Sync_start: pack and post MPI_Isend/Irecv, return immediately
-void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
-                          SyncCache &cache, AsyncSyncState &state)
-{
-  // Ensure cache is built
-  if (!cache.valid)
-  {
-    // Build cache (same logic as Sync_cached)
-    int cpusize;
-    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-    cache.cpusize = cpusize;
-
-    if (!cache.combined_src)
-    {
-      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
-      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
-      cache.send_lengths = new int[cpusize];
-      cache.recv_lengths = new int[cpusize];
-      cache.send_bufs = new double *[cpusize];
-      cache.recv_bufs = new double *[cpusize];
-      cache.send_buf_caps = new int[cpusize];
-      cache.recv_buf_caps = new int[cpusize];
-      for (int i = 0; i < cpusize; i++)
-      {
-        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
-        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
-      }
-      cache.max_reqs = 2 * cpusize;
-      cache.reqs = new MPI_Request[cache.max_reqs];
-      cache.stats = new MPI_Status[cache.max_reqs];
-    }
-
-    for (int node = 0; node < cpusize; node++)
-    {
-      cache.combined_src[node] = cache.combined_dst[node] = 0;
-      cache.send_lengths[node] = cache.recv_lengths[node] = 0;
-    }
-
-    MyList<Patch> *Pp = PatL;
-    while (Pp)
-    {
-      Patch *Pat = Pp->data;
-      MyList<Parallel::gridseg> *dst_ghost = build_ghost_gsl(Pat);
-      for (int node = 0; node < cpusize; node++)
-      {
-        MyList<Parallel::gridseg> *src_owned = build_owned_gsl0(Pat, node);
-        MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
-        build_gstl(src_owned, dst_ghost, &tsrc, &tdst);
-        if (tsrc)
-        {
-          if (cache.combined_src[node])
-            cache.combined_src[node]->catList(tsrc);
-          else
-            cache.combined_src[node] = tsrc;
-        }
-        if (tdst)
-        {
-          if (cache.combined_dst[node])
-            cache.combined_dst[node]->catList(tdst);
-          else
-            cache.combined_dst[node] = tdst;
-        }
-        if (src_owned) src_owned->destroyList();
-      }
-      if (dst_ghost) dst_ghost->destroyList();
-      Pp = Pp->next;
-    }
-
-    MyList<Parallel::gridseg> *dst_buffer = build_buffer_gsl(PatL);
-    for (int node = 0; node < cpusize; node++)
-    {
-      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatL, node, 5, Symmetry);
-      MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
-      build_gstl(src_owned, dst_buffer, &tsrc, &tdst);
-      if (tsrc)
-      {
-        if (cache.combined_src[node])
-          cache.combined_src[node]->catList(tsrc);
-        else
-          cache.combined_src[node] = tsrc;
-      }
-      if (tdst)
-      {
-        if (cache.combined_dst[node])
-          cache.combined_dst[node]->catList(tdst);
-        else
-          cache.combined_dst[node] = tdst;
-      }
-      if (src_owned) src_owned->destroyList();
-    }
-    if (dst_buffer) dst_buffer->destroyList();
-    cache.valid = true;
-  }
-
-  // Now pack and post async MPI operations
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  int cpusize = cache.cpusize;
-  state.req_no = 0;
-  state.active = true;
-
-  MyList<Parallel::gridseg> **src = cache.combined_src;
-  MyList<Parallel::gridseg> **dst = cache.combined_dst;
-
-  for (int node = 0; node < cpusize; node++)
-  {
-    if (node == myrank)
-    {
-      int length;
-      if (!cache.lengths_valid) {
-        length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
-        cache.recv_lengths[node] = length;
-      } else {
-        length = cache.recv_lengths[node];
-      }
-      if (length > 0)
-      {
-        if (length > cache.recv_buf_caps[node])
-        {
-          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
-          cache.recv_bufs[node] = new double[length];
-          cache.recv_buf_caps[node] = length;
-        }
-        data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
-      }
-    }
-    else
-    {
-      int slength;
-      if (!cache.lengths_valid) {
-        slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
-        cache.send_lengths[node] = slength;
-      } else {
-        slength = cache.send_lengths[node];
-      }
-      if (slength > 0)
-      {
-        if (slength > cache.send_buf_caps[node])
-        {
-          if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
-          cache.send_bufs[node] = new double[slength];
-          cache.send_buf_caps[node] = slength;
-        }
-        data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
-        MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
-      }
-      int rlength;
-      if (!cache.lengths_valid) {
-        rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry);
-        cache.recv_lengths[node] = rlength;
-      } else {
-        rlength = cache.recv_lengths[node];
-      }
-      if (rlength > 0)
-      {
-        if (rlength > cache.recv_buf_caps[node])
-        {
-          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
-          cache.recv_bufs[node] = new double[rlength];
-          cache.recv_buf_caps[node] = rlength;
-        }
-        MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
-      }
-    }
-  }
-  cache.lengths_valid = true;
-}
-// Sync_finish: wait for async MPI operations and unpack
-void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state,
-                           MyList<var> *VarList, int Symmetry)
-{
-  if (!state.active)
-    return;
-
-  MPI_Waitall(state.req_no, cache.reqs, cache.stats);
-
-  int cpusize = cache.cpusize;
-  MyList<Parallel::gridseg> **src = cache.combined_src;
-  MyList<Parallel::gridseg> **dst = cache.combined_dst;
-
-  for (int node = 0; node < cpusize; node++)
-    if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
-      data_packer(cache.recv_bufs[node], src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry);
-
-  state.active = false;
-}
-// collect buffer grid segments or blocks for the periodic boundary condition of given patch
-// ---------------------------------------------------
-// |con |                                       |con |
-// |ner |                PhysBD                 |ner |
-// |-------------------------------------------------|
-// |    |                                       |    |
-// |Phy |                                       |Phy |
-// |sBD |                                       |BD  |
-// |    |                                       |    |
-// |    |                                       |    |
-// |    |                                       |    |
-// |-------------------------------------------------|
-// |con |               PhysBD                  |con |
-// |ner |                                       |ner |
-// ---------------------------------------------------
-// first order derivetive does not need conner information,
-// but second order derivative needs!
-/* the following code does not include conner part
-MyList<Parallel::gridseg> *Parallel::build_PhysBD_gsl(Patch *Pat)
-{
-       MyList<Parallel::gridseg> *cgsl,*gsc,*gsb=0,*p;
-       gsc = build_ghost_gsl(Pat);
-       for(int i=0;i<dim;i++)
-       {
-         double DH = gsc->data->Bg->getdX(i);
-// lower boundary
-         if(gsb)
-   {
-          p = new MyList<Parallel::gridseg>;
-          p->data = new Parallel::gridseg;
-          p->next=gsb;
-    gsb=p;
-   }
-   else
-   {
-          gsb = new MyList<Parallel::gridseg>;
-          gsb->data = new Parallel::gridseg;
-          gsb->next=0;
-   }
-         for(int j=0;j<dim;j++)
-   {
-           if(i == j)
-     {
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-             gsb->data->llb[i] = Pat->bbox[i]-ghost_width*DH;
-             gsb->data->uub[i] = Pat->bbox[i]-DH;
-#else
-#ifdef Cell
-             gsb->data->llb[i] = Pat->bbox[i]-ghost_width*DH;
-             gsb->data->uub[i] = Pat->bbox[i];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-             gsb->data->shape[i] = ghost_width;
-     }
-     else
-     {
-             gsb->data->llb[j] = Pat->bbox[j];
-             gsb->data->uub[j] = Pat->bbox[j+dim];
-             gsb->data->shape[j] = Pat->shape[j];
-     }
-   }
-   gsb->data->Bg = 0;  //vertual grid segment
-// upper boundary
-         p = new MyList<Parallel::gridseg>;
-         p->data = new Parallel::gridseg;
-         p->next=gsb;
-   gsb=p;
-         for(int j=0;j<dim;j++)
-   {
-           if(i == j)
-     {
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-             gsb->data->llb[i] = Pat->bbox[i+dim]+DH;
-             gsb->data->uub[i] = Pat->bbox[i+dim]+ghost_width*DH;
-#else
-#ifdef Cell
-             gsb->data->llb[i] = Pat->bbox[i+dim];
-             gsb->data->uub[i] = Pat->bbox[i+dim]+ghost_width*DH;
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-             gsb->data->shape[i] = ghost_width;
-     }
-     else
-     {
-             gsb->data->llb[j] = Pat->bbox[j];
-             gsb->data->uub[j] = Pat->bbox[j+dim];
-             gsb->data->shape[j] = Pat->shape[j];
-     }
-   }
-   gsb->data->Bg = 0;  //vertual grid segment
-       }
-
-       cgsl = gsl_and(gsc,gsb);
-
-       gsc->destroyList();
-       gsb->destroyList();
-
-       return cgsl;
-}
-*/
-// the following code includes conner part
-MyList<Parallel::gridseg> *Parallel::build_PhysBD_gsl(Patch *Pat)
-{
-  MyList<Parallel::gridseg> *cgsl, *gsc, *gsb = 0, *p;
-
-  gsc = build_complete_gsl(Pat);
-
-  gsb = new MyList<Parallel::gridseg>;
-  gsb->data = new Parallel::gridseg;
-  gsb->next = 0;
-  gsb->data->Bg = 0;
-
-  for (int j = 0; j < dim; j++)
-  {
-    gsb->data->llb[j] = Pat->bbox[j];
-    gsb->data->uub[j] = Pat->bbox[j + dim];
-    gsb->data->shape[j] = Pat->shape[j];
-  }
-
-  p = gsl_subtract(gsc, gsb);
-
-  gsc->destroyList();
-  gsb->destroyList();
-
-  cgsl = divide_gsl(p, Pat);
-
-  p->destroyList();
-
-  return cgsl;
-}
-MyList<Parallel::gridseg> *Parallel::divide_gsl(MyList<Parallel::gridseg> *p, Patch *Pat)
-{
-  MyList<Parallel::gridseg> *cgsl = 0;
-  while (p)
-  {
-    if (cgsl)
-      cgsl->catList(divide_gs(p, Pat));
-    else
-      cgsl = divide_gs(p, Pat);
-    p = p->next;
-  }
-
-  return cgsl;
-}
-// divide the gs into pices which locate either totally outside of the given Patch coordinate range
-// or totally inside it. It's usefull for periodic boundary condition
-MyList<Parallel::gridseg> *Parallel::divide_gs(MyList<Parallel::gridseg> *p, Patch *Pat)
-{
-  double DH[dim];
-  for (int i = 0; i < dim; i++)
-  {
-    DH[i] = p->data->Bg->getdX(i);
-  }
-
-  int num[dim];
-  double llb[3][dim], uub[3][dim];
-  for (int i = 0; i < dim; i++)
-  {
-    if (p->data->llb[i] < Pat->bbox[i] - DH[i] / 2)
-    {
-      if (p->data->uub[i] > Pat->bbox[i + dim] + DH[i] / 2)
-      {
-        num[i] = 3;
-        llb[0][i] = p->data->llb[i];
-        llb[1][i] = Pat->bbox[i];
-        uub[1][i] = Pat->bbox[i + dim];
-        uub[2][i] = p->data->uub[i];
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        uub[0][i] = Pat->bbox[i] - DH[i];
-        llb[2][i] = Pat->bbox[i + dim] + DH[i];
-#else
-#ifdef Cell
-        uub[0][i] = Pat->bbox[i];
-        llb[2][i] = Pat->bbox[i + dim];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-      else if (p->data->uub[i] > Pat->bbox[i] + DH[i] / 2)
-      {
-        num[i] = 2;
-        llb[0][i] = p->data->llb[i];
-        llb[1][i] = Pat->bbox[i];
-        uub[1][i] = p->data->uub[i];
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        uub[0][i] = Pat->bbox[i] - DH[i];
-#else
-#ifdef Cell
-        uub[0][i] = Pat->bbox[i];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-      else
-      {
-        num[i] = 1;
-        llb[0][i] = p->data->llb[i];
-        uub[0][i] = p->data->uub[i];
-      }
-    }
-    else if (p->data->llb[i] < Pat->bbox[i + dim] - DH[i] / 2)
-    {
-      if (p->data->uub[i] > Pat->bbox[i + dim] + DH[i] / 2)
-      {
-        num[i] = 2;
-        llb[0][i] = p->data->llb[i];
-        uub[0][i] = Pat->bbox[i + dim];
-        uub[1][i] = p->data->uub[i];
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        llb[1][i] = Pat->bbox[i + dim] + DH[i];
-#else
-#ifdef Cell
-        llb[1][i] = Pat->bbox[i + dim];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-      else
-      {
-        num[i] = 1;
-        llb[0][i] = p->data->llb[i];
-        uub[0][i] = p->data->uub[i];
-      }
-    }
-    else
-    {
-      num[i] = 1;
-      llb[0][i] = p->data->llb[i];
-      uub[0][i] = p->data->uub[i];
-    }
-  }
-  MyList<Parallel::gridseg> *cgsl = 0, *gg;
-  int NN = 1;
-  for (int i = 0; i < dim; i++)
-    NN = NN * num[i];
-
-  for (int i = 0; i < NN; i++)
-  {
-    int ind[dim];
-    getarrayindex(dim, num, ind, i);
-    gg = clone_gsl(p, true);
-    for (int k = 0; k < dim; k++)
-    {
-      gg->data->llb[k] = llb[ind[k]][k];
-      gg->data->uub[k] = uub[ind[k]][k];
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-      gg->data->shape[k] = int((uub[ind[k]][k] - llb[ind[k]][k]) / DH[k] + 0.4) + 1;
-#else
-#ifdef Cell
-      gg->data->shape[k] = int((uub[ind[k]][k] - llb[ind[k]][k]) / DH[k] + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-    }
-
-    if (cgsl)
-      cgsl->catList(gg);
-    else
-      cgsl = gg;
-  }
-
-  return cgsl;
-}
-// after mod operation, according to overlape to determine real grid segments
-void Parallel::build_PhysBD_gstl(Patch *Pat, MyList<Parallel::gridseg> *srci, MyList<Parallel::gridseg> *dsti,
-                                 MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst)
-{
-  *out_src = *out_dst = 0;
-
-  if (!srci || !dsti)
-    return;
-
-  MyList<Parallel::gridseg> *s, *d;
-  MyList<Parallel::gridseg> *s2, *d2;
-
-  double llb[dim], uub[dim];
-
-  s = srci;
-  while (s)
-  {
-    Parallel::gridseg *sd = s->data;
-    d = dsti;
-    while (d)
-    {
-      Parallel::gridseg *dd = d->data;
-      bool flag = true;
-      for (int i = 0; i < dim; i++)
-      {
-        double SH = sd->Bg->getdX(i), DH = dd->Bg->getdX(i);
-        if (!feq(SH, DH, SH / 2))
-        {
-          cout << "Parallel::build_PhysBD_gstl meets different grid space SH = " << SH << ", DH = " << DH << endl;
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-        // we assume dst and src locate on the same Patch
-        if (dd->llb[i] < Pat->bbox[i])
-          llb[i] = Mymax(sd->llb[i], dd->llb[i] + Pat->bbox[dim + i] - Pat->bbox[i]);
-        else if (dd->llb[i] > Pat->bbox[i + dim])
-          llb[i] = Mymax(sd->llb[i], dd->llb[i] - Pat->bbox[dim + i] + Pat->bbox[i]);
-        else
-          llb[i] = Mymax(sd->llb[i], dd->llb[i]);
-
-        if (dd->uub[i] < Pat->bbox[i])
-          uub[i] = Mymin(sd->uub[i], dd->uub[i] + Pat->bbox[dim + i] - Pat->bbox[i]);
-        else if (dd->uub[i] > Pat->bbox[dim + i])
-          uub[i] = Mymin(sd->uub[i], dd->uub[i] - Pat->bbox[dim + i] + Pat->bbox[i]);
-        else
-          uub[i] = Mymin(sd->uub[i], dd->uub[i]);
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        if (llb[i] > uub[i] + SH / 2)
-        {
-          flag = false;
-          break;
-        } // special for isolated point
-#else
-#ifdef Cell
-        if (llb[i] > uub[i])
-        {
-          flag = false;
-          break;
-        }
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-
-      if (flag)
-      {
-        if (!(*out_src))
-        {
-          *out_src = s2 = new MyList<Parallel::gridseg>;
-          *out_dst = d2 = new MyList<Parallel::gridseg>;
-          s2->data = new Parallel::gridseg;
-          d2->data = new Parallel::gridseg;
-        }
-        else
-        {
-          s2->next = new MyList<Parallel::gridseg>;
-          s2 = s2->next;
-          d2->next = new MyList<Parallel::gridseg>;
-          d2 = d2->next;
-          s2->data = new Parallel::gridseg;
-          d2->data = new Parallel::gridseg;
-        }
-
-        for (int i = 0; i < dim; i++)
-        {
-          double SH = sd->Bg->getdX(i), DH = dd->Bg->getdX(i);
-          s2->data->llb[i] = llb[i];
-          s2->data->uub[i] = uub[i];
-
-          if (dd->llb[i] < Pat->bbox[i])
-            d2->data->llb[i] = llb[i] - Pat->bbox[dim + i] + Pat->bbox[i];
-          else if (dd->llb[i] > Pat->bbox[i + dim])
-            d2->data->llb[i] = llb[i] + Pat->bbox[dim + i] - Pat->bbox[i];
-          else
-            d2->data->llb[i] = llb[i];
-
-          if (dd->uub[i] < Pat->bbox[i])
-            d2->data->uub[i] = uub[i] - Pat->bbox[dim + i] + Pat->bbox[i];
-          else if (dd->uub[i] > Pat->bbox[dim + i])
-            d2->data->uub[i] = uub[i] + Pat->bbox[dim + i] - Pat->bbox[i];
-          else
-            d2->data->uub[i] = uub[i];
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-          s2->data->shape[i] = int((s2->data->uub[i] - s2->data->llb[i]) / SH + 0.4) + 1;
-          d2->data->shape[i] = int((d2->data->uub[i] - d2->data->llb[i]) / DH + 0.4) + 1;
-#else
-#ifdef Cell
-          s2->data->shape[i] = int((s2->data->uub[i] - s2->data->llb[i]) / SH + 0.4);
-          d2->data->shape[i] = int((d2->data->uub[i] - d2->data->llb[i]) / DH + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-        }
-        s2->data->Bg = sd->Bg;
-        s2->next = 0;
-        d2->data->Bg = dd->Bg;
-        d2->next = 0;
-      }
-      d = d->next;
-    }
-    s = s->next;
-  }
-}
-void Parallel::PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry)
-{
-  int cpusize;
-  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-
-  MyList<Parallel::gridseg> *dst;
-  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
-  src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
-
-  dst = build_PhysBD_gsl(Pat);
-  for (int node = 0; node < cpusize; node++)
-  {
-    src[node] = build_owned_gsl0(Pat, node);                                          // for the part without ghost points and do not extend
-    build_PhysBD_gstl(Pat, src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
-  }
-
-  transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
-
-  if (dst)
-    dst->destroyList();
-  for (int node = 0; node < cpusize; node++)
-  {
-    if (src[node])
-      src[node]->destroyList();
-    if (transfer_src[node])
-      transfer_src[node]->destroyList();
-    if (transfer_dst[node])
-      transfer_dst[node]->destroyList();
-  }
-
-  delete[] src;
-  delete[] transfer_src;
-  delete[] transfer_dst;
-}
-double Parallel::L2Norm(Patch *Pat, var *vf)
-{
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  double tvf, dtvf = 0;
-  int BDW = ghost_width;
-
-  MyList<Block> *BP = Pat->blb;
-  while (BP)
-  {
-    Block *cg = BP->data;
-    if (myrank == cg->rank)
-    {
-      f_l2normhelper(cg->shape, cg->X[0], cg->X[1], cg->X[2],
-                     Pat->bbox[0], Pat->bbox[1], Pat->bbox[2],
-                     Pat->bbox[3], Pat->bbox[4], Pat->bbox[5],
-                     cg->fgfs[vf->sgfn], tvf, BDW);
-      dtvf += tvf;
-    }
-    if (BP == Pat->ble)
-      break;
-    BP = BP->next;
-  }
-
-  MPI_Allreduce(&dtvf, &tvf, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-
-  tvf = sqrt(tvf);
-
-  return tvf;
-}
-double Parallel::L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here)
-{
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  double tvf, dtvf = 0;
-  int BDW = ghost_width;
-
-  MyList<Block> *BP = Pat->blb;
-  while (BP)
-  {
-    Block *cg = BP->data;
-    if (myrank == cg->rank)
-    {
-      f_l2normhelper(cg->shape, cg->X[0], cg->X[1], cg->X[2],
-                     Pat->bbox[0], Pat->bbox[1], Pat->bbox[2],
-                     Pat->bbox[3], Pat->bbox[4], Pat->bbox[5],
-                     cg->fgfs[vf->sgfn], tvf, BDW);
-      dtvf += tvf;
-    }
-    if (BP == Pat->ble)
-      break;
-    BP = BP->next;
-  }
-
-  MPI_Allreduce(&dtvf, &tvf, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
-
-  tvf = sqrt(tvf);
-
-  return tvf;
-}
-void Parallel::checkgsl(MyList<Parallel::gridseg> *pp, bool first_only)
-{
-  int myrank = 0;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  if (myrank == 0)
-  {
-    if (!pp)
-      cout << " Parallel::checkgsl meets empty gsl" << endl;
-    while (pp)
-    {
-      if (pp->data->Bg)
-        cout << " on node#" << pp->data->Bg->rank << endl;
-      else
-        cout << " virtual grid segment" << endl;
-      cout << " shape: (";
-      for (int i = 0; i < dim; i++)
-      {
-        if (i < dim - 1)
-          cout << pp->data->shape[i] << ",";
-        else
-          cout << pp->data->shape[i] << ")" << endl;
-      }
-      cout << " range: (";
-      for (int i = 0; i < dim; i++)
-      {
-        if (i < dim - 1)
-          cout << pp->data->llb[i] << ":" << pp->data->uub[i] << ",";
-        else
-          cout << pp->data->llb[i] << ":" << pp->data->uub[i] << ")" << endl;
-      }
-      if (first_only)
-        return;
-      pp = pp->next;
-    }
-  }
-}
-void Parallel::checkvarl(MyList<var> *pp, bool first_only)
-{
-  int myrank = 0;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  if (myrank == 0)
-  {
-    while (pp)
-    {
-      cout << "name: " << pp->data->name << endl;
-      cout << "SoA = (" << pp->data->SoA[0] << "," << pp->data->SoA[1] << "," << pp->data->SoA[2] << ")" << endl;
-      cout << "sgfn = " << pp->data->sgfn << endl;
-      if (first_only)
-        return;
-      pp = pp->next;
-    }
-  }
-}
-void Parallel::prepare_inter_time_level(MyList<Patch> *PatL,
-                                        MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
-                                        MyList<var> *VarList3 /* target (t+a*dt) */, int tindex)
-{
-  while (PatL)
-  {
-    prepare_inter_time_level(PatL->data, VarList1, VarList2, VarList3, tindex);
-    PatL = PatL->next;
-  }
-}
-void Parallel::prepare_inter_time_level(Patch *Pat,
-                                        MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
-                                        MyList<var> *VarList3 /* target (t+a*dt) */, int tindex)
-{
-  int myrank = 0;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  MyList<var> *varl1;
-  MyList<var> *varl2;
-  MyList<var> *varl3;
-
-  MyList<Block> *BP = Pat->blb;
-  while (BP)
-  {
-    Block *cg = BP->data;
-    if (myrank == cg->rank)
-    {
-      varl1 = VarList1;
-      varl2 = VarList2;
-      varl3 = VarList3;
-      while (varl1)
-      {
-        if (tindex == 0)
-          f_average(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn], cg->fgfs[varl3->data->sgfn]);
-        else if (tindex == 1)
-          f_average3(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn], cg->fgfs[varl3->data->sgfn]);
-        else if (tindex == -1)
-          // just change data order to use average3
-          f_average3(cg->shape, cg->fgfs[varl2->data->sgfn], cg->fgfs[varl1->data->sgfn], cg->fgfs[varl3->data->sgfn]);
-        else
-        {
-          cout << "error tindex in Parallel::prepare_inter_time_level" << endl;
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-        varl1 = varl1->next;
-        varl2 = varl2->next;
-        varl3 = varl3->next;
-      }
-    }
-    if (BP == Pat->ble)
-      break;
-    BP = BP->next;
-  }
-}
-void Parallel::prepare_inter_time_level(MyList<Patch> *PatL,
-                                        MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
-                                        MyList<var> *VarList3 /* source (t-dt) */, MyList<var> *VarList4 /* target (t+a*dt) */, int tindex)
-{
-  while (PatL)
-  {
-    prepare_inter_time_level(PatL->data, VarList1, VarList2, VarList3, VarList4, tindex);
-    PatL = PatL->next;
-  }
-}
-void Parallel::prepare_inter_time_level(Patch *Pat,
-                                        MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
-                                        MyList<var> *VarList3 /* source (t-dt) */, MyList<var> *VarList4 /* target (t+a*dt) */, int tindex)
-{
-  int myrank = 0;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  MyList<var> *varl1;
-  MyList<var> *varl2;
-  MyList<var> *varl3;
-  MyList<var> *varl4;
-
-  MyList<Block> *BP = Pat->blb;
-  while (BP)
-  {
-    Block *cg = BP->data;
-    if (myrank == cg->rank)
-    {
-      varl1 = VarList1;
-      varl2 = VarList2;
-      varl3 = VarList3;
-      varl4 = VarList4;
-      while (varl1)
-      {
-        if (tindex == 0)
-          f_average2(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn],
-                     cg->fgfs[varl3->data->sgfn], cg->fgfs[varl4->data->sgfn]);
-        else if (tindex == 1)
-          f_average2p(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn],
-                      cg->fgfs[varl3->data->sgfn], cg->fgfs[varl4->data->sgfn]);
-        else if (tindex == -1)
-          f_average2m(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn],
-                      cg->fgfs[varl3->data->sgfn], cg->fgfs[varl4->data->sgfn]);
-        else
-        {
-          cout << "error tindex in long cgh::prepare_inter_time_level" << endl;
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-        varl1 = varl1->next;
-        varl2 = varl2->next;
-        varl3 = varl3->next;
-        varl4 = varl4->next;
-      }
-    }
-    if (BP == Pat->ble)
-      break;
-    BP = BP->next;
-  }
-}
-void Parallel::Prolong(Patch *Patc, Patch *Patf,
-                       MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
-                       int Symmetry)
-{
-  if (Patc->lev >= Patf->lev)
-  {
-    cout << "Parallel::Prolong: meet requst of Prolong from lev#" << Patc->lev << " to lev#" << Patf->lev << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-  int cpusize;
-  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-
-  MyList<Parallel::gridseg> *dst;
-  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
-  src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
-
-  dst = build_complete_gsl(Patf); // including ghost
-  for (int node = 0; node < cpusize; node++)
-  {
-    src[node] = build_owned_gsl4(Patc, node, Symmetry);                   // - buffer - ghost - BD ghost
-    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
-  }
-
-  transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
-
-  if (dst)
-    dst->destroyList();
-  for (int node = 0; node < cpusize; node++)
-  {
-    if (src[node])
-      src[node]->destroyList();
-    if (transfer_src[node])
-      transfer_src[node]->destroyList();
-    if (transfer_dst[node])
-      transfer_dst[node]->destroyList();
-  }
-
-  delete[] src;
-  delete[] transfer_src;
-  delete[] transfer_dst;
-}
-void Parallel::Restrict(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
-                        MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
-                        int Symmetry)
-{
-  if (PatcL->data->lev >= PatfL->data->lev)
-  {
-    cout << "Parallel::Restrict: meet requst of Restrict from lev#" << PatfL->data->lev << " to lev#" << PatcL->data->lev << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-  int cpusize;
-  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-
-  MyList<Parallel::gridseg> *dst;
-  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
-  src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
-
-  dst = build_complete_gsl(PatcL); // including ghost
-  for (int node = 0; node < cpusize; node++)
-  {
-#if 0
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif    
-      src[node]=build_owned_gsl(PatfL,node,2,Symmetry);   // - buffer - ghost
-#else
-#ifdef Cell
-      src[node]=build_owned_gsl(PatfL,node,4,Symmetry); // - buffer - ghost - BD ghost
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-#else
-    // it seems bam always use this
-    src[node] = build_owned_gsl(PatfL, node, 2, Symmetry); // - buffer - ghost
-#endif
-    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
-  }
-
-  transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
-
-  if (dst)
-    dst->destroyList();
-  for (int node = 0; node < cpusize; node++)
-  {
-    if (src[node])
-      src[node]->destroyList();
-    if (transfer_src[node])
-      transfer_src[node]->destroyList();
-    if (transfer_dst[node])
-      transfer_dst[node]->destroyList();
-  }
-
-  delete[] src;
-  delete[] transfer_src;
-  delete[] transfer_dst;
-}
-void Parallel::Restrict_after(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
-                              MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
-                              int Symmetry)
-{
-  if (PatcL->data->lev >= PatfL->data->lev)
-  {
-    cout << "Parallel::Restrict: meet requst of Restrict from lev#" << PatfL->data->lev << " to lev#" << PatcL->data->lev << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-  int cpusize;
-  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-
-  MyList<Parallel::gridseg> *dst;
-  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
-  src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
-
-  dst = build_complete_gsl(PatcL); // including ghost
-  for (int node = 0; node < cpusize; node++)
-  {
-    src[node] = build_owned_gsl(PatfL, node, 3, Symmetry); // - ghost - BD ghost
-
-    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
-  }
-
-  transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
-
-  if (dst)
-    dst->destroyList();
-  for (int node = 0; node < cpusize; node++)
-  {
-    if (src[node])
-      src[node]->destroyList();
-    if (transfer_src[node])
-      transfer_src[node]->destroyList();
-    if (transfer_dst[node])
-      transfer_dst[node]->destroyList();
-  }
-
-  delete[] src;
-  delete[] transfer_src;
-  delete[] transfer_dst;
-}
-// for the same time level
-void Parallel::OutBdLow2Hi(Patch *Patc, Patch *Patf,
-                           MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
-                           int Symmetry)
-{
-  if (Patc->lev >= Patf->lev)
-  {
-    cout << "Parallel::OutBdLow2Hi: meet requst of Prolong from lev#" << Patc->lev << " to lev#" << Patf->lev << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-  int cpusize;
-  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-
-  MyList<Parallel::gridseg> *dst;
-  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
-  src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
-
-  dst = build_buffer_gsl(Patf); // buffer region only
-
-  for (int node = 0; node < cpusize; node++)
-  {
-    src[node] = build_owned_gsl4(Patc, node, Symmetry);                   // - buffer - ghost - BD ghost
-    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
-  }
-
-  transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
-
-  if (dst)
-    dst->destroyList();
-  for (int node = 0; node < cpusize; node++)
-  {
-    if (src[node])
-      src[node]->destroyList();
-    if (transfer_src[node])
-      transfer_src[node]->destroyList();
-    if (transfer_dst[node])
-      transfer_dst[node]->destroyList();
-  }
-
-  delete[] src;
-  delete[] transfer_src;
-  delete[] transfer_dst;
-}
-void Parallel::OutBdLow2Hi(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
-                           MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
-                           int Symmetry)
-{
-  MyList<Patch> *Pp, *Ppc;
-  Ppc = PatcL;
-  while (Ppc)
-  {
-    Pp = PatfL;
-    while (Pp)
-    {
-      if (Ppc->data->lev >= Pp->data->lev)
-      {
-        cout << "Parallel::OutBdLow2Hi(list): meet requst of Prolong from lev#" << Ppc->data->lev << " to lev#" << Pp->data->lev << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-      Pp = Pp->next;
-    }
-    Ppc = Ppc->next;
-  }
-
-  int cpusize;
-  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-
-  MyList<Parallel::gridseg> *dst;
-  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
-  src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
-
-  dst = build_buffer_gsl(PatfL); // buffer region only
-
-  for (int node = 0; node < cpusize; node++)
-  {
-    src[node] = build_owned_gsl(PatcL, node, 4, Symmetry);                // - buffer - ghost - BD ghost
-    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
-  }
-
-  transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
-
-  if (dst)
-    dst->destroyList();
-  for (int node = 0; node < cpusize; node++)
-  {
-    if (src[node])
-      src[node]->destroyList();
-    if (transfer_src[node])
-      transfer_src[node]->destroyList();
-    if (transfer_dst[node])
-      transfer_dst[node]->destroyList();
-  }
-
-  delete[] src;
-  delete[] transfer_src;
-  delete[] transfer_dst;
-}
-// for the same time level
-void Parallel::OutBdLow2Himix(Patch *Patc, Patch *Patf,
-                              MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
-                              int Symmetry)
-{
-  if (Patc->lev >= Patf->lev)
-  {
-    cout << "Parallel::OutBdLow2Himix: meet requst of Prolong from lev#" << Patc->lev << " to lev#" << Patf->lev << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-  int cpusize;
-  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-
-  MyList<Parallel::gridseg> *dst;
-  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
-  src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
-
-  dst = build_buffer_gsl(Patf); // buffer region only
-
-  for (int node = 0; node < cpusize; node++)
-  {
-    src[node] = build_owned_gsl4(Patc, node, Symmetry);                   // - buffer - ghost - BD ghost
-    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
-  }
-
-  transfermix(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
-
-  if (dst)
-    dst->destroyList();
-  for (int node = 0; node < cpusize; node++)
-  {
-    if (src[node])
-      src[node]->destroyList();
-    if (transfer_src[node])
-      transfer_src[node]->destroyList();
-    if (transfer_dst[node])
-      transfer_dst[node]->destroyList();
-  }
-
-  delete[] src;
-  delete[] transfer_src;
-  delete[] transfer_dst;
-
-  // do not need this, we have done after calling of this routine in ProlongRestrict or RestrictProlong
-  //    Sync(Patf,VarList2,Symmetry);  // fine level points may be not enough for interpolation
-}
-void Parallel::OutBdLow2Himix(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
-                              MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
-                              int Symmetry)
-{
-  MyList<Patch> *Pp, *Ppc;
-  Ppc = PatcL;
-  while (Ppc)
-  {
-    Pp = PatfL;
-    while (Pp)
-    {
-      if (Ppc->data->lev >= Pp->data->lev)
-      {
-        cout << "Parallel::OutBdLow2Himix(list): meet requst of Prolong from lev#" << Ppc->data->lev << " to lev#" << Pp->data->lev << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-      Pp = Pp->next;
-    }
-    Ppc = Ppc->next;
-  }
-
-  int cpusize;
-  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-
-  MyList<Parallel::gridseg> *dst;
-  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
-  src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
-
-  dst = build_buffer_gsl(PatfL); // buffer region only
-
-  for (int node = 0; node < cpusize; node++)
-  {
-    src[node] = build_owned_gsl(PatcL, node, 4, Symmetry);                // - buffer - ghost - BD ghost
-    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
-  }
-
-  transfermix(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
-
-  if (dst)
-    dst->destroyList();
-  for (int node = 0; node < cpusize; node++)
-  {
-    if (src[node])
-      src[node]->destroyList();
-    if (transfer_src[node])
-      transfer_src[node]->destroyList();
-    if (transfer_dst[node])
-      transfer_dst[node]->destroyList();
-  }
-
-  delete[] src;
-  delete[] transfer_src;
-  delete[] transfer_dst;
-}
-
-// Restrict_cached: cache grid segment lists, reuse buffers via transfer_cached
-void Parallel::Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
-                               MyList<var> *VarList1, MyList<var> *VarList2,
-                               int Symmetry, SyncCache &cache)
-{
-  if (!cache.valid)
-  {
-    int cpusize;
-    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-    cache.cpusize = cpusize;
-
-    if (!cache.combined_src)
-    {
-      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
-      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
-      cache.send_lengths = new int[cpusize];
-      cache.recv_lengths = new int[cpusize];
-      cache.send_bufs = new double *[cpusize];
-      cache.recv_bufs = new double *[cpusize];
-      cache.send_buf_caps = new int[cpusize];
-      cache.recv_buf_caps = new int[cpusize];
-      for (int i = 0; i < cpusize; i++)
-      {
-        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
-        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
-      }
-      cache.max_reqs = 2 * cpusize;
-      cache.reqs = new MPI_Request[cache.max_reqs];
-      cache.stats = new MPI_Status[cache.max_reqs];
-    }
-
-    MyList<Parallel::gridseg> *dst = build_complete_gsl(PatcL);
-    for (int node = 0; node < cpusize; node++)
-    {
-      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatfL, node, 2, Symmetry);
-      build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]);
-      if (src_owned) src_owned->destroyList();
-    }
-    if (dst) dst->destroyList();
-
-    cache.valid = true;
-  }
-
-  transfer_cached(cache.combined_src, cache.combined_dst, VarList1, VarList2, Symmetry, cache);
-}
-
-// OutBdLow2Hi_cached: cache grid segment lists, reuse buffers via transfer_cached
-void Parallel::OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
-                                  MyList<var> *VarList1, MyList<var> *VarList2,
-                                  int Symmetry, SyncCache &cache)
-{
-  if (!cache.valid)
-  {
-    int cpusize;
-    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-    cache.cpusize = cpusize;
-
-    if (!cache.combined_src)
-    {
-      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
-      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
-      cache.send_lengths = new int[cpusize];
-      cache.recv_lengths = new int[cpusize];
-      cache.send_bufs = new double *[cpusize];
-      cache.recv_bufs = new double *[cpusize];
-      cache.send_buf_caps = new int[cpusize];
-      cache.recv_buf_caps = new int[cpusize];
-      for (int i = 0; i < cpusize; i++)
-      {
-        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
-        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
-      }
-      cache.max_reqs = 2 * cpusize;
-      cache.reqs = new MPI_Request[cache.max_reqs];
-      cache.stats = new MPI_Status[cache.max_reqs];
-    }
-
-    MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
-    for (int node = 0; node < cpusize; node++)
-    {
-      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatcL, node, 4, Symmetry);
-      build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]);
-      if (src_owned) src_owned->destroyList();
-    }
-    if (dst) dst->destroyList();
-
-    cache.valid = true;
-  }
-
-  transfer_cached(cache.combined_src, cache.combined_dst, VarList1, VarList2, Symmetry, cache);
-}
-
-// OutBdLow2Himix_cached: same as OutBdLow2Hi_cached but uses transfermix for unpacking
-void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
-                                     MyList<var> *VarList1, MyList<var> *VarList2,
-                                     int Symmetry, SyncCache &cache)
-{
-  if (!cache.valid)
-  {
-    int cpusize;
-    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-    cache.cpusize = cpusize;
-
-    if (!cache.combined_src)
-    {
-      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
-      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
-      cache.send_lengths = new int[cpusize];
-      cache.recv_lengths = new int[cpusize];
-      cache.send_bufs = new double *[cpusize];
-      cache.recv_bufs = new double *[cpusize];
-      cache.send_buf_caps = new int[cpusize];
-      cache.recv_buf_caps = new int[cpusize];
-      for (int i = 0; i < cpusize; i++)
-      {
-        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
-        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
-      }
-      cache.max_reqs = 2 * cpusize;
-      cache.reqs = new MPI_Request[cache.max_reqs];
-      cache.stats = new MPI_Status[cache.max_reqs];
-    }
-
-    MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
-    for (int node = 0; node < cpusize; node++)
-    {
-      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatcL, node, 4, Symmetry);
-      build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]);
-      if (src_owned) src_owned->destroyList();
-    }
-    if (dst) dst->destroyList();
-
-    cache.valid = true;
-  }
-
-  // Use transfermix instead of transfer for mix-mode interpolation
-  int myrank;
-  MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  int cpusize = cache.cpusize;
-
-  int req_no = 0;
-  for (int node = 0; node < cpusize; node++)
-  {
-    if (node == myrank)
-    {
-      int length = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
-      cache.recv_lengths[node] = length;
-      if (length > 0)
-      {
-        if (length > cache.recv_buf_caps[node])
-        {
-          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
-          cache.recv_bufs[node] = new double[length];
-          cache.recv_buf_caps[node] = length;
-        }
-        data_packermix(cache.recv_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
-      }
-    }
-    else
-    {
-      int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
-      cache.send_lengths[node] = slength;
-      if (slength > 0)
-      {
-        if (slength > cache.send_buf_caps[node])
-        {
-          if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
-          cache.send_bufs[node] = new double[slength];
-          cache.send_buf_caps[node] = slength;
-        }
-        data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
-        MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
-      }
-      int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
-      cache.recv_lengths[node] = rlength;
-      if (rlength > 0)
-      {
-        if (rlength > cache.recv_buf_caps[node])
-        {
-          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
-          cache.recv_bufs[node] = new double[rlength];
-          cache.recv_buf_caps[node] = rlength;
-        }
-        MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
-      }
-    }
-  }
-
-  MPI_Waitall(req_no, cache.reqs, cache.stats);
-
-  for (int node = 0; node < cpusize; node++)
-    if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
-      data_packermix(cache.recv_bufs[node], cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
-}
-
-// collect all buffer grid segments or blocks for given patch
-MyList<Parallel::gridseg> *Parallel::build_buffer_gsl(Patch *Pat)
-{
-  MyList<Parallel::gridseg> *cgsl, *gsc, *gsb;
-
-  gsc = build_complete_gsl(Pat); // including ghost
-
-  gsb = new MyList<Parallel::gridseg>;
-  gsb->data = new Parallel::gridseg;
-
-  for (int i = 0; i < dim; i++)
-  {
-    double DH = Pat->blb->data->getdX(i);
-    gsb->data->uub[i] = Pat->bbox[dim + i] - Pat->uui[i] * DH;
-    gsb->data->llb[i] = Pat->bbox[i] + Pat->lli[i] * DH;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-    gsb->data->shape[i] = int((gsb->data->uub[i] - gsb->data->llb[i]) / DH + 0.4) + 1;
-#else
-#ifdef Cell
-    gsb->data->shape[i] = int((gsb->data->uub[i] - gsb->data->llb[i]) / DH + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-  }
-  gsb->data->Bg = 0;
-  gsb->next = 0;
-
-  cgsl = gsl_subtract(gsc, gsb);
-
-  gsc->destroyList();
-  gsb->destroyList();
-
-  //  set illb and iuub
-  gsb = cgsl;
-  while (gsb)
-  {
-    for (int i = 0; i < dim; i++)
-    {
-      double DH = Pat->blb->data->getdX(i);
-      gsb->data->iuub[i] = Pat->bbox[dim + i] - Pat->uui[i] * DH;
-      gsb->data->illb[i] = Pat->bbox[i] + Pat->lli[i] * DH;
-    }
-    gsb = gsb->next;
-  }
-
-  return cgsl;
-}
-MyList<Parallel::gridseg> *Parallel::build_buffer_gsl(MyList<Patch> *PatL)
-{
-  MyList<Parallel::gridseg> *cgsl = 0, *gs;
-  while (PatL)
-  {
-    if (cgsl)
-    {
-      gs->next = build_buffer_gsl(PatL->data);
-      gs = gs->next;
-      if (gs)
-        while (gs->next)
-          gs = gs->next;
-    }
-    else
-    {
-      cgsl = build_buffer_gsl(PatL->data);
-      gs = cgsl;
-      if (gs)
-        while (gs->next)
-          gs = gs->next;
-    }
-    PatL = PatL->next;
-  }
-
-  return cgsl;
-}
-void Parallel::Prolongint(Patch *Patc, Patch *Patf,
-                          MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
-                          int Symmetry)
-{
-  if (Patc->lev >= Patf->lev)
-  {
-    cout << "Parallel::Prolong: meet requst of Prolong from lev#" << Patc->lev << " to lev#" << Patf->lev << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-  int myrank = 0;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  int num_var = 0;
-  MyList<var> *varl;
-  varl = VarList1;
-  while (varl)
-  {
-    num_var++;
-    varl = varl->next;
-  }
-
-  MyList<Block> *BP = Patf->blb;
-  while (BP)
-  {
-    int Npts;
-    if (myrank == BP->data->rank)
-      Npts = BP->data->shape[0] * BP->data->shape[1] * BP->data->shape[2];
-    MPI_Bcast(&Npts, 1, MPI_INT, BP->data->rank, MPI_COMM_WORLD);
-    double *pox[3];
-    for (int i = 0; i < 3; i++)
-      pox[i] = new double[Npts];
-    if (myrank == BP->data->rank)
-    {
-      for (int i = 0; i < Npts; i++)
-      {
-        int ind[3];
-        Parallel::getarrayindex(3, BP->data->shape, ind, i);
-        pox[0][i] = BP->data->X[0][ind[0]];
-        pox[1][i] = BP->data->X[1][ind[1]];
-        pox[2][i] = BP->data->X[2][ind[2]];
-      }
-    }
-    for (int i = 0; i < 3; i++)
-      MPI_Bcast(pox[i], Npts, MPI_DOUBLE, BP->data->rank, MPI_COMM_WORLD);
-    double *res;
-    res = new double[num_var * Npts];
-    Patc->Interp_Points(VarList1, Npts, pox, res, Symmetry); // because this operation is a global operation (for all processors)
-                                                             // we have to isolate it out of myrank==BP->data->rank
-    if (myrank == BP->data->rank)
-    {
-      for (int i = 0; i < Npts; i++)
-      {
-        varl = VarList2;
-        int j = 0;
-        while (varl)
-        {
-          (BP->data->fgfs[varl->data->sgfn])[i] = res[j + i * num_var];
-          j++;
-          varl = varl->next;
-        }
-      }
-    }
-    delete[] pox[0];
-    delete[] pox[1];
-    delete[] pox[2];
-    delete[] res;
-    BP = BP->next;
-  }
-}
-//
-void Parallel::merge_gsl(MyList<gridseg> *&A, const double ratio)
-{
-  if (!A)
-    return;
-
-  MyList<gridseg> *B, *C, *D = A;
-  bool flag = false;
-  while (D->next)
-  {
-    B = D->next;
-    while (B)
-    {
-      flag = merge_gs(D, B, C, ratio);
-      if (flag)
-        break;
-      B = B->next;
-    }
-    if (flag)
-      break;
-    D = D->next;
-  }
-
-  if (flag)
-  {
-    // delete D and B from A
-    MyList<gridseg> *E = A;
-    while (E->next)
-    {
-      MyList<gridseg> *tp = E->next;
-      if (D == tp || B == tp)
-      {
-        E->next = (tp->next) ? tp->next : 0;
-        delete tp->data;
-        delete tp;
-      }
-      if (E->next)
-        E = E->next;
-    }
-
-    if (D == A)
-    {
-      MyList<gridseg> *tp = A;
-      A = (A->next) ? A->next : 0;
-      delete tp->data;
-      delete tp;
-    }
-    // cat C to A
-    if (A)
-      A->catList(C);
-    else
-      A = C;
-
-    merge_gsl(A, ratio);
-  }
-}
-//
-bool Parallel::merge_gs(MyList<gridseg> *D, MyList<gridseg> *B, MyList<gridseg> *&C, const double ratio)
-{
-  if (!B || !D)
-    return false;
-
-  C = 0;
-  double llb[dim], uub[dim], DH[dim];
-  for (int i = 0; i < dim; i++)
-  {
-    double tdh;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-    DH[i] = (D->data->uub[i] - D->data->llb[i]) / (D->data->shape[i] - 1);
-    tdh = (B->data->uub[i] - B->data->llb[i]) / (B->data->shape[i] - 1);
-#else
-#ifdef Cell
-    DH[i] = (D->data->uub[i] - D->data->llb[i]) / D->data->shape[i];
-    tdh = (B->data->uub[i] - B->data->llb[i]) / B->data->shape[i];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-    if (!feq(DH[i], tdh, DH[i] / 2))
-    {
-      cout << "Parallel::merge_gs meets different grid segment " << DH[i] << " vs " << tdh << endl;
-      checkgsl(B, true);
-      checkgsl(D, true);
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-    llb[i] = Mymax(D->data->llb[i], B->data->llb[i]);
-    uub[i] = Mymin(D->data->uub[i], B->data->uub[i]);
-    //    if(uub[i]-llb[i] < DH[i]/2) return false;  //here this is valid for both vertex and cell
-
-    // use 0 instead of DH[i]/2, we consider contact case, 2012 Aug 8
-    if (uub[i] - llb[i] < 0)
-      return false; // here this is valid for both vertex and cell
-  }
-
-  // vb: volume of B
-  // vd: volume of D
-  // vo: volume of overlap
-  // vt: volume of smallest common box (virtual merged box)
-  double vd = 1, vb = 1, vt = 1, vo = 1;
-  for (int i = 0; i < dim; i++)
-  {
-    vt = vt * (Mymax(D->data->uub[i], B->data->uub[i]) - Mymin(D->data->llb[i], B->data->llb[i]));
-    vo = vo * (uub[i] - llb[i]);
-    vd = vd * (D->data->uub[i] - D->data->llb[i]);
-    vb = vb * (B->data->uub[i] - B->data->llb[i]);
-  }
-
-  // smller ratio, more possible to merge
-  if ((vd + vb - vo) / vt > ratio)
-  {
-    C = new MyList<gridseg>;
-    C->data = new gridseg;
-    for (int i = 0; i < dim; i++)
-    {
-      C->data->uub[i] = Mymax(D->data->uub[i], B->data->uub[i]);
-      C->data->llb[i] = Mymin(D->data->llb[i], B->data->llb[i]);
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-      C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / DH[i] + 0.4) + 1;
-#else
-#ifdef Cell
-      C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / DH[i] + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-    }
-    if (D->data->Bg == B->data->Bg)
-      C->data->Bg = D->data->Bg;
-    else
-      C->data->Bg = 0;
-
-    C->next = 0;
-
-    return true;
-  }
-  else
-  {
-    return false;
-  }
-}
-// Add ghost region to tangent plane
-// we assume the grids have the same resolution
-void Parallel::add_ghost_touch(MyList<gridseg> *&A)
-{
-  if (!A || !(A->next))
-    return;
-
-  double DH[dim];
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-  for (int i = 0; i < dim; i++)
-    DH[i] = (A->data->uub[i] - A->data->llb[i]) / (A->data->shape[i] - 1) / 2;
-#else
-#ifdef Cell
-  for (int i = 0; i < dim; i++)
-    DH[i] = (A->data->uub[i] - A->data->llb[i]) / A->data->shape[i] / 2;
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-
-  MyList<gridseg> *C1, *C2, *A1 = A, *A2, *dc;
-  dc = C1 = clone_gsl(A, false);
-  while (C1)
-  {
-    C2 = C1->next;
-    A2 = A1->next;
-    while (C2)
-    {
-      for (int i = 0; i < dim; i++)
-      {
-        if (feq(C1->data->llb[i], C2->data->uub[i], DH[i]))
-        {
-          // direction i touch, other directions overlap
-          bool flag = true;
-          for (int j = 0; j < i; j++)
-            if ((C1->data->llb[j] - C2->data->llb[j]) * (C1->data->uub[j] - C2->data->llb[j]) > 0 &&
-                (C2->data->llb[j] - C1->data->llb[j]) * (C2->data->uub[j] - C1->data->llb[j]) > 0)
-              flag = false;
-          for (int j = i + 1; j < dim; j++)
-            if ((C1->data->llb[j] - C2->data->llb[j]) * (C1->data->uub[j] - C2->data->llb[j]) > 0 &&
-                (C2->data->llb[j] - C1->data->llb[j]) * (C2->data->uub[j] - C1->data->llb[j]) > 0)
-              flag = false;
-
-          if (flag)
-          {
-            // only add one ghost region
-            if (feq(A1->data->llb[i], C1->data->llb[i], DH[i]))
-            {
-              A1->data->llb[i] -= ghost_width * 2 * DH[i];
-              A1->data->shape[i] += ghost_width;
-            }
-            if (feq(A2->data->uub[i], C2->data->uub[i], DH[i]))
-            {
-              A2->data->uub[i] += ghost_width * 2 * DH[i];
-              A2->data->shape[i] += ghost_width;
-            }
-          }
-        }
-        if (feq(C1->data->uub[i], C2->data->llb[i], DH[i]))
-        {
-          // direction i touch, other directions overlap
-          bool flag = true;
-          for (int j = 0; j < i; j++)
-            if ((C1->data->llb[j] - C2->data->llb[j]) * (C1->data->uub[j] - C2->data->llb[j]) > 0 &&
-                (C2->data->llb[j] - C1->data->llb[j]) * (C2->data->uub[j] - C1->data->llb[j]) > 0)
-              flag = false;
-          for (int j = i + 1; j < dim; j++)
-            if ((C1->data->llb[j] - C2->data->llb[j]) * (C1->data->uub[j] - C2->data->llb[j]) > 0 &&
-                (C2->data->llb[j] - C1->data->llb[j]) * (C2->data->uub[j] - C1->data->llb[j]) > 0)
-              flag = false;
-
-          if (flag)
-          {
-            // only add one ghost region
-            if (feq(A1->data->uub[i], C1->data->uub[i], DH[i]))
-            {
-              A1->data->uub[i] += ghost_width * 2 * DH[i];
-              A1->data->shape[i] += ghost_width;
-            }
-            if (feq(A2->data->llb[i], C2->data->llb[i], DH[i]))
-            {
-              A2->data->llb[i] -= ghost_width * 2 * DH[i];
-              A2->data->shape[i] += ghost_width;
-            }
-          }
-        }
-      }
-      C2 = C2->next;
-      A2 = A2->next;
-    }
-    C1 = C1->next;
-    A1 = A1->next;
-  }
-
-  if (dc)
-    dc->destroyList();
-}
-// According to overlap to cut the gsl into recular pices
-void Parallel::cut_gsl(MyList<gridseg> *&A)
-{
-  if (!A)
-    return;
-
-  MyList<gridseg> *B, *C, *D = A;
-  bool flag = false;
-  while (D->next)
-  {
-    B = D->next;
-    while (B)
-    {
-      flag = cut_gs(D, B, C);
-      if (flag)
-        break;
-      B = B->next;
-    }
-    if (flag)
-      break;
-    D = D->next;
-  }
-
-  if (flag)
-  {
-    // delete D and B from A
-    MyList<gridseg> *E = A;
-    while (E->next)
-    {
-      MyList<gridseg> *tp = E->next;
-      if (D == tp || B == tp)
-      {
-        E->next = (tp->next) ? tp->next : 0;
-        delete tp->data;
-        delete tp;
-      }
-      if (E->next)
-        E = E->next;
-    }
-
-    if (D == A)
-    {
-      MyList<gridseg> *tp = A;
-      A = (A->next) ? A->next : 0;
-      delete tp->data;
-      delete tp;
-    }
-    // cat C to A
-    if (A)
-      A->catList(C);
-    else
-      A = C;
-
-    cut_gsl(A);
-  }
-}
-// when D and B have overlap, cut them into C and return true
-// otherwise return false and C=0
-bool Parallel::cut_gs(MyList<gridseg> *D, MyList<gridseg> *B, MyList<gridseg> *&C)
-{
-  C = 0;
-  double llb[dim], uub[dim], DH[dim];
-  for (int i = 0; i < dim; i++)
-  {
-    double tdh;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-    DH[i] = (D->data->uub[i] - D->data->llb[i]) / (D->data->shape[i] - 1);
-    tdh = (B->data->uub[i] - B->data->llb[i]) / (B->data->shape[i] - 1);
-#else
-#ifdef Cell
-    DH[i] = (D->data->uub[i] - D->data->llb[i]) / D->data->shape[i];
-    tdh = (B->data->uub[i] - B->data->llb[i]) / B->data->shape[i];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-    if (!feq(DH[i], tdh, DH[i] / 2))
-    {
-      cout << "Parallel::cut_gs meets different grid segment " << DH[i] << " vs " << tdh << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-    llb[i] = Mymax(D->data->llb[i], B->data->llb[i]);
-    uub[i] = Mymin(D->data->uub[i], B->data->uub[i]);
-    // for efficiency we ask the width of the patch at least 2(buffer+ghost+BD ghost)
-    if (uub[i] - llb[i] < DH[i] * 2 * (buffer_width + 2 * ghost_width))
-      return false; // here this is valid for both vertex and cell
-  }
-
-  // this part code results in 5 patches generally
-
-  C = new MyList<gridseg>;
-  C->data = new gridseg;
-  for (int i = 0; i < dim; i++)
-  {
-    C->data->llb[i] = llb[i];
-    C->data->uub[i] = uub[i];
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-    C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / DH[i] + 0.4) + 1;
-#else
-#ifdef Cell
-    C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / DH[i] + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-  }
-  if (D->data->Bg == B->data->Bg)
-    C->data->Bg = D->data->Bg;
-  else
-    C->data->Bg = 0;
-
-  C->next = gs_subtract_virtual(D, C);
-
-  MyList<gridseg> *E = C;
-
-  while (E->next)
-    E = E->next;
-
-  E->next = gs_subtract_virtual(B, C);
-
-  // this part code results in 3 patches generally
-  /*
-       C = clone_gsl(D,true);
-       C->next = gs_subtract_virtual(B,C);
-  */
-
-  return true;
-}
-// note here it is different to real cut, we need leave the cutting edge for both vertex center and cell center
-MyList<Parallel::gridseg> *Parallel::gs_subtract_virtual(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B)
-{
-  if (!A)
-    return 0;
-  if (!B)
-    return clone_gsl(A, true);
-
-  double cut_plane[2 * dim], DH[dim];
-
-  for (int i = 0; i < dim; i++)
-  {
-    double tdh;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-    DH[i] = (A->data->uub[i] - A->data->llb[i]) / (A->data->shape[i] - 1);
-    tdh = (B->data->uub[i] - B->data->llb[i]) / (B->data->shape[i] - 1);
-#else
-#ifdef Cell
-    DH[i] = (A->data->uub[i] - A->data->llb[i]) / A->data->shape[i];
-    tdh = (B->data->uub[i] - B->data->llb[i]) / B->data->shape[i];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-    if (!feq(DH[i], tdh, DH[i] / 2))
-    {
-      cout << "Parallel::gs_subtract_virtual meets different grid segment " << DH[i] << " vs " << tdh << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-  }
-
-  MyList<Parallel::gridseg> *C = 0, *q;
-  for (int i = 0; i < dim; i++)
-  {
-    if (B->data->llb[i] > A->data->uub[i] || B->data->uub[i] < A->data->llb[i])
-      return clone_gsl(A, true);
-    cut_plane[i] = A->data->llb[i];
-    cut_plane[i + dim] = A->data->uub[i];
-  }
-
-  for (int i = 0; i < dim; i++)
-  {
-    cut_plane[i] = Mymax(A->data->llb[i], B->data->llb[i]);
-    if (cut_plane[i] > A->data->llb[i])
-    {
-      q = clone_gsl(A, true);
-      // prolong the list from head
-      if (C)
-        q->next = C;
-      C = q;
-      for (int j = 0; j < dim; j++)
-      {
-        if (i == j)
-        {
-          C->data->llb[i] = A->data->llb[i];
-          // **note here it is different to real cut, we need leave the cutting edge for both vertex center and cell center**
-          C->data->uub[i] = Mymax(C->data->llb[i], cut_plane[i]);
-        }
-        else
-        {
-          C->data->llb[j] = cut_plane[j];
-          C->data->uub[j] = cut_plane[j + dim];
-        }
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4) + 1;
-#else
-#ifdef Cell
-        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-    }
-
-    cut_plane[i + dim] = Mymin(A->data->uub[i], B->data->uub[i]);
-    if (cut_plane[i + dim] < A->data->uub[i])
-    {
-      q = clone_gsl(A, true);
-      if (C)
-        q->next = C;
-      C = q;
-      for (int j = 0; j < dim; j++)
-      {
-        if (i == j)
-        {
-          C->data->uub[i] = A->data->uub[i];
-          // note here it is different to real cut, we need leave the cutting edge for both vertex center and cell center
-          C->data->llb[i] = Mymin(C->data->uub[i], cut_plane[i + dim]);
-        }
-        else
-        {
-          C->data->llb[j] = cut_plane[j];
-          C->data->uub[j] = cut_plane[j + dim];
-        }
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4) + 1;
-#else
-#ifdef Cell
-        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-    }
-  }
-  return C;
-}
-// note the data structure
-// if CC is true
-// 1   -----------  1   ------  ^
-//                  0   ------  |  t
-// 0   -----------  old ------  |
-//
-// old -----------
-// if CC is false
-// 1   -----------  1   ------  ^
-// 0   -----------  0   ------  |  t
-// old -----------  old ------  |
-void Parallel::fill_level_data(MyList<Patch> *PatLd, MyList<Patch> *PatLs, MyList<Patch> *PatcL,
-                               MyList<var> *OldList, MyList<var> *StateList, MyList<var> *FutureList,
-                               MyList<var> *tmList, int Symmetry, bool BB, bool CC)
-{
-  if (PatLd->data->lev != PatLs->data->lev)
-  {
-    cout << "Parallel::fill_level_data: meet requst from lev#" << PatLs->data->lev << " to lev#" << PatLd->data->lev << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-  if (PatLd->data->lev <= PatcL->data->lev)
-  {
-    cout << "Parallel::fill_level_data: meet prolong requst from lev#" << PatcL->data->lev << " to lev#" << PatLd->data->lev << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-  int cpusize;
-  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-
-  MyList<var> *VarList = 0;
-  MyList<var> *p;
-  p = StateList;
-  while (p)
-  {
-    if (VarList)
-      VarList->insert(p->data);
-    else
-      VarList = new MyList<var>(p->data);
-    p = p->next;
-  }
-  p = FutureList;
-  while (p)
-  {
-    if (VarList)
-      VarList->insert(p->data);
-    else
-      VarList = new MyList<var>(p->data);
-    p = p->next;
-  }
-
-  MyList<Parallel::gridseg> *dst;
-  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
-  src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
-
-  dst = build_complete_gsl(PatLd); // including ghost
-  // copy part
-  for (int node = 0; node < cpusize; node++)
-  {
-    src[node] = build_owned_gsl(PatLs, node, 0, Symmetry);                // similar to Sync
-    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
-  }
-
-  transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
-
-  for (int node = 0; node < cpusize; node++)
-  {
-    if (src[node])
-      src[node]->destroyList();
-    if (transfer_src[node])
-      transfer_src[node]->destroyList();
-    if (transfer_dst[node])
-      transfer_dst[node]->destroyList();
-  }
-
-  MyList<Parallel::gridseg> *dsts, *dstd;
-  dsts = build_complete_gsl_virtual(PatLs);
-  dstd = dst;
-  dst = gsl_subtract(dstd, dsts);
-  if (dstd)
-    dstd->destroyList();
-  if (dsts)
-    dsts->destroyList();
-
-  if (dst)
-  {
-    // prolongation part
-    for (int node = 0; node < cpusize; node++)
-    {
-      src[node] = build_owned_gsl(PatcL, node, 4, Symmetry);                // - buffer - ghost - BD ghost
-      build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
-    }
-
-    if (CC)
-    {
-      // for FutureList
-      // restrict first~~~>
-      {
-        Restrict(PatcL, PatLs, FutureList, FutureList, Symmetry);
-        Sync(PatcL, FutureList, Symmetry);
-      }
-      //<~~~prolong then
-      transfer(transfer_src, transfer_dst, FutureList, FutureList, Symmetry);
-
-      // for StateList
-      // time interpolation part
-      if (BB)
-        prepare_inter_time_level(PatcL, FutureList, StateList, OldList,
-                                 tmList, 0); // use SynchList_pre as temporal storage space
-      else
-        prepare_inter_time_level(PatcL, FutureList, StateList,
-                                 tmList, 0); // use SynchList_pre as temporal storage space
-                                             // restrict first~~~>
-      {
-        Restrict(PatcL, PatLs, StateList, tmList, Symmetry);
-        Sync(PatcL, tmList, Symmetry);
-      }
-      //<~~~prolong then
-      transfer(transfer_src, transfer_dst, tmList, StateList, Symmetry);
-    }
-    else
-    {
-      // for both FutureList and StateList
-      // restrict first~~~>
-      {
-        Restrict(PatcL, PatLs, VarList, VarList, Symmetry);
-        Sync(PatcL, VarList, Symmetry);
-      }
-      //<~~~prolong then
-      transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
-    }
-
-    for (int node = 0; node < cpusize; node++)
-    {
-      if (src[node])
-        src[node]->destroyList();
-      if (transfer_src[node])
-        transfer_src[node]->destroyList();
-      if (transfer_dst[node])
-        transfer_dst[node]->destroyList();
-    }
-
-    dst->destroyList();
-  }
-
-  delete[] src;
-  delete[] transfer_src;
-  delete[] transfer_dst;
-
-  VarList->clearList();
-}
-void Parallel::KillBlocks(MyList<Patch> *PatchLIST)
-{
-  while (PatchLIST)
-  {
-    Patch *Pp = PatchLIST->data;
-    MyList<Block> *bg;
-    while (Pp->blb)
-    {
-      if (Pp->blb == Pp->ble)
-        break;
-      bg = (Pp->blb->next) ? Pp->blb->next : 0;
-      delete Pp->blb->data;
-      delete Pp->blb;
-      Pp->blb = bg;
-    }
-    if (Pp->ble)
-    {
-      delete Pp->ble->data;
-      delete Pp->ble;
-    }
-    Pp->blb = Pp->ble = 0;
-    PatchLIST = PatchLIST->next;
-  }
-}
-bool Parallel::PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
-                                     int NN, double **XX,
-                                     double *Shellf, int Symmetry)
-{
-  MyList<var> *varl;
-  int num_var = 0;
-  varl = VarList;
-  while (varl)
-  {
-    num_var++;
-    varl = varl->next;
-  }
-
-  double lld[dim], uud[dim];
-  double **pox;
-  pox = new double *[dim];
-  for (int j = 0; j < dim; j++)
-    pox[j] = new double[1];
-  for (int i = 0; i < NN; i++)
-  {
-    MyList<Patch> *PL = PatL;
-    while (PL)
-    {
-      bool flag = true;
-      for (int j = 0; j < dim; j++)
-      {
-        double h = PL->data->getdX(j);
-        lld[j] = PL->data->lli[j] * h;
-        uud[j] = PL->data->uui[j] * h;
-        if (XX[j][i] < PL->data->bbox[j] + lld[j] || XX[j][i] > PL->data->bbox[j + dim] - uud[j])
-        {
-          flag = false;
-          break;
-        }
-        pox[j][0] = XX[j][i];
-      }
-      if (flag)
-      {
-        PL->data->Interp_Points(VarList, 1, pox, Shellf + i * num_var, Symmetry);
-        break;
-      }
-      PL = PL->next;
-    }
-    if (!PL)
-    {
-      checkpatchlist(PatL, false);
-      return false;
-    }
-  }
-  for (int j = 0; j < dim; j++)
-    delete[] pox[j];
-  delete[] pox;
-
-  return true;
-}
-bool Parallel::PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
-                                     int NN, double **XX,
-                                     double *Shellf, int Symmetry, MPI_Comm Comm_here)
-{
-  MyList<var> *varl;
-  int num_var = 0;
-  varl = VarList;
-  while (varl)
-  {
-    num_var++;
-    varl = varl->next;
-  }
-
-  double lld[dim], uud[dim];
-  double **pox;
-  pox = new double *[dim];
-  for (int j = 0; j < dim; j++)
-    pox[j] = new double[1];
-  for (int i = 0; i < NN; i++)
-  {
-    MyList<Patch> *PL = PatL;
-    while (PL)
-    {
-      bool flag = true;
-      for (int j = 0; j < dim; j++)
-      {
-        double h = PL->data->getdX(j);
-        lld[j] = PL->data->lli[j] * h;
-        uud[j] = PL->data->uui[j] * h;
-        if (XX[j][i] < PL->data->bbox[j] + lld[j] || XX[j][i] > PL->data->bbox[j + dim] - uud[j])
-        {
-          flag = false;
-          break;
-        }
-        pox[j][0] = XX[j][i];
-      }
-      if (flag)
-      {
-        PL->data->Interp_Points(VarList, 1, pox, Shellf + i * num_var, Symmetry, Comm_here);
-        break;
-      }
-      PL = PL->next;
-    }
-    if (!PL)
-    {
-      checkpatchlist(PatL, false);
-      return false;
-    }
-  }
-  for (int j = 0; j < dim; j++)
-    delete[] pox[j];
-  delete[] pox;
-
-  return true;
-}
-void Parallel::aligncheck(double *bbox0, double *bboxl, int lev, double *DH0, int *shape)
-{
-  const double aligntiny = 0.1;
-  double DHl, rr;
-  int NN;
-  for (int i = 0; i < dim; i++)
-  {
-    DHl = DH0[i] * pow(0.5, lev);
-    rr = bboxl[i] - bbox0[i];
-    bboxl[i] = bbox0[i] + int(rr / DHl + 0.4) * DHl;
-    rr = bbox0[i + dim] - bboxl[i + dim];
-    bboxl[i + dim] = bbox0[i + dim] - int(rr / DHl + 0.4) * DHl;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-    NN = int((bboxl[i + dim] - bboxl[i]) / DHl + 0.4) + 1;
-#else
-#ifdef Cell
-    NN = int((bboxl[i + dim] - bboxl[i]) / DHl + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-    if (NN != shape[i])
-    {
-      int myrank;
-      MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-      if (myrank == 0)
-      {
-        cout << "Parallel::aligncheck want shape " << NN << " for lev#" << lev << ", but " << shape[i] << endl;
-        cout << "i = " << i << ", low = " << bboxl[i] << ", up = " << bboxl[i + dim] << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-    }
-  }
-}
-bool Parallel::point_locat_gsl(double *pox, MyList<Parallel::gridseg> *gsl)
-{
-  bool flag = false;
-  while (gsl)
-  {
-    for (int i = 0; i < dim; i++)
-    {
-      if (pox[i] > gsl->data->llb[i] && pox[i] < gsl->data->uub[i])
-        flag = true;
-      else
-      {
-        flag = false;
-        break;
-      }
-    }
-    if (flag)
-      break;
-    gsl = gsl->next;
-  }
-
-  return flag;
-}
-void Parallel::checkpatchlist(MyList<Patch> *PatL, bool buflog)
-{
-  MyList<Patch> *PL = PatL;
-  while (PL)
-  {
-    PL->data->checkPatch(buflog);
-    PL = PL->next;
-  }
-}
+
+#include "Parallel.h"
+#include "fmisc.h"
+#include "prolongrestrict.h"
+#include "misc.h"
+#include "parameters.h"
+#include <set>
+
+int Parallel::partition1(int &nx, int split_size, int min_width, int cpusize, int shape) // special for 1 diemnsion
+{
+  nx = Mymax(1, shape / min_width);
+  nx = Mymin(cpusize, nx);
+
+  return nx;
+}
+int Parallel::partition2(int *nxy, int split_size, int *min_width, int cpusize, int *shape) // special for 2 diemnsions
+{
+#define SEARCH_SIZE 5
+  int i, j, nx, ny;
+  int maxnx, maxny;
+  int mnx, mny;
+  int dn, hmin_width, cmin_width;
+  int cnx, cny;
+  double fx, fy;
+  int block_size;
+  int n;
+
+  block_size = shape[0] * shape[1];
+  n = Mymax(1, (block_size + split_size / 2) / split_size);
+
+  maxnx = Mymax(1, shape[0] / min_width[0]);
+  maxnx = Mymin(cpusize, maxnx);
+  maxny = Mymax(1, shape[1] / min_width[1]);
+  maxny = Mymin(cpusize, maxny);
+  fx = (double)shape[0] / (shape[0] + shape[1]);
+  fy = (double)shape[1] / (shape[0] + shape[1]);
+  nx = mnx = Mymax(1, Mymin(maxnx, (int)(sqrt(double(n)) * fx / fy)));
+  ny = mny = Mymax(1, Mymin(maxny, (int)(sqrt(double(n)) * fy / fx)));
+  dn = abs(n - nx * ny);
+  hmin_width = Mymin(shape[0] / nx, shape[1] / ny);
+  for (cny = Mymax(1, mny - SEARCH_SIZE); cny <= (Mymin(mny + SEARCH_SIZE, maxny)); cny++)
+    for (cnx = Mymax(1, mnx - SEARCH_SIZE); cnx <= (Mymin(mnx + SEARCH_SIZE, maxnx)); cnx++)
+    {
+      cmin_width = Mymin(shape[0] / cnx, shape[1] / cny);
+      if (dn > abs(n - cnx * cny) || (dn == abs(n - cnx * cny) && cmin_width > hmin_width))
+      {
+        dn = abs(n - cnx * cny);
+        nx = cnx;
+        ny = cny;
+        hmin_width = cmin_width;
+      }
+    }
+
+  nxy[0] = nx;
+  nxy[1] = ny;
+
+  return nx * ny;
+#undef SEARCH_SIZE
+}
+int Parallel::partition3(int *nxyz, int split_size, int *min_width, int cpusize, int *shape) // special for 3 diemnsions
+#if 1                                                                                        // algrithsm from Pretorius
+{
+//	cout<<split_size<<endl<<min_width[0]<<endl<<min_width[1]<<endl<<min_width[2]<<endl
+//            <<shape[0]<<endl<<shape[1]<<endl<<shape[2]<<endl<<cpusize<<endl;
+#define SEARCH_SIZE 5
+  int i, j, k, nx, ny, nz;
+  int maxnx, maxny, maxnz;
+  int mnx, mny, mnz;
+  int dn, hmin_width, cmin_width;
+  int cnx, cny, cnz;
+  double fx, fy, fz, max_fxfy, max_fxfz, max_fyfz;
+  int block_size;
+  int n;
+
+  block_size = shape[0] * shape[1] * shape[2];
+  n = Mymax(1, (block_size + split_size / 2) / split_size);
+
+  maxnx = Mymax(1, shape[0] / min_width[0]);
+  maxnx = Mymin(cpusize, maxnx);
+  maxny = Mymax(1, shape[1] / min_width[1]);
+  maxny = Mymin(cpusize, maxny);
+  maxnz = Mymax(1, shape[2] / min_width[2]);
+  maxnz = Mymin(cpusize, maxnz);
+  fx = (double)shape[0] / (shape[0] + shape[1] + shape[2]);
+  fy = (double)shape[1] / (shape[0] + shape[1] + shape[2]);
+  fz = (double)shape[2] / (shape[0] + shape[1] + shape[2]);
+  max_fxfy = Mymax(fx, fy);
+  max_fxfz = Mymax(fx, fz);
+  max_fyfz = Mymax(fy, fz);
+  nx = mnx = Mymax(1, Mymin(maxnx, (int)(pow(n, 1.0 / 3.0) * fx / max_fyfz)));
+  ny = mny = Mymax(1, Mymin(maxny, (int)(pow(n, 1.0 / 3.0) * fy / max_fxfz)));
+  nz = mnz = Mymax(1, Mymin(maxnz, (int)(pow(n, 1.0 / 3.0) * fz / max_fxfy)));
+  dn = abs(n - nx * ny * nz);
+  hmin_width = Mymin(shape[2] / nz, shape[1] / ny);
+  hmin_width = Mymin(hmin_width, shape[0] / nx);
+  for (cnz = Mymax(1, mnz - SEARCH_SIZE); cnz <= (Mymin(mnz + SEARCH_SIZE, maxnz)); cnz++)
+    for (cny = Mymax(1, mny - SEARCH_SIZE); cny <= (Mymin(mny + SEARCH_SIZE, maxny)); cny++)
+      for (cnx = Mymax(1, mnx - SEARCH_SIZE); cnx <= (Mymin(mnx + SEARCH_SIZE, maxnx)); cnx++)
+      {
+        cmin_width = Mymin(shape[2] / cnz, shape[1] / cny);
+        cmin_width = Mymin(cmin_width, shape[0] / cnx);
+        if (dn > abs(n - cnx * cny * cnz) || (dn == abs(n - cnx * cny * cnz) && cmin_width > hmin_width))
+        {
+          dn = abs(n - cnx * cny * cnz);
+          nx = cnx;
+          ny = cny;
+          nz = cnz;
+          hmin_width = cmin_width;
+        }
+      }
+
+  nxyz[0] = nx;
+  nxyz[1] = ny;
+  nxyz[2] = nz;
+
+  return nx * ny * nz;
+#undef SEARCH_SIZE
+}
+#elif 0 // Zhihui's idea one on 2013-09-25
+{
+  int nx, ny, nz;
+  int hmin_width;
+  hmin_width = Mymin(min_width[0], min_width[1]);
+  hmin_width = Mymin(hmin_width, min_width[2]);
+  nx = shape[0] / hmin_width;
+  if (nx * hmin_width < shape[0])
+    nx++;
+  ny = shape[1] / hmin_width;
+  if (ny * hmin_width < shape[1])
+    ny++;
+  nz = shape[2] / hmin_width;
+  if (nz * hmin_width < shape[2])
+    nz++;
+  while (nx * ny * nz > cpusize)
+  {
+    hmin_width++;
+    nx = shape[0] / hmin_width;
+    if (nx * hmin_width < shape[0])
+      nx++;
+    ny = shape[1] / hmin_width;
+    if (ny * hmin_width < shape[1])
+      ny++;
+    nz = shape[2] / hmin_width;
+    if (nz * hmin_width < shape[2])
+      nz++;
+  }
+
+  nxyz[0] = nx;
+  nxyz[1] = ny;
+  nxyz[2] = nz;
+
+  return nx * ny * nz;
+}
+#elif 0 // Zhihui's idea two on 2013-09-25
+{
+  int nx, ny, nz;
+  const int hmin_width = 8; // for example we use 8
+  nx = shape[0] / hmin_width;
+  if (nx * hmin_width < shape[0])
+    nx++;
+  ny = shape[1] / hmin_width;
+  if (ny * hmin_width < shape[1])
+    ny++;
+  nz = shape[2] / hmin_width;
+  if (nz * hmin_width < shape[2])
+    nz++;
+
+  nxyz[0] = nx;
+  nxyz[1] = ny;
+  nxyz[2] = nz;
+
+  return nx * ny * nz;
+}
+#endif
+// distribute the data to cprocessors
+#if (PSTR == 0)
+MyList<Block> *Parallel::distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
+                                    bool periodic, int nodes)
+{
+#ifdef USE_GPU_DIVIDE
+  double cpu_part, gpu_part;
+  map<string, double>::iterator iter;
+  iter = parameters::dou_par.find("cpu part");
+  if (iter != parameters::dou_par.end())
+  {
+    cpu_part = iter->second;
+  }
+  else
+  {
+    int myrank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+    // read parameter from file
+    const int LEN = 256;
+    char pline[LEN];
+    string str, sgrp, skey, sval;
+    int sind;
+    char pname[50];
+    {
+      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
+      if (iter != parameters::str_par.end())
+      {
+        strcpy(pname, (iter->second).c_str());
+      }
+      else
+      {
+        cout << "Error inputpar" << endl;
+        exit(0);
+      }
+    }
+    ifstream inf(pname, ifstream::in);
+    if (!inf.good() && myrank == 0)
+    {
+      cout << "Can not open parameter file " << pname << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    for (int i = 1; inf.good(); i++)
+    {
+      inf.getline(pline, LEN);
+      str = pline;
+
+      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
+      if (status == -1)
+      {
+        cout << "error reading parameter file " << pname << " in line " << i << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+      else if (status == 0)
+        continue;
+
+      if (sgrp == "ABE")
+      {
+        if (skey == "cpu part")
+          cpu_part = atof(sval.c_str());
+      }
+    }
+    inf.close();
+
+    parameters::dou_par.insert(map<string, double>::value_type("cpu part", cpu_part));
+  }
+  iter = parameters::dou_par.find("gpu part");
+  if (iter != parameters::dou_par.end())
+  {
+    gpu_part = iter->second;
+  }
+  else
+  {
+    int myrank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+    // read parameter from file
+    const int LEN = 256;
+    char pline[LEN];
+    string str, sgrp, skey, sval;
+    int sind;
+    char pname[50];
+    {
+      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
+      if (iter != parameters::str_par.end())
+      {
+        strcpy(pname, (iter->second).c_str());
+      }
+      else
+      {
+        cout << "Error inputpar" << endl;
+        exit(0);
+      }
+    }
+    ifstream inf(pname, ifstream::in);
+    if (!inf.good() && myrank == 0)
+    {
+      cout << "Can not open parameter file " << pname << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    for (int i = 1; inf.good(); i++)
+    {
+      inf.getline(pline, LEN);
+      str = pline;
+
+      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
+      if (status == -1)
+      {
+        cout << "error reading parameter file " << pname << " in line " << i << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+      else if (status == 0)
+        continue;
+
+      if (sgrp == "ABE")
+      {
+        if (skey == "gpu part")
+          gpu_part = atof(sval.c_str());
+      }
+    }
+    inf.close();
+
+    parameters::dou_par.insert(map<string, double>::value_type("gpu part", gpu_part));
+  }
+
+  if (nodes == 0)
+    nodes = cpusize / 2;
+#else
+  if (nodes == 0)
+    nodes = cpusize;
+#endif
+
+  if (dim != 3)
+  {
+    cout << "distrivute: now we only support 3-dimension" << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+
+  MyList<Block> *BlL = 0;
+
+  int split_size, min_size, block_size = 0;
+
+  int min_width = 2 * Mymax(ghost_width, buffer_width);
+  int nxyz[dim], mmin_width[dim], min_shape[dim];
+
+  MyList<Patch> *PLi = PatchLIST;
+  for (int i = 0; i < dim; i++)
+    min_shape[i] = PLi->data->shape[i];
+  int lev = PLi->data->lev;
+  PLi = PLi->next;
+  while (PLi)
+  {
+    Patch *PP = PLi->data;
+    for (int i = 0; i < dim; i++)
+      min_shape[i] = Mymin(min_shape[i], PP->shape[i]);
+    if (lev != PLi->data->lev)
+      cout << "Parallel::distribute CAUSTION: meet Patches for different level: " << lev << " and " << PLi->data->lev << endl;
+    PLi = PLi->next;
+  }
+
+  for (int i = 0; i < dim; i++)
+    mmin_width[i] = Mymin(min_width, min_shape[i]);
+
+  min_size = mmin_width[0];
+  for (int i = 1; i < dim; i++)
+    min_size = min_size * mmin_width[i];
+
+  PLi = PatchLIST;
+  while (PLi)
+  {
+    Patch *PP = PLi->data;
+    //    PP->checkPatch(true);
+    int bs = PP->shape[0];
+    for (int i = 1; i < dim; i++)
+      bs = bs * PP->shape[i];
+    block_size = block_size + bs;
+    PLi = PLi->next;
+  }
+  split_size = Mymax(min_size, block_size / nodes);
+  split_size = Mymax(1, split_size);
+
+  int n_rank = 0;
+  PLi = PatchLIST;
+  int reacpu = 0;
+  while (PLi)
+  {
+    Patch *PP = PLi->data;
+
+    reacpu += partition3(nxyz, split_size, mmin_width, nodes, PP->shape);
+
+    Block *ng0, *ng;
+    int shape_here[dim], ibbox_here[2 * dim];
+    double bbox_here[2 * dim], dd;
+
+    // ibbox : 0,...N-1
+    for (int i = 0; i < nxyz[0]; i++)
+      for (int j = 0; j < nxyz[1]; j++)
+        for (int k = 0; k < nxyz[2]; k++)
+        {
+          ibbox_here[0] = (PP->shape[0] * i) / nxyz[0];
+          ibbox_here[3] = (PP->shape[0] * (i + 1)) / nxyz[0] - 1;
+          ibbox_here[1] = (PP->shape[1] * j) / nxyz[1];
+          ibbox_here[4] = (PP->shape[1] * (j + 1)) / nxyz[1] - 1;
+          ibbox_here[2] = (PP->shape[2] * k) / nxyz[2];
+          ibbox_here[5] = (PP->shape[2] * (k + 1)) / nxyz[2] - 1;
+
+          if (periodic)
+          {
+            ibbox_here[0] = ibbox_here[0] - ghost_width;
+            ibbox_here[3] = ibbox_here[3] + ghost_width;
+            ibbox_here[1] = ibbox_here[1] - ghost_width;
+            ibbox_here[4] = ibbox_here[4] + ghost_width;
+            ibbox_here[2] = ibbox_here[2] - ghost_width;
+            ibbox_here[5] = ibbox_here[5] + ghost_width;
+          }
+          else
+          {
+            ibbox_here[0] = Mymax(0, ibbox_here[0] - ghost_width);
+            ibbox_here[3] = Mymin(PP->shape[0] - 1, ibbox_here[3] + ghost_width);
+            ibbox_here[1] = Mymax(0, ibbox_here[1] - ghost_width);
+            ibbox_here[4] = Mymin(PP->shape[1] - 1, ibbox_here[4] + ghost_width);
+            ibbox_here[2] = Mymax(0, ibbox_here[2] - ghost_width);
+            ibbox_here[5] = Mymin(PP->shape[2] - 1, ibbox_here[5] + ghost_width);
+          }
+
+          shape_here[0] = ibbox_here[3] - ibbox_here[0] + 1;
+          shape_here[1] = ibbox_here[4] - ibbox_here[1] + 1;
+          shape_here[2] = ibbox_here[5] - ibbox_here[2] + 1;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+          // 0--4, 5--10
+          dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1);
+          bbox_here[0] = PP->bbox[0] + ibbox_here[0] * dd;
+          bbox_here[3] = PP->bbox[0] + ibbox_here[3] * dd;
+
+          dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1);
+          bbox_here[1] = PP->bbox[1] + ibbox_here[1] * dd;
+          bbox_here[4] = PP->bbox[1] + ibbox_here[4] * dd;
+
+          dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1);
+          bbox_here[2] = PP->bbox[2] + ibbox_here[2] * dd;
+          bbox_here[5] = PP->bbox[2] + ibbox_here[5] * dd;
+#else
+#ifdef Cell
+          // 0--5, 5--10
+          dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0];
+          bbox_here[0] = PP->bbox[0] + (ibbox_here[0]) * dd;
+          bbox_here[3] = PP->bbox[0] + (ibbox_here[3] + 1) * dd;
+
+          dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1];
+          bbox_here[1] = PP->bbox[1] + (ibbox_here[1]) * dd;
+          bbox_here[4] = PP->bbox[1] + (ibbox_here[4] + 1) * dd;
+
+          dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2];
+          bbox_here[2] = PP->bbox[2] + (ibbox_here[2]) * dd;
+          bbox_here[5] = PP->bbox[2] + (ibbox_here[5] + 1) * dd;
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+
+#ifdef USE_GPU_DIVIDE
+          {
+            const int pices = 2;
+            double picef[pices];
+            picef[0] = cpu_part;
+            picef[1] = gpu_part;
+            int shape_res[dim * pices];
+            double bbox_res[2 * dim * pices];
+            misc::dividBlock(dim, shape_here, bbox_here, pices, picef, shape_res, bbox_res, min_width);
+            ng = ng0 = new Block(dim, shape_res, bbox_res, n_rank++, ingfsi, fngfsi, PP->lev, 0); // delete through KillBlocks
+
+            //	       if(n_rank==cpusize) {n_rank=0; cerr<<"place one!!"<<endl;}
+
+            //	       ng->checkBlock();
+            if (BlL)
+              BlL->insert(ng);
+            else
+              BlL = new MyList<Block>(ng); // delete through KillBlocks
+
+            for (int i = 1; i < pices; i++)
+            {
+              ng = new Block(dim, shape_res + i * dim, bbox_res + i * 2 * dim, n_rank++, ingfsi, fngfsi, PP->lev, i); // delete through KillBlocks
+              //	        if(n_rank==cpusize) {n_rank=0; cerr<<"place two!! "<<i<<endl;}
+              //	        ng->checkBlock();
+              BlL->insert(ng);
+            }
+          }
+#else
+          ng = ng0 = new Block(dim, shape_here, bbox_here, n_rank++, ingfsi, fngfsi, PP->lev); // delete through KillBlocks
+          //	    ng->checkBlock();
+          if (BlL)
+            BlL->insert(ng);
+          else
+            BlL = new MyList<Block>(ng); // delete through KillBlocks
+#endif
+          if (n_rank == cpusize)
+            n_rank = 0;
+
+          // set PP->blb
+          if (i == 0 && j == 0 && k == 0)
+          {
+            MyList<Block> *Bp = BlL;
+            while (Bp->data != ng0)
+              Bp = Bp->next; // ng0 is the first of the pices list
+            PP->blb = Bp;
+          }
+        }
+    // set PP->ble
+    {
+      MyList<Block> *Bp = BlL;
+      while (Bp->data != ng)
+        Bp = Bp->next; // ng is the last of the pices list
+      PP->ble = Bp;
+    }
+    PLi = PLi->next;
+  }
+  if (reacpu < nodes * 2 / 3)
+  {
+    int myrank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+    if (myrank == 0)
+      cout << "Parallel::distribute CAUSTION: level#" << lev << " uses essencially " << reacpu << " processors vs " << nodes << " nodes run, your scientific computation scale is not as large as you estimate." << endl;
+  }
+
+  return BlL;
+}
+MyList<Block> *Parallel::distribute_hard(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
+                                    bool periodic, int nodes)
+{
+#ifdef USE_GPU_DIVIDE
+  double cpu_part, gpu_part;
+  map<string, double>::iterator iter;
+  iter = parameters::dou_par.find("cpu part");
+  if (iter != parameters::dou_par.end())
+  {
+    cpu_part = iter->second;
+  }
+  else
+  {
+    int myrank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+    // read parameter from file
+    const int LEN = 256;
+    char pline[LEN];
+    string str, sgrp, skey, sval;
+    int sind;
+    char pname[50];
+    {
+      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
+      if (iter != parameters::str_par.end())
+      {
+        strcpy(pname, (iter->second).c_str());
+      }
+      else
+      {
+        cout << "Error inputpar" << endl;
+        exit(0);
+      }
+    }
+    ifstream inf(pname, ifstream::in);
+    if (!inf.good() && myrank == 0)
+    {
+      cout << "Can not open parameter file " << pname << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    for (int i = 1; inf.good(); i++)
+    {
+      inf.getline(pline, LEN);
+      str = pline;
+
+      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
+      if (status == -1)
+      {
+        cout << "error reading parameter file " << pname << " in line " << i << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+      else if (status == 0)
+        continue;
+
+      if (sgrp == "ABE")
+      {
+        if (skey == "cpu part")
+          cpu_part = atof(sval.c_str());
+      }
+    }
+    inf.close();
+
+    parameters::dou_par.insert(map<string, double>::value_type("cpu part", cpu_part));
+  }
+  iter = parameters::dou_par.find("gpu part");
+  if (iter != parameters::dou_par.end())
+  {
+    gpu_part = iter->second;
+  }
+  else
+  {
+    int myrank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+    // read parameter from file
+    const int LEN = 256;
+    char pline[LEN];
+    string str, sgrp, skey, sval;
+    int sind;
+    char pname[50];
+    {
+      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
+      if (iter != parameters::str_par.end())
+      {
+        strcpy(pname, (iter->second).c_str());
+      }
+      else
+      {
+        cout << "Error inputpar" << endl;
+        exit(0);
+      }
+    }
+    ifstream inf(pname, ifstream::in);
+    if (!inf.good() && myrank == 0)
+    {
+      cout << "Can not open parameter file " << pname << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    for (int i = 1; inf.good(); i++)
+    {
+      inf.getline(pline, LEN);
+      str = pline;
+
+      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
+      if (status == -1)
+      {
+        cout << "error reading parameter file " << pname << " in line " << i << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+      else if (status == 0)
+        continue;
+
+      if (sgrp == "ABE")
+      {
+        if (skey == "gpu part")
+          gpu_part = atof(sval.c_str());
+      }
+    }
+    inf.close();
+
+    parameters::dou_par.insert(map<string, double>::value_type("gpu part", gpu_part));
+  }
+
+  if (nodes == 0)
+    nodes = cpusize / 2;
+#else
+  if (nodes == 0)
+    nodes = cpusize;
+#endif
+
+  if (dim != 3)
+  {
+    cout << "distrivute: now we only support 3-dimension" << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+
+  MyList<Block> *BlL = 0;
+  int split_size, min_size, block_size = 0;
+
+  int min_width = 2 * Mymax(ghost_width, buffer_width);
+  int nxyz[dim], mmin_width[dim], min_shape[dim];
+
+  MyList<Patch> *PLi = PatchLIST;
+  for (int i = 0; i < dim; i++)
+    min_shape[i] = PLi->data->shape[i];
+  int lev = PLi->data->lev;
+  PLi = PLi->next;
+  while (PLi)
+  {
+    Patch *PP = PLi->data;
+    for (int i = 0; i < dim; i++)
+      min_shape[i] = Mymin(min_shape[i], PP->shape[i]);
+    if (lev != PLi->data->lev)
+      cout << "Parallel::distribute CAUSTION: meet Patches for different level: " << lev << " and " << PLi->data->lev << endl;
+    PLi = PLi->next;
+  }
+
+  for (int i = 0; i < dim; i++)
+    mmin_width[i] = Mymin(min_width, min_shape[i]);
+
+  min_size = mmin_width[0];
+  for (int i = 1; i < dim; i++)
+    min_size = min_size * mmin_width[i];
+
+  PLi = PatchLIST;
+  while (PLi)
+  {
+    Patch *PP = PLi->data;
+    //    PP->checkPatch(true);
+    int bs = PP->shape[0];
+    for (int i = 1; i < dim; i++)
+      bs = bs * PP->shape[i];
+    block_size = block_size + bs;
+    PLi = PLi->next;
+  }
+  split_size = Mymax(min_size, block_size / nodes);
+  split_size = Mymax(1, split_size);
+
+  int n_rank = 0;
+  PLi = PatchLIST;
+  int reacpu = 0;
+  int current_block_id = 0;
+  while (PLi) {
+    Block *ng0, *ng;
+    bool first_block_in_patch = true; 
+    Patch *PP = PLi->data;
+    reacpu += partition3(nxyz, split_size, mmin_width, nodes, PP->shape);
+
+    for (int i = 0; i < nxyz[0]; i++)
+    for (int j = 0; j < nxyz[1]; j++)
+    for (int k = 0; k < nxyz[2]; k++)
+    {
+        // --- 1. 定义局部变量 ---
+        int ibbox_here[6], shape_here[3];
+        double bbox_here[6], dd;
+        Block *current_ng_start = nullptr; // 本次循环产生的第一个(或唯一一个)块
+
+        // --- 2. 核心逻辑分支 ---
+        if (current_block_id == 27 || current_block_id == 28 ||
+            current_block_id == 35 || current_block_id == 36)
+        {
+            // A. 计算原始索引 (不带 Ghost)
+            int ib0 = (PP->shape[0] * i) / nxyz[0];
+            int ib3 = (PP->shape[0] * (i + 1)) / nxyz[0] - 1;
+            int jb1 = (PP->shape[1] * j) / nxyz[1];
+            int jb4 = (PP->shape[1] * (j + 1)) / nxyz[1] - 1;
+            int kb2 = (PP->shape[2] * k) / nxyz[2];
+            int kb5 = (PP->shape[2] * (k + 1)) / nxyz[2] - 1;
+
+            int r_l, r_r;
+            if(current_block_id == 27)      { r_l = 26; r_r = 27; }
+            else if(current_block_id == 28) { r_l = 28; r_r = 29; }
+            else if(current_block_id == 35) { r_l = 34; r_r = 35; }
+            else                            { r_l = 36; r_r = 37; }
+            Block * split_first_block = nullptr;
+            Block * split_last_block = nullptr; 
+            // 拆分逻辑：该函数应更新类成员变量 split_first_block 和 split_last_block
+            splitHotspotBlock(BlL, dim, ib0, ib3, jb1, jb4, kb2, kb5, 
+                              PP, r_l, r_r, ingfsi, fngfsi, periodic,split_first_block,split_last_block);
+            
+            current_ng_start = split_first_block;
+            ng = split_last_block; 
+        }
+        else 
+        {
+            // B. 普通块逻辑 (含 Ghost 扩张)
+            ibbox_here[0] = (PP->shape[0] * i) / nxyz[0];
+            ibbox_here[3] = (PP->shape[0] * (i + 1)) / nxyz[0] - 1;
+            ibbox_here[1] = (PP->shape[1] * j) / nxyz[1];
+            ibbox_here[4] = (PP->shape[1] * (j + 1)) / nxyz[1] - 1;
+            ibbox_here[2] = (PP->shape[2] * k) / nxyz[2];
+            ibbox_here[5] = (PP->shape[2] * (k + 1)) / nxyz[2] - 1;
+
+            if (periodic) {
+                for(int d=0; d<3; d++) {
+                    ibbox_here[d] -= ghost_width;
+                    ibbox_here[d+3] += ghost_width;
+                }
+            } else {
+                ibbox_here[0] = Mymax(0, ibbox_here[0] - ghost_width);
+                ibbox_here[3] = Mymin(PP->shape[0] - 1, ibbox_here[3] + ghost_width);
+                ibbox_here[1] = Mymax(0, ibbox_here[1] - ghost_width);
+                ibbox_here[4] = Mymin(PP->shape[1] - 1, ibbox_here[4] + ghost_width);
+                ibbox_here[2] = Mymax(0, ibbox_here[2] - ghost_width);
+                ibbox_here[5] = Mymin(PP->shape[2] - 1, ibbox_here[5] + ghost_width);
+            }
+
+            for(int d=0; d<3; d++) shape_here[d] = ibbox_here[d+3] - ibbox_here[d] + 1;
+
+            // 物理坐标计算 (根据你的宏定义 Cell/Vertex)
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+          // 0--4, 5--10
+          dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1);
+          bbox_here[0] = PP->bbox[0] + ibbox_here[0] * dd;
+          bbox_here[3] = PP->bbox[0] + ibbox_here[3] * dd;
+
+          dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1);
+          bbox_here[1] = PP->bbox[1] + ibbox_here[1] * dd;
+          bbox_here[4] = PP->bbox[1] + ibbox_here[4] * dd;
+
+          dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1);
+          bbox_here[2] = PP->bbox[2] + ibbox_here[2] * dd;
+          bbox_here[5] = PP->bbox[2] + ibbox_here[5] * dd;
+#else
+#ifdef Cell
+          // 0--5, 5--10
+          dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0];
+          bbox_here[0] = PP->bbox[0] + (ibbox_here[0]) * dd;
+          bbox_here[3] = PP->bbox[0] + (ibbox_here[3] + 1) * dd;
+
+          dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1];
+          bbox_here[1] = PP->bbox[1] + (ibbox_here[1]) * dd;
+          bbox_here[4] = PP->bbox[1] + (ibbox_here[4] + 1) * dd;
+
+          dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2];
+          bbox_here[2] = PP->bbox[2] + (ibbox_here[2]) * dd;
+          bbox_here[5] = PP->bbox[2] + (ibbox_here[5] + 1) * dd;
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+            ng = createMappedBlock(BlL, dim, shape_here, bbox_here, current_block_id, ingfsi, fngfsi, PP->lev);
+            current_ng_start = ng;
+        }
+
+        // --- 3. 统一处理 Patch 起始 Block 指针 ---
+        if (first_block_in_patch) {
+            ng0 = current_ng_start;
+            
+            // 立即设置 PP->blb，避免后续循环覆盖 ng0
+            MyList<Block> *Bp_start = BlL;
+            while (Bp_start && Bp_start->data != ng0) Bp_start = Bp_start->next;
+            PP->blb = Bp_start;
+            
+            first_block_in_patch = false;
+        }
+        
+        current_block_id++;
+    }
+
+    // --- 4. 设置 Patch 结束 Block 指针 ---
+    MyList<Block> *Bp_end = BlL;
+    while (Bp_end && Bp_end->data != ng) Bp_end = Bp_end->next;
+    PP->ble = Bp_end;
+
+    PLi = PLi->next;
+    first_block_in_patch = true; 
+  }
+  if (reacpu < nodes * 2 / 3)
+  {
+    int myrank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+    if (myrank == 0)
+      cout << "Parallel::distribute CAUSTION: level#" << lev << " uses essencially " << reacpu << " processors vs " << nodes << " nodes run, your scientific computation scale is not as large as you estimate." << endl;
+  }
+
+  return BlL;
+}
+
+ /**
+ * @brief 将当前 Block 几何二等分并存入列表
+ * @param axis 拆分轴：0-x, 1-y, 2-z (建议选最长轴)
+ */
+Block* Parallel::splitHotspotBlock(MyList<Block>* &BlL, int _dim, 
+                                 int ib0_orig, int ib3_orig, 
+                                 int jb1_orig, int jb4_orig, 
+                                 int kb2_orig, int kb5_orig, 
+                                 Patch* PP, int r_left, int r_right, 
+                                 int ingfsi, int fngfsi, bool periodic,
+                                 Block* &split_first_block, Block* &split_last_block)
+{
+    // 1. 索引二分 (基于无 ghost 的原始索引)
+    int mid = (ib0_orig + ib3_orig) / 2;
+
+    // 左块原始索引: [ib0, mid], 右块原始索引: [mid+1, ib3]
+    int indices_L[6] = {ib0_orig, jb1_orig, kb2_orig, mid, jb4_orig, kb5_orig};
+    int indices_R[6] = {mid + 1, jb1_orig, kb2_orig, ib3_orig, jb4_orig, kb5_orig};
+
+    // 2. 内部处理逻辑 (复刻原 distribute 逻辑)
+    auto createSubBlock = [&](int* ib_raw, int target_rank) {
+        int ib_final[6];
+        int sh_here[3];
+        double bb_here[6], dd;
+
+        // --- 逻辑 A: Ghost 扩张 ---
+        if (periodic) {
+            ib_final[0] = ib_raw[0] - ghost_width;
+            ib_final[3] = ib_raw[3] + ghost_width;
+            ib_final[1] = ib_raw[1] - ghost_width;
+            ib_final[4] = ib_raw[4] + ghost_width;
+            ib_final[2] = ib_raw[2] - ghost_width;
+            ib_final[5] = ib_raw[5] + ghost_width;
+        } else {
+            ib_final[0] = Mymax(0, ib_raw[0] - ghost_width);
+            ib_final[3] = Mymin(PP->shape[0] - 1, ib_raw[3] + ghost_width);
+            ib_final[1] = Mymax(0, ib_raw[1] - ghost_width);
+            ib_final[4] = Mymin(PP->shape[1] - 1, ib_raw[4] + ghost_width);
+            ib_final[2] = Mymax(0, ib_raw[2] - ghost_width);
+            ib_final[5] = Mymin(PP->shape[2] - 1, ib_raw[5] + ghost_width);
+        }
+
+        sh_here[0] = ib_final[3] - ib_final[0] + 1;
+        sh_here[1] = ib_final[4] - ib_final[1] + 1;
+        sh_here[2] = ib_final[5] - ib_final[2] + 1;
+
+        // --- 逻辑 B: 物理坐标计算 (严格匹配 Cell 模式) ---
+        // X 方向
+        dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0];
+        bb_here[0] = PP->bbox[0] + ib_final[0] * dd;
+        bb_here[3] = PP->bbox[0] + (ib_final[3] + 1) * dd;
+
+        // Y 方向
+        dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1];
+        bb_here[1] = PP->bbox[1] + ib_final[1] * dd;
+        bb_here[4] = PP->bbox[1] + (ib_final[4] + 1) * dd;
+
+        // Z 方向
+        dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2];
+        bb_here[2] = PP->bbox[2] + ib_final[2] * dd;
+        bb_here[5] = PP->bbox[2] + (ib_final[5] + 1) * dd;
+
+        Block* Bg = new Block(dim, sh_here, bb_here, target_rank, ingfsi, fngfsi, PP->lev);
+        if (BlL) BlL->insert(Bg);
+        else     BlL = new MyList<Block>(Bg);
+
+       
+        return Bg;
+    };
+
+    // 执行创建
+    split_first_block = createSubBlock(indices_L, r_left);
+    split_last_block  = createSubBlock(indices_R, r_right);
+}
+
+
+/**
+ * @brief 创建映射后的 Block
+ */
+  Block* Parallel::createMappedBlock(MyList<Block>* &BlL, int _dim, int* shape, double* bbox,
+                        int block_id, int ingfsi, int fngfsi, int lev)
+  {
+      // 映射表逻辑
+      int target_rank = block_id;
+      if (block_id == 26)      target_rank = 25;
+      else if (block_id == 29) target_rank = 30;
+      else if (block_id == 34) target_rank = 33;
+      else if (block_id == 37) target_rank = 38;
+
+      Block* ng = new Block(dim, shape, bbox, target_rank, ingfsi, fngfsi, lev);
+
+      if (BlL) BlL->insert(ng);
+      else     BlL = new MyList<Block>(ng);
+
+      return ng;
+  }
+
+
+
+
+#elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
+MyList<Block> *Parallel::distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
+                                    bool periodic, int start_rank, int end_rank, int nodes)
+{
+#ifdef USE_GPU_DIVIDE
+  double cpu_part, gpu_part;
+  map<string, double>::iterator iter;
+  iter = parameters::dou_par.find("cpu part");
+  if (iter != parameters::dou_par.end())
+  {
+    cpu_part = iter->second;
+  }
+  else
+  {
+    int myrank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+    // read parameter from file
+    const int LEN = 256;
+    char pline[LEN];
+    string str, sgrp, skey, sval;
+    int sind;
+    char pname[50];
+    {
+      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
+      if (iter != parameters::str_par.end())
+      {
+        strcpy(pname, (iter->second).c_str());
+      }
+      else
+      {
+        cout << "Error inputpar" << endl;
+        exit(0);
+      }
+    }
+    ifstream inf(pname, ifstream::in);
+    if (!inf.good() && myrank == 0)
+    {
+      cout << "Can not open parameter file " << pname << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    for (int i = 1; inf.good(); i++)
+    {
+      inf.getline(pline, LEN);
+      str = pline;
+
+      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
+      if (status == -1)
+      {
+        cout << "error reading parameter file " << pname << " in line " << i << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+      else if (status == 0)
+        continue;
+
+      if (sgrp == "ABE")
+      {
+        if (skey == "cpu part")
+          cpu_part = atof(sval.c_str());
+      }
+    }
+    inf.close();
+
+    parameters::dou_par.insert(map<string, double>::value_type("cpu part", cpu_part));
+  }
+  iter = parameters::dou_par.find("gpu part");
+  if (iter != parameters::dou_par.end())
+  {
+    gpu_part = iter->second;
+  }
+  else
+  {
+    int myrank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+    // read parameter from file
+    const int LEN = 256;
+    char pline[LEN];
+    string str, sgrp, skey, sval;
+    int sind;
+    char pname[50];
+    {
+      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
+      if (iter != parameters::str_par.end())
+      {
+        strcpy(pname, (iter->second).c_str());
+      }
+      else
+      {
+        cout << "Error inputpar" << endl;
+        exit(0);
+      }
+    }
+    ifstream inf(pname, ifstream::in);
+    if (!inf.good() && myrank == 0)
+    {
+      cout << "Can not open parameter file " << pname << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    for (int i = 1; inf.good(); i++)
+    {
+      inf.getline(pline, LEN);
+      str = pline;
+
+      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
+      if (status == -1)
+      {
+        cout << "error reading parameter file " << pname << " in line " << i << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+      else if (status == 0)
+        continue;
+
+      if (sgrp == "ABE")
+      {
+        if (skey == "gpu part")
+          gpu_part = atof(sval.c_str());
+      }
+    }
+    inf.close();
+
+    parameters::dou_par.insert(map<string, double>::value_type("gpu part", gpu_part));
+  }
+
+  if (nodes == 0)
+    nodes = cpusize / 2;
+#else
+  if (nodes == 0)
+    nodes = cpusize;
+#endif
+
+  if (dim != 3)
+  {
+    cout << "distrivute: now we only support 3-dimension" << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+
+  MyList<Block> *BlL = 0;
+
+  int split_size, min_size, block_size = 0;
+
+  int min_width = 2 * Mymax(ghost_width, buffer_width);
+  int nxyz[dim], mmin_width[dim], min_shape[dim];
+
+  MyList<Patch> *PLi = PatchLIST;
+  for (int i = 0; i < dim; i++)
+    min_shape[i] = PLi->data->shape[i];
+  int lev = PLi->data->lev;
+  PLi = PLi->next;
+  while (PLi)
+  {
+    Patch *PP = PLi->data;
+    for (int i = 0; i < dim; i++)
+      min_shape[i] = Mymin(min_shape[i], PP->shape[i]);
+    if (lev != PLi->data->lev)
+      cout << "Parallel::distribute CAUSTION: meet Patches for different level: " << lev << " and " << PLi->data->lev << endl;
+    PLi = PLi->next;
+  }
+
+  for (int i = 0; i < dim; i++)
+    mmin_width[i] = Mymin(min_width, min_shape[i]);
+
+  min_size = mmin_width[0];
+  for (int i = 1; i < dim; i++)
+    min_size = min_size * mmin_width[i];
+
+  PLi = PatchLIST;
+  while (PLi)
+  {
+    Patch *PP = PLi->data;
+    //    PP->checkPatch(true);
+    int bs = PP->shape[0];
+    for (int i = 1; i < dim; i++)
+      bs = bs * PP->shape[i];
+    block_size = block_size + bs;
+    PLi = PLi->next;
+  }
+  split_size = Mymax(min_size, block_size / cpusize);
+  split_size = Mymax(1, split_size);
+
+  int n_rank = start_rank;
+  PLi = PatchLIST;
+  int reacpu = 0;
+  while (PLi)
+  {
+    Patch *PP = PLi->data;
+
+    reacpu += partition3(nxyz, split_size, mmin_width, cpusize, PP->shape);
+
+    Block *ng, *ng0;
+    int shape_here[dim], ibbox_here[2 * dim];
+    double bbox_here[2 * dim], dd;
+
+    // ibbox : 0,...N-1
+    for (int i = 0; i < nxyz[0]; i++)
+      for (int j = 0; j < nxyz[1]; j++)
+        for (int k = 0; k < nxyz[2]; k++)
+        {
+          ibbox_here[0] = (PP->shape[0] * i) / nxyz[0];
+          ibbox_here[3] = (PP->shape[0] * (i + 1)) / nxyz[0] - 1;
+          ibbox_here[1] = (PP->shape[1] * j) / nxyz[1];
+          ibbox_here[4] = (PP->shape[1] * (j + 1)) / nxyz[1] - 1;
+          ibbox_here[2] = (PP->shape[2] * k) / nxyz[2];
+          ibbox_here[5] = (PP->shape[2] * (k + 1)) / nxyz[2] - 1;
+
+          if (periodic)
+          {
+            ibbox_here[0] = ibbox_here[0] - ghost_width;
+            ibbox_here[3] = ibbox_here[3] + ghost_width;
+            ibbox_here[1] = ibbox_here[1] - ghost_width;
+            ibbox_here[4] = ibbox_here[4] + ghost_width;
+            ibbox_here[2] = ibbox_here[2] - ghost_width;
+            ibbox_here[5] = ibbox_here[5] + ghost_width;
+          }
+          else
+          {
+            ibbox_here[0] = Mymax(0, ibbox_here[0] - ghost_width);
+            ibbox_here[3] = Mymin(PP->shape[0] - 1, ibbox_here[3] + ghost_width);
+            ibbox_here[1] = Mymax(0, ibbox_here[1] - ghost_width);
+            ibbox_here[4] = Mymin(PP->shape[1] - 1, ibbox_here[4] + ghost_width);
+            ibbox_here[2] = Mymax(0, ibbox_here[2] - ghost_width);
+            ibbox_here[5] = Mymin(PP->shape[2] - 1, ibbox_here[5] + ghost_width);
+          }
+
+          shape_here[0] = ibbox_here[3] - ibbox_here[0] + 1;
+          shape_here[1] = ibbox_here[4] - ibbox_here[1] + 1;
+          shape_here[2] = ibbox_here[5] - ibbox_here[2] + 1;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+          // 0--4, 5--10
+          dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1);
+          bbox_here[0] = PP->bbox[0] + ibbox_here[0] * dd;
+          bbox_here[3] = PP->bbox[0] + ibbox_here[3] * dd;
+
+          dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1);
+          bbox_here[1] = PP->bbox[1] + ibbox_here[1] * dd;
+          bbox_here[4] = PP->bbox[1] + ibbox_here[4] * dd;
+
+          dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1);
+          bbox_here[2] = PP->bbox[2] + ibbox_here[2] * dd;
+          bbox_here[5] = PP->bbox[2] + ibbox_here[5] * dd;
+#else
+#ifdef Cell
+          // 0--5, 5--10
+          dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0];
+          bbox_here[0] = PP->bbox[0] + (ibbox_here[0]) * dd;
+          bbox_here[3] = PP->bbox[0] + (ibbox_here[3] + 1) * dd;
+
+          dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1];
+          bbox_here[1] = PP->bbox[1] + (ibbox_here[1]) * dd;
+          bbox_here[4] = PP->bbox[1] + (ibbox_here[4] + 1) * dd;
+
+          dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2];
+          bbox_here[2] = PP->bbox[2] + (ibbox_here[2]) * dd;
+          bbox_here[5] = PP->bbox[2] + (ibbox_here[5] + 1) * dd;
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+
+#ifdef USE_GPU_DIVIDE
+          {
+            const int pices = 2;
+            double picef[pices];
+            picef[0] = cpu_part;
+            picef[1] = gpu_part;
+            int shape_res[dim * pices];
+            double bbox_res[2 * dim * pices];
+            misc::dividBlock(dim, shape_here, bbox_here, pices, picef, shape_res, bbox_res, min_width);
+            ng = ng0 = new Block(dim, shape_res, bbox_res, n_rank++, ingfsi, fngfsi, PP->lev, 0); // delete through KillBlocks
+            //	       ng->checkBlock();
+            if (BlL)
+              BlL->insert(ng);
+            else
+              BlL = new MyList<Block>(ng); // delete through KillBlocks
+
+            for (int i = 1; i < pices; i++)
+            {
+              ng = new Block(dim, shape_res + i * dim, bbox_res + i * 2 * dim, n_rank++, ingfsi, fngfsi, PP->lev, i); // delete through KillBlocks
+              //	        ng->checkBlock();
+              BlL->insert(ng);
+            }
+          }
+#else
+          ng = ng0 = new Block(dim, shape_here, bbox_here, n_rank++, ingfsi, fngfsi, PP->lev); // delete through KillBlocks
+          //	    ng->checkBlock();
+          if (BlL)
+            BlL->insert(ng);
+          else
+            BlL = new MyList<Block>(ng); // delete through KillBlocks
+#endif
+
+          if (n_rank == end_rank + 1)
+            n_rank = start_rank;
+
+          // set PP->blb
+          if (i == 0 && j == 0 && k == 0)
+          {
+            MyList<Block> *Bp = BlL;
+            while (Bp->data != ng0)
+              Bp = Bp->next; // ng0 is the first of the pices list
+            PP->blb = Bp;
+          }
+        }
+    // set PP->ble
+    {
+      MyList<Block> *Bp = BlL;
+      while (Bp->data != ng)
+        Bp = Bp->next; // ng is the last of the pices list
+      PP->ble = Bp;
+    }
+    PLi = PLi->next;
+  }
+  if (reacpu < nodes * 2 / 3)
+  {
+    int myrank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+    if (myrank == start_rank)
+      cout << "Parallel::distribute CAUSTION: level#" << lev << " uses essencially " << reacpu << " processors vs " << nodes << " nodes run, your scientific computation scale is not as large as you estimate." << endl;
+  }
+
+  return BlL;
+}
+#endif
+void Parallel::setfunction(MyList<Block> *BlL, var *vn, double func(double x, double y, double z))
+{
+  while (BlL)
+  {
+    if (BlL->data->X[0])
+    {
+      int nn = BlL->data->shape[0] * BlL->data->shape[1] * BlL->data->shape[2];
+      double *p = BlL->data->fgfs[vn->sgfn];
+      for (int i = 0; i < nn; i++)
+      {
+        int ind[3];
+        getarrayindex(3, BlL->data->shape, ind, i);
+        p[i] = func(BlL->data->X[0][ind[0]], BlL->data->X[1][ind[1]], BlL->data->X[2][ind[2]]);
+      }
+    }
+    BlL = BlL->next;
+  }
+}
+// set function only for cpu rank
+void Parallel::setfunction(int rank, MyList<Block> *BlL, var *vn, double func(double x, double y, double z))
+{
+  while (BlL)
+  {
+    if (BlL->data->X[0] && BlL->data->rank == rank)
+    {
+      int nn = BlL->data->shape[0] * BlL->data->shape[1] * BlL->data->shape[2];
+      double *p = BlL->data->fgfs[vn->sgfn];
+      for (int i = 0; i < nn; i++)
+      {
+        int ind[3];
+        getarrayindex(3, BlL->data->shape, ind, i);
+        p[i] = func(BlL->data->X[0][ind[0]], BlL->data->X[1][ind[1]], BlL->data->X[2][ind[2]]);
+      }
+    }
+    BlL = BlL->next;
+  }
+}
+void Parallel::getarrayindex(int DIM, int *shape, int *index, int n)
+{
+  // we assume index has already memory space
+  int *mu;
+  mu = new int[DIM];
+  mu[0] = 1;
+  for (int i = 1; i < DIM; i++)
+    mu[i] = mu[i - 1] * shape[i - 1];
+  for (int i = DIM - 1; i >= 0; i--)
+  {
+    index[i] = n / mu[i];
+    n = n - index[i] * mu[i];
+  }
+
+  delete[] mu;
+}
+int Parallel::getarraylocation(int DIM, int *shape, int *index)
+{
+  int n, mu;
+  mu = shape[0];
+  n = index[0];
+  for (int i = 1; i < DIM; i++)
+  {
+    n = n + index[i] * mu;
+    mu = mu * shape[i];
+  }
+
+  return n;
+}
+void Parallel::copy(int DIM, double *llbout, double *uubout, int *Dshape, double *DD, double *llbin, double *uubin,
+                    int *shape, double *datain, double *llb, double *uub)
+{
+  // for 3 dimensional case, based on simple test, I found this is half slower than f90 code
+  int *illi, *iuui;
+  int *illo, *iuuo;
+  int *indi, *indo;
+  illi = new int[DIM];
+  iuui = new int[DIM];
+  illo = new int[DIM];
+  iuuo = new int[DIM];
+  indi = new int[DIM];
+  indo = new int[DIM];
+
+  int ial = 1;
+  for (int i = 0; i < DIM; i++)
+  {
+    double ho, hi;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+    ho = (uubout[i] - llbout[i]) / (Dshape[i] - 1);
+    hi = (uubin[i] - llbin[i]) / (shape[i] - 1);
+#else
+#ifdef Cell
+    ho = (uubout[i] - llbout[i]) / Dshape[i];
+    hi = (uubin[i] - llbin[i]) / shape[i];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+    illo[i] = int((llb[i] - llbout[i]) / ho);
+    iuuo[i] = Dshape[i] - 1 - int((uubout[i] - uub[i]) / ho);
+    illi[i] = int((llb[i] - llbin[i]) / hi);
+    iuui[i] = shape[i] - 1 - int((uubin[i] - uub[i]) / hi);
+
+    if (illo[i] > iuuo[i] || illi[i] > iuui[i] || illo[i] < 0 || illi[i] < 0 ||
+        iuui[i] >= shape[i] || iuuo[i] >= Dshape[i])
+    {
+      cout << "Parallel copy: in direction " << i << ":" << endl;
+      cout << "llb = " << llb[i] << ", uub = " << uub[i] << endl;
+      cout << " in data : il = " << illi[i] << ", iu = " << iuui[i] << endl;
+      cout << "bbox = (" << llbin[i] << "," << uubin[i] << ")" << endl;
+      cout << "shape = " << shape[i] << endl;
+      cout << "out data : il = " << illo[i] << ", iu = " << iuuo[i] << endl;
+      cout << "bbox = (" << llbout[i] << "," << uubout[i] << ")" << endl;
+      cout << "shape = " << Dshape[i] << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+    int ihi = iuui[i] - illi[i] + 1, iho = iuuo[i] - illo[i] + 1;
+    if (!(feq(ho, hi, ho / 2)) || ihi != iho)
+    {
+      cout << "Parallel copy: in direction " << i << ":" << endl;
+      cout << "Parallel copy: not the same grid structure." << endl;
+      cout << "hi = " << hi << ", bbox = (" << llbin[i] << "," << uubin[i] << "), shape = " << shape[i] << endl;
+      cout << "ho = " << ho << ", bbox = (" << llbout[i] << "," << uubout[i] << "), shape = " << Dshape[i] << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+    ial = ial * ihi;
+  }
+
+  for (int i = 0; i < DIM; i++)
+  {
+    indi[i] = illi[i];
+    indo[i] = illo[i];
+  }
+  /*
+  //check start index
+     for(int i=0;i<DIM;i++)
+     {
+       cout << "Parallel copy: in direction " <<i<<":"<< endl;
+       cout<<"start : indi = "<<indi[i]<<", indo = "<<indo[i]<<endl;
+     }
+  */
+  int NNi = 1, NNo = 1;
+  for (int i = 0; i < DIM; i++)
+  {
+    NNi = NNi * shape[i];
+    NNo = NNo * Dshape[i];
+  }
+  for (int i = 0; i < ial; i++)
+  {
+    int ni, no;
+    ni = getarraylocation(DIM, shape, indi);
+    no = getarraylocation(DIM, Dshape, indo);
+    if (no < 0 || no > NNo)
+    {
+      cout << "Parallel copy: no = " << no << " is out of array range (0," << NNo << ")." << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+    if (ni < 0 || ni > NNi)
+    {
+      cout << "Parallel copy: ni = " << ni << " is out of array range (0," << NNi << ")." << endl;
+      cout << "shape = (";
+      for (int j = 0; j < DIM; j++)
+      {
+        cout << shape[j];
+        if (j < DIM - 1)
+          cout << ",";
+        else
+          cout << ")" << endl;
+      }
+      cout << "ind = (";
+      for (int j = 0; j < DIM; j++)
+      {
+        cout << indi[j];
+        if (j < DIM - 1)
+          cout << ",";
+        else
+          cout << ")" << endl;
+      }
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+    DD[no] = datain[ni];
+
+    indi[0]++;
+    for (int j = 1; j < DIM; j++)
+    {
+      if (indi[j - 1] == iuui[j - 1] + 1)
+      {
+        indi[j - 1] = illi[j - 1];
+        indi[j]++;
+      } // carry 1 to next digital
+      else
+        break;
+    }
+    indo[0]++;
+    for (int j = 1; j < DIM; j++)
+    {
+      if (indo[j - 1] == iuuo[j - 1] + 1)
+      {
+        indo[j - 1] = illo[j - 1];
+        indo[j]++;
+      }
+      else
+        break;
+    }
+  }
+  /*
+  //check final index
+     for(int i=0;i<DIM;i++)
+     {
+       cout << "Parallel copy: in direction " <<i<<":"<< endl;
+       cout<<"final : indi = "<<indi[i]<<", indo = "<<indo[i]<<endl;
+     }
+  */
+  delete[] illi;
+  delete[] iuui;
+  delete[] illo;
+  delete[] iuuo;
+  delete[] indi;
+  delete[] indo;
+}
+void Parallel::writefile(double time, int nx, int ny, int nz, double xmin, double xmax, double ymin, double ymax,
+                         double zmin, double zmax, char *filename, double *data_out)
+{
+  ofstream outfile;
+  outfile.open(filename, ios::out | ios::trunc);
+  if (!outfile)
+  {
+    cout << "Can't open " << filename << " for output." << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+  outfile.write((char *)&time, sizeof(double));
+  outfile.write((char *)&nx, sizeof(int));
+  outfile.write((char *)&ny, sizeof(int));
+  outfile.write((char *)&nz, sizeof(int));
+  outfile.write((char *)&xmin, sizeof(double));
+  outfile.write((char *)&xmax, sizeof(double));
+  outfile.write((char *)&ymin, sizeof(double));
+  outfile.write((char *)&ymax, sizeof(double));
+  outfile.write((char *)&zmin, sizeof(double));
+  outfile.write((char *)&zmax, sizeof(double));
+  outfile.write((char *)data_out, nx * ny * nz * sizeof(double));
+  outfile.close();
+}
+void Parallel::writefile(double time, int nx, int ny, double xmin, double xmax, double ymin, double ymax,
+                         char *filename, double *datain)
+{
+  int i, j;
+  double *X, *Y;
+  X = new double[nx];
+  Y = new double[ny];
+  double dd;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+  dd = (xmax - xmin) / (nx - 1);
+  for (i = 0; i < nx; i++)
+    X[i] = xmin + i * dd;
+  dd = (ymax - ymin) / (ny - 1);
+  for (j = 0; j < ny; j++)
+    Y[j] = ymin + j * dd;
+#else
+#ifdef Cell
+  dd = (xmax - xmin) / nx;
+  for (i = 0; i < nx; i++)
+    X[i] = xmin + (i + 0.5) * dd;
+  dd = (ymax - ymin) / ny;
+  for (j = 0; j < ny; j++)
+    Y[j] = ymin + (j + 0.5) * dd;
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+  ofstream outfile;
+  outfile.open(filename, ios::out | ios::trunc);
+  if (!outfile)
+  {
+    cout << "Can't open " << filename << " for output." << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+  outfile << "# t = " << time << endl;
+  for (j = 0; j < ny; j++)
+  {
+    for (i = 0; i < nx; i++)
+    {
+      int ind1 = i + j * nx;
+      outfile << setw(10) << setprecision(10) << X[i] << " "
+              << setw(10) << setprecision(10) << Y[j] << " "
+              << setw(16) << setprecision(15) << datain[ind1]
+              << endl;
+    }
+    outfile << "\n"; /* blanck line for gnuplot */
+  }
+  outfile.close();
+
+  delete[] X;
+  delete[] Y;
+}
+void Parallel::Dump_CPU_Data(MyList<Block> *BlL, MyList<var> *DumpList, char *tag, double time, double dT)
+{
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  // round at 4 and 5
+  int ncount = int(time / dT + 0.5);
+
+  MyList<Block> *Bp;
+  while (DumpList)
+  {
+    Bp = BlL;
+    int Bi = 0;
+    while (Bp)
+    {
+      Block *BP = Bp->data;
+      var *VP = DumpList->data;
+      if (BP->rank == myrank)
+      {
+
+        string out_dir;
+        map<string, string>::iterator iter;
+        iter = parameters::str_par.find("output dir");
+        if (iter != parameters::str_par.end())
+        {
+          out_dir = iter->second;
+        }
+        else
+        {
+          // read parameter from file
+          const int LEN = 256;
+          char pline[LEN];
+          string str, sgrp, skey, sval;
+          int sind;
+          char pname[50];
+          {
+            map<string, string>::iterator iter = parameters::str_par.find("inputpar");
+            if (iter != parameters::str_par.end())
+            {
+              strcpy(pname, (iter->second).c_str());
+            }
+            else
+            {
+              cout << "Error inputpar" << endl;
+              exit(0);
+            }
+          }
+          ifstream inf(pname, ifstream::in);
+          if (!inf.good())
+          {
+            cout << "Can not open parameter file " << pname << endl;
+            MPI_Abort(MPI_COMM_WORLD, 1);
+          }
+
+          for (int i = 1; inf.good(); i++)
+          {
+            inf.getline(pline, LEN);
+            str = pline;
+
+            int status = misc::parse_parts(str, sgrp, skey, sval, sind);
+            if (status == -1)
+            {
+              cout << "error reading parameter file " << pname << " in line " << i << endl;
+              MPI_Abort(MPI_COMM_WORLD, 1);
+            }
+            else if (status == 0)
+              continue;
+
+            if (sgrp == "ABE")
+            {
+              if (skey == "output dir")
+                out_dir = sval;
+            }
+          }
+          inf.close();
+
+          parameters::str_par.insert(map<string, string>::value_type("output dir", out_dir));
+        }
+
+        char filename[100];
+        if (tag)
+          sprintf(filename, "%s/%s_Lev%02d-%02d_%02d_%s_%05d.bin", out_dir.c_str(), tag, BP->lev, Bi, myrank, VP->name, ncount);
+        else
+          sprintf(filename, "%s/Lev%02d-%02d_%02d_%s_%05d.bin", out_dir.c_str(), BP->lev, Bi, myrank, VP->name, ncount);
+        writefile(time, BP->shape[0], BP->shape[1], BP->shape[2], BP->bbox[0], BP->bbox[3], BP->bbox[1], BP->bbox[4],
+                  BP->bbox[2], BP->bbox[5], filename, BP->fgfs[VP->sgfn]);
+        cout << "end of dump " << VP->name << " at time " << time << ", on node " << myrank << endl;
+      }
+      Bp = Bp->next;
+      Bi++;
+    }
+    DumpList = DumpList->next;
+  }
+}
+// Now we dump the data including buffer points
+void Parallel::Dump_Data(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT, int grd)
+{
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  //   round at 4 and 5
+  int ncount = int(time / dT + 0.5);
+
+  MPI_Status sta;
+  int DIM = 3;
+  double llb[3], uub[3];
+  double DX, DY, DZ;
+
+  double *databuffer = 0;
+  if (myrank == 0)
+  {
+    databuffer = (double *)malloc(sizeof(double) * PP->shape[0] * PP->shape[1] * PP->shape[2]);
+    if (!databuffer)
+    {
+      cout << "Parallel::Dump_Data: out of memory when dumping data." << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+  }
+
+  while (DumpList)
+  {
+    var *VP = DumpList->data;
+
+    MyList<Block> *Bp = PP->blb;
+    while (Bp)
+    {
+      Block *BP = Bp->data;
+      if (BP->rank == 0 && myrank == 0)
+      {
+        DX = BP->getdX(0);
+        DY = BP->getdX(1);
+        DZ = BP->getdX(2);
+        llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
+        llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
+        llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
+        uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
+        uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
+        uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
+        f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, BP->fgfs[VP->sgfn], llb, uub);
+      }
+      else
+      {
+        int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]);
+        if (myrank == 0)
+        {
+          double *bufferhere = (double *)malloc(sizeof(double) * nnn);
+          if (!bufferhere)
+          {
+            cout << "on node#" << myrank << ", out of memory when dumping data." << endl;
+            MPI_Abort(MPI_COMM_WORLD, 1);
+          }
+          MPI_Recv(bufferhere, nnn, MPI_DOUBLE, BP->rank, 0, MPI_COMM_WORLD, &sta);
+          DX = BP->getdX(0);
+          DY = BP->getdX(1);
+          DZ = BP->getdX(2);
+          llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
+          llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
+          llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
+          uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
+          uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
+          uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
+          f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, bufferhere, llb, uub);
+          free(bufferhere);
+        }
+        else if (myrank == BP->rank)
+        {
+          MPI_Send(BP->fgfs[VP->sgfn], nnn, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
+        }
+      }
+      if (Bp == PP->ble)
+        break;
+      Bp = Bp->next;
+    }
+    if (myrank == 0)
+    {
+
+      string out_dir;
+      map<string, string>::iterator iter;
+      iter = parameters::str_par.find("output dir");
+      if (iter != parameters::str_par.end())
+      {
+        out_dir = iter->second;
+      }
+      else
+      {
+        // read parameter from file
+        const int LEN = 256;
+        char pline[LEN];
+        string str, sgrp, skey, sval;
+        int sind;
+        char pname[50];
+        {
+          map<string, string>::iterator iter = parameters::str_par.find("inputpar");
+          if (iter != parameters::str_par.end())
+          {
+            strcpy(pname, (iter->second).c_str());
+          }
+          else
+          {
+            cout << "Error inputpar" << endl;
+            exit(0);
+          }
+        }
+        ifstream inf(pname, ifstream::in);
+        if (!inf.good())
+        {
+          cout << "Can not open parameter file " << pname << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+
+        for (int i = 1; inf.good(); i++)
+        {
+          inf.getline(pline, LEN);
+          str = pline;
+
+          int status = misc::parse_parts(str, sgrp, skey, sval, sind);
+          if (status == -1)
+          {
+            cout << "error reading parameter file " << pname << " in line " << i << endl;
+            MPI_Abort(MPI_COMM_WORLD, 1);
+          }
+          else if (status == 0)
+            continue;
+
+          if (sgrp == "ABE")
+          {
+            if (skey == "output dir")
+              out_dir = sval;
+          }
+        }
+        inf.close();
+
+        parameters::str_par.insert(map<string, string>::value_type("output dir", out_dir));
+      }
+
+      char filename[100];
+      if (tag)
+        sprintf(filename, "%s/%s_Lev%02d-%02d_%s_%05d.bin", out_dir.c_str(), tag, PP->lev, grd, VP->name, ncount);
+      else
+        sprintf(filename, "%s/Lev%02d-%02d_%s_%05d.bin", out_dir.c_str(), PP->lev, grd, VP->name, ncount);
+
+      writefile(time, PP->shape[0], PP->shape[1], PP->shape[2], PP->bbox[0], PP->bbox[3], PP->bbox[1], PP->bbox[4],
+                PP->bbox[2], PP->bbox[5], filename, databuffer);
+    }
+    DumpList = DumpList->next;
+  }
+
+  if (myrank == 0)
+    free(databuffer);
+}
+void Parallel::Dump_Data(MyList<Patch> *PL, MyList<var> *DumpList, char *tag, double time, double dT)
+{
+  MyList<Patch> *Pp;
+  Pp = PL;
+  int grd = 0;
+  while (Pp)
+  {
+    Patch *PP = Pp->data;
+    Dump_Data(PP, DumpList, tag, time, dT, grd);
+    grd++;
+    Pp = Pp->next;
+  }
+}
+// collect the data including buffer points
+double *Parallel::Collect_Data(Patch *PP, var *VP)
+{
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+  MPI_Status sta;
+  int DIM = 3;
+  double llb[3], uub[3];
+  double DX, DY, DZ;
+
+  double *databuffer = 0;
+  if (myrank == 0)
+  {
+    databuffer = (double *)malloc(sizeof(double) * PP->shape[0] * PP->shape[1] * PP->shape[2]);
+    if (!databuffer)
+    {
+      cout << "Parallel::Collect_Data: out of memory when dumping data." << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+  }
+
+  MyList<Block> *Bp = PP->blb;
+  while (Bp)
+  {
+    Block *BP = Bp->data;
+    if (BP->rank == 0 && myrank == 0)
+    {
+      DX = BP->getdX(0);
+      DY = BP->getdX(1);
+      DZ = BP->getdX(2);
+      llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
+      llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
+      llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
+      uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
+      uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
+      uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
+      f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, BP->fgfs[VP->sgfn], llb, uub);
+    }
+    else
+    {
+      int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]);
+      if (myrank == 0)
+      {
+        double *bufferhere = (double *)malloc(sizeof(double) * nnn);
+        if (!bufferhere)
+        {
+          cout << "on node#" << myrank << ", out of memory when dumping data." << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        MPI_Recv(bufferhere, nnn, MPI_DOUBLE, BP->rank, 0, MPI_COMM_WORLD, &sta);
+        DX = BP->getdX(0);
+        DY = BP->getdX(1);
+        DZ = BP->getdX(2);
+        llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
+        llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
+        llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
+        uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
+        uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
+        uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
+        f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, bufferhere, llb, uub);
+        free(bufferhere);
+      }
+      else if (myrank == BP->rank)
+      {
+        MPI_Send(BP->fgfs[VP->sgfn], nnn, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
+      }
+    }
+    if (Bp == PP->ble)
+      break;
+    Bp = Bp->next;
+  }
+
+  return databuffer;
+}
+// Now we dump the data including buffer points
+// dump z = 0 plane
+void Parallel::d2Dump_Data(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT, int grd)
+{
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  //   round at 4 and 5
+  int ncount = int(time / dT + 0.5);
+
+  MPI_Status sta;
+  int DIM = 3;
+  double llb[3], uub[3];
+  double DX, DY, DZ;
+
+  double *databuffer = 0, *databuffer2 = 0;
+  if (myrank == 0)
+  {
+    databuffer = (double *)malloc(sizeof(double) * PP->shape[0] * PP->shape[1] * PP->shape[2]);
+    databuffer2 = (double *)malloc(sizeof(double) * PP->shape[0] * PP->shape[1]);
+    if (!databuffer || !databuffer2)
+    {
+      cout << "Parallel::d2Dump_Data: out of memory when dumping data." << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+  }
+
+  while (DumpList)
+  {
+    var *VP = DumpList->data;
+
+    MyList<Block> *Bp = PP->blb;
+    while (Bp)
+    {
+      Block *BP = Bp->data;
+      if (BP->rank == 0 && myrank == 0)
+      {
+        DX = BP->getdX(0);
+        DY = BP->getdX(1);
+        DZ = BP->getdX(2);
+        llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
+        llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
+        llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
+        uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
+        uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
+        uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
+        f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, BP->fgfs[VP->sgfn], llb, uub);
+      }
+      else
+      {
+        int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]);
+        if (myrank == 0)
+        {
+          double *bufferhere = (double *)malloc(sizeof(double) * nnn);
+          if (!bufferhere)
+          {
+            cout << "on node#" << myrank << ", out of memory when dumping data." << endl;
+            MPI_Abort(MPI_COMM_WORLD, 1);
+          }
+          MPI_Recv(bufferhere, nnn, MPI_DOUBLE, BP->rank, 0, MPI_COMM_WORLD, &sta);
+          DX = BP->getdX(0);
+          DY = BP->getdX(1);
+          DZ = BP->getdX(2);
+          llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
+          llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
+          llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
+          uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
+          uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
+          uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
+          f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, bufferhere, llb, uub);
+          free(bufferhere);
+        }
+        else if (myrank == BP->rank)
+        {
+          MPI_Send(BP->fgfs[VP->sgfn], nnn, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
+        }
+      }
+      if (Bp == PP->ble)
+        break;
+      Bp = Bp->next;
+    }
+    if (myrank == 0)
+    {
+
+      string out_dir;
+      map<string, string>::iterator iter;
+      iter = parameters::str_par.find("output dir");
+      if (iter != parameters::str_par.end())
+      {
+        out_dir = iter->second;
+      }
+      else
+      {
+        // read parameter from file
+        const int LEN = 256;
+        char pline[LEN];
+        string str, sgrp, skey, sval;
+        int sind;
+        char pname[50];
+        {
+          map<string, string>::iterator iter = parameters::str_par.find("inputpar");
+          if (iter != parameters::str_par.end())
+          {
+            strcpy(pname, (iter->second).c_str());
+          }
+          else
+          {
+            cout << "Error inputpar" << endl;
+            exit(0);
+          }
+        }
+        ifstream inf(pname, ifstream::in);
+        if (!inf.good())
+        {
+          cout << "Can not open parameter file " << pname << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+
+        for (int i = 1; inf.good(); i++)
+        {
+          inf.getline(pline, LEN);
+          str = pline;
+
+          int status = misc::parse_parts(str, sgrp, skey, sval, sind);
+          if (status == -1)
+          {
+            cout << "error reading parameter file " << pname << " in line " << i << endl;
+            MPI_Abort(MPI_COMM_WORLD, 1);
+          }
+          else if (status == 0)
+            continue;
+
+          if (sgrp == "ABE")
+          {
+            if (skey == "output dir")
+              out_dir = sval;
+          }
+        }
+        inf.close();
+
+        parameters::str_par.insert(map<string, string>::value_type("output dir", out_dir));
+      }
+
+      char filename[100];
+      if (tag)
+        sprintf(filename, "%s/%s_2d_Lev%02d-%02d_%s_%05d.dat", out_dir.c_str(), tag, PP->lev, grd, VP->name, ncount);
+      else
+        sprintf(filename, "%s/2d_Lev%02d-%02d_%s_%05d.dat", out_dir.c_str(), PP->lev, grd, VP->name, ncount);
+
+      int gord = ghost_width;
+      f_d2dump(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, databuffer2, gord, VP->SoA);
+      writefile(time, PP->shape[0], PP->shape[1], PP->bbox[0], PP->bbox[3], PP->bbox[1], PP->bbox[4],
+                filename, databuffer2);
+    }
+    DumpList = DumpList->next;
+  }
+
+  if (myrank == 0)
+  {
+    free(databuffer);
+    free(databuffer2);
+  }
+}
+void Parallel::d2Dump_Data(MyList<Patch> *PL, MyList<var> *DumpList, char *tag, double time, double dT)
+{
+  MyList<Patch> *Pp;
+  Pp = PL;
+  int grd = 0;
+  while (Pp)
+  {
+    Patch *PP = Pp->data;
+    d2Dump_Data(PP, DumpList, tag, time, dT, grd);
+    grd++;
+    Pp = Pp->next;
+  }
+}
+// Now we dump the data including buffer points and ghost points of the given patch
+void Parallel::Dump_Data0(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT)
+{
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  //   round at 4 and 5
+  int ncount = int(time / dT + 0.5);
+
+  MPI_Status sta;
+  int DIM = 3;
+  double llb[3], uub[3], tllb[3], tuub[3];
+  int tshape[3];
+  double DX, DY, DZ;
+
+  for (int i = 0; i < 3; i++)
+  {
+    double DX = PP->blb->data->getdX(i);
+    tshape[i] = PP->shape[i] + 2 * ghost_width;
+    tllb[i] = PP->bbox[i] - ghost_width * DX;
+    tuub[i] = PP->bbox[i + dim] + ghost_width * DX;
+  }
+
+  int NN = tshape[0] * tshape[1] * tshape[2];
+  double *databuffer = 0;
+  if (myrank == 0)
+  {
+    databuffer = (double *)malloc(sizeof(double) * NN);
+    if (!databuffer)
+    {
+      cout << "on node# " << myrank << ", out of memory when dumping data." << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+  }
+
+  while (DumpList)
+  {
+    var *VP = DumpList->data;
+    MyList<Block> *Bp = PP->blb;
+    while (Bp)
+    {
+      Block *BP = Bp->data;
+      if (BP->rank == 0 && myrank == 0)
+      {
+        DX = BP->getdX(0);
+        DY = BP->getdX(1);
+        DZ = BP->getdX(2);
+        llb[0] = (feq(BP->bbox[0], tllb[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
+        llb[1] = (feq(BP->bbox[1], tllb[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
+        llb[2] = (feq(BP->bbox[2], tllb[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
+        uub[0] = (feq(BP->bbox[3], tuub[0], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
+        uub[1] = (feq(BP->bbox[4], tuub[1], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
+        uub[2] = (feq(BP->bbox[5], tuub[2], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
+        f_copy(DIM, tllb, tuub, tshape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, BP->fgfs[VP->sgfn], llb, uub);
+      }
+      else
+      {
+        if (myrank == 0)
+        {
+          int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]);
+          double *bufferhere = (double *)malloc(sizeof(double) * nnn);
+          if (!bufferhere)
+          {
+            cout << "on node#" << myrank << ", out of memory when dumping data." << endl;
+            MPI_Abort(MPI_COMM_WORLD, 1);
+          }
+          MPI_Recv(bufferhere, nnn, MPI_DOUBLE, BP->rank, 0, MPI_COMM_WORLD, &sta);
+          DX = BP->getdX(0);
+          DY = BP->getdX(1);
+          DZ = BP->getdX(2);
+          llb[0] = (feq(BP->bbox[0], tllb[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
+          llb[1] = (feq(BP->bbox[1], tllb[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
+          llb[2] = (feq(BP->bbox[2], tllb[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
+          uub[0] = (feq(BP->bbox[3], tuub[0], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
+          uub[1] = (feq(BP->bbox[4], tuub[1], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
+          uub[2] = (feq(BP->bbox[5], tuub[2], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
+          f_copy(DIM, tllb, tuub, tshape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, bufferhere, llb, uub);
+          free(bufferhere);
+        }
+        else if (myrank == BP->rank)
+        {
+          int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]);
+          MPI_Send(BP->fgfs[VP->sgfn], nnn, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
+        }
+      }
+      if (Bp == PP->ble)
+        break;
+      Bp = Bp->next;
+    }
+    if (myrank == 0)
+    {
+
+      string out_dir;
+      map<string, string>::iterator iter;
+      iter = parameters::str_par.find("output dir");
+      if (iter != parameters::str_par.end())
+      {
+        out_dir = iter->second;
+      }
+      else
+      {
+        // read parameter from file
+        const int LEN = 256;
+        char pline[LEN];
+        string str, sgrp, skey, sval;
+        int sind;
+        char pname[50];
+        {
+          map<string, string>::iterator iter = parameters::str_par.find("inputpar");
+          if (iter != parameters::str_par.end())
+          {
+            strcpy(pname, (iter->second).c_str());
+          }
+          else
+          {
+            cout << "Error inputpar" << endl;
+            exit(0);
+          }
+        }
+        ifstream inf(pname, ifstream::in);
+        if (!inf.good())
+        {
+          cout << "Can not open parameter file " << pname << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+
+        for (int i = 1; inf.good(); i++)
+        {
+          inf.getline(pline, LEN);
+          str = pline;
+
+          int status = misc::parse_parts(str, sgrp, skey, sval, sind);
+          if (status == -1)
+          {
+            cout << "error reading parameter file " << pname << " in line " << i << endl;
+            MPI_Abort(MPI_COMM_WORLD, 1);
+          }
+          else if (status == 0)
+            continue;
+
+          if (sgrp == "ABE")
+          {
+            if (skey == "output dir")
+              out_dir = sval;
+          }
+        }
+        inf.close();
+
+        parameters::str_par.insert(map<string, string>::value_type("output dir", out_dir));
+      }
+
+      char filename[100];
+      if (tag)
+        sprintf(filename, "%s/%s_Lev%02d_%s_%05d.bin", out_dir.c_str(), tag, PP->lev, VP->name, ncount);
+      else
+        sprintf(filename, "%s/Lev%02d_%s_%05d.bin", out_dir.c_str(), PP->lev, VP->name, ncount);
+
+      writefile(time, tshape[0], tshape[1], tshape[2], tllb[0], tuub[0], tllb[1], tuub[2],
+                tllb[2], tuub[2], filename, databuffer);
+    }
+    DumpList = DumpList->next;
+  }
+
+  if (myrank == 0)
+    free(databuffer);
+}
+// Map point is much easier than maping data itself
+// But the main problem is about the points near the boundary
+// worst case is -ghost -ghost+1 .... 0 * ......
+double Parallel::global_interp(int DIM, int *ext, double **CoX, double *datain,
+                               double *poXb, int ordn, double *SoA, int Symmetry)
+{
+  if (DIM != 3)
+  {
+    cout << "Parallel::global_interp does not suport DIM = " << DIM << " for Symmetry." << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+  double resu;
+  double poX[3];
+  double asgn = 1;
+
+  for (int i = 0; i < 3; i++)
+    poX[i] = poXb[i];
+
+  switch (Symmetry)
+  {
+  case 2:
+    for (int i = 0; i < 3; i++)
+      if (poX[i] < 0)
+      {
+        poX[i] = -poX[i];
+        asgn = asgn * SoA[i];
+      }
+    break;
+  case 1:
+    if (poX[2] < 0)
+    {
+      poX[2] = -poX[2];
+      asgn = asgn * SoA[2];
+    }
+  }
+
+  int extb[3];
+
+  for (int i = 0; i < 3; i++)
+    extb[i] = ext[i];
+
+  switch (Symmetry)
+  {
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+  case 2:
+    if (poX[0] < (ghost_width - 1) * (CoX[0][1] - CoX[0][0]))
+      extb[0] = extb[0] + ghost_width - 1;
+    if (poX[1] < (ghost_width - 1) * (CoX[1][1] - CoX[1][0]))
+      extb[1] = extb[1] + ghost_width - 1;
+  case 1:
+    if (poX[2] < (ghost_width - 1) * (CoX[2][1] - CoX[2][0]))
+      extb[2] = extb[2] + ghost_width - 1;
+#else
+#ifdef Cell
+  case 2:
+    if (poX[0] < (ghost_width - 0.5) * (CoX[0][1] - CoX[0][0]))
+      extb[0] = extb[0] + ghost_width;
+    if (poX[1] < (ghost_width - 0.5) * (CoX[1][1] - CoX[1][0]))
+      extb[1] = extb[1] + ghost_width;
+  case 1:
+    if (poX[2] < (ghost_width - 0.5) * (CoX[2][1] - CoX[2][0]))
+      extb[2] = extb[2] + ghost_width;
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+  }
+
+  if (extb[0] > ext[0] || extb[1] > ext[1] || extb[2] > ext[2])
+  {
+    double *CoXb[3];
+    int Nb = extb[0] * extb[1] * extb[2];
+    double *datab;
+    datab = new double[Nb];
+    for (int i = 0; i < 3; i++)
+    {
+      CoXb[i] = new double[extb[i]];
+      double DH = CoX[i][1] - CoX[i][0];
+      if (extb[i] > ext[i])
+      {
+        if (CoX[i][0] > DH)
+        {
+          cout << "lower boundary[" << i << "] = " << CoX[i][0] << ", but SYmmetry = " << Symmetry << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        for (int j = 0; j < ghost_width - 1; j++)
+          CoXb[i][j] = -CoX[i][ghost_width - 1 - j];
+        for (int j = ghost_width - 1; j < extb[i]; j++)
+          CoXb[i][j] = CoX[i][j - ghost_width + 1];
+#else
+#ifdef Cell
+        for (int j = 0; j < ghost_width; j++)
+          CoXb[i][j] = -CoX[i][ghost_width - 1 - j];
+        for (int j = ghost_width; j < extb[i]; j++)
+          CoXb[i][j] = CoX[i][j - ghost_width];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+      else
+      {
+        for (int j = 0; j < extb[i]; j++)
+          CoXb[i][j] = CoX[i][j];
+      }
+    }
+
+    for (int i = 0; i < Nb; i++)
+    {
+      int ind[3], indb[3];
+      getarrayindex(3, extb, indb, i);
+      double sgn = 1;
+      for (int j = 0; j < 3; j++)
+      {
+        if (extb[j] > ext[j])
+        {
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+          if (indb[j] < ghost_width - 1)
+          {
+            ind[j] = ghost_width - 1 - indb[j];
+            sgn = sgn * SoA[j];
+          }
+          else
+          {
+            ind[j] = 1 + indb[j] - ghost_width;
+          }
+#else
+#ifdef Cell
+          if (indb[j] < ghost_width)
+          {
+            ind[j] = ghost_width - 1 - indb[j];
+            sgn = sgn * SoA[j];
+          }
+          else
+          {
+            ind[j] = indb[j] - ghost_width;
+          }
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+        }
+        else
+          ind[j] = indb[j];
+      }
+      int lon = getarraylocation(3, ext, ind);
+      datab[i] = datain[lon] * sgn;
+    }
+
+    resu = global_interp(DIM, extb, CoXb, datab, poX, ordn);
+
+    for (int i = 0; i < 3; i++)
+      delete[] CoXb[i];
+    delete[] datab;
+  }
+  else
+  {
+    resu = global_interp(DIM, ext, CoX, datain, poX, ordn);
+  }
+
+  return resu * asgn;
+}
+double Parallel::global_interp(int DIM, int *ext, double **CoX, double *datain,
+                               double *poX, int ordn)
+{
+  if (ordn > 2 * ghost_width)
+  {
+    cout << "Parallel::global_interp can not handle ordn = " << ordn << " > 2*ghost_width = " << 2 * ghost_width << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+
+  double *bbox, *datainbbox;
+  bbox = new double[2 * DIM];
+  datainbbox = new double[2 * DIM];
+
+  int *NN, *ind, *shape;
+  NN = new int[DIM];
+  ind = new int[DIM];
+  shape = new int[DIM];
+
+  for (int i = 0; i < DIM; i++)
+  {
+    ind[i] = int((poX[i] - CoX[i][0]) / (CoX[i][1] - CoX[i][0])) - ordn / 2 + 1;
+    // poX may exactly locate on the boundary (exclude ghost)
+    if (ind[i] == -1 && feq(poX[i], CoX[i][0], (CoX[i][1] - CoX[i][0]) / 2))
+      ind[i] = 0;
+    /*
+         if(ind[i] < 0)
+         {
+           cout<<"Parallel::global_interp error ind["<<i<<"] = "<<ind[i]<<endl;
+           cout<<"pox = "<<poX[i]<<", CoX[0] = "<<CoX[i][0]<<endl;
+           MPI_Abort(MPI_COMM_WORLD,1);
+         }
+    */
+    if (ind[i] == ext[i] - ordn + 1 && feq(poX[i], CoX[i][ext[i] - ordn / 2], (CoX[i][1] - CoX[i][0]) / 2))
+      ind[i] = ext[i] - ordn - 1;
+    /*
+         if(ind[i]+ordn-1 > ext[i]-1)
+         {
+           cout<<"Parallel::global_interp error ind["<<i<<"] = "<<ind[i]<<" + ordn ("<<ordn<<") > ext = "<<ext[i]<<endl;
+           cout<<"pox = "<<poX[i]<<", CoX[ind] = "<<CoX[i][ind[i]]<<", CoX = ("<<CoX[i][0]<<","<<CoX[i][ext[i]-1]<<")"<<endl;
+           MPI_Abort(MPI_COMM_WORLD,1);
+         }
+    */
+    bbox[i] = CoX[i][ind[i]];
+    bbox[DIM + i] = CoX[i][ind[i] + ordn - 1];
+    datainbbox[i] = CoX[i][0];
+    datainbbox[DIM + i] = CoX[i][ext[i] - 1];
+    shape[i] = ordn;
+  }
+
+  NN[DIM - 1] = ordn;
+  for (int i = DIM - 2; i >= 0; i--)
+    NN[i] = NN[i + 1] * ordn;
+
+  double *xpts, *funcvals;
+  xpts = new double[ordn];
+  funcvals = new double[ordn];
+  double *DDd, *DDd1, rr;
+
+  DDd = new double[NN[0]];
+
+  copy(DIM, bbox, bbox + DIM, shape, DDd, datainbbox, datainbbox + DIM, ext, datain, bbox, bbox + DIM);
+
+  for (int i = 0; i < DIM; i++)
+  {
+    for (int j = ind[i]; j < ind[i] + ordn; j++)
+    {
+      xpts[j - ind[i]] = CoX[i][j];
+    }
+
+    if (i < DIM - 1)
+    {
+      DDd1 = new double[NN[i + 1]];
+      for (int j = 0; j < NN[i + 1]; j++)
+      {
+        for (int k = 0; k < ordn; k++)
+          funcvals[k] = DDd[k + j * ordn];
+        DDd1[j] = Lagrangian_Int(poX[i], ordn, xpts, funcvals);
+      }
+      delete[] DDd;
+      DDd = DDd1;
+    }
+    else
+    {
+      for (int j = 0; j < ordn; j++)
+        funcvals[j] = DDd[j];
+      rr = Lagrangian_Int(poX[i], ordn, xpts, funcvals);
+      delete[] DDd1; // since DDd and DDd1 now point to the same stuff, we need delete after above int
+    }
+  }
+
+  delete[] NN;
+  delete[] ind;
+  delete[] xpts;
+  delete[] funcvals;
+  delete[] bbox;
+  delete[] datainbbox;
+  delete[] shape;
+
+  return rr;
+}
+double Parallel::Lagrangian_Int(double x, int npts, double *xpts, double *funcvals)
+{
+  double sum = 0;
+  for (int i = 0; i < npts; i++)
+  {
+    sum = sum + funcvals[i] * LagrangePoly(x, i, npts, xpts);
+  }
+  return sum;
+}
+double Parallel::LagrangePoly(double x, int pt, int npts, double *xpts)
+{
+  double h = 1;
+  int i;
+
+  for (i = 0; i < pt; i++)
+    h = h * (x - xpts[i]) / (xpts[pt] - xpts[i]);
+
+  for (i = pt + 1; i < npts; i++)
+    h = h * (x - xpts[i]) / (xpts[pt] - xpts[i]);
+
+  return h;
+}
+// collect all grid segments or blocks including ghost and buffer for given patch
+MyList<Parallel::gridseg> *Parallel::build_complete_gsl(Patch *Pat)
+{
+  MyList<Parallel::gridseg> *cgsl = 0, *gs;
+  MyList<Block> *BP = Pat->blb;
+  while (BP)
+  {
+    if (!cgsl)
+    {
+      cgsl = gs = new MyList<Parallel::gridseg>; // delete through destroyList();
+      gs->data = new Parallel::gridseg;
+    }
+    else
+    {
+      gs->next = new MyList<Parallel::gridseg>;
+      gs = gs->next;
+      gs->data = new Parallel::gridseg;
+    }
+
+    for (int i = 0; i < dim; i++)
+    {
+      gs->data->llb[i] = BP->data->bbox[i];
+      gs->data->uub[i] = BP->data->bbox[dim + i];
+      gs->data->shape[i] = BP->data->shape[i];
+    }
+    gs->data->Bg = BP->data;
+    gs->next = 0;
+
+    if (BP == Pat->ble)
+      break;
+    BP = BP->next;
+  }
+
+  return cgsl;
+}
+// collect all grid segments or blocks including ghost and buffer for given patch list
+MyList<Parallel::gridseg> *Parallel::build_complete_gsl(MyList<Patch> *PatL)
+{
+  MyList<Parallel::gridseg> *cgsl = 0, *gs;
+  while (PatL)
+  {
+    if (!cgsl)
+    {
+      cgsl = build_complete_gsl(PatL->data);
+      gs = cgsl;
+      while (gs->next)
+        gs = gs->next;
+    }
+    else
+    {
+      gs->next = build_complete_gsl(PatL->data);
+      gs = gs->next;
+      while (gs->next)
+        gs = gs->next;
+    }
+    PatL = PatL->next;
+  }
+
+  return cgsl;
+}
+// cellect the information of Patch list
+MyList<Parallel::gridseg> *Parallel::build_complete_gsl_virtual(MyList<Patch> *PatL)
+{
+  MyList<Parallel::gridseg> *cgsl = 0, *gs;
+  while (PatL)
+  {
+    if (cgsl)
+    {
+      gs->next = new MyList<Parallel::gridseg>;
+      gs = gs->next;
+      gs->data = new Parallel::gridseg;
+    }
+    else
+    {
+      cgsl = gs = new MyList<Parallel::gridseg>;
+      gs->data = new Parallel::gridseg;
+    }
+
+    for (int i = 0; i < dim; i++)
+    {
+      gs->data->llb[i] = PatL->data->bbox[i];
+      gs->data->uub[i] = PatL->data->bbox[dim + i];
+      gs->data->shape[i] = PatL->data->shape[i];
+    }
+    gs->data->Bg = 0;
+    gs->next = 0;
+
+    PatL = PatL->next;
+  }
+
+  return cgsl;
+}
+// cellect the information of Patch list without buffer points
+MyList<Parallel::gridseg> *Parallel::build_complete_gsl_virtual2(MyList<Patch> *PatL) // - buffer
+{
+  MyList<Parallel::gridseg> *cgsl = 0, *gs;
+  while (PatL)
+  {
+    if (cgsl)
+    {
+      gs->next = new MyList<Parallel::gridseg>;
+      gs = gs->next;
+      gs->data = new Parallel::gridseg;
+    }
+    else
+    {
+      cgsl = gs = new MyList<Parallel::gridseg>;
+      gs->data = new Parallel::gridseg;
+    }
+
+    for (int i = 0; i < dim; i++)
+    {
+      double DH = PatL->data->getdX(i);
+      gs->data->llb[i] = PatL->data->bbox[i] + PatL->data->lli[i] * DH;
+      gs->data->uub[i] = PatL->data->bbox[dim + i] - PatL->data->uui[i] * DH;
+      gs->data->shape[i] = PatL->data->shape[i] - PatL->data->lli[i] - PatL->data->uui[i];
+    }
+    gs->data->Bg = 0;
+    gs->next = 0;
+
+    PatL = PatL->next;
+  }
+
+  return cgsl;
+}
+// collect all grid segments or blocks without ghost for given patch, without extension
+MyList<Parallel::gridseg> *Parallel::build_bulk_gsl(Patch *Pat)
+{
+  MyList<Parallel::gridseg> *cgsl = 0, *gs;
+  MyList<Block> *BP = Pat->blb;
+  while (BP)
+  {
+    Block *bp = BP->data;
+    if (!cgsl)
+    {
+      cgsl = gs = new MyList<Parallel::gridseg>;
+      gs->data = new Parallel::gridseg;
+    }
+    else
+    {
+      gs->next = new MyList<Parallel::gridseg>;
+      gs = gs->next;
+      gs->data = new Parallel::gridseg;
+    }
+
+    for (int i = 0; i < dim; i++)
+    {
+      double DH = bp->getdX(i);
+      gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] : bp->bbox[dim + i] - ghost_width * DH;
+      gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + ghost_width * DH;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+      gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
+#else
+#ifdef Cell
+      gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+    }
+    gs->data->Bg = BP->data;
+    gs->next = 0;
+
+    if (BP == Pat->ble)
+      break;
+    BP = BP->next;
+  }
+
+  return cgsl;
+}
+// bulk part for given Block within given patch, without extension
+MyList<Parallel::gridseg> *Parallel::build_bulk_gsl(Block *bp, Patch *Pat)
+{
+  MyList<Parallel::gridseg> *gs = 0;
+
+  gs = new MyList<Parallel::gridseg>;
+  gs->data = new Parallel::gridseg;
+
+  for (int i = 0; i < dim; i++)
+  {
+    double DH = bp->getdX(i);
+    gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] : bp->bbox[dim + i] - ghost_width * DH;
+    gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + ghost_width * DH;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+    gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
+#else
+#ifdef Cell
+    gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+  }
+  gs->data->Bg = bp;
+  gs->next = 0;
+
+  return gs;
+}
+MyList<Parallel::gridseg> *Parallel::clone_gsl(MyList<Parallel::gridseg> *p, bool first_only)
+{
+  MyList<Parallel::gridseg> *np = 0, *q = 0, *pq = 0;
+
+  while (p)
+  {
+    q = new MyList<Parallel::gridseg>;
+    q->data = new Parallel::gridseg;
+    q->data->Bg = p->data->Bg;
+    for (int i = 0; i < dim; i++)
+    {
+      q->data->llb[i] = p->data->llb[i];
+      q->data->uub[i] = p->data->uub[i];
+      q->data->shape[i] = p->data->shape[i];
+    }
+    if (pq)
+      pq->next = q;
+    else
+      np = q;
+    if (first_only)
+    {
+      np->next = 0;
+      return np;
+    }
+    pq = q;
+    p = p->next;
+  }
+  return np;
+}
+MyList<Parallel::gridseg> *Parallel::gs_subtract(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B)
+{
+  if (!A)
+    return 0;
+  if (!B)
+    return clone_gsl(A, true);
+
+  double cut_plane[2 * dim], DH[dim];
+
+  for (int i = 0; i < dim; i++)
+  {
+    DH[i] = A->data->Bg->getdX(i);
+    if (B->data->Bg && !feq(DH[i], B->data->Bg->getdX(i), DH[i] / 2))
+    {
+      cout << "Parallel::gs_subtract meets different grid segment " << DH[i] << " vs " << B->data->Bg->getdX(i) << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+  }
+
+  MyList<Parallel::gridseg> *C = 0, *q;
+  for (int i = 0; i < dim; i++)
+  {
+    if (B->data->llb[i] > A->data->uub[i] || B->data->uub[i] < A->data->llb[i])
+      return clone_gsl(A, true);
+    cut_plane[i] = A->data->llb[i];
+    cut_plane[i + dim] = A->data->uub[i];
+  }
+
+  for (int i = 0; i < dim; i++)
+  {
+    cut_plane[i] = Mymax(A->data->llb[i], B->data->llb[i]);
+    if (cut_plane[i] - A->data->llb[i] > DH[i] / 2)
+    {
+      q = clone_gsl(A, true);
+      // prolong the list from head
+      if (C)
+        q->next = C;
+      C = q;
+      for (int j = 0; j < dim; j++)
+      {
+        if (i == j)
+        {
+          C->data->llb[i] = A->data->llb[i];
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+          C->data->uub[i] = Mymax(C->data->llb[i], cut_plane[i] - DH[i]);
+#else
+#ifdef Cell
+          C->data->uub[i] = Mymax(C->data->llb[i], cut_plane[i]);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+        }
+        else
+        {
+          C->data->llb[j] = cut_plane[j];
+          C->data->uub[j] = cut_plane[j + dim];
+        }
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4) + 1;
+#else
+#ifdef Cell
+        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+    }
+
+    cut_plane[i + dim] = Mymin(A->data->uub[i], B->data->uub[i]);
+    if (A->data->uub[i] - cut_plane[i + dim] > DH[i] / 2)
+    {
+      q = clone_gsl(A, true);
+      if (C)
+        q->next = C;
+      C = q;
+      for (int j = 0; j < dim; j++)
+      {
+        if (i == j)
+        {
+          C->data->uub[i] = A->data->uub[i];
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+          C->data->llb[i] = Mymin(C->data->uub[i], cut_plane[i + dim] + DH[i]);
+#else
+#ifdef Cell
+          C->data->llb[i] = Mymin(C->data->uub[i], cut_plane[i + dim]);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+        }
+        else
+        {
+          C->data->llb[j] = cut_plane[j];
+          C->data->uub[j] = cut_plane[j + dim];
+        }
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4) + 1;
+#else
+#ifdef Cell
+        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+    }
+  }
+  return C;
+}
+// stupid method
+/*
+MyList<Parallel::gridseg> *Parallel::gsl_subtract(MyList<Parallel::gridseg> *A,MyList<Parallel::gridseg> *B) //A subtract B but with A's information
+{
+// always make return and A, B distinct
+  if(!A) return 0;
+
+  if(!B) return clone_gsl(A,0);
+
+  MyList<Parallel::gridseg> *C=0,*C0,*C1,*Cc,*CC0,*gs;
+
+  while(A)
+  {
+     C0=gs_subtract(A,B);  // note C0 becomes a list after subtraction
+     C1=B->next;
+     while(C1)
+     {
+  CC0=C0;
+  Cc=0;
+  while(CC0)
+  {
+    gs=gs_subtract(CC0,C1);
+    if(Cc) Cc->catList(gs);
+    else   Cc=gs;
+    CC0=CC0->next;
+  }
+  if(C0) C0->destroyList();
+  C0=Cc;
+  C1=C1->next;
+     }
+     if(C) C->catList(C0);
+     else  C=C0;
+     A=A->next;
+  }
+
+  return C;
+}
+*/
+// more clever method
+MyList<Parallel::gridseg> *Parallel::gsl_subtract(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B) // A subtract B but with A's information
+{
+  // always make return and A, B distinct
+  if (!A)
+    return 0;
+
+  MyList<Parallel::gridseg> *C = 0, *C0, *C1;
+
+  C = clone_gsl(A, 0);
+
+  while (B)
+  {
+    C0 = 0;
+    C1 = C;
+    while (C1)
+    {
+      if (C0)
+        C0->catList(gs_subtract(C1, B));
+      else
+        C0 = gs_subtract(C1, B);
+      C1 = C1->next;
+    }
+    if (C)
+      C->destroyList();
+    else
+    {
+      if (C0)
+        C0->destroyList();
+      return 0;
+    }
+
+    C = C0;
+    B = B->next;
+  }
+
+  return C;
+}
+MyList<Parallel::gridseg> *Parallel::gs_and(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B)
+{
+  if (!A || !B)
+    return 0;
+
+  double llb[dim], uub[dim];
+  bool flag = false;
+  for (int i = 0; i < dim; i++)
+  {
+    llb[i] = Mymax(A->data->llb[i], B->data->llb[i]);
+    uub[i] = Mymin(A->data->uub[i], B->data->uub[i]);
+    if (llb[i] > uub[i])
+    {
+      flag = true;
+      break;
+    }
+  }
+  if (flag)
+    return 0;
+
+  MyList<Parallel::gridseg> *C;
+  C = clone_gsl(A, true);
+  for (int i = 0; i < dim; i++)
+  {
+    C->data->llb[i] = llb[i];
+    C->data->uub[i] = uub[i];
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+    C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / C->data->Bg->getdX(i) + 0.4) + 1;
+#else
+#ifdef Cell
+    C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / C->data->Bg->getdX(i) + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+  }
+
+  return C;
+}
+// overlap of A_i and (union of all j of B_j)
+MyList<Parallel::gridseg> *Parallel::gsl_and(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B) // A and B but with A's information
+{
+  MyList<Parallel::gridseg> *C = 0, *C1;
+
+  while (A)
+  {
+    C1 = B;
+    while (C1)
+    {
+      if (C)
+        C->catList(gs_and(A, C1));
+      else
+        C = gs_and(A, C1);
+      C1 = C1->next;
+    }
+    A = A->next;
+  }
+  return C;
+}
+// collect all ghost grid segments or blocks for given patch
+MyList<Parallel::gridseg> *Parallel::build_ghost_gsl(Patch *Pat)
+{
+  MyList<Parallel::gridseg> *cgsl = 0, *gs, *gsb;
+  MyList<Block> *BP = Pat->blb;
+  while (BP)
+  {
+    gs = new MyList<Parallel::gridseg>;
+    gs->data = new Parallel::gridseg;
+
+    for (int i = 0; i < dim; i++)
+    {
+      gs->data->llb[i] = BP->data->bbox[i];
+      gs->data->uub[i] = BP->data->bbox[dim + i];
+      gs->data->shape[i] = BP->data->shape[i];
+    }
+    gs->data->Bg = BP->data;
+    gs->next = 0;
+
+    gsb = build_bulk_gsl(BP->data, Pat);
+
+    if (!cgsl)
+      cgsl = gs_subtract(gs, gsb);
+    else
+      cgsl->catList(gs_subtract(gs, gsb));
+
+    gsb->destroyList();
+    gs->destroyList();
+
+    if (BP == Pat->ble)
+      break;
+    BP = BP->next;
+  }
+
+  return cgsl;
+}
+// collect all ghost grid segments or blocks for given patch list
+MyList<Parallel::gridseg> *Parallel::build_ghost_gsl(MyList<Patch> *PatL)
+{
+  MyList<Parallel::gridseg> *cgsl = 0, *gs;
+  while (PatL)
+  {
+    if (!cgsl)
+    {
+      cgsl = build_ghost_gsl(PatL->data);
+      gs = cgsl;
+      while (gs->next)
+        gs = gs->next;
+    }
+    else
+    {
+      gs->next = build_ghost_gsl(PatL->data);
+      gs = gs->next;
+      while (gs->next)
+        gs = gs->next;
+    }
+    PatL = PatL->next;
+  }
+
+  return cgsl;
+}
+// collect all grid segments or blocks without ghost for given patch
+// special for Sync usage, so we do not need consider missing points
+MyList<Parallel::gridseg> *Parallel::build_owned_gsl0(Patch *Pat, int rank_in)
+{
+  MyList<Parallel::gridseg> *cgsl = 0, *gs;
+  MyList<Block> *BP = Pat->blb;
+  while (BP)
+  {
+    Block *bp = BP->data;
+    if (bp->rank == rank_in)
+    {
+      if (!cgsl)
+      {
+        cgsl = gs = new MyList<Parallel::gridseg>;
+        gs->data = new Parallel::gridseg;
+      }
+      else
+      {
+        gs->next = new MyList<Parallel::gridseg>;
+        gs = gs->next;
+        gs->data = new Parallel::gridseg;
+      }
+
+      for (int i = 0; i < dim; i++)
+      {
+        double DH = bp->getdX(i);
+        gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] : bp->bbox[dim + i] - ghost_width * DH;
+        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + ghost_width * DH;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
+#else
+#ifdef Cell
+        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+      gs->data->Bg = BP->data;
+      gs->next = 0;
+    }
+
+    if (BP == Pat->ble)
+      break;
+    BP = BP->next;
+  }
+
+  return cgsl;
+}
+// collect all grid segments or blocks without ghost for given patch
+MyList<Parallel::gridseg> *Parallel::build_owned_gsl1(Patch *Pat, int rank_in)
+{
+  MyList<Parallel::gridseg> *cgsl = 0, *gs;
+  MyList<Block> *BP = Pat->blb;
+  while (BP)
+  {
+    Block *bp = BP->data;
+    if (bp->rank == rank_in)
+    {
+      if (!cgsl)
+      {
+        cgsl = gs = new MyList<Parallel::gridseg>;
+        gs->data = new Parallel::gridseg;
+      }
+      else
+      {
+        gs->next = new MyList<Parallel::gridseg>;
+        gs = gs->next;
+        gs->data = new Parallel::gridseg;
+      }
+
+      for (int i = 0; i < dim; i++)
+      {
+        double DH = bp->getdX(i);
+        gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] : bp->bbox[dim + i] - ghost_width * DH;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        // NOTE: our dividing structure is (exclude ghost)
+        // -1 0
+        //       1  2
+        // so (0,1) does not belong to any part for vertex structure, we always put it to right part, this is consistent to
+        // the fortran routine where we always take floor to get index
+        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + (ghost_width - 1) * DH;
+        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
+#else
+#ifdef Cell
+        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + ghost_width * DH;
+        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+      gs->data->Bg = BP->data;
+      gs->next = 0;
+    }
+
+    if (BP == Pat->ble)
+      break;
+    BP = BP->next;
+  }
+
+  return cgsl;
+}
+// collect all grid segments or blocks without ghost nor buffer for given patch
+MyList<Parallel::gridseg> *Parallel::build_owned_gsl2(Patch *Pat, int rank_in)
+{
+  MyList<Parallel::gridseg> *cgsl = 0, *gs;
+  MyList<Block> *BP = Pat->blb;
+  while (BP)
+  {
+    Block *bp = BP->data;
+    if (bp->rank == rank_in)
+    {
+      if (!cgsl)
+      {
+        cgsl = gs = new MyList<Parallel::gridseg>;
+        gs->data = new Parallel::gridseg;
+      }
+      else
+      {
+        gs->next = new MyList<Parallel::gridseg>;
+        gs = gs->next;
+        gs->data = new Parallel::gridseg;
+      }
+
+      for (int i = 0; i < dim; i++)
+      {
+        double DH = bp->getdX(i);
+        gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] - Pat->uui[i] * DH : bp->bbox[dim + i] - ghost_width * DH;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        // NOTE: our dividing structure is (exclude ghost)
+        // -1 0
+        //       1  2
+        // so (0,1) does not belong to any part for vertex structure, we always put it to right part, this is consistent to
+        // the fortran routine where we always take floor to get index
+        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i] + (ghost_width - 1) * DH;
+        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
+#else
+#ifdef Cell
+        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i] + ghost_width * DH;
+        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+      gs->data->Bg = BP->data;
+      gs->next = 0;
+    }
+
+    if (BP == Pat->ble)
+      break;
+    BP = BP->next;
+  }
+
+  return cgsl;
+}
+// collect all grid segments or blocks without ghost for given patch, and delete the ghost_width for interpolation consideration on the patch boundary
+MyList<Parallel::gridseg> *Parallel::build_owned_gsl3(Patch *Pat, int rank_in, int Symmetry)
+{
+  MyList<Parallel::gridseg> *cgsl = 0, *gs;
+  MyList<Block> *BP = Pat->blb;
+  while (BP)
+  {
+    Block *bp = BP->data;
+    if (bp->rank == rank_in)
+    {
+      if (!cgsl)
+      {
+        cgsl = gs = new MyList<Parallel::gridseg>;
+        gs->data = new Parallel::gridseg;
+      }
+      else
+      {
+        gs->next = new MyList<Parallel::gridseg>;
+        gs = gs->next;
+        gs->data = new Parallel::gridseg;
+      }
+
+      for (int i = 0; i < dim; i++)
+      {
+        double DH = bp->getdX(i);
+        gs->data->uub[i] = bp->bbox[dim + i] - ghost_width * DH;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        // NOTE: our dividing structure is (exclude ghost)
+        // -1 0
+        //       1  2
+        // so (0,1) does not belong to any part for vertex structure, we always put it to right part, this is consistent to
+        // the fortran routine where we always take floor to get index
+        gs->data->llb[i] = bp->bbox[i] + (ghost_width - 1) * DH;
+        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
+#else
+#ifdef Cell
+        gs->data->llb[i] = bp->bbox[i] + ghost_width * DH;
+        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+      // Symmetry consideration
+      if (Symmetry > 0)
+      {
+        double DH = bp->getdX(2);
+        if (feq(bp->bbox[2], 0, DH / 2))
+        {
+          gs->data->llb[2] = bp->bbox[2];
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+          gs->data->shape[2] = int((gs->data->uub[2] - gs->data->llb[2]) / DH + 0.4) + 1;
+#else
+#ifdef Cell
+          gs->data->shape[2] = int((gs->data->uub[2] - gs->data->llb[2]) / DH + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+        }
+        if (Symmetry > 1)
+        {
+          for (int i = 0; i < 2; i++)
+          {
+            DH = bp->getdX(i);
+            if (feq(bp->bbox[i], 0, DH / 2))
+            {
+              gs->data->llb[i] = bp->bbox[i];
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+              gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
+#else
+#ifdef Cell
+              gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+            }
+          }
+        }
+      }
+
+      gs->data->Bg = BP->data;
+      gs->next = 0;
+    }
+
+    if (BP == Pat->ble)
+      break;
+    BP = BP->next;
+  }
+
+  return cgsl;
+}
+// collect all grid segments or blocks without ghost nor buffer for given patch,
+// and delete the ghost_width for interpolation consideration on the patch boundary
+MyList<Parallel::gridseg> *Parallel::build_owned_gsl4(Patch *Pat, int rank_in, int Symmetry)
+{
+  MyList<Parallel::gridseg> *cgsl = 0, *gs;
+  MyList<Block> *BP = Pat->blb;
+  while (BP)
+  {
+    Block *bp = BP->data;
+    if (bp->rank == rank_in)
+    {
+      if (!cgsl)
+      {
+        cgsl = gs = new MyList<Parallel::gridseg>;
+        gs->data = new Parallel::gridseg;
+      }
+      else
+      {
+        gs->next = new MyList<Parallel::gridseg>;
+        gs = gs->next;
+        gs->data = new Parallel::gridseg;
+      }
+
+      for (int i = 0; i < dim; i++)
+      {
+        double DH = bp->getdX(i);
+        gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] - Pat->uui[i] * DH : bp->bbox[dim + i];
+        gs->data->uub[i] -= ghost_width * DH;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        // NOTE: our dividing structure is (exclude ghost)
+        // -1 0
+        //       1  2
+        // so (0,1) does not belong to any part for vertex structure, we always put it to right part, this is consistent to
+        // the fortran routine where we always take floor to get index
+        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i];
+        gs->data->llb[i] += (ghost_width - 1) * DH;
+        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
+#else
+#ifdef Cell
+        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i];
+        gs->data->llb[i] += ghost_width * DH;
+        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+      // Symmetry consideration
+      if (Symmetry > 0)
+      {
+        double DH = bp->getdX(2);
+        if (feq(bp->bbox[2], 0, DH / 2))
+        {
+          gs->data->llb[2] = bp->bbox[2];
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+          gs->data->shape[2] = int((gs->data->uub[2] - gs->data->llb[2]) / DH + 0.4) + 1;
+#else
+#ifdef Cell
+          gs->data->shape[2] = int((gs->data->uub[2] - gs->data->llb[2]) / DH + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+        }
+        if (Symmetry > 1)
+        {
+          for (int i = 0; i < 2; i++)
+          {
+            DH = bp->getdX(i);
+            if (feq(bp->bbox[i], 0, DH / 2))
+            {
+              gs->data->llb[i] = bp->bbox[i];
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+              gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
+#else
+#ifdef Cell
+              gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+            }
+          }
+        }
+      }
+
+      gs->data->Bg = BP->data;
+      gs->next = 0;
+    }
+
+    if (BP == Pat->ble)
+      break;
+    BP = BP->next;
+  }
+
+  return cgsl;
+}
+// collect all grid segments or blocks without ghost nor buffer for given patch, no extention
+MyList<Parallel::gridseg> *Parallel::build_owned_gsl5(Patch *Pat, int rank_in)
+{
+  MyList<Parallel::gridseg> *cgsl = 0, *gs;
+  MyList<Block> *BP = Pat->blb;
+  while (BP)
+  {
+    Block *bp = BP->data;
+    if (bp->rank == rank_in)
+    {
+      if (!cgsl)
+      {
+        cgsl = gs = new MyList<Parallel::gridseg>;
+        gs->data = new Parallel::gridseg;
+      }
+      else
+      {
+        gs->next = new MyList<Parallel::gridseg>;
+        gs = gs->next;
+        gs->data = new Parallel::gridseg;
+      }
+
+      for (int i = 0; i < dim; i++)
+      {
+        double DH = bp->getdX(i);
+        gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] - Pat->uui[i] * DH : bp->bbox[dim + i] - ghost_width * DH;
+        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i] + ghost_width * DH;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
+#else
+#ifdef Cell
+        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+      gs->data->Bg = BP->data;
+      gs->next = 0;
+    }
+
+    if (BP == Pat->ble)
+      break;
+    BP = BP->next;
+  }
+
+  return cgsl;
+}
+// collect all grid segments or blocks without ghost for given patch list
+// stupid method
+/*
+MyList<Parallel::gridseg> *Parallel::build_owned_gsl(MyList<Patch> *PatL,int rank_in,int type,int Symmetry)
+{
+       MyList<Parallel::gridseg> *cgsl=0,*gs;
+       while(PatL)
+       {
+    if(!cgsl)
+    {
+            switch(type)
+      {
+         case 0:
+                  cgsl = build_owned_gsl0(PatL->data,rank_in);
+      break;
+         case 1:
+                  cgsl = build_owned_gsl1(PatL->data,rank_in);
+      break;
+         case 2:
+                  cgsl = build_owned_gsl2(PatL->data,rank_in);
+      break;
+         case 3:
+                  cgsl = build_owned_gsl3(PatL->data,rank_in,Symmetry);
+      break;
+         case 4:
+                  cgsl = build_owned_gsl4(PatL->data,rank_in,Symmetry);
+      break;
+         case 5:
+                  cgsl = build_owned_gsl5(PatL->data,rank_in);
+      break;
+               default:
+      cout<<"Parallel::build_owned_gsl : unknown type = "<<type<<endl;
+                  MPI_Abort(MPI_COMM_WORLD,1);
+      }
+       gs = cgsl;
+       while(gs && gs->next) gs = gs->next;
+    }
+    else
+    {
+       switch(type)
+      {
+         case 0:
+                  gs->next = build_owned_gsl0(PatL->data,rank_in);
+      break;
+         case 1:
+                  gs->next = build_owned_gsl1(PatL->data,rank_in);
+      break;
+         case 2:
+                  gs->next = build_owned_gsl2(PatL->data,rank_in);
+      break;
+         case 3:
+                  gs->next = build_owned_gsl3(PatL->data,rank_in,Symmetry);
+      break;
+         case 4:
+                  gs->next = build_owned_gsl4(PatL->data,rank_in,Symmetry);
+      break;
+         case 5:
+                  gs->next = build_owned_gsl5(PatL->data,rank_in);
+      break;
+               default:
+      cout<<"Parallel::build_owned_gsl : unknown type = "<<type<<endl;
+                  MPI_Abort(MPI_COMM_WORLD,1);
+      }
+       while(gs && gs->next) gs = gs->next;
+    }
+    PatL = PatL->next;
+       }
+
+       return cgsl;
+}
+*/
+// more clever method
+MyList<Parallel::gridseg> *Parallel::build_owned_gsl(MyList<Patch> *PatL, int rank_in, int type, int Symmetry)
+{
+  MyList<Parallel::gridseg> *cgsl = 0, *gs;
+  while (PatL)
+  {
+    switch (type)
+    {
+    case 0:
+      gs = build_owned_gsl0(PatL->data, rank_in);
+      break;
+    case 1:
+      gs = build_owned_gsl1(PatL->data, rank_in);
+      break;
+    case 2:
+      gs = build_owned_gsl2(PatL->data, rank_in);
+      break;
+    case 3:
+      gs = build_owned_gsl3(PatL->data, rank_in, Symmetry);
+      break;
+    case 4:
+      gs = build_owned_gsl4(PatL->data, rank_in, Symmetry);
+      break;
+    case 5:
+      gs = build_owned_gsl5(PatL->data, rank_in);
+      break;
+    default:
+      cout << "Parallel::build_owned_gsl : unknown type = " << type << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+    if (cgsl)
+      cgsl->catList(gs);
+    else
+      cgsl = gs;
+    PatL = PatL->next;
+  }
+
+  return cgsl;
+}
+// according to overlape to determine real grid segments
+void Parallel::build_gstl(MyList<Parallel::gridseg> *srci, MyList<Parallel::gridseg> *dsti,
+                          MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst)
+{
+  *out_src = *out_dst = 0;
+
+  if (!srci || !dsti)
+    return;
+
+  MyList<Parallel::gridseg> *s, *d;
+  MyList<Parallel::gridseg> *s2, *d2;
+
+  double llb[dim], uub[dim];
+
+  s = srci;
+  while (s)
+  {
+    Parallel::gridseg *sd = s->data;
+    d = dsti;
+    while (d)
+    {
+      Parallel::gridseg *dd = d->data;
+      bool flag = true;
+      for (int i = 0; i < dim; i++)
+      {
+        double SH = sd->Bg->getdX(i), DH = dd->Bg->getdX(i);
+        llb[i] = Mymax(sd->llb[i], dd->llb[i]);
+        uub[i] = Mymin(sd->uub[i], dd->uub[i]);
+        // make sure the region boundary is consistent to the grids
+        // here we only judge if the domain is empty, so do not need to adjust the align
+        double lb = llb[i], ub = uub[i];
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        // ---*---
+        // x-------x
+        //		if     (int(2*(sd->uub[i]-uub[i])/SH+0.4)%2 == 1) ub = uub[i]-SH/2;
+        //		else if(int(2*(dd->uub[i]-uub[i])/DH+0.4)%2 == 1) ub = uub[i]-DH/2;
+        //		if     (int(2*(llb[i]-sd->llb[i])/SH+0.4)%2 == 1) lb = llb[i]+SH/2;
+        //		else if(int(2*(llb[i]-dd->llb[i])/DH+0.4)%2 == 1) lb = llb[i]+DH/2;
+        if (lb > ub + Mymin(SH, DH) / 2)
+        {
+          flag = false;
+          break;
+        } // special for isolated point
+#else
+#ifdef Cell
+        // |------|
+        // |-------------|
+        //		if     (int(2*(sd->uub[i]-uub[i])/SH+0.4)%2 == 1) ub = uub[i]+SH/2;
+        //		else if(int(2*(dd->uub[i]-uub[i])/DH+0.4)%2 == 1) ub = uub[i]+DH/2;
+        //        |------|
+        // |-------------|
+        //		if     (int(2*(llb[i]-sd->llb[i])/SH+0.4)%2 == 1) lb = llb[i]-SH/2;
+        //		else if(int(2*(llb[i]-dd->llb[i])/DH+0.4)%2 == 1) lb = llb[i]-DH/2;
+        if (ub - lb < Mymin(SH, DH) / 2)
+        {
+          flag = false;
+          break;
+        } // even for isolated point, it has a cell belong to it
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+
+      if (flag)
+      {
+        if (!(*out_src))
+        {
+          *out_src = s2 = new MyList<Parallel::gridseg>;
+          *out_dst = d2 = new MyList<Parallel::gridseg>;
+          s2->data = new Parallel::gridseg;
+          d2->data = new Parallel::gridseg;
+        }
+        else
+        {
+          s2->next = new MyList<Parallel::gridseg>;
+          s2 = s2->next;
+          d2->next = new MyList<Parallel::gridseg>;
+          d2 = d2->next;
+          s2->data = new Parallel::gridseg;
+          d2->data = new Parallel::gridseg;
+        }
+
+        for (int i = 0; i < dim; i++)
+        {
+          double SH = sd->Bg->getdX(i), DH = dd->Bg->getdX(i);
+          s2->data->llb[i] = d2->data->llb[i] = llb[i];
+          s2->data->uub[i] = d2->data->uub[i] = uub[i];
+// using float method to count point, we do not need following consideration (2012 nov 17)
+#if 1
+
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+          // old code distuinguish vertex and cell
+          //		   if     (int(2*(sd->uub[i]-uub[i])/SH+0.4)%2 == 1) s2->data->uub[i] = uub[i]-SH/2;
+          //		   else if(int(2*(dd->uub[i]-uub[i])/DH+0.4)%2 == 1) d2->data->uub[i] = uub[i]-DH/2;
+          //	           if     (int(2*(llb[i]-sd->llb[i])/SH+0.4)%2 == 1) s2->data->llb[i] = llb[i]+SH/2;
+          //		   else if(int(2*(llb[i]-dd->llb[i])/DH+0.4)%2 == 1) d2->data->llb[i] = llb[i]+DH/2;
+          // new code: here we concern much more about missing point, because overlaping domain has been gaureented above
+          if (int(2 * (sd->uub[i] - uub[i]) / SH + 0.4) % 2 == 1)
+            s2->data->uub[i] = uub[i] + SH / 2;
+          else if (int(2 * (dd->uub[i] - uub[i]) / DH + 0.4) % 2 == 1)
+            d2->data->uub[i] = uub[i] + DH / 2;
+          if (int(2 * (llb[i] - sd->llb[i]) / SH + 0.4) % 2 == 1)
+            s2->data->llb[i] = llb[i] - SH / 2;
+          else if (int(2 * (llb[i] - dd->llb[i]) / DH + 0.4) % 2 == 1)
+            d2->data->llb[i] = llb[i] - DH / 2;
+          s2->data->shape[i] = int((s2->data->uub[i] - s2->data->llb[i]) / SH + 0.4) + 1;
+          d2->data->shape[i] = int((d2->data->uub[i] - d2->data->llb[i]) / DH + 0.4) + 1;
+#else
+#ifdef Cell
+          if (int(2 * (sd->uub[i] - uub[i]) / SH + 0.4) % 2 == 1)
+            s2->data->uub[i] = uub[i] + SH / 2;
+          else if (int(2 * (dd->uub[i] - uub[i]) / DH + 0.4) % 2 == 1)
+            d2->data->uub[i] = uub[i] + DH / 2;
+          if (int(2 * (llb[i] - sd->llb[i]) / SH + 0.4) % 2 == 1)
+            s2->data->llb[i] = llb[i] - SH / 2;
+          else if (int(2 * (llb[i] - dd->llb[i]) / DH + 0.4) % 2 == 1)
+            d2->data->llb[i] = llb[i] - DH / 2;
+          s2->data->shape[i] = int((s2->data->uub[i] - s2->data->llb[i]) / SH + 0.4);
+          d2->data->shape[i] = int((d2->data->uub[i] - d2->data->llb[i]) / DH + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+
+#endif
+          s2->data->illb[i] = sd->illb[i];
+          d2->data->illb[i] = dd->illb[i];
+          s2->data->iuub[i] = sd->iuub[i];
+          d2->data->iuub[i] = dd->iuub[i];
+        }
+        s2->data->Bg = sd->Bg;
+        s2->next = 0;
+        d2->data->Bg = dd->Bg;
+        d2->next = 0;
+      }
+      d = d->next;
+    }
+    s = s->next;
+  }
+}
+//   PACK: prepare target data in 'data'
+// UNPACK: copy target data from 'data' to corresponding numerical grids
+int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<Parallel::gridseg> *dst, int rank_in, int dir,
+                          MyList<var> *VarLists /* source */, MyList<var> *VarListd /* target */, int Symmetry)
+{
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+  int DIM = dim;
+
+  if (dir != PACK && dir != UNPACK)
+  {
+    cout << "error dir " << dir << " for data_packer " << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+
+  int size_out = 0;
+
+  if (!src || !dst)
+    return size_out;
+
+  MyList<var> *varls, *varld;
+
+  varls = VarLists;
+  varld = VarListd;
+  while (varls && varld)
+  {
+    varls = varls->next;
+    varld = varld->next;
+  }
+
+  if (varls || varld)
+  {
+    cout << "error in short data packer, var lists does not match." << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+
+  int type; /* 1 copy, 2 restrict, 3 prolong */
+  if (src->data->Bg->lev == dst->data->Bg->lev)
+    type = 1;
+  else if (src->data->Bg->lev > dst->data->Bg->lev)
+    type = 2;
+  else
+    type = 3;
+
+  while (src && dst)
+  {
+    if ((dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
+        (dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank))
+    {
+      varls = VarLists;
+      varld = VarListd;
+      while (varls && varld)
+      {
+        if (data)
+        {
+          if (dir == PACK)
+            switch (type)
+            {
+              // attention must be paied to the difference between src's llb,uub and dst's llb,uub
+            case 1:
+              f_copy(DIM, dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
+                     src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
+                     dst->data->llb, dst->data->uub);
+              break;
+            case 2:
+              f_restrict3(DIM, dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
+                          src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
+                          dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
+              break;
+            case 3:
+              f_prolong3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
+                         dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
+                         dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
+            }
+          if (dir == UNPACK) // from target data to corresponding grid
+            f_copy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn],
+                   dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
+                   dst->data->llb, dst->data->uub);
+        }
+        size_out += dst->data->shape[0] * dst->data->shape[1] * dst->data->shape[2];
+        varls = varls->next;
+        varld = varld->next;
+      }
+    }
+    dst = dst->next;
+    src = src->next;
+  }
+
+  return size_out;
+}
+int Parallel::data_packermix(double *data, MyList<Parallel::gridseg> *src, MyList<Parallel::gridseg> *dst, int rank_in, int dir,
+                             MyList<var> *VarLists /* source */, MyList<var> *VarListd /* target */, int Symmetry)
+{
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+  int DIM = dim;
+
+  if (dir != PACK && dir != UNPACK)
+  {
+    cout << "Parallel::data_packermix: error dir " << dir << " for data_packermix." << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+
+  int size_out = 0;
+
+  if (!src || !dst)
+    return size_out;
+
+  MyList<var> *varls, *varld;
+
+  varls = VarLists;
+  varld = VarListd;
+  while (varls && varld)
+  {
+    varls = varls->next;
+    varld = varld->next;
+  }
+
+  if (varls || varld)
+  {
+    cout << "error in short data packer, var lists does not match." << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+
+  int type; /* 1 copy, 2 restrict, 3 prolong */
+  if (src->data->Bg->lev == dst->data->Bg->lev)
+    type = 1;
+  else if (src->data->Bg->lev > dst->data->Bg->lev)
+    type = 2;
+  else
+    type = 3;
+
+  if (type != 3)
+  {
+    cout << "Parallel::data_packermix: error type " << type << " for data_packermix." << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+
+  while (src && dst)
+  {
+    if ((dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
+        (dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank))
+    {
+      varls = VarLists;
+      varld = VarListd;
+      while (varls && varld)
+      {
+        if (data)
+        {
+          if (dir == PACK)
+            f_prolongcopy3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
+                           dst->data->llb, dst->data->uub, src->data->shape, data + size_out,
+                           src->data->llb, src->data->uub, varls->data->SoA, Symmetry);
+          if (dir == UNPACK) // from target data to corresponding grid
+            f_prolongmix3(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn],
+                          src->data->llb, src->data->uub, src->data->shape, data + size_out,
+                          dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry, dst->data->illb, dst->data->iuub);
+        }
+        // the symmetry problem should be dealt in prolongcopy3,
+        // so we always have ghost_width for both sides
+        size_out += (src->data->shape[0] + 2 * ghost_width) * (src->data->shape[1] + 2 * ghost_width) * (src->data->shape[2] + 2 * ghost_width);
+        varls = varls->next;
+        varld = varld->next;
+      }
+    }
+    dst = dst->next;
+    src = src->next;
+  }
+
+  return size_out;
+}
+//
+void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
+                        MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
+                        int Symmetry)
+{
+  int myrank, cpusize;
+  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+  int node;
+
+  MPI_Request *reqs;
+  MPI_Status *stats;
+  reqs = new MPI_Request[2 * cpusize];
+  stats = new MPI_Status[2 * cpusize];
+  int req_no = 0;
+
+  double **send_data, **rec_data;
+  send_data = new double *[cpusize];
+  rec_data = new double *[cpusize];
+  int length;
+
+  for (node = 0; node < cpusize; node++)
+  {
+    send_data[node] = rec_data[node] = 0;
+    if (node == myrank)
+    {
+      if (length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
+      {
+        rec_data[node] = new double[length];
+        if (!rec_data[node])
+        {
+          cout << "out of memory when new in short transfer, place 1" << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        data_packer(rec_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+      }
+    }
+    else
+    {
+      // send from this cpu to cpu#node
+      if (length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
+      {
+        send_data[node] = new double[length];
+        if (!send_data[node])
+        {
+          cout << "out of memory when new in short transfer, place 2" << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+        MPI_Isend((void *)send_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
+      }
+      // receive from cpu#node to this cpu
+      if (length = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry))
+      {
+        rec_data[node] = new double[length];
+        if (!rec_data[node])
+        {
+          cout << "out of memory when new in short transfer, place 3" << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        MPI_Irecv((void *)rec_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
+      }
+    }
+  }
+  // wait for all requests to complete
+  MPI_Waitall(req_no, reqs, stats);
+
+  for (node = 0; node < cpusize; node++)
+    if (rec_data[node])
+      data_packer(rec_data[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
+
+  for (node = 0; node < cpusize; node++)
+  {
+    if (send_data[node])
+      delete[] send_data[node];
+    if (rec_data[node])
+      delete[] rec_data[node];
+  }
+
+  delete[] reqs;
+  delete[] stats;
+  delete[] send_data;
+  delete[] rec_data;
+}
+//
+void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
+                           MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
+                           int Symmetry)
+{
+  int myrank, cpusize;
+  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+  int node;
+
+  MPI_Request *reqs;
+  MPI_Status *stats;
+  reqs = new MPI_Request[2 * cpusize];
+  stats = new MPI_Status[2 * cpusize];
+  int req_no = 0;
+
+  double **send_data, **rec_data;
+  send_data = new double *[cpusize];
+  rec_data = new double *[cpusize];
+  int length;
+
+  for (node = 0; node < cpusize; node++)
+  {
+    send_data[node] = rec_data[node] = 0;
+    if (node == myrank)
+    {
+      if (length = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
+      {
+        rec_data[node] = new double[length];
+        if (!rec_data[node])
+        {
+          cout << "out of memory when new in short transfer, place 1" << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        data_packermix(rec_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+      }
+    }
+    else
+    {
+      // send from this cpu to cpu#node
+      if (length = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
+      {
+        send_data[node] = new double[length];
+        if (!send_data[node])
+        {
+          cout << "out of memory when new in short transfer, place 2" << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        data_packermix(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+        MPI_Isend((void *)send_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
+      }
+      // receive from cpu#node to this cpu
+      if (length = data_packermix(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry))
+      {
+        rec_data[node] = new double[length];
+        if (!rec_data[node])
+        {
+          cout << "out of memory when new in short transfer, place 3" << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        MPI_Irecv((void *)rec_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
+      }
+    }
+  }
+  // wait for all requests to complete
+  MPI_Waitall(req_no, reqs, stats);
+
+  for (node = 0; node < cpusize; node++)
+    if (rec_data[node])
+      data_packermix(rec_data[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
+
+  for (node = 0; node < cpusize; node++)
+  {
+    if (send_data[node])
+      delete[] send_data[node];
+    if (rec_data[node])
+      delete[] rec_data[node];
+  }
+
+  delete[] reqs;
+  delete[] stats;
+  delete[] send_data;
+  delete[] rec_data;
+}
+void Parallel::Sync(Patch *Pat, MyList<var> *VarList, int Symmetry)
+{
+  int cpusize;
+  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+
+  MyList<Parallel::gridseg> *dst;
+  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
+  src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
+
+  dst = build_ghost_gsl(Pat); // ghost region only
+  for (int node = 0; node < cpusize; node++)
+  {
+    src[node] = build_owned_gsl0(Pat, node);                              // for the part without ghost points and do not extend
+    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer_src[node], data locate on cpu#node;
+                                                                          // but for transfer_dst[node] the data may locate on any node
+  }
+
+  transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
+
+  if (dst)
+    dst->destroyList();
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (src[node])
+      src[node]->destroyList();
+    if (transfer_src[node])
+      transfer_src[node]->destroyList();
+    if (transfer_dst[node])
+      transfer_dst[node]->destroyList();
+  }
+
+  delete[] src;
+  delete[] transfer_src;
+  delete[] transfer_dst;
+}
+void Parallel::Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
+{
+  // Patch inner Synch
+  MyList<Patch> *Pp = PatL;
+  while (Pp)
+  {
+    Sync(Pp->data, VarList, Symmetry);
+    Pp = Pp->next;
+  }
+
+  // Patch inter Synch
+  int cpusize;
+  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+
+  MyList<Parallel::gridseg> *dst;
+  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
+  src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
+
+  dst = build_buffer_gsl(PatL); // buffer region only
+  for (int node = 0; node < cpusize; node++)
+  {
+    src[node] = build_owned_gsl(PatL, node, 5, Symmetry);                 // for the part without ghost nor buffer points and do not extend
+    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
+  }
+
+  transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
+
+  if (dst)
+    dst->destroyList();
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (src[node])
+      src[node]->destroyList();
+    if (transfer_src[node])
+      transfer_src[node]->destroyList();
+    if (transfer_dst[node])
+      transfer_dst[node]->destroyList();
+  }
+
+  delete[] src;
+  delete[] transfer_src;
+  delete[] transfer_dst;
+}
+// Merged Sync: collect all intra-patch and inter-patch grid segment lists,
+// then issue a single transfer() call instead of N+1 separate ones.
+void Parallel::Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
+{
+  int cpusize;
+  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+
+  MyList<Parallel::gridseg> **combined_src = new MyList<Parallel::gridseg> *[cpusize];
+  MyList<Parallel::gridseg> **combined_dst = new MyList<Parallel::gridseg> *[cpusize];
+  for (int node = 0; node < cpusize; node++)
+    combined_src[node] = combined_dst[node] = 0;
+
+  // Phase A: Intra-patch ghost exchange segments
+  MyList<Patch> *Pp = PatL;
+  while (Pp)
+  {
+    Patch *Pat = Pp->data;
+    MyList<Parallel::gridseg> *dst_ghost = build_ghost_gsl(Pat);
+
+    for (int node = 0; node < cpusize; node++)
+    {
+      MyList<Parallel::gridseg> *src_owned = build_owned_gsl0(Pat, node);
+      MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
+      build_gstl(src_owned, dst_ghost, &tsrc, &tdst);
+
+      if (tsrc)
+      {
+        if (combined_src[node])
+          combined_src[node]->catList(tsrc);
+        else
+          combined_src[node] = tsrc;
+      }
+      if (tdst)
+      {
+        if (combined_dst[node])
+          combined_dst[node]->catList(tdst);
+        else
+          combined_dst[node] = tdst;
+      }
+
+      if (src_owned)
+        src_owned->destroyList();
+    }
+
+    if (dst_ghost)
+      dst_ghost->destroyList();
+
+    Pp = Pp->next;
+  }
+
+  // Phase B: Inter-patch buffer exchange segments
+  MyList<Parallel::gridseg> *dst_buffer = build_buffer_gsl(PatL);
+  for (int node = 0; node < cpusize; node++)
+  {
+    MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatL, node, 5, Symmetry);
+    MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
+    build_gstl(src_owned, dst_buffer, &tsrc, &tdst);
+
+    if (tsrc)
+    {
+      if (combined_src[node])
+        combined_src[node]->catList(tsrc);
+      else
+        combined_src[node] = tsrc;
+    }
+    if (tdst)
+    {
+      if (combined_dst[node])
+        combined_dst[node]->catList(tdst);
+      else
+        combined_dst[node] = tdst;
+    }
+
+    if (src_owned)
+      src_owned->destroyList();
+  }
+  if (dst_buffer)
+    dst_buffer->destroyList();
+
+  // Phase C: Single transfer
+  transfer(combined_src, combined_dst, VarList, VarList, Symmetry);
+
+  // Phase D: Cleanup
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (combined_src[node])
+      combined_src[node]->destroyList();
+    if (combined_dst[node])
+      combined_dst[node]->destroyList();
+  }
+  delete[] combined_src;
+  delete[] combined_dst;
+}
+// SyncCache constructor
+Parallel::SyncCache::SyncCache()
+    : valid(false), cpusize(0), combined_src(0), combined_dst(0),
+      send_lengths(0), recv_lengths(0), send_bufs(0), recv_bufs(0),
+      send_buf_caps(0), recv_buf_caps(0), reqs(0), stats(0), max_reqs(0),
+      lengths_valid(false)
+{
+}
+// SyncCache invalidate: free grid segment lists but keep buffers
+void Parallel::SyncCache::invalidate()
+{
+  if (!valid)
+    return;
+  for (int i = 0; i < cpusize; i++)
+  {
+    if (combined_src[i])
+      combined_src[i]->destroyList();
+    if (combined_dst[i])
+      combined_dst[i]->destroyList();
+    combined_src[i] = combined_dst[i] = 0;
+    send_lengths[i] = recv_lengths[i] = 0;
+  }
+  valid = false;
+  lengths_valid = false;
+}
+// SyncCache destroy: free everything
+void Parallel::SyncCache::destroy()
+{
+  invalidate();
+  if (combined_src) delete[] combined_src;
+  if (combined_dst) delete[] combined_dst;
+  if (send_lengths) delete[] send_lengths;
+  if (recv_lengths) delete[] recv_lengths;
+  if (send_buf_caps) delete[] send_buf_caps;
+  if (recv_buf_caps) delete[] recv_buf_caps;
+  for (int i = 0; i < cpusize; i++)
+  {
+    if (send_bufs && send_bufs[i]) delete[] send_bufs[i];
+    if (recv_bufs && recv_bufs[i]) delete[] recv_bufs[i];
+  }
+  if (send_bufs) delete[] send_bufs;
+  if (recv_bufs) delete[] recv_bufs;
+  if (reqs) delete[] reqs;
+  if (stats) delete[] stats;
+  combined_src = combined_dst = 0;
+  send_lengths = recv_lengths = 0;
+  send_buf_caps = recv_buf_caps = 0;
+  send_bufs = recv_bufs = 0;
+  reqs = 0; stats = 0;
+  cpusize = 0; max_reqs = 0;
+}
+// transfer_cached: reuse pre-allocated buffers from SyncCache
+void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
+                               MyList<var> *VarList1, MyList<var> *VarList2,
+                               int Symmetry, SyncCache &cache)
+{
+  int myrank;
+  MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  int cpusize = cache.cpusize;
+
+  int req_no = 0;
+  int node;
+
+  for (node = 0; node < cpusize; node++)
+  {
+    if (node == myrank)
+    {
+      int length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+      cache.recv_lengths[node] = length;
+      if (length > 0)
+      {
+        if (length > cache.recv_buf_caps[node])
+        {
+          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
+          cache.recv_bufs[node] = new double[length];
+          cache.recv_buf_caps[node] = length;
+        }
+        data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+      }
+    }
+    else
+    {
+      // send
+      int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+      cache.send_lengths[node] = slength;
+      if (slength > 0)
+      {
+        if (slength > cache.send_buf_caps[node])
+        {
+          if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
+          cache.send_bufs[node] = new double[slength];
+          cache.send_buf_caps[node] = slength;
+        }
+        data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+        MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
+      }
+      // recv
+      int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
+      cache.recv_lengths[node] = rlength;
+      if (rlength > 0)
+      {
+        if (rlength > cache.recv_buf_caps[node])
+        {
+          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
+          cache.recv_bufs[node] = new double[rlength];
+          cache.recv_buf_caps[node] = rlength;
+        }
+        MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
+      }
+    }
+  }
+
+  MPI_Waitall(req_no, cache.reqs, cache.stats);
+
+  for (node = 0; node < cpusize; node++)
+    if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
+      data_packer(cache.recv_bufs[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
+}
+// Sync_cached: build grid segment lists on first call, reuse on subsequent calls
+void Parallel::Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache)
+{
+  if (!cache.valid)
+  {
+    int cpusize;
+    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+    cache.cpusize = cpusize;
+
+    // Allocate cache arrays if needed
+    if (!cache.combined_src)
+    {
+      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
+      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
+      cache.send_lengths = new int[cpusize];
+      cache.recv_lengths = new int[cpusize];
+      cache.send_bufs = new double *[cpusize];
+      cache.recv_bufs = new double *[cpusize];
+      cache.send_buf_caps = new int[cpusize];
+      cache.recv_buf_caps = new int[cpusize];
+      for (int i = 0; i < cpusize; i++)
+      {
+        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
+        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
+      }
+      cache.max_reqs = 2 * cpusize;
+      cache.reqs = new MPI_Request[cache.max_reqs];
+      cache.stats = new MPI_Status[cache.max_reqs];
+    }
+
+    for (int node = 0; node < cpusize; node++)
+    {
+      cache.combined_src[node] = cache.combined_dst[node] = 0;
+      cache.send_lengths[node] = cache.recv_lengths[node] = 0;
+    }
+
+    // Build intra-patch segments (same as Sync_merged Phase A)
+    MyList<Patch> *Pp = PatL;
+    while (Pp)
+    {
+      Patch *Pat = Pp->data;
+      MyList<Parallel::gridseg> *dst_ghost = build_ghost_gsl(Pat);
+      for (int node = 0; node < cpusize; node++)
+      {
+        MyList<Parallel::gridseg> *src_owned = build_owned_gsl0(Pat, node);
+        MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
+        build_gstl(src_owned, dst_ghost, &tsrc, &tdst);
+        if (tsrc)
+        {
+          if (cache.combined_src[node])
+            cache.combined_src[node]->catList(tsrc);
+          else
+            cache.combined_src[node] = tsrc;
+        }
+        if (tdst)
+        {
+          if (cache.combined_dst[node])
+            cache.combined_dst[node]->catList(tdst);
+          else
+            cache.combined_dst[node] = tdst;
+        }
+        if (src_owned) src_owned->destroyList();
+      }
+      if (dst_ghost) dst_ghost->destroyList();
+      Pp = Pp->next;
+    }
+
+    // Build inter-patch segments (same as Sync_merged Phase B)
+    MyList<Parallel::gridseg> *dst_buffer = build_buffer_gsl(PatL);
+    for (int node = 0; node < cpusize; node++)
+    {
+      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatL, node, 5, Symmetry);
+      MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
+      build_gstl(src_owned, dst_buffer, &tsrc, &tdst);
+      if (tsrc)
+      {
+        if (cache.combined_src[node])
+          cache.combined_src[node]->catList(tsrc);
+        else
+          cache.combined_src[node] = tsrc;
+      }
+      if (tdst)
+      {
+        if (cache.combined_dst[node])
+          cache.combined_dst[node]->catList(tdst);
+        else
+          cache.combined_dst[node] = tdst;
+      }
+      if (src_owned) src_owned->destroyList();
+    }
+    if (dst_buffer) dst_buffer->destroyList();
+
+    cache.valid = true;
+  }
+
+  // Use cached lists with buffer-reusing transfer
+  transfer_cached(cache.combined_src, cache.combined_dst, VarList, VarList, Symmetry, cache);
+}
+// Sync_start: pack and post MPI_Isend/Irecv, return immediately
+void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
+                          SyncCache &cache, AsyncSyncState &state)
+{
+  // Ensure cache is built
+  if (!cache.valid)
+  {
+    // Build cache (same logic as Sync_cached)
+    int cpusize;
+    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+    cache.cpusize = cpusize;
+
+    if (!cache.combined_src)
+    {
+      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
+      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
+      cache.send_lengths = new int[cpusize];
+      cache.recv_lengths = new int[cpusize];
+      cache.send_bufs = new double *[cpusize];
+      cache.recv_bufs = new double *[cpusize];
+      cache.send_buf_caps = new int[cpusize];
+      cache.recv_buf_caps = new int[cpusize];
+      for (int i = 0; i < cpusize; i++)
+      {
+        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
+        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
+      }
+      cache.max_reqs = 2 * cpusize;
+      cache.reqs = new MPI_Request[cache.max_reqs];
+      cache.stats = new MPI_Status[cache.max_reqs];
+    }
+
+    for (int node = 0; node < cpusize; node++)
+    {
+      cache.combined_src[node] = cache.combined_dst[node] = 0;
+      cache.send_lengths[node] = cache.recv_lengths[node] = 0;
+    }
+
+    MyList<Patch> *Pp = PatL;
+    while (Pp)
+    {
+      Patch *Pat = Pp->data;
+      MyList<Parallel::gridseg> *dst_ghost = build_ghost_gsl(Pat);
+      for (int node = 0; node < cpusize; node++)
+      {
+        MyList<Parallel::gridseg> *src_owned = build_owned_gsl0(Pat, node);
+        MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
+        build_gstl(src_owned, dst_ghost, &tsrc, &tdst);
+        if (tsrc)
+        {
+          if (cache.combined_src[node])
+            cache.combined_src[node]->catList(tsrc);
+          else
+            cache.combined_src[node] = tsrc;
+        }
+        if (tdst)
+        {
+          if (cache.combined_dst[node])
+            cache.combined_dst[node]->catList(tdst);
+          else
+            cache.combined_dst[node] = tdst;
+        }
+        if (src_owned) src_owned->destroyList();
+      }
+      if (dst_ghost) dst_ghost->destroyList();
+      Pp = Pp->next;
+    }
+
+    MyList<Parallel::gridseg> *dst_buffer = build_buffer_gsl(PatL);
+    for (int node = 0; node < cpusize; node++)
+    {
+      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatL, node, 5, Symmetry);
+      MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
+      build_gstl(src_owned, dst_buffer, &tsrc, &tdst);
+      if (tsrc)
+      {
+        if (cache.combined_src[node])
+          cache.combined_src[node]->catList(tsrc);
+        else
+          cache.combined_src[node] = tsrc;
+      }
+      if (tdst)
+      {
+        if (cache.combined_dst[node])
+          cache.combined_dst[node]->catList(tdst);
+        else
+          cache.combined_dst[node] = tdst;
+      }
+      if (src_owned) src_owned->destroyList();
+    }
+    if (dst_buffer) dst_buffer->destroyList();
+    cache.valid = true;
+  }
+
+  // Now pack and post async MPI operations
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  int cpusize = cache.cpusize;
+  state.req_no = 0;
+  state.active = true;
+
+  MyList<Parallel::gridseg> **src = cache.combined_src;
+  MyList<Parallel::gridseg> **dst = cache.combined_dst;
+
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (node == myrank)
+    {
+      int length;
+      if (!cache.lengths_valid) {
+        length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
+        cache.recv_lengths[node] = length;
+      } else {
+        length = cache.recv_lengths[node];
+      }
+      if (length > 0)
+      {
+        if (length > cache.recv_buf_caps[node])
+        {
+          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
+          cache.recv_bufs[node] = new double[length];
+          cache.recv_buf_caps[node] = length;
+        }
+        data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
+      }
+    }
+    else
+    {
+      int slength;
+      if (!cache.lengths_valid) {
+        slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
+        cache.send_lengths[node] = slength;
+      } else {
+        slength = cache.send_lengths[node];
+      }
+      if (slength > 0)
+      {
+        if (slength > cache.send_buf_caps[node])
+        {
+          if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
+          cache.send_bufs[node] = new double[slength];
+          cache.send_buf_caps[node] = slength;
+        }
+        data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
+        MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
+      }
+      int rlength;
+      if (!cache.lengths_valid) {
+        rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry);
+        cache.recv_lengths[node] = rlength;
+      } else {
+        rlength = cache.recv_lengths[node];
+      }
+      if (rlength > 0)
+      {
+        if (rlength > cache.recv_buf_caps[node])
+        {
+          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
+          cache.recv_bufs[node] = new double[rlength];
+          cache.recv_buf_caps[node] = rlength;
+        }
+        MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
+      }
+    }
+  }
+  cache.lengths_valid = true;
+}
+// Sync_finish: wait for async MPI operations and unpack
+void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state,
+                           MyList<var> *VarList, int Symmetry)
+{
+  if (!state.active)
+    return;
+
+  MPI_Waitall(state.req_no, cache.reqs, cache.stats);
+
+  int cpusize = cache.cpusize;
+  MyList<Parallel::gridseg> **src = cache.combined_src;
+  MyList<Parallel::gridseg> **dst = cache.combined_dst;
+
+  for (int node = 0; node < cpusize; node++)
+    if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
+      data_packer(cache.recv_bufs[node], src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry);
+
+  state.active = false;
+}
+// collect buffer grid segments or blocks for the periodic boundary condition of given patch
+// ---------------------------------------------------
+// |con |                                       |con |
+// |ner |                PhysBD                 |ner |
+// |-------------------------------------------------|
+// |    |                                       |    |
+// |Phy |                                       |Phy |
+// |sBD |                                       |BD  |
+// |    |                                       |    |
+// |    |                                       |    |
+// |    |                                       |    |
+// |-------------------------------------------------|
+// |con |               PhysBD                  |con |
+// |ner |                                       |ner |
+// ---------------------------------------------------
+// first order derivetive does not need conner information,
+// but second order derivative needs!
+/* the following code does not include conner part
+MyList<Parallel::gridseg> *Parallel::build_PhysBD_gsl(Patch *Pat)
+{
+       MyList<Parallel::gridseg> *cgsl,*gsc,*gsb=0,*p;
+       gsc = build_ghost_gsl(Pat);
+       for(int i=0;i<dim;i++)
+       {
+         double DH = gsc->data->Bg->getdX(i);
+// lower boundary
+         if(gsb)
+   {
+          p = new MyList<Parallel::gridseg>;
+          p->data = new Parallel::gridseg;
+          p->next=gsb;
+    gsb=p;
+   }
+   else
+   {
+          gsb = new MyList<Parallel::gridseg>;
+          gsb->data = new Parallel::gridseg;
+          gsb->next=0;
+   }
+         for(int j=0;j<dim;j++)
+   {
+           if(i == j)
+     {
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+             gsb->data->llb[i] = Pat->bbox[i]-ghost_width*DH;
+             gsb->data->uub[i] = Pat->bbox[i]-DH;
+#else
+#ifdef Cell
+             gsb->data->llb[i] = Pat->bbox[i]-ghost_width*DH;
+             gsb->data->uub[i] = Pat->bbox[i];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+             gsb->data->shape[i] = ghost_width;
+     }
+     else
+     {
+             gsb->data->llb[j] = Pat->bbox[j];
+             gsb->data->uub[j] = Pat->bbox[j+dim];
+             gsb->data->shape[j] = Pat->shape[j];
+     }
+   }
+   gsb->data->Bg = 0;  //vertual grid segment
+// upper boundary
+         p = new MyList<Parallel::gridseg>;
+         p->data = new Parallel::gridseg;
+         p->next=gsb;
+   gsb=p;
+         for(int j=0;j<dim;j++)
+   {
+           if(i == j)
+     {
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+             gsb->data->llb[i] = Pat->bbox[i+dim]+DH;
+             gsb->data->uub[i] = Pat->bbox[i+dim]+ghost_width*DH;
+#else
+#ifdef Cell
+             gsb->data->llb[i] = Pat->bbox[i+dim];
+             gsb->data->uub[i] = Pat->bbox[i+dim]+ghost_width*DH;
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+             gsb->data->shape[i] = ghost_width;
+     }
+     else
+     {
+             gsb->data->llb[j] = Pat->bbox[j];
+             gsb->data->uub[j] = Pat->bbox[j+dim];
+             gsb->data->shape[j] = Pat->shape[j];
+     }
+   }
+   gsb->data->Bg = 0;  //vertual grid segment
+       }
+
+       cgsl = gsl_and(gsc,gsb);
+
+       gsc->destroyList();
+       gsb->destroyList();
+
+       return cgsl;
+}
+*/
+// the following code includes conner part
+MyList<Parallel::gridseg> *Parallel::build_PhysBD_gsl(Patch *Pat)
+{
+  MyList<Parallel::gridseg> *cgsl, *gsc, *gsb = 0, *p;
+
+  gsc = build_complete_gsl(Pat);
+
+  gsb = new MyList<Parallel::gridseg>;
+  gsb->data = new Parallel::gridseg;
+  gsb->next = 0;
+  gsb->data->Bg = 0;
+
+  for (int j = 0; j < dim; j++)
+  {
+    gsb->data->llb[j] = Pat->bbox[j];
+    gsb->data->uub[j] = Pat->bbox[j + dim];
+    gsb->data->shape[j] = Pat->shape[j];
+  }
+
+  p = gsl_subtract(gsc, gsb);
+
+  gsc->destroyList();
+  gsb->destroyList();
+
+  cgsl = divide_gsl(p, Pat);
+
+  p->destroyList();
+
+  return cgsl;
+}
+MyList<Parallel::gridseg> *Parallel::divide_gsl(MyList<Parallel::gridseg> *p, Patch *Pat)
+{
+  MyList<Parallel::gridseg> *cgsl = 0;
+  while (p)
+  {
+    if (cgsl)
+      cgsl->catList(divide_gs(p, Pat));
+    else
+      cgsl = divide_gs(p, Pat);
+    p = p->next;
+  }
+
+  return cgsl;
+}
+// divide the gs into pices which locate either totally outside of the given Patch coordinate range
+// or totally inside it. It's usefull for periodic boundary condition
+MyList<Parallel::gridseg> *Parallel::divide_gs(MyList<Parallel::gridseg> *p, Patch *Pat)
+{
+  double DH[dim];
+  for (int i = 0; i < dim; i++)
+  {
+    DH[i] = p->data->Bg->getdX(i);
+  }
+
+  int num[dim];
+  double llb[3][dim], uub[3][dim];
+  for (int i = 0; i < dim; i++)
+  {
+    if (p->data->llb[i] < Pat->bbox[i] - DH[i] / 2)
+    {
+      if (p->data->uub[i] > Pat->bbox[i + dim] + DH[i] / 2)
+      {
+        num[i] = 3;
+        llb[0][i] = p->data->llb[i];
+        llb[1][i] = Pat->bbox[i];
+        uub[1][i] = Pat->bbox[i + dim];
+        uub[2][i] = p->data->uub[i];
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        uub[0][i] = Pat->bbox[i] - DH[i];
+        llb[2][i] = Pat->bbox[i + dim] + DH[i];
+#else
+#ifdef Cell
+        uub[0][i] = Pat->bbox[i];
+        llb[2][i] = Pat->bbox[i + dim];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+      else if (p->data->uub[i] > Pat->bbox[i] + DH[i] / 2)
+      {
+        num[i] = 2;
+        llb[0][i] = p->data->llb[i];
+        llb[1][i] = Pat->bbox[i];
+        uub[1][i] = p->data->uub[i];
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        uub[0][i] = Pat->bbox[i] - DH[i];
+#else
+#ifdef Cell
+        uub[0][i] = Pat->bbox[i];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+      else
+      {
+        num[i] = 1;
+        llb[0][i] = p->data->llb[i];
+        uub[0][i] = p->data->uub[i];
+      }
+    }
+    else if (p->data->llb[i] < Pat->bbox[i + dim] - DH[i] / 2)
+    {
+      if (p->data->uub[i] > Pat->bbox[i + dim] + DH[i] / 2)
+      {
+        num[i] = 2;
+        llb[0][i] = p->data->llb[i];
+        uub[0][i] = Pat->bbox[i + dim];
+        uub[1][i] = p->data->uub[i];
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        llb[1][i] = Pat->bbox[i + dim] + DH[i];
+#else
+#ifdef Cell
+        llb[1][i] = Pat->bbox[i + dim];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+      else
+      {
+        num[i] = 1;
+        llb[0][i] = p->data->llb[i];
+        uub[0][i] = p->data->uub[i];
+      }
+    }
+    else
+    {
+      num[i] = 1;
+      llb[0][i] = p->data->llb[i];
+      uub[0][i] = p->data->uub[i];
+    }
+  }
+  MyList<Parallel::gridseg> *cgsl = 0, *gg;
+  int NN = 1;
+  for (int i = 0; i < dim; i++)
+    NN = NN * num[i];
+
+  for (int i = 0; i < NN; i++)
+  {
+    int ind[dim];
+    getarrayindex(dim, num, ind, i);
+    gg = clone_gsl(p, true);
+    for (int k = 0; k < dim; k++)
+    {
+      gg->data->llb[k] = llb[ind[k]][k];
+      gg->data->uub[k] = uub[ind[k]][k];
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+      gg->data->shape[k] = int((uub[ind[k]][k] - llb[ind[k]][k]) / DH[k] + 0.4) + 1;
+#else
+#ifdef Cell
+      gg->data->shape[k] = int((uub[ind[k]][k] - llb[ind[k]][k]) / DH[k] + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+    }
+
+    if (cgsl)
+      cgsl->catList(gg);
+    else
+      cgsl = gg;
+  }
+
+  return cgsl;
+}
+// after mod operation, according to overlape to determine real grid segments
+void Parallel::build_PhysBD_gstl(Patch *Pat, MyList<Parallel::gridseg> *srci, MyList<Parallel::gridseg> *dsti,
+                                 MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst)
+{
+  *out_src = *out_dst = 0;
+
+  if (!srci || !dsti)
+    return;
+
+  MyList<Parallel::gridseg> *s, *d;
+  MyList<Parallel::gridseg> *s2, *d2;
+
+  double llb[dim], uub[dim];
+
+  s = srci;
+  while (s)
+  {
+    Parallel::gridseg *sd = s->data;
+    d = dsti;
+    while (d)
+    {
+      Parallel::gridseg *dd = d->data;
+      bool flag = true;
+      for (int i = 0; i < dim; i++)
+      {
+        double SH = sd->Bg->getdX(i), DH = dd->Bg->getdX(i);
+        if (!feq(SH, DH, SH / 2))
+        {
+          cout << "Parallel::build_PhysBD_gstl meets different grid space SH = " << SH << ", DH = " << DH << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        // we assume dst and src locate on the same Patch
+        if (dd->llb[i] < Pat->bbox[i])
+          llb[i] = Mymax(sd->llb[i], dd->llb[i] + Pat->bbox[dim + i] - Pat->bbox[i]);
+        else if (dd->llb[i] > Pat->bbox[i + dim])
+          llb[i] = Mymax(sd->llb[i], dd->llb[i] - Pat->bbox[dim + i] + Pat->bbox[i]);
+        else
+          llb[i] = Mymax(sd->llb[i], dd->llb[i]);
+
+        if (dd->uub[i] < Pat->bbox[i])
+          uub[i] = Mymin(sd->uub[i], dd->uub[i] + Pat->bbox[dim + i] - Pat->bbox[i]);
+        else if (dd->uub[i] > Pat->bbox[dim + i])
+          uub[i] = Mymin(sd->uub[i], dd->uub[i] - Pat->bbox[dim + i] + Pat->bbox[i]);
+        else
+          uub[i] = Mymin(sd->uub[i], dd->uub[i]);
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        if (llb[i] > uub[i] + SH / 2)
+        {
+          flag = false;
+          break;
+        } // special for isolated point
+#else
+#ifdef Cell
+        if (llb[i] > uub[i])
+        {
+          flag = false;
+          break;
+        }
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+
+      if (flag)
+      {
+        if (!(*out_src))
+        {
+          *out_src = s2 = new MyList<Parallel::gridseg>;
+          *out_dst = d2 = new MyList<Parallel::gridseg>;
+          s2->data = new Parallel::gridseg;
+          d2->data = new Parallel::gridseg;
+        }
+        else
+        {
+          s2->next = new MyList<Parallel::gridseg>;
+          s2 = s2->next;
+          d2->next = new MyList<Parallel::gridseg>;
+          d2 = d2->next;
+          s2->data = new Parallel::gridseg;
+          d2->data = new Parallel::gridseg;
+        }
+
+        for (int i = 0; i < dim; i++)
+        {
+          double SH = sd->Bg->getdX(i), DH = dd->Bg->getdX(i);
+          s2->data->llb[i] = llb[i];
+          s2->data->uub[i] = uub[i];
+
+          if (dd->llb[i] < Pat->bbox[i])
+            d2->data->llb[i] = llb[i] - Pat->bbox[dim + i] + Pat->bbox[i];
+          else if (dd->llb[i] > Pat->bbox[i + dim])
+            d2->data->llb[i] = llb[i] + Pat->bbox[dim + i] - Pat->bbox[i];
+          else
+            d2->data->llb[i] = llb[i];
+
+          if (dd->uub[i] < Pat->bbox[i])
+            d2->data->uub[i] = uub[i] - Pat->bbox[dim + i] + Pat->bbox[i];
+          else if (dd->uub[i] > Pat->bbox[dim + i])
+            d2->data->uub[i] = uub[i] + Pat->bbox[dim + i] - Pat->bbox[i];
+          else
+            d2->data->uub[i] = uub[i];
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+          s2->data->shape[i] = int((s2->data->uub[i] - s2->data->llb[i]) / SH + 0.4) + 1;
+          d2->data->shape[i] = int((d2->data->uub[i] - d2->data->llb[i]) / DH + 0.4) + 1;
+#else
+#ifdef Cell
+          s2->data->shape[i] = int((s2->data->uub[i] - s2->data->llb[i]) / SH + 0.4);
+          d2->data->shape[i] = int((d2->data->uub[i] - d2->data->llb[i]) / DH + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+        }
+        s2->data->Bg = sd->Bg;
+        s2->next = 0;
+        d2->data->Bg = dd->Bg;
+        d2->next = 0;
+      }
+      d = d->next;
+    }
+    s = s->next;
+  }
+}
+void Parallel::PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry)
+{
+  int cpusize;
+  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+
+  MyList<Parallel::gridseg> *dst;
+  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
+  src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
+
+  dst = build_PhysBD_gsl(Pat);
+  for (int node = 0; node < cpusize; node++)
+  {
+    src[node] = build_owned_gsl0(Pat, node);                                          // for the part without ghost points and do not extend
+    build_PhysBD_gstl(Pat, src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
+  }
+
+  transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
+
+  if (dst)
+    dst->destroyList();
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (src[node])
+      src[node]->destroyList();
+    if (transfer_src[node])
+      transfer_src[node]->destroyList();
+    if (transfer_dst[node])
+      transfer_dst[node]->destroyList();
+  }
+
+  delete[] src;
+  delete[] transfer_src;
+  delete[] transfer_dst;
+}
+double Parallel::L2Norm(Patch *Pat, var *vf)
+{
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+  double tvf, dtvf = 0;
+  int BDW = ghost_width;
+
+  MyList<Block> *BP = Pat->blb;
+  while (BP)
+  {
+    Block *cg = BP->data;
+    if (myrank == cg->rank)
+    {
+      f_l2normhelper(cg->shape, cg->X[0], cg->X[1], cg->X[2],
+                     Pat->bbox[0], Pat->bbox[1], Pat->bbox[2],
+                     Pat->bbox[3], Pat->bbox[4], Pat->bbox[5],
+                     cg->fgfs[vf->sgfn], tvf, BDW);
+      dtvf += tvf;
+    }
+    if (BP == Pat->ble)
+      break;
+    BP = BP->next;
+  }
+
+  MPI_Allreduce(&dtvf, &tvf, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+
+  tvf = sqrt(tvf);
+
+  return tvf;
+}
+double Parallel::L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here)
+{
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+  double tvf, dtvf = 0;
+  int BDW = ghost_width;
+
+  MyList<Block> *BP = Pat->blb;
+  while (BP)
+  {
+    Block *cg = BP->data;
+    if (myrank == cg->rank)
+    {
+      f_l2normhelper(cg->shape, cg->X[0], cg->X[1], cg->X[2],
+                     Pat->bbox[0], Pat->bbox[1], Pat->bbox[2],
+                     Pat->bbox[3], Pat->bbox[4], Pat->bbox[5],
+                     cg->fgfs[vf->sgfn], tvf, BDW);
+      dtvf += tvf;
+    }
+    if (BP == Pat->ble)
+      break;
+    BP = BP->next;
+  }
+
+  MPI_Allreduce(&dtvf, &tvf, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
+
+  tvf = sqrt(tvf);
+
+  return tvf;
+}
+void Parallel::checkgsl(MyList<Parallel::gridseg> *pp, bool first_only)
+{
+  int myrank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  if (myrank == 0)
+  {
+    if (!pp)
+      cout << " Parallel::checkgsl meets empty gsl" << endl;
+    while (pp)
+    {
+      if (pp->data->Bg)
+        cout << " on node#" << pp->data->Bg->rank << endl;
+      else
+        cout << " virtual grid segment" << endl;
+      cout << " shape: (";
+      for (int i = 0; i < dim; i++)
+      {
+        if (i < dim - 1)
+          cout << pp->data->shape[i] << ",";
+        else
+          cout << pp->data->shape[i] << ")" << endl;
+      }
+      cout << " range: (";
+      for (int i = 0; i < dim; i++)
+      {
+        if (i < dim - 1)
+          cout << pp->data->llb[i] << ":" << pp->data->uub[i] << ",";
+        else
+          cout << pp->data->llb[i] << ":" << pp->data->uub[i] << ")" << endl;
+      }
+      if (first_only)
+        return;
+      pp = pp->next;
+    }
+  }
+}
+void Parallel::checkvarl(MyList<var> *pp, bool first_only)
+{
+  int myrank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  if (myrank == 0)
+  {
+    while (pp)
+    {
+      cout << "name: " << pp->data->name << endl;
+      cout << "SoA = (" << pp->data->SoA[0] << "," << pp->data->SoA[1] << "," << pp->data->SoA[2] << ")" << endl;
+      cout << "sgfn = " << pp->data->sgfn << endl;
+      if (first_only)
+        return;
+      pp = pp->next;
+    }
+  }
+}
+void Parallel::prepare_inter_time_level(MyList<Patch> *PatL,
+                                        MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
+                                        MyList<var> *VarList3 /* target (t+a*dt) */, int tindex)
+{
+  while (PatL)
+  {
+    prepare_inter_time_level(PatL->data, VarList1, VarList2, VarList3, tindex);
+    PatL = PatL->next;
+  }
+}
+void Parallel::prepare_inter_time_level(Patch *Pat,
+                                        MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
+                                        MyList<var> *VarList3 /* target (t+a*dt) */, int tindex)
+{
+  int myrank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+  MyList<var> *varl1;
+  MyList<var> *varl2;
+  MyList<var> *varl3;
+
+  MyList<Block> *BP = Pat->blb;
+  while (BP)
+  {
+    Block *cg = BP->data;
+    if (myrank == cg->rank)
+    {
+      varl1 = VarList1;
+      varl2 = VarList2;
+      varl3 = VarList3;
+      while (varl1)
+      {
+        if (tindex == 0)
+          f_average(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn], cg->fgfs[varl3->data->sgfn]);
+        else if (tindex == 1)
+          f_average3(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn], cg->fgfs[varl3->data->sgfn]);
+        else if (tindex == -1)
+          // just change data order to use average3
+          f_average3(cg->shape, cg->fgfs[varl2->data->sgfn], cg->fgfs[varl1->data->sgfn], cg->fgfs[varl3->data->sgfn]);
+        else
+        {
+          cout << "error tindex in Parallel::prepare_inter_time_level" << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        varl1 = varl1->next;
+        varl2 = varl2->next;
+        varl3 = varl3->next;
+      }
+    }
+    if (BP == Pat->ble)
+      break;
+    BP = BP->next;
+  }
+}
+void Parallel::prepare_inter_time_level(MyList<Patch> *PatL,
+                                        MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
+                                        MyList<var> *VarList3 /* source (t-dt) */, MyList<var> *VarList4 /* target (t+a*dt) */, int tindex)
+{
+  while (PatL)
+  {
+    prepare_inter_time_level(PatL->data, VarList1, VarList2, VarList3, VarList4, tindex);
+    PatL = PatL->next;
+  }
+}
+void Parallel::prepare_inter_time_level(Patch *Pat,
+                                        MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
+                                        MyList<var> *VarList3 /* source (t-dt) */, MyList<var> *VarList4 /* target (t+a*dt) */, int tindex)
+{
+  int myrank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+  MyList<var> *varl1;
+  MyList<var> *varl2;
+  MyList<var> *varl3;
+  MyList<var> *varl4;
+
+  MyList<Block> *BP = Pat->blb;
+  while (BP)
+  {
+    Block *cg = BP->data;
+    if (myrank == cg->rank)
+    {
+      varl1 = VarList1;
+      varl2 = VarList2;
+      varl3 = VarList3;
+      varl4 = VarList4;
+      while (varl1)
+      {
+        if (tindex == 0)
+          f_average2(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn],
+                     cg->fgfs[varl3->data->sgfn], cg->fgfs[varl4->data->sgfn]);
+        else if (tindex == 1)
+          f_average2p(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn],
+                      cg->fgfs[varl3->data->sgfn], cg->fgfs[varl4->data->sgfn]);
+        else if (tindex == -1)
+          f_average2m(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn],
+                      cg->fgfs[varl3->data->sgfn], cg->fgfs[varl4->data->sgfn]);
+        else
+        {
+          cout << "error tindex in long cgh::prepare_inter_time_level" << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        varl1 = varl1->next;
+        varl2 = varl2->next;
+        varl3 = varl3->next;
+        varl4 = varl4->next;
+      }
+    }
+    if (BP == Pat->ble)
+      break;
+    BP = BP->next;
+  }
+}
+void Parallel::Prolong(Patch *Patc, Patch *Patf,
+                       MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
+                       int Symmetry)
+{
+  if (Patc->lev >= Patf->lev)
+  {
+    cout << "Parallel::Prolong: meet requst of Prolong from lev#" << Patc->lev << " to lev#" << Patf->lev << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+  int cpusize;
+  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+
+  MyList<Parallel::gridseg> *dst;
+  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
+  src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
+
+  dst = build_complete_gsl(Patf); // including ghost
+  for (int node = 0; node < cpusize; node++)
+  {
+    src[node] = build_owned_gsl4(Patc, node, Symmetry);                   // - buffer - ghost - BD ghost
+    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
+  }
+
+  transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
+
+  if (dst)
+    dst->destroyList();
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (src[node])
+      src[node]->destroyList();
+    if (transfer_src[node])
+      transfer_src[node]->destroyList();
+    if (transfer_dst[node])
+      transfer_dst[node]->destroyList();
+  }
+
+  delete[] src;
+  delete[] transfer_src;
+  delete[] transfer_dst;
+}
+void Parallel::Restrict(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
+                        MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
+                        int Symmetry)
+{
+  if (PatcL->data->lev >= PatfL->data->lev)
+  {
+    cout << "Parallel::Restrict: meet requst of Restrict from lev#" << PatfL->data->lev << " to lev#" << PatcL->data->lev << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+  int cpusize;
+  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+
+  MyList<Parallel::gridseg> *dst;
+  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
+  src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
+
+  dst = build_complete_gsl(PatcL); // including ghost
+  for (int node = 0; node < cpusize; node++)
+  {
+#if 0
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif    
+      src[node]=build_owned_gsl(PatfL,node,2,Symmetry);   // - buffer - ghost
+#else
+#ifdef Cell
+      src[node]=build_owned_gsl(PatfL,node,4,Symmetry); // - buffer - ghost - BD ghost
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+#else
+    // it seems bam always use this
+    src[node] = build_owned_gsl(PatfL, node, 2, Symmetry); // - buffer - ghost
+#endif
+    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
+  }
+
+  transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
+
+  if (dst)
+    dst->destroyList();
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (src[node])
+      src[node]->destroyList();
+    if (transfer_src[node])
+      transfer_src[node]->destroyList();
+    if (transfer_dst[node])
+      transfer_dst[node]->destroyList();
+  }
+
+  delete[] src;
+  delete[] transfer_src;
+  delete[] transfer_dst;
+}
+void Parallel::Restrict_after(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
+                              MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
+                              int Symmetry)
+{
+  if (PatcL->data->lev >= PatfL->data->lev)
+  {
+    cout << "Parallel::Restrict: meet requst of Restrict from lev#" << PatfL->data->lev << " to lev#" << PatcL->data->lev << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+  int cpusize;
+  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+
+  MyList<Parallel::gridseg> *dst;
+  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
+  src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
+
+  dst = build_complete_gsl(PatcL); // including ghost
+  for (int node = 0; node < cpusize; node++)
+  {
+    src[node] = build_owned_gsl(PatfL, node, 3, Symmetry); // - ghost - BD ghost
+
+    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
+  }
+
+  transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
+
+  if (dst)
+    dst->destroyList();
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (src[node])
+      src[node]->destroyList();
+    if (transfer_src[node])
+      transfer_src[node]->destroyList();
+    if (transfer_dst[node])
+      transfer_dst[node]->destroyList();
+  }
+
+  delete[] src;
+  delete[] transfer_src;
+  delete[] transfer_dst;
+}
+// for the same time level
+void Parallel::OutBdLow2Hi(Patch *Patc, Patch *Patf,
+                           MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
+                           int Symmetry)
+{
+  if (Patc->lev >= Patf->lev)
+  {
+    cout << "Parallel::OutBdLow2Hi: meet requst of Prolong from lev#" << Patc->lev << " to lev#" << Patf->lev << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+  int cpusize;
+  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+
+  MyList<Parallel::gridseg> *dst;
+  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
+  src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
+
+  dst = build_buffer_gsl(Patf); // buffer region only
+
+  for (int node = 0; node < cpusize; node++)
+  {
+    src[node] = build_owned_gsl4(Patc, node, Symmetry);                   // - buffer - ghost - BD ghost
+    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
+  }
+
+  transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
+
+  if (dst)
+    dst->destroyList();
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (src[node])
+      src[node]->destroyList();
+    if (transfer_src[node])
+      transfer_src[node]->destroyList();
+    if (transfer_dst[node])
+      transfer_dst[node]->destroyList();
+  }
+
+  delete[] src;
+  delete[] transfer_src;
+  delete[] transfer_dst;
+}
+void Parallel::OutBdLow2Hi(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
+                           MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
+                           int Symmetry)
+{
+  MyList<Patch> *Pp, *Ppc;
+  Ppc = PatcL;
+  while (Ppc)
+  {
+    Pp = PatfL;
+    while (Pp)
+    {
+      if (Ppc->data->lev >= Pp->data->lev)
+      {
+        cout << "Parallel::OutBdLow2Hi(list): meet requst of Prolong from lev#" << Ppc->data->lev << " to lev#" << Pp->data->lev << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+      Pp = Pp->next;
+    }
+    Ppc = Ppc->next;
+  }
+
+  int cpusize;
+  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+
+  MyList<Parallel::gridseg> *dst;
+  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
+  src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
+
+  dst = build_buffer_gsl(PatfL); // buffer region only
+
+  for (int node = 0; node < cpusize; node++)
+  {
+    src[node] = build_owned_gsl(PatcL, node, 4, Symmetry);                // - buffer - ghost - BD ghost
+    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
+  }
+
+  transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
+
+  if (dst)
+    dst->destroyList();
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (src[node])
+      src[node]->destroyList();
+    if (transfer_src[node])
+      transfer_src[node]->destroyList();
+    if (transfer_dst[node])
+      transfer_dst[node]->destroyList();
+  }
+
+  delete[] src;
+  delete[] transfer_src;
+  delete[] transfer_dst;
+}
+// for the same time level
+void Parallel::OutBdLow2Himix(Patch *Patc, Patch *Patf,
+                              MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
+                              int Symmetry)
+{
+  if (Patc->lev >= Patf->lev)
+  {
+    cout << "Parallel::OutBdLow2Himix: meet requst of Prolong from lev#" << Patc->lev << " to lev#" << Patf->lev << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+  int cpusize;
+  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+
+  MyList<Parallel::gridseg> *dst;
+  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
+  src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
+
+  dst = build_buffer_gsl(Patf); // buffer region only
+
+  for (int node = 0; node < cpusize; node++)
+  {
+    src[node] = build_owned_gsl4(Patc, node, Symmetry);                   // - buffer - ghost - BD ghost
+    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
+  }
+
+  transfermix(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
+
+  if (dst)
+    dst->destroyList();
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (src[node])
+      src[node]->destroyList();
+    if (transfer_src[node])
+      transfer_src[node]->destroyList();
+    if (transfer_dst[node])
+      transfer_dst[node]->destroyList();
+  }
+
+  delete[] src;
+  delete[] transfer_src;
+  delete[] transfer_dst;
+
+  // do not need this, we have done after calling of this routine in ProlongRestrict or RestrictProlong
+  //    Sync(Patf,VarList2,Symmetry);  // fine level points may be not enough for interpolation
+}
+void Parallel::OutBdLow2Himix(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
+                              MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
+                              int Symmetry)
+{
+  MyList<Patch> *Pp, *Ppc;
+  Ppc = PatcL;
+  while (Ppc)
+  {
+    Pp = PatfL;
+    while (Pp)
+    {
+      if (Ppc->data->lev >= Pp->data->lev)
+      {
+        cout << "Parallel::OutBdLow2Himix(list): meet requst of Prolong from lev#" << Ppc->data->lev << " to lev#" << Pp->data->lev << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+      Pp = Pp->next;
+    }
+    Ppc = Ppc->next;
+  }
+
+  int cpusize;
+  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+
+  MyList<Parallel::gridseg> *dst;
+  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
+  src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
+
+  dst = build_buffer_gsl(PatfL); // buffer region only
+
+  for (int node = 0; node < cpusize; node++)
+  {
+    src[node] = build_owned_gsl(PatcL, node, 4, Symmetry);                // - buffer - ghost - BD ghost
+    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
+  }
+
+  transfermix(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
+
+  if (dst)
+    dst->destroyList();
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (src[node])
+      src[node]->destroyList();
+    if (transfer_src[node])
+      transfer_src[node]->destroyList();
+    if (transfer_dst[node])
+      transfer_dst[node]->destroyList();
+  }
+
+  delete[] src;
+  delete[] transfer_src;
+  delete[] transfer_dst;
+}
+
+// Restrict_cached: cache grid segment lists, reuse buffers via transfer_cached
+void Parallel::Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
+                               MyList<var> *VarList1, MyList<var> *VarList2,
+                               int Symmetry, SyncCache &cache)
+{
+  if (!cache.valid)
+  {
+    int cpusize;
+    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+    cache.cpusize = cpusize;
+
+    if (!cache.combined_src)
+    {
+      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
+      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
+      cache.send_lengths = new int[cpusize];
+      cache.recv_lengths = new int[cpusize];
+      cache.send_bufs = new double *[cpusize];
+      cache.recv_bufs = new double *[cpusize];
+      cache.send_buf_caps = new int[cpusize];
+      cache.recv_buf_caps = new int[cpusize];
+      for (int i = 0; i < cpusize; i++)
+      {
+        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
+        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
+      }
+      cache.max_reqs = 2 * cpusize;
+      cache.reqs = new MPI_Request[cache.max_reqs];
+      cache.stats = new MPI_Status[cache.max_reqs];
+    }
+
+    MyList<Parallel::gridseg> *dst = build_complete_gsl(PatcL);
+    for (int node = 0; node < cpusize; node++)
+    {
+      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatfL, node, 2, Symmetry);
+      build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]);
+      if (src_owned) src_owned->destroyList();
+    }
+    if (dst) dst->destroyList();
+
+    cache.valid = true;
+  }
+
+  transfer_cached(cache.combined_src, cache.combined_dst, VarList1, VarList2, Symmetry, cache);
+}
+
+// OutBdLow2Hi_cached: cache grid segment lists, reuse buffers via transfer_cached
+void Parallel::OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
+                                  MyList<var> *VarList1, MyList<var> *VarList2,
+                                  int Symmetry, SyncCache &cache)
+{
+  if (!cache.valid)
+  {
+    int cpusize;
+    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+    cache.cpusize = cpusize;
+
+    if (!cache.combined_src)
+    {
+      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
+      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
+      cache.send_lengths = new int[cpusize];
+      cache.recv_lengths = new int[cpusize];
+      cache.send_bufs = new double *[cpusize];
+      cache.recv_bufs = new double *[cpusize];
+      cache.send_buf_caps = new int[cpusize];
+      cache.recv_buf_caps = new int[cpusize];
+      for (int i = 0; i < cpusize; i++)
+      {
+        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
+        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
+      }
+      cache.max_reqs = 2 * cpusize;
+      cache.reqs = new MPI_Request[cache.max_reqs];
+      cache.stats = new MPI_Status[cache.max_reqs];
+    }
+
+    MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
+    for (int node = 0; node < cpusize; node++)
+    {
+      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatcL, node, 4, Symmetry);
+      build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]);
+      if (src_owned) src_owned->destroyList();
+    }
+    if (dst) dst->destroyList();
+
+    cache.valid = true;
+  }
+
+  transfer_cached(cache.combined_src, cache.combined_dst, VarList1, VarList2, Symmetry, cache);
+}
+
+// OutBdLow2Himix_cached: same as OutBdLow2Hi_cached but uses transfermix for unpacking
+void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
+                                     MyList<var> *VarList1, MyList<var> *VarList2,
+                                     int Symmetry, SyncCache &cache)
+{
+  if (!cache.valid)
+  {
+    int cpusize;
+    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+    cache.cpusize = cpusize;
+
+    if (!cache.combined_src)
+    {
+      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
+      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
+      cache.send_lengths = new int[cpusize];
+      cache.recv_lengths = new int[cpusize];
+      cache.send_bufs = new double *[cpusize];
+      cache.recv_bufs = new double *[cpusize];
+      cache.send_buf_caps = new int[cpusize];
+      cache.recv_buf_caps = new int[cpusize];
+      for (int i = 0; i < cpusize; i++)
+      {
+        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
+        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
+      }
+      cache.max_reqs = 2 * cpusize;
+      cache.reqs = new MPI_Request[cache.max_reqs];
+      cache.stats = new MPI_Status[cache.max_reqs];
+    }
+
+    MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
+    for (int node = 0; node < cpusize; node++)
+    {
+      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatcL, node, 4, Symmetry);
+      build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]);
+      if (src_owned) src_owned->destroyList();
+    }
+    if (dst) dst->destroyList();
+
+    cache.valid = true;
+  }
+
+  // Use transfermix instead of transfer for mix-mode interpolation
+  int myrank;
+  MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  int cpusize = cache.cpusize;
+
+  int req_no = 0;
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (node == myrank)
+    {
+      int length = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+      cache.recv_lengths[node] = length;
+      if (length > 0)
+      {
+        if (length > cache.recv_buf_caps[node])
+        {
+          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
+          cache.recv_bufs[node] = new double[length];
+          cache.recv_buf_caps[node] = length;
+        }
+        data_packermix(cache.recv_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+      }
+    }
+    else
+    {
+      int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+      cache.send_lengths[node] = slength;
+      if (slength > 0)
+      {
+        if (slength > cache.send_buf_caps[node])
+        {
+          if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
+          cache.send_bufs[node] = new double[slength];
+          cache.send_buf_caps[node] = slength;
+        }
+        data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+        MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
+      }
+      int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
+      cache.recv_lengths[node] = rlength;
+      if (rlength > 0)
+      {
+        if (rlength > cache.recv_buf_caps[node])
+        {
+          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
+          cache.recv_bufs[node] = new double[rlength];
+          cache.recv_buf_caps[node] = rlength;
+        }
+        MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
+      }
+    }
+  }
+
+  MPI_Waitall(req_no, cache.reqs, cache.stats);
+
+  for (int node = 0; node < cpusize; node++)
+    if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
+      data_packermix(cache.recv_bufs[node], cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
+}
+
+// collect all buffer grid segments or blocks for given patch
+MyList<Parallel::gridseg> *Parallel::build_buffer_gsl(Patch *Pat)
+{
+  MyList<Parallel::gridseg> *cgsl, *gsc, *gsb;
+
+  gsc = build_complete_gsl(Pat); // including ghost
+
+  gsb = new MyList<Parallel::gridseg>;
+  gsb->data = new Parallel::gridseg;
+
+  for (int i = 0; i < dim; i++)
+  {
+    double DH = Pat->blb->data->getdX(i);
+    gsb->data->uub[i] = Pat->bbox[dim + i] - Pat->uui[i] * DH;
+    gsb->data->llb[i] = Pat->bbox[i] + Pat->lli[i] * DH;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+    gsb->data->shape[i] = int((gsb->data->uub[i] - gsb->data->llb[i]) / DH + 0.4) + 1;
+#else
+#ifdef Cell
+    gsb->data->shape[i] = int((gsb->data->uub[i] - gsb->data->llb[i]) / DH + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+  }
+  gsb->data->Bg = 0;
+  gsb->next = 0;
+
+  cgsl = gsl_subtract(gsc, gsb);
+
+  gsc->destroyList();
+  gsb->destroyList();
+
+  //  set illb and iuub
+  gsb = cgsl;
+  while (gsb)
+  {
+    for (int i = 0; i < dim; i++)
+    {
+      double DH = Pat->blb->data->getdX(i);
+      gsb->data->iuub[i] = Pat->bbox[dim + i] - Pat->uui[i] * DH;
+      gsb->data->illb[i] = Pat->bbox[i] + Pat->lli[i] * DH;
+    }
+    gsb = gsb->next;
+  }
+
+  return cgsl;
+}
+MyList<Parallel::gridseg> *Parallel::build_buffer_gsl(MyList<Patch> *PatL)
+{
+  MyList<Parallel::gridseg> *cgsl = 0, *gs;
+  while (PatL)
+  {
+    if (cgsl)
+    {
+      gs->next = build_buffer_gsl(PatL->data);
+      gs = gs->next;
+      if (gs)
+        while (gs->next)
+          gs = gs->next;
+    }
+    else
+    {
+      cgsl = build_buffer_gsl(PatL->data);
+      gs = cgsl;
+      if (gs)
+        while (gs->next)
+          gs = gs->next;
+    }
+    PatL = PatL->next;
+  }
+
+  return cgsl;
+}
+void Parallel::Prolongint(Patch *Patc, Patch *Patf,
+                          MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
+                          int Symmetry)
+{
+  if (Patc->lev >= Patf->lev)
+  {
+    cout << "Parallel::Prolong: meet requst of Prolong from lev#" << Patc->lev << " to lev#" << Patf->lev << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+
+  int myrank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+  int num_var = 0;
+  MyList<var> *varl;
+  varl = VarList1;
+  while (varl)
+  {
+    num_var++;
+    varl = varl->next;
+  }
+
+  MyList<Block> *BP = Patf->blb;
+  while (BP)
+  {
+    int Npts;
+    if (myrank == BP->data->rank)
+      Npts = BP->data->shape[0] * BP->data->shape[1] * BP->data->shape[2];
+    MPI_Bcast(&Npts, 1, MPI_INT, BP->data->rank, MPI_COMM_WORLD);
+    double *pox[3];
+    for (int i = 0; i < 3; i++)
+      pox[i] = new double[Npts];
+    if (myrank == BP->data->rank)
+    {
+      for (int i = 0; i < Npts; i++)
+      {
+        int ind[3];
+        Parallel::getarrayindex(3, BP->data->shape, ind, i);
+        pox[0][i] = BP->data->X[0][ind[0]];
+        pox[1][i] = BP->data->X[1][ind[1]];
+        pox[2][i] = BP->data->X[2][ind[2]];
+      }
+    }
+    for (int i = 0; i < 3; i++)
+      MPI_Bcast(pox[i], Npts, MPI_DOUBLE, BP->data->rank, MPI_COMM_WORLD);
+    double *res;
+    res = new double[num_var * Npts];
+    Patc->Interp_Points(VarList1, Npts, pox, res, Symmetry); // because this operation is a global operation (for all processors)
+                                                             // we have to isolate it out of myrank==BP->data->rank
+    if (myrank == BP->data->rank)
+    {
+      for (int i = 0; i < Npts; i++)
+      {
+        varl = VarList2;
+        int j = 0;
+        while (varl)
+        {
+          (BP->data->fgfs[varl->data->sgfn])[i] = res[j + i * num_var];
+          j++;
+          varl = varl->next;
+        }
+      }
+    }
+    delete[] pox[0];
+    delete[] pox[1];
+    delete[] pox[2];
+    delete[] res;
+    BP = BP->next;
+  }
+}
+//
+void Parallel::merge_gsl(MyList<gridseg> *&A, const double ratio)
+{
+  if (!A)
+    return;
+
+  MyList<gridseg> *B, *C, *D = A;
+  bool flag = false;
+  while (D->next)
+  {
+    B = D->next;
+    while (B)
+    {
+      flag = merge_gs(D, B, C, ratio);
+      if (flag)
+        break;
+      B = B->next;
+    }
+    if (flag)
+      break;
+    D = D->next;
+  }
+
+  if (flag)
+  {
+    // delete D and B from A
+    MyList<gridseg> *E = A;
+    while (E->next)
+    {
+      MyList<gridseg> *tp = E->next;
+      if (D == tp || B == tp)
+      {
+        E->next = (tp->next) ? tp->next : 0;
+        delete tp->data;
+        delete tp;
+      }
+      if (E->next)
+        E = E->next;
+    }
+
+    if (D == A)
+    {
+      MyList<gridseg> *tp = A;
+      A = (A->next) ? A->next : 0;
+      delete tp->data;
+      delete tp;
+    }
+    // cat C to A
+    if (A)
+      A->catList(C);
+    else
+      A = C;
+
+    merge_gsl(A, ratio);
+  }
+}
+//
+bool Parallel::merge_gs(MyList<gridseg> *D, MyList<gridseg> *B, MyList<gridseg> *&C, const double ratio)
+{
+  if (!B || !D)
+    return false;
+
+  C = 0;
+  double llb[dim], uub[dim], DH[dim];
+  for (int i = 0; i < dim; i++)
+  {
+    double tdh;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+    DH[i] = (D->data->uub[i] - D->data->llb[i]) / (D->data->shape[i] - 1);
+    tdh = (B->data->uub[i] - B->data->llb[i]) / (B->data->shape[i] - 1);
+#else
+#ifdef Cell
+    DH[i] = (D->data->uub[i] - D->data->llb[i]) / D->data->shape[i];
+    tdh = (B->data->uub[i] - B->data->llb[i]) / B->data->shape[i];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+    if (!feq(DH[i], tdh, DH[i] / 2))
+    {
+      cout << "Parallel::merge_gs meets different grid segment " << DH[i] << " vs " << tdh << endl;
+      checkgsl(B, true);
+      checkgsl(D, true);
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+    llb[i] = Mymax(D->data->llb[i], B->data->llb[i]);
+    uub[i] = Mymin(D->data->uub[i], B->data->uub[i]);
+    //    if(uub[i]-llb[i] < DH[i]/2) return false;  //here this is valid for both vertex and cell
+
+    // use 0 instead of DH[i]/2, we consider contact case, 2012 Aug 8
+    if (uub[i] - llb[i] < 0)
+      return false; // here this is valid for both vertex and cell
+  }
+
+  // vb: volume of B
+  // vd: volume of D
+  // vo: volume of overlap
+  // vt: volume of smallest common box (virtual merged box)
+  double vd = 1, vb = 1, vt = 1, vo = 1;
+  for (int i = 0; i < dim; i++)
+  {
+    vt = vt * (Mymax(D->data->uub[i], B->data->uub[i]) - Mymin(D->data->llb[i], B->data->llb[i]));
+    vo = vo * (uub[i] - llb[i]);
+    vd = vd * (D->data->uub[i] - D->data->llb[i]);
+    vb = vb * (B->data->uub[i] - B->data->llb[i]);
+  }
+
+  // smller ratio, more possible to merge
+  if ((vd + vb - vo) / vt > ratio)
+  {
+    C = new MyList<gridseg>;
+    C->data = new gridseg;
+    for (int i = 0; i < dim; i++)
+    {
+      C->data->uub[i] = Mymax(D->data->uub[i], B->data->uub[i]);
+      C->data->llb[i] = Mymin(D->data->llb[i], B->data->llb[i]);
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+      C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / DH[i] + 0.4) + 1;
+#else
+#ifdef Cell
+      C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / DH[i] + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+    }
+    if (D->data->Bg == B->data->Bg)
+      C->data->Bg = D->data->Bg;
+    else
+      C->data->Bg = 0;
+
+    C->next = 0;
+
+    return true;
+  }
+  else
+  {
+    return false;
+  }
+}
+// Add ghost region to tangent plane
+// we assume the grids have the same resolution
+void Parallel::add_ghost_touch(MyList<gridseg> *&A)
+{
+  if (!A || !(A->next))
+    return;
+
+  double DH[dim];
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+  for (int i = 0; i < dim; i++)
+    DH[i] = (A->data->uub[i] - A->data->llb[i]) / (A->data->shape[i] - 1) / 2;
+#else
+#ifdef Cell
+  for (int i = 0; i < dim; i++)
+    DH[i] = (A->data->uub[i] - A->data->llb[i]) / A->data->shape[i] / 2;
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+
+  MyList<gridseg> *C1, *C2, *A1 = A, *A2, *dc;
+  dc = C1 = clone_gsl(A, false);
+  while (C1)
+  {
+    C2 = C1->next;
+    A2 = A1->next;
+    while (C2)
+    {
+      for (int i = 0; i < dim; i++)
+      {
+        if (feq(C1->data->llb[i], C2->data->uub[i], DH[i]))
+        {
+          // direction i touch, other directions overlap
+          bool flag = true;
+          for (int j = 0; j < i; j++)
+            if ((C1->data->llb[j] - C2->data->llb[j]) * (C1->data->uub[j] - C2->data->llb[j]) > 0 &&
+                (C2->data->llb[j] - C1->data->llb[j]) * (C2->data->uub[j] - C1->data->llb[j]) > 0)
+              flag = false;
+          for (int j = i + 1; j < dim; j++)
+            if ((C1->data->llb[j] - C2->data->llb[j]) * (C1->data->uub[j] - C2->data->llb[j]) > 0 &&
+                (C2->data->llb[j] - C1->data->llb[j]) * (C2->data->uub[j] - C1->data->llb[j]) > 0)
+              flag = false;
+
+          if (flag)
+          {
+            // only add one ghost region
+            if (feq(A1->data->llb[i], C1->data->llb[i], DH[i]))
+            {
+              A1->data->llb[i] -= ghost_width * 2 * DH[i];
+              A1->data->shape[i] += ghost_width;
+            }
+            if (feq(A2->data->uub[i], C2->data->uub[i], DH[i]))
+            {
+              A2->data->uub[i] += ghost_width * 2 * DH[i];
+              A2->data->shape[i] += ghost_width;
+            }
+          }
+        }
+        if (feq(C1->data->uub[i], C2->data->llb[i], DH[i]))
+        {
+          // direction i touch, other directions overlap
+          bool flag = true;
+          for (int j = 0; j < i; j++)
+            if ((C1->data->llb[j] - C2->data->llb[j]) * (C1->data->uub[j] - C2->data->llb[j]) > 0 &&
+                (C2->data->llb[j] - C1->data->llb[j]) * (C2->data->uub[j] - C1->data->llb[j]) > 0)
+              flag = false;
+          for (int j = i + 1; j < dim; j++)
+            if ((C1->data->llb[j] - C2->data->llb[j]) * (C1->data->uub[j] - C2->data->llb[j]) > 0 &&
+                (C2->data->llb[j] - C1->data->llb[j]) * (C2->data->uub[j] - C1->data->llb[j]) > 0)
+              flag = false;
+
+          if (flag)
+          {
+            // only add one ghost region
+            if (feq(A1->data->uub[i], C1->data->uub[i], DH[i]))
+            {
+              A1->data->uub[i] += ghost_width * 2 * DH[i];
+              A1->data->shape[i] += ghost_width;
+            }
+            if (feq(A2->data->llb[i], C2->data->llb[i], DH[i]))
+            {
+              A2->data->llb[i] -= ghost_width * 2 * DH[i];
+              A2->data->shape[i] += ghost_width;
+            }
+          }
+        }
+      }
+      C2 = C2->next;
+      A2 = A2->next;
+    }
+    C1 = C1->next;
+    A1 = A1->next;
+  }
+
+  if (dc)
+    dc->destroyList();
+}
+// According to overlap to cut the gsl into recular pices
+void Parallel::cut_gsl(MyList<gridseg> *&A)
+{
+  if (!A)
+    return;
+
+  MyList<gridseg> *B, *C, *D = A;
+  bool flag = false;
+  while (D->next)
+  {
+    B = D->next;
+    while (B)
+    {
+      flag = cut_gs(D, B, C);
+      if (flag)
+        break;
+      B = B->next;
+    }
+    if (flag)
+      break;
+    D = D->next;
+  }
+
+  if (flag)
+  {
+    // delete D and B from A
+    MyList<gridseg> *E = A;
+    while (E->next)
+    {
+      MyList<gridseg> *tp = E->next;
+      if (D == tp || B == tp)
+      {
+        E->next = (tp->next) ? tp->next : 0;
+        delete tp->data;
+        delete tp;
+      }
+      if (E->next)
+        E = E->next;
+    }
+
+    if (D == A)
+    {
+      MyList<gridseg> *tp = A;
+      A = (A->next) ? A->next : 0;
+      delete tp->data;
+      delete tp;
+    }
+    // cat C to A
+    if (A)
+      A->catList(C);
+    else
+      A = C;
+
+    cut_gsl(A);
+  }
+}
+// when D and B have overlap, cut them into C and return true
+// otherwise return false and C=0
+bool Parallel::cut_gs(MyList<gridseg> *D, MyList<gridseg> *B, MyList<gridseg> *&C)
+{
+  C = 0;
+  double llb[dim], uub[dim], DH[dim];
+  for (int i = 0; i < dim; i++)
+  {
+    double tdh;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+    DH[i] = (D->data->uub[i] - D->data->llb[i]) / (D->data->shape[i] - 1);
+    tdh = (B->data->uub[i] - B->data->llb[i]) / (B->data->shape[i] - 1);
+#else
+#ifdef Cell
+    DH[i] = (D->data->uub[i] - D->data->llb[i]) / D->data->shape[i];
+    tdh = (B->data->uub[i] - B->data->llb[i]) / B->data->shape[i];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+    if (!feq(DH[i], tdh, DH[i] / 2))
+    {
+      cout << "Parallel::cut_gs meets different grid segment " << DH[i] << " vs " << tdh << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+    llb[i] = Mymax(D->data->llb[i], B->data->llb[i]);
+    uub[i] = Mymin(D->data->uub[i], B->data->uub[i]);
+    // for efficiency we ask the width of the patch at least 2(buffer+ghost+BD ghost)
+    if (uub[i] - llb[i] < DH[i] * 2 * (buffer_width + 2 * ghost_width))
+      return false; // here this is valid for both vertex and cell
+  }
+
+  // this part code results in 5 patches generally
+
+  C = new MyList<gridseg>;
+  C->data = new gridseg;
+  for (int i = 0; i < dim; i++)
+  {
+    C->data->llb[i] = llb[i];
+    C->data->uub[i] = uub[i];
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+    C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / DH[i] + 0.4) + 1;
+#else
+#ifdef Cell
+    C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / DH[i] + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+  }
+  if (D->data->Bg == B->data->Bg)
+    C->data->Bg = D->data->Bg;
+  else
+    C->data->Bg = 0;
+
+  C->next = gs_subtract_virtual(D, C);
+
+  MyList<gridseg> *E = C;
+
+  while (E->next)
+    E = E->next;
+
+  E->next = gs_subtract_virtual(B, C);
+
+  // this part code results in 3 patches generally
+  /*
+       C = clone_gsl(D,true);
+       C->next = gs_subtract_virtual(B,C);
+  */
+
+  return true;
+}
+// note here it is different to real cut, we need leave the cutting edge for both vertex center and cell center
+MyList<Parallel::gridseg> *Parallel::gs_subtract_virtual(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B)
+{
+  if (!A)
+    return 0;
+  if (!B)
+    return clone_gsl(A, true);
+
+  double cut_plane[2 * dim], DH[dim];
+
+  for (int i = 0; i < dim; i++)
+  {
+    double tdh;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+    DH[i] = (A->data->uub[i] - A->data->llb[i]) / (A->data->shape[i] - 1);
+    tdh = (B->data->uub[i] - B->data->llb[i]) / (B->data->shape[i] - 1);
+#else
+#ifdef Cell
+    DH[i] = (A->data->uub[i] - A->data->llb[i]) / A->data->shape[i];
+    tdh = (B->data->uub[i] - B->data->llb[i]) / B->data->shape[i];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+    if (!feq(DH[i], tdh, DH[i] / 2))
+    {
+      cout << "Parallel::gs_subtract_virtual meets different grid segment " << DH[i] << " vs " << tdh << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+  }
+
+  MyList<Parallel::gridseg> *C = 0, *q;
+  for (int i = 0; i < dim; i++)
+  {
+    if (B->data->llb[i] > A->data->uub[i] || B->data->uub[i] < A->data->llb[i])
+      return clone_gsl(A, true);
+    cut_plane[i] = A->data->llb[i];
+    cut_plane[i + dim] = A->data->uub[i];
+  }
+
+  for (int i = 0; i < dim; i++)
+  {
+    cut_plane[i] = Mymax(A->data->llb[i], B->data->llb[i]);
+    if (cut_plane[i] > A->data->llb[i])
+    {
+      q = clone_gsl(A, true);
+      // prolong the list from head
+      if (C)
+        q->next = C;
+      C = q;
+      for (int j = 0; j < dim; j++)
+      {
+        if (i == j)
+        {
+          C->data->llb[i] = A->data->llb[i];
+          // **note here it is different to real cut, we need leave the cutting edge for both vertex center and cell center**
+          C->data->uub[i] = Mymax(C->data->llb[i], cut_plane[i]);
+        }
+        else
+        {
+          C->data->llb[j] = cut_plane[j];
+          C->data->uub[j] = cut_plane[j + dim];
+        }
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4) + 1;
+#else
+#ifdef Cell
+        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+    }
+
+    cut_plane[i + dim] = Mymin(A->data->uub[i], B->data->uub[i]);
+    if (cut_plane[i + dim] < A->data->uub[i])
+    {
+      q = clone_gsl(A, true);
+      if (C)
+        q->next = C;
+      C = q;
+      for (int j = 0; j < dim; j++)
+      {
+        if (i == j)
+        {
+          C->data->uub[i] = A->data->uub[i];
+          // note here it is different to real cut, we need leave the cutting edge for both vertex center and cell center
+          C->data->llb[i] = Mymin(C->data->uub[i], cut_plane[i + dim]);
+        }
+        else
+        {
+          C->data->llb[j] = cut_plane[j];
+          C->data->uub[j] = cut_plane[j + dim];
+        }
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4) + 1;
+#else
+#ifdef Cell
+        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+    }
+  }
+  return C;
+}
+// note the data structure
+// if CC is true
+// 1   -----------  1   ------  ^
+//                  0   ------  |  t
+// 0   -----------  old ------  |
+//
+// old -----------
+// if CC is false
+// 1   -----------  1   ------  ^
+// 0   -----------  0   ------  |  t
+// old -----------  old ------  |
+void Parallel::fill_level_data(MyList<Patch> *PatLd, MyList<Patch> *PatLs, MyList<Patch> *PatcL,
+                               MyList<var> *OldList, MyList<var> *StateList, MyList<var> *FutureList,
+                               MyList<var> *tmList, int Symmetry, bool BB, bool CC)
+{
+  if (PatLd->data->lev != PatLs->data->lev)
+  {
+    cout << "Parallel::fill_level_data: meet requst from lev#" << PatLs->data->lev << " to lev#" << PatLd->data->lev << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+  if (PatLd->data->lev <= PatcL->data->lev)
+  {
+    cout << "Parallel::fill_level_data: meet prolong requst from lev#" << PatcL->data->lev << " to lev#" << PatLd->data->lev << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+
+  int cpusize;
+  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+
+  MyList<var> *VarList = 0;
+  MyList<var> *p;
+  p = StateList;
+  while (p)
+  {
+    if (VarList)
+      VarList->insert(p->data);
+    else
+      VarList = new MyList<var>(p->data);
+    p = p->next;
+  }
+  p = FutureList;
+  while (p)
+  {
+    if (VarList)
+      VarList->insert(p->data);
+    else
+      VarList = new MyList<var>(p->data);
+    p = p->next;
+  }
+
+  MyList<Parallel::gridseg> *dst;
+  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
+  src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
+
+  dst = build_complete_gsl(PatLd); // including ghost
+  // copy part
+  for (int node = 0; node < cpusize; node++)
+  {
+    src[node] = build_owned_gsl(PatLs, node, 0, Symmetry);                // similar to Sync
+    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
+  }
+
+  transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
+
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (src[node])
+      src[node]->destroyList();
+    if (transfer_src[node])
+      transfer_src[node]->destroyList();
+    if (transfer_dst[node])
+      transfer_dst[node]->destroyList();
+  }
+
+  MyList<Parallel::gridseg> *dsts, *dstd;
+  dsts = build_complete_gsl_virtual(PatLs);
+  dstd = dst;
+  dst = gsl_subtract(dstd, dsts);
+  if (dstd)
+    dstd->destroyList();
+  if (dsts)
+    dsts->destroyList();
+
+  if (dst)
+  {
+    // prolongation part
+    for (int node = 0; node < cpusize; node++)
+    {
+      src[node] = build_owned_gsl(PatcL, node, 4, Symmetry);                // - buffer - ghost - BD ghost
+      build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
+    }
+
+    if (CC)
+    {
+      // for FutureList
+      // restrict first~~~>
+      {
+        Restrict(PatcL, PatLs, FutureList, FutureList, Symmetry);
+        Sync(PatcL, FutureList, Symmetry);
+      }
+      //<~~~prolong then
+      transfer(transfer_src, transfer_dst, FutureList, FutureList, Symmetry);
+
+      // for StateList
+      // time interpolation part
+      if (BB)
+        prepare_inter_time_level(PatcL, FutureList, StateList, OldList,
+                                 tmList, 0); // use SynchList_pre as temporal storage space
+      else
+        prepare_inter_time_level(PatcL, FutureList, StateList,
+                                 tmList, 0); // use SynchList_pre as temporal storage space
+                                             // restrict first~~~>
+      {
+        Restrict(PatcL, PatLs, StateList, tmList, Symmetry);
+        Sync(PatcL, tmList, Symmetry);
+      }
+      //<~~~prolong then
+      transfer(transfer_src, transfer_dst, tmList, StateList, Symmetry);
+    }
+    else
+    {
+      // for both FutureList and StateList
+      // restrict first~~~>
+      {
+        Restrict(PatcL, PatLs, VarList, VarList, Symmetry);
+        Sync(PatcL, VarList, Symmetry);
+      }
+      //<~~~prolong then
+      transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
+    }
+
+    for (int node = 0; node < cpusize; node++)
+    {
+      if (src[node])
+        src[node]->destroyList();
+      if (transfer_src[node])
+        transfer_src[node]->destroyList();
+      if (transfer_dst[node])
+        transfer_dst[node]->destroyList();
+    }
+
+    dst->destroyList();
+  }
+
+  delete[] src;
+  delete[] transfer_src;
+  delete[] transfer_dst;
+
+  VarList->clearList();
+}
+void Parallel::KillBlocks(MyList<Patch> *PatchLIST)
+{
+  while (PatchLIST)
+  {
+    Patch *Pp = PatchLIST->data;
+    MyList<Block> *bg;
+    while (Pp->blb)
+    {
+      if (Pp->blb == Pp->ble)
+        break;
+      bg = (Pp->blb->next) ? Pp->blb->next : 0;
+      delete Pp->blb->data;
+      delete Pp->blb;
+      Pp->blb = bg;
+    }
+    if (Pp->ble)
+    {
+      delete Pp->ble->data;
+      delete Pp->ble;
+    }
+    Pp->blb = Pp->ble = 0;
+    PatchLIST = PatchLIST->next;
+  }
+}
+bool Parallel::PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
+                                     int NN, double **XX,
+                                     double *Shellf, int Symmetry)
+{
+  MyList<var> *varl;
+  int num_var = 0;
+  varl = VarList;
+  while (varl)
+  {
+    num_var++;
+    varl = varl->next;
+  }
+
+  double lld[dim], uud[dim];
+  double **pox;
+  pox = new double *[dim];
+  for (int j = 0; j < dim; j++)
+    pox[j] = new double[1];
+  for (int i = 0; i < NN; i++)
+  {
+    MyList<Patch> *PL = PatL;
+    while (PL)
+    {
+      bool flag = true;
+      for (int j = 0; j < dim; j++)
+      {
+        double h = PL->data->getdX(j);
+        lld[j] = PL->data->lli[j] * h;
+        uud[j] = PL->data->uui[j] * h;
+        if (XX[j][i] < PL->data->bbox[j] + lld[j] || XX[j][i] > PL->data->bbox[j + dim] - uud[j])
+        {
+          flag = false;
+          break;
+        }
+        pox[j][0] = XX[j][i];
+      }
+      if (flag)
+      {
+        PL->data->Interp_Points(VarList, 1, pox, Shellf + i * num_var, Symmetry);
+        break;
+      }
+      PL = PL->next;
+    }
+    if (!PL)
+    {
+      checkpatchlist(PatL, false);
+      return false;
+    }
+  }
+  for (int j = 0; j < dim; j++)
+    delete[] pox[j];
+  delete[] pox;
+
+  return true;
+}
+bool Parallel::PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
+                                     int NN, double **XX,
+                                     double *Shellf, int Symmetry, MPI_Comm Comm_here)
+{
+  MyList<var> *varl;
+  int num_var = 0;
+  varl = VarList;
+  while (varl)
+  {
+    num_var++;
+    varl = varl->next;
+  }
+
+  double lld[dim], uud[dim];
+  double **pox;
+  pox = new double *[dim];
+  for (int j = 0; j < dim; j++)
+    pox[j] = new double[1];
+  for (int i = 0; i < NN; i++)
+  {
+    MyList<Patch> *PL = PatL;
+    while (PL)
+    {
+      bool flag = true;
+      for (int j = 0; j < dim; j++)
+      {
+        double h = PL->data->getdX(j);
+        lld[j] = PL->data->lli[j] * h;
+        uud[j] = PL->data->uui[j] * h;
+        if (XX[j][i] < PL->data->bbox[j] + lld[j] || XX[j][i] > PL->data->bbox[j + dim] - uud[j])
+        {
+          flag = false;
+          break;
+        }
+        pox[j][0] = XX[j][i];
+      }
+      if (flag)
+      {
+        PL->data->Interp_Points(VarList, 1, pox, Shellf + i * num_var, Symmetry, Comm_here);
+        break;
+      }
+      PL = PL->next;
+    }
+    if (!PL)
+    {
+      checkpatchlist(PatL, false);
+      return false;
+    }
+  }
+  for (int j = 0; j < dim; j++)
+    delete[] pox[j];
+  delete[] pox;
+
+  return true;
+}
+void Parallel::aligncheck(double *bbox0, double *bboxl, int lev, double *DH0, int *shape)
+{
+  const double aligntiny = 0.1;
+  double DHl, rr;
+  int NN;
+  for (int i = 0; i < dim; i++)
+  {
+    DHl = DH0[i] * pow(0.5, lev);
+    rr = bboxl[i] - bbox0[i];
+    bboxl[i] = bbox0[i] + int(rr / DHl + 0.4) * DHl;
+    rr = bbox0[i + dim] - bboxl[i + dim];
+    bboxl[i + dim] = bbox0[i + dim] - int(rr / DHl + 0.4) * DHl;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+    NN = int((bboxl[i + dim] - bboxl[i]) / DHl + 0.4) + 1;
+#else
+#ifdef Cell
+    NN = int((bboxl[i + dim] - bboxl[i]) / DHl + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+    if (NN != shape[i])
+    {
+      int myrank;
+      MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+      if (myrank == 0)
+      {
+        cout << "Parallel::aligncheck want shape " << NN << " for lev#" << lev << ", but " << shape[i] << endl;
+        cout << "i = " << i << ", low = " << bboxl[i] << ", up = " << bboxl[i + dim] << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+    }
+  }
+}
+bool Parallel::point_locat_gsl(double *pox, MyList<Parallel::gridseg> *gsl)
+{
+  bool flag = false;
+  while (gsl)
+  {
+    for (int i = 0; i < dim; i++)
+    {
+      if (pox[i] > gsl->data->llb[i] && pox[i] < gsl->data->uub[i])
+        flag = true;
+      else
+      {
+        flag = false;
+        break;
+      }
+    }
+    if (flag)
+      break;
+    gsl = gsl->next;
+  }
+
+  return flag;
+}
+void Parallel::checkpatchlist(MyList<Patch> *PatL, bool buflog)
+{
+  MyList<Patch> *PL = PatL;
+  while (PL)
+  {
+    PL->data->checkPatch(buflog);
+    PL = PL->next;
+  }
+}
+// Check if load balancing is needed based on interpolation times
+bool Parallel::check_load_balance_need(double *rank_times, int nprocs, int &num_heavy, int *heavy_ranks)
+{
+  // Calculate average time
+  double avg_time = 0;
+  for (int r = 0; r < nprocs; r++)
+  {
+    avg_time += rank_times[r];
+  }
+  avg_time /= nprocs;
+
+  // Identify heavy ranks (time > 1.5x average)
+  std::vector<std::pair<int, double>> rank_times_vec;
+  for (int r = 0; r < nprocs; r++)
+  {
+    if (rank_times[r] > avg_time * 1.5)
+    {
+      rank_times_vec.push_back(std::make_pair(r, rank_times[r]));
+    }
+  }
+
+  // Sort by time (descending)
+  std::sort(rank_times_vec.begin(), rank_times_vec.end(),
+            [](const std::pair<int, double>& a, const std::pair<int, double>& b) {
+              return a.second > b.second;
+            });
+
+  // Take top 4 heavy ranks
+  num_heavy = std::min(4, (int)rank_times_vec.size());
+  if (num_heavy > 0)
+  {
+    for (int i = 0; i < num_heavy; i++)
+    {
+      heavy_ranks[i] = rank_times_vec[i].first;
+    }
+    return true;  // Load balancing is needed
+  }
+
+  return false;  // No load balancing needed
+}
+
+// Split blocks belonging to heavy ranks to improve load balancing
+// Strategy: Split heavy rank blocks in half, merge 8 light ranks to free 4 ranks
+void Parallel::split_heavy_blocks(MyList<Patch> *PatL, int *heavy_ranks, int num_heavy,
+                                  int split_factor, int cpusize, int ingfsi, int fngfsi)
+{
+  int myrank, nprocs;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+  if (myrank != 0) return; // Only rank 0 performs the analysis
+
+  cout << "\n=== Load Balancing Strategy ===" << endl;
+  cout << "Heavy ranks to split (in half): " << num_heavy << endl;
+  for (int i = 0; i < num_heavy; i++)
+    cout << "  Heavy rank " << heavy_ranks[i] << endl;
+
+  // Step 1: Identify all blocks and their ranks
+  std::vector<int> all_ranks;
+  std::map<int, std::vector<Block*>> rank_to_blocks;
+
+  MyList<Patch> *PL = PatL;
+  while (PL)
+  {
+    Patch *PP = PL->data;
+    MyList<Block> *BP = PP->blb;
+    while (BP)
+    {
+      Block *block = BP->data;
+      all_ranks.push_back(block->rank);
+      rank_to_blocks[block->rank].push_back(block);
+      BP = BP->next;
+    }
+    PL = PL->next;
+  }
+
+  // Step 2: Identify light ranks (not in heavy_ranks list)
+  std::set<int> heavy_set(heavy_ranks, heavy_ranks + num_heavy);
+  std::vector<int> light_ranks;
+  for (int r : all_ranks)
+  {
+    if (heavy_set.find(r) == heavy_set.end())
+    {
+      light_ranks.push_back(r);
+    }
+  }
+
+  // Remove duplicates from light_ranks
+  std::sort(light_ranks.begin(), light_ranks.end());
+  light_ranks.erase(std::unique(light_ranks.begin(), light_ranks.end()), light_ranks.end());
+
+  cout << "Found " << light_ranks.size() << " light ranks (candidates for merging)" << endl;
+
+  // Step 3: Select 8 light ranks to merge (those with smallest workload)
+  // For now, we select the first 8 light ranks
+  int num_to_merge = 8;
+  if (light_ranks.size() < num_to_merge)
+  {
+    cout << "WARNING: Not enough light ranks to merge. Found " << light_ranks.size()
+         << ", need " << num_to_merge << endl;
+    num_to_merge = light_ranks.size();
+  }
+
+  std::vector<int> ranks_to_merge(light_ranks.begin(), light_ranks.begin() + num_to_merge);
+
+  cout << "Light ranks to merge (8 -> 4 merged ranks):" << endl;
+  for (int i = 0; i < num_to_merge; i++)
+    cout << "  Rank " << ranks_to_merge[i] << endl;
+
+  // Step 4: Analyze blocks that need to be split
+  cout << "\n=== Analyzing blocks for splitting ===" << endl;
+
+  struct BlockSplitInfo {
+    Block *original_block;
+    int split_dim;
+    int split_point;
+  };
+
+  std::vector<BlockSplitInfo> blocks_to_split;
+
+  PL = PatL;
+  while (PL)
+  {
+    Patch *PP = PL->data;
+    MyList<Block> *BP = PP->blb;
+    while (BP)
+    {
+      Block *block = BP->data;
+
+      // Check if this block belongs to a heavy rank
+      for (int i = 0; i < num_heavy; i++)
+      {
+        if (block->rank == heavy_ranks[i])
+        {
+          // Find the largest dimension for splitting
+          int max_dim = 0;
+          int max_size = block->shape[0];
+          for (int d = 1; d < dim; d++)
+          {
+            if (block->shape[d] > max_size)
+            {
+              max_size = block->shape[d];
+              max_dim = d;
+            }
+          }
+
+          int split_point = max_size / 2;
+
+          BlockSplitInfo info;
+          info.original_block = block;
+          info.split_dim = max_dim;
+          info.split_point = split_point;
+          blocks_to_split.push_back(info);
+
+          cout << "Block at rank " << block->rank << " will be split" << endl;
+          cout << "  Shape: [" << block->shape[0] << ", " << block->shape[1] << ", " << block->shape[2] << "]" << endl;
+          cout << "  Split along dimension " << max_dim << " at index " << split_point << endl;
+          break;
+        }
+      }
+
+      BP = BP->next;
+    }
+    PL = PL->next;
+  }
+
+  cout << "\nTotal blocks to split: " << blocks_to_split.size() << endl;
+
+  // Step 5: Calculate new rank assignments
+  // Strategy:
+  // - For each heavy rank, its blocks are split in half
+  // - First half keeps the original rank
+  // - Second half gets a new rank (from the freed light ranks)
+  // - 8 light ranks are merged into 4 ranks, freeing up 4 ranks
+
+  std::vector<int> freed_ranks;
+  for (size_t i = 0; i < ranks_to_merge.size(); i += 2)
+  {
+    // Merge pairs of light ranks: (ranks_to_merge[i], ranks_to_merge[i+1]) -> ranks_to_merge[i]
+    // This frees up ranks_to_merge[i+1]
+    if (i + 1 < ranks_to_merge.size())
+    {
+      freed_ranks.push_back(ranks_to_merge[i + 1]);
+      cout << "Merging ranks " << ranks_to_merge[i] << " and " << ranks_to_merge[i + 1]
+           << " -> keeping rank " << ranks_to_merge[i] << ", freeing rank " << ranks_to_merge[i + 1] << endl;
+    }
+  }
+
+  cout << "\nFreed ranks available for split blocks: ";
+  for (int r : freed_ranks)
+    cout << r << " ";
+  cout << endl;
+
+  // Step 6: Assign new ranks to split blocks
+  int freed_idx = 0;
+  for (size_t i = 0; i < blocks_to_split.size(); i++)
+  {
+    BlockSplitInfo &info = blocks_to_split[i];
+    Block *original = info.original_block;
+
+    if (freed_idx < freed_ranks.size())
+    {
+      cout << "\nSplitting block at rank " << original->rank << endl;
+      cout << "  First half: keeps rank " << original->rank << endl;
+      cout << "  Second half: gets new rank " << freed_ranks[freed_idx] << endl;
+      freed_idx++;
+    }
+    else
+    {
+      cout << "WARNING: Not enough freed ranks for all split blocks!" << endl;
+      break;
+    }
+  }
+
+  cout << "\n=== Load Balancing Analysis Complete ===" << endl;
+  cout << "Next steps:" << endl;
+  cout << "  1. Recompose the grid with new rank assignments" << endl;
+  cout << "  2. Data migration will be handled by recompose_cgh" << endl;
+  cout << "  3. Ghost zone communication will be updated automatically" << endl;
+}
+
diff --git a/AMSS_NCKU_source/Parallel.h b/AMSS_NCKU_source/Parallel.h
index a6ef351..7538958 100644
--- a/AMSS_NCKU_source/Parallel.h
+++ b/AMSS_NCKU_source/Parallel.h
@@ -1,213 +1,235 @@
-
-#ifndef PARALLEL_H
-#define PARALLEL_H
-
-#include <iostream>
-#include <iomanip>
-#include <fstream>
-#include <cstdlib>
-#include <cstdio>
-#include <string>
-#include <cmath>
-#include <new>
-using namespace std;
-
-#include "Parallel_bam.h"
-#include "var.h"
-#include "MPatch.h"
-#include "Block.h"
-#include "MyList.h"
-#include "macrodef.h" //need dim; ghost_width; CONTRACT
-namespace Parallel
-{
-  struct gridseg
-  {
-    double llb[dim];
-    double uub[dim];
-    int shape[dim];
-    double illb[dim], iuub[dim]; // only use for OutBdLow2Hi
-    Block *Bg;
-  };
-  int partition1(int &nx, int split_size, int min_width, int cpusize, int shape);    // special for 1 diemnsion
-  int partition2(int *nxy, int split_size, int *min_width, int cpusize, int *shape); // special for 2 diemnsions
-  int partition3(int *nxyz, int split_size, int *min_width, int cpusize, int *shape);
-  MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks
-  void KillBlocks(MyList<Patch> *PatchLIST);
-
-  void setfunction(MyList<Block> *BlL, var *vn, double func(double x, double y, double z));
-  void setfunction(int rank, MyList<Block> *BlL, var *vn, double func(double x, double y, double z));
-  void writefile(double time, int nx, int ny, int nz, double xmin, double xmax, double ymin, double ymax,
-                 double zmin, double zmax, char *filename, double *data_out);
-  void writefile(double time, int nx, int ny, double xmin, double xmax, double ymin, double ymax,
-                 char *filename, double *datain);
-  void getarrayindex(int DIM, int *shape, int *index, int n);
-  int getarraylocation(int DIM, int *shape, int *index);
-  void copy(int DIM, double *llbout, double *uubout, int *Dshape, double *DD, double *llbin, double *uubin,
-            int *shape, double *datain, double *llb, double *uub);
-  void Dump_CPU_Data(MyList<Block> *BlL, MyList<var> *DumpList, char *tag, double time, double dT);
-  void Dump_Data(MyList<Patch> *PL, MyList<var> *DumpList, char *tag, double time, double dT);
-  void Dump_Data(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT, int grd);
-  double *Collect_Data(Patch *PP, var *VP);
-  void d2Dump_Data(MyList<Patch> *PL, MyList<var> *DumpList, char *tag, double time, double dT);
-  void d2Dump_Data(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT, int grd);
-  void Dump_Data0(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT);
-  double global_interp(int DIM, int *ext, double **CoX, double *datain,
-                       double *poX, int ordn, double *SoA, int Symmetry);
-  double global_interp(int DIM, int *ext, double **CoX, double *datain,
-                       double *poX, int ordn);
-  double Lagrangian_Int(double x, int npts, double *xpts, double *funcvals);
-  double LagrangePoly(double x, int pt, int npts, double *xpts);
-  MyList<gridseg> *build_complete_gsl(Patch *Pat);
-  MyList<gridseg> *build_complete_gsl(MyList<Patch> *PatL);
-  MyList<gridseg> *build_complete_gsl_virtual(MyList<Patch> *PatL);
-  MyList<gridseg> *build_complete_gsl_virtual2(MyList<Patch> *PatL);        // - buffer
-  MyList<gridseg> *build_owned_gsl0(Patch *Pat, int rank_in);               // - ghost without extension, special for Sync usage
-  MyList<gridseg> *build_owned_gsl1(Patch *Pat, int rank_in);               // - ghost, similar to build_owned_gsl0 but extend one point on left side for vertex grid
-  MyList<gridseg> *build_owned_gsl2(Patch *Pat, int rank_in);               // - buffer - ghost
-  MyList<gridseg> *build_owned_gsl3(Patch *Pat, int rank_in, int Symmetry); // - ghost - BD ghost
-  MyList<gridseg> *build_owned_gsl4(Patch *Pat, int rank_in, int Symmetry); // - buffer - ghost - BD ghost
-  MyList<gridseg> *build_owned_gsl5(Patch *Pat, int rank_in);               // similar to build_owned_gsl2 but no extension
-  MyList<gridseg> *build_owned_gsl(MyList<Patch> *PatL, int rank_in, int type, int Symmetry);
-  void build_gstl(MyList<gridseg> *srci, MyList<gridseg> *dsti, MyList<gridseg> **out_src, MyList<gridseg> **out_dst);
-  int data_packer(double *data, MyList<gridseg> *src, MyList<gridseg> *dst, int rank_in, int dir,
-                  MyList<var> *VarLists, MyList<var> *VarListd, int Symmetry);
-  void transfer(MyList<gridseg> **src, MyList<gridseg> **dst,
-                MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
-                int Symmetry);
-  int data_packermix(double *data, MyList<gridseg> *src, MyList<gridseg> *dst, int rank_in, int dir,
-                     MyList<var> *VarLists, MyList<var> *VarListd, int Symmetry);
-  void transfermix(MyList<gridseg> **src, MyList<gridseg> **dst,
-                   MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
-                   int Symmetry);
-  void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry);
-  void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
-  void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
-
-  struct SyncCache {
-    bool valid;
-    int cpusize;
-    MyList<gridseg> **combined_src;
-    MyList<gridseg> **combined_dst;
-    int *send_lengths;
-    int *recv_lengths;
-    double **send_bufs;
-    double **recv_bufs;
-    int *send_buf_caps;
-    int *recv_buf_caps;
-    MPI_Request *reqs;
-    MPI_Status *stats;
-    int max_reqs;
-    bool lengths_valid;
-    SyncCache();
-    void invalidate();
-    void destroy();
-  };
-
-  void Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache);
-  void transfer_cached(MyList<gridseg> **src, MyList<gridseg> **dst,
-                       MyList<var> *VarList1, MyList<var> *VarList2,
-                       int Symmetry, SyncCache &cache);
-
-  struct AsyncSyncState {
-    int req_no;
-    bool active;
-    AsyncSyncState() : req_no(0), active(false) {}
-  };
-
-  void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
-                  SyncCache &cache, AsyncSyncState &state);
-  void Sync_finish(SyncCache &cache, AsyncSyncState &state,
-                   MyList<var> *VarList, int Symmetry);
-  void OutBdLow2Hi(Patch *Patc, Patch *Patf,
-                   MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
-                   int Symmetry);
-  void OutBdLow2Hi(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
-                   MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
-                   int Symmetry);
-  void OutBdLow2Himix(Patch *Patc, Patch *Patf,
-                      MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
-                      int Symmetry);
-  void OutBdLow2Himix(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
-                      MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
-                      int Symmetry);
-  void Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
-                       MyList<var> *VarList1, MyList<var> *VarList2,
-                       int Symmetry, SyncCache &cache);
-  void OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
-                          MyList<var> *VarList1, MyList<var> *VarList2,
-                          int Symmetry, SyncCache &cache);
-  void OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
-                             MyList<var> *VarList1, MyList<var> *VarList2,
-                             int Symmetry, SyncCache &cache);
-  void Prolong(Patch *Patc, Patch *Patf,
-               MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
-               int Symmetry);
-  void Prolongint(Patch *Patc, Patch *Patf,
-                  MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
-                  int Symmetry);
-  void Restrict(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
-                MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
-                int Symmetry);
-  void Restrict_after(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
-                      MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
-                      int Symmetry); // for -ghost - BDghost
-  MyList<Parallel::gridseg> *build_PhysBD_gsl(Patch *Pat);
-  MyList<Parallel::gridseg> *build_ghost_gsl(MyList<Patch> *PatL);
-  MyList<Parallel::gridseg> *build_ghost_gsl(Patch *Pat);
-  MyList<Parallel::gridseg> *build_buffer_gsl(Patch *Pat);
-  MyList<Parallel::gridseg> *build_buffer_gsl(MyList<Patch> *PatL);
-  MyList<Parallel::gridseg> *gsl_subtract(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B);
-  MyList<Parallel::gridseg> *gs_subtract(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B);
-  MyList<Parallel::gridseg> *gsl_and(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B);
-  MyList<Parallel::gridseg> *gs_and(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B);
-  MyList<Parallel::gridseg> *clone_gsl(MyList<Parallel::gridseg> *p, bool first_only);
-  MyList<Parallel::gridseg> *build_bulk_gsl(Patch *Pat); // similar to build_owned_gsl0 but does not care rank issue
-  MyList<Parallel::gridseg> *build_bulk_gsl(Block *bp, Patch *Pat);
-  void build_PhysBD_gstl(Patch *Pat, MyList<Parallel::gridseg> *srci, MyList<Parallel::gridseg> *dsti,
-                         MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst);
-  void PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry);
-  double L2Norm(Patch *Pat, var *vf);
-  void checkgsl(MyList<Parallel::gridseg> *pp, bool first_only);
-  void checkvarl(MyList<var> *pp, bool first_only);
-  MyList<Parallel::gridseg> *divide_gsl(MyList<Parallel::gridseg> *p, Patch *Pat);
-  MyList<Parallel::gridseg> *divide_gs(MyList<Parallel::gridseg> *p, Patch *Pat);
-  void prepare_inter_time_level(Patch *Pat,
-                                MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
-                                MyList<var> *VarList3 /* target (t+a*dt) */, int tindex);
-  void prepare_inter_time_level(Patch *Pat,
-                                MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
-                                MyList<var> *VarList3 /* source (t-dt) */, MyList<var> *VarList4 /* target (t+a*dt) */, int tindex);
-  void prepare_inter_time_level(MyList<Patch> *PatL,
-                                MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
-                                MyList<var> *VarList3 /* target (t+a*dt) */, int tindex);
-  void prepare_inter_time_level(MyList<Patch> *Pat,
-                                MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
-                                MyList<var> *VarList3 /* source (t-dt) */, MyList<var> *VarList4 /* target (t+a*dt) */, int tindex);
-  void merge_gsl(MyList<gridseg> *&A, const double ratio);
-  bool merge_gs(MyList<gridseg> *D, MyList<gridseg> *B, MyList<gridseg> *&C, const double ratio);
-  // Add ghost region to tangent plane
-  // we assume the grids have the same resolution
-  void add_ghost_touch(MyList<gridseg> *&A);
-  void cut_gsl(MyList<gridseg> *&A);
-  bool cut_gs(MyList<gridseg> *D, MyList<gridseg> *B, MyList<gridseg> *&C);
-  MyList<Parallel::gridseg> *gs_subtract_virtual(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B);
-  void fill_level_data(MyList<Patch> *PatLd, MyList<Patch> *PatLs, MyList<Patch> *PatcL,
-                       MyList<var> *OldList, MyList<var> *StateList, MyList<var> *FutureList,
-                       MyList<var> *tmList, int Symmetry, bool BB, bool CC);
-  bool PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
-                             int NN, double **XX,
-                             double *Shellf, int Symmetry);
-  void aligncheck(double *bbox0, double *bboxl, int lev, double *DH0, int *shape);
-  bool point_locat_gsl(double *pox, MyList<Parallel::gridseg> *gsl);
-  void checkpatchlist(MyList<Patch> *PatL, bool buflog);
-
-  double L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here);
-  bool PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
-                             int NN, double **XX,
-                             double *Shellf, int Symmetry, MPI_Comm Comm_here);
-#if (PSTR == 1 || PSTR == 2 || PSTR == 3)
-  MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
-                            bool periodic, int start_rank, int end_rank, int nodes = 0);
-#endif
-}
-#endif /*PARALLEL_H */
+
+#ifndef PARALLEL_H
+#define PARALLEL_H
+
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <cstdlib>
+#include <cstdio>
+#include <string>
+#include <cmath>
+#include <new>
+using namespace std;
+#include <memory.h>
+#include "Parallel_bam.h"
+#include "var.h"
+#include "MPatch.h"
+#include "Block.h"
+#include "MyList.h"
+#include "macrodef.h" //need dim; ghost_width; CONTRACT
+namespace Parallel
+{
+  struct gridseg
+  {
+    double llb[dim];
+    double uub[dim];
+    int shape[dim];
+    double illb[dim], iuub[dim]; // only use for OutBdLow2Hi
+    Block *Bg;
+  };
+  int partition1(int &nx, int split_size, int min_width, int cpusize, int shape);    // special for 1 diemnsion
+  int partition2(int *nxy, int split_size, int *min_width, int cpusize, int *shape); // special for 2 diemnsions
+  int partition3(int *nxyz, int split_size, int *min_width, int cpusize, int *shape);
+  MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks
+  MyList<Block> *distribute_hard(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks
+  Block* splitHotspotBlock(MyList<Block>* &BlL, int _dim, 
+                                 int ib0_orig, int ib3_orig, 
+                                 int jb1_orig, int jb4_orig, 
+                                 int kb2_orig, int kb5_orig, 
+                                 Patch* PP, int r_left, int r_right, 
+                                 int ingfsi, int fngfsi, bool periodic,
+                                 Block* &split_first_block, Block* &split_last_block);
+  Block* createMappedBlock(MyList<Block>* &BlL, int _dim, int* shape, double* bbox,
+                        int block_id, int ingfsi, int fngfsi, int lev); 
+  void KillBlocks(MyList<Patch> *PatchLIST);
+
+  void setfunction(MyList<Block> *BlL, var *vn, double func(double x, double y, double z));
+  void setfunction(int rank, MyList<Block> *BlL, var *vn, double func(double x, double y, double z));
+  void writefile(double time, int nx, int ny, int nz, double xmin, double xmax, double ymin, double ymax,
+                 double zmin, double zmax, char *filename, double *data_out);
+  void writefile(double time, int nx, int ny, double xmin, double xmax, double ymin, double ymax,
+                 char *filename, double *datain);
+  void getarrayindex(int DIM, int *shape, int *index, int n);
+  int getarraylocation(int DIM, int *shape, int *index);
+  void copy(int DIM, double *llbout, double *uubout, int *Dshape, double *DD, double *llbin, double *uubin,
+            int *shape, double *datain, double *llb, double *uub);
+  void Dump_CPU_Data(MyList<Block> *BlL, MyList<var> *DumpList, char *tag, double time, double dT);
+  void Dump_Data(MyList<Patch> *PL, MyList<var> *DumpList, char *tag, double time, double dT);
+  void Dump_Data(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT, int grd);
+  double *Collect_Data(Patch *PP, var *VP);
+  void d2Dump_Data(MyList<Patch> *PL, MyList<var> *DumpList, char *tag, double time, double dT);
+  void d2Dump_Data(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT, int grd);
+  void Dump_Data0(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT);
+  double global_interp(int DIM, int *ext, double **CoX, double *datain,
+                       double *poX, int ordn, double *SoA, int Symmetry);
+  double global_interp(int DIM, int *ext, double **CoX, double *datain,
+                       double *poX, int ordn);
+  double Lagrangian_Int(double x, int npts, double *xpts, double *funcvals);
+  double LagrangePoly(double x, int pt, int npts, double *xpts);
+  MyList<gridseg> *build_complete_gsl(Patch *Pat);
+  MyList<gridseg> *build_complete_gsl(MyList<Patch> *PatL);
+  MyList<gridseg> *build_complete_gsl_virtual(MyList<Patch> *PatL);
+  MyList<gridseg> *build_complete_gsl_virtual2(MyList<Patch> *PatL);        // - buffer
+  MyList<gridseg> *build_owned_gsl0(Patch *Pat, int rank_in);               // - ghost without extension, special for Sync usage
+  MyList<gridseg> *build_owned_gsl1(Patch *Pat, int rank_in);               // - ghost, similar to build_owned_gsl0 but extend one point on left side for vertex grid
+  MyList<gridseg> *build_owned_gsl2(Patch *Pat, int rank_in);               // - buffer - ghost
+  MyList<gridseg> *build_owned_gsl3(Patch *Pat, int rank_in, int Symmetry); // - ghost - BD ghost
+  MyList<gridseg> *build_owned_gsl4(Patch *Pat, int rank_in, int Symmetry); // - buffer - ghost - BD ghost
+  MyList<gridseg> *build_owned_gsl5(Patch *Pat, int rank_in);               // similar to build_owned_gsl2 but no extension
+  MyList<gridseg> *build_owned_gsl(MyList<Patch> *PatL, int rank_in, int type, int Symmetry);
+  void build_gstl(MyList<gridseg> *srci, MyList<gridseg> *dsti, MyList<gridseg> **out_src, MyList<gridseg> **out_dst);
+  int data_packer(double *data, MyList<gridseg> *src, MyList<gridseg> *dst, int rank_in, int dir,
+                  MyList<var> *VarLists, MyList<var> *VarListd, int Symmetry);
+  void transfer(MyList<gridseg> **src, MyList<gridseg> **dst,
+                MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
+                int Symmetry);
+  int data_packermix(double *data, MyList<gridseg> *src, MyList<gridseg> *dst, int rank_in, int dir,
+                     MyList<var> *VarLists, MyList<var> *VarListd, int Symmetry);
+  void transfermix(MyList<gridseg> **src, MyList<gridseg> **dst,
+                   MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
+                   int Symmetry);
+  void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry);
+  void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
+  void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
+
+  struct SyncCache {
+    bool valid;
+    int cpusize;
+    MyList<gridseg> **combined_src;
+    MyList<gridseg> **combined_dst;
+    int *send_lengths;
+    int *recv_lengths;
+    double **send_bufs;
+    double **recv_bufs;
+    int *send_buf_caps;
+    int *recv_buf_caps;
+    MPI_Request *reqs;
+    MPI_Status *stats;
+    int max_reqs;
+    bool lengths_valid;
+    SyncCache();
+    void invalidate();
+    void destroy();
+  };
+
+  void Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache);
+  void transfer_cached(MyList<gridseg> **src, MyList<gridseg> **dst,
+                       MyList<var> *VarList1, MyList<var> *VarList2,
+                       int Symmetry, SyncCache &cache);
+
+  struct AsyncSyncState {
+    int req_no;
+    bool active;
+    AsyncSyncState() : req_no(0), active(false) {}
+  };
+
+  void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
+                  SyncCache &cache, AsyncSyncState &state);
+  void Sync_finish(SyncCache &cache, AsyncSyncState &state,
+                   MyList<var> *VarList, int Symmetry);
+  void OutBdLow2Hi(Patch *Patc, Patch *Patf,
+                   MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
+                   int Symmetry);
+  void OutBdLow2Hi(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
+                   MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
+                   int Symmetry);
+  void OutBdLow2Himix(Patch *Patc, Patch *Patf,
+                      MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
+                      int Symmetry);
+  void OutBdLow2Himix(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
+                      MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
+                      int Symmetry);
+  void Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
+                       MyList<var> *VarList1, MyList<var> *VarList2,
+                       int Symmetry, SyncCache &cache);
+  void OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
+                          MyList<var> *VarList1, MyList<var> *VarList2,
+                          int Symmetry, SyncCache &cache);
+  void OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
+                             MyList<var> *VarList1, MyList<var> *VarList2,
+                             int Symmetry, SyncCache &cache);
+  void Prolong(Patch *Patc, Patch *Patf,
+               MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
+               int Symmetry);
+  void Prolongint(Patch *Patc, Patch *Patf,
+                  MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
+                  int Symmetry);
+  void Restrict(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
+                MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
+                int Symmetry);
+  void Restrict_after(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
+                      MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
+                      int Symmetry); // for -ghost - BDghost
+  MyList<Parallel::gridseg> *build_PhysBD_gsl(Patch *Pat);
+  MyList<Parallel::gridseg> *build_ghost_gsl(MyList<Patch> *PatL);
+  MyList<Parallel::gridseg> *build_ghost_gsl(Patch *Pat);
+  MyList<Parallel::gridseg> *build_buffer_gsl(Patch *Pat);
+  MyList<Parallel::gridseg> *build_buffer_gsl(MyList<Patch> *PatL);
+  MyList<Parallel::gridseg> *gsl_subtract(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B);
+  MyList<Parallel::gridseg> *gs_subtract(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B);
+  MyList<Parallel::gridseg> *gsl_and(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B);
+  MyList<Parallel::gridseg> *gs_and(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B);
+  MyList<Parallel::gridseg> *clone_gsl(MyList<Parallel::gridseg> *p, bool first_only);
+  MyList<Parallel::gridseg> *build_bulk_gsl(Patch *Pat); // similar to build_owned_gsl0 but does not care rank issue
+  MyList<Parallel::gridseg> *build_bulk_gsl(Block *bp, Patch *Pat);
+  void build_PhysBD_gstl(Patch *Pat, MyList<Parallel::gridseg> *srci, MyList<Parallel::gridseg> *dsti,
+                         MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst);
+  void PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry);
+  double L2Norm(Patch *Pat, var *vf);
+  void checkgsl(MyList<Parallel::gridseg> *pp, bool first_only);
+  void checkvarl(MyList<var> *pp, bool first_only);
+  MyList<Parallel::gridseg> *divide_gsl(MyList<Parallel::gridseg> *p, Patch *Pat);
+  MyList<Parallel::gridseg> *divide_gs(MyList<Parallel::gridseg> *p, Patch *Pat);
+  void prepare_inter_time_level(Patch *Pat,
+                                MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
+                                MyList<var> *VarList3 /* target (t+a*dt) */, int tindex);
+  void prepare_inter_time_level(Patch *Pat,
+                                MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
+                                MyList<var> *VarList3 /* source (t-dt) */, MyList<var> *VarList4 /* target (t+a*dt) */, int tindex);
+  void prepare_inter_time_level(MyList<Patch> *PatL,
+                                MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
+                                MyList<var> *VarList3 /* target (t+a*dt) */, int tindex);
+  void prepare_inter_time_level(MyList<Patch> *Pat,
+                                MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
+                                MyList<var> *VarList3 /* source (t-dt) */, MyList<var> *VarList4 /* target (t+a*dt) */, int tindex);
+  void merge_gsl(MyList<gridseg> *&A, const double ratio);
+  bool merge_gs(MyList<gridseg> *D, MyList<gridseg> *B, MyList<gridseg> *&C, const double ratio);
+  // Add ghost region to tangent plane
+  // we assume the grids have the same resolution
+  void add_ghost_touch(MyList<gridseg> *&A);
+  void cut_gsl(MyList<gridseg> *&A);
+  bool cut_gs(MyList<gridseg> *D, MyList<gridseg> *B, MyList<gridseg> *&C);
+  MyList<Parallel::gridseg> *gs_subtract_virtual(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B);
+  void fill_level_data(MyList<Patch> *PatLd, MyList<Patch> *PatLs, MyList<Patch> *PatcL,
+                       MyList<var> *OldList, MyList<var> *StateList, MyList<var> *FutureList,
+                       MyList<var> *tmList, int Symmetry, bool BB, bool CC);
+  bool PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
+                             int NN, double **XX,
+                             double *Shellf, int Symmetry);
+  void aligncheck(double *bbox0, double *bboxl, int lev, double *DH0, int *shape);
+  bool point_locat_gsl(double *pox, MyList<Parallel::gridseg> *gsl);
+  void checkpatchlist(MyList<Patch> *PatL, bool buflog);
+
+  double L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here);
+  bool PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
+                             int NN, double **XX,
+                             double *Shellf, int Symmetry, MPI_Comm Comm_here);
+#if (PSTR == 1 || PSTR == 2 || PSTR == 3)
+  MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
+                            bool periodic, int start_rank, int end_rank, int nodes = 0);
+  
+  // Redistribute blocks with time statistics for load balancing
+  MyList<Block> *distribute(MyList<Patch> *PatchLIST, MyList<Block> *OldBlockL,
+                            int cpusize, int ingfsi, int fngfsi,
+                            bool periodic, int start_rank, int end_rank, int nodes = 0);
+#endif
+
+  // Dynamic load balancing: split blocks for heavy ranks
+    void split_heavy_blocks(MyList<Patch> *PatL, int *heavy_ranks, int num_heavy,
+                            int split_factor, int cpusize, int ingfsi, int fngfsi);
+
+    // Check if load balancing is needed based on interpolation times
+    bool check_load_balance_need(double *rank_times, int nprocs, int &num_heavy, int *heavy_ranks);
+  }
+  #endif /*PARALLEL_H */
diff --git a/AMSS_NCKU_source/cgh.C b/AMSS_NCKU_source/cgh.C
index e27ccd6..a72ba42 100644
--- a/AMSS_NCKU_source/cgh.C
+++ b/AMSS_NCKU_source/cgh.C
@@ -1,1707 +1,1839 @@
-
-#ifdef newc
-#include <iostream>
-#include <iomanip>
-#include <fstream>
-#include <cstdlib>
-#include <cstdio>
-#include <string>
-#include <cmath>
-#include <map>
-using namespace std;
-#else
-#include <iostream.h>
-#include <iomanip.h>
-#include <fstream.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
-#include <map.h>
-#endif
-
-#include <mpi.h>
-
-#include "macrodef.h"
-#include "misc.h"
-#include "cgh.h"
-#include "Parallel.h"
-#include "parameters.h"
-
-//================================================================================================
-
-// define cgh class
-
-//================================================================================================
-
-cgh::cgh(int ingfsi, int fngfsi, int Symmetry, char *filename, int checkrun,
-         monitor *ErrorMonitor) : ingfs(ingfsi), fngfs(fngfsi), trfls(0)
-{
-#if (PSTR == 1 || PSTR == 2 || PSTR == 3)
-  Commlev = 0;
-  start_rank = 0;
-  end_rank = 0;
-#endif
-
-  if (!checkrun)
-  {
-    read_bbox(Symmetry, filename);
-    sethandle(ErrorMonitor);
-    for (int lev = 0; lev < levels; lev++)
-      PatL[lev] = construct_patchlist(lev, Symmetry);
-  }
-}
-
-//================================================================================================
-
-
-
-//================================================================================================
-
-// This member function is the destructor; it releases allocated resources and deletes variables
-
-//================================================================================================
-
-cgh::~cgh()
-{
-  for (int lev = 0; lev < levels; lev++)
-  {
-    for (int grd = 0; grd < grids[lev]; grd++)
-    {
-      delete[] bbox[lev][grd];
-      delete[] shape[lev][grd];
-      delete[] handle[lev][grd];
-    }
-    delete[] bbox[lev];
-    delete[] shape[lev];
-    delete[] handle[lev];
-    Parallel::KillBlocks(PatL[lev]);
-    PatL[lev]->destroyList();
-#if (RPB == 1)
-    Parallel::destroypsuList_bam(bdsul[lev]);
-    Parallel::destroypsuList_bam(rsul[lev]);
-#endif
-  }
-  delete[] grids;
-  delete[] Lt;
-  delete[] bbox;
-  delete[] shape;
-  delete[] handle;
-  delete[] PatL;
-#if (RPB == 1)
-  delete[] bdsul;
-  delete[] rsul;
-#endif
-
-#if (PSTR == 1 || PSTR == 2 || PSTR == 3)
-  for (int lev = 0; lev < levels; lev++)
-  {
-    MPI_Comm_free(&Commlev[lev]);
-  }
-
-  if (Commlev)
-    delete[] Commlev;
-  if (start_rank)
-    delete[] start_rank;
-  if (end_rank)
-    delete[] end_rank;
-#endif
-  for (int lev = 0; lev < levels; lev++)
-  {
-    for (int ibh = 0; ibh < BH_num_in; ibh++)
-      delete[] Porgls[lev][ibh];
-    delete[] Porgls[lev];
-  }
-  delete[] Porgls;
-}
-
-//================================================================================================
-
-
-//================================================================================================
-
-// This member function constructs the computational grid
-
-//================================================================================================
-
-#if (PSTR == 0)
-void cgh::compose_cgh(int nprocs)
-{
-  for (int lev = 0; lev < levels; lev++)
-  {
-    checkPatchList(PatL[lev], false);
-    Parallel::distribute(PatL[lev], nprocs, ingfs, fngfs, false);
-#if (RPB == 1)
-    // we need distributed box of PatL[lev] and PatL[lev-1]
-    if (lev > 0)
-    {
-      Parallel::Constr_pointstr_OutBdLow2Hi(PatL[lev], PatL[lev - 1], bdsul[lev]);
-      Parallel::Constr_pointstr_Restrict(PatL[lev], PatL[lev - 1], rsul[lev]);
-    }
-    else
-    {
-      bdsul[lev] = 0;
-      rsul[lev] = 0;
-    }
-#endif
-  }
-}
-
-//================================================================================================
-
-
-//================================================================================================
-
-// This member function constructs the computational grid
-// For the cases PSTR == 1 and PSTR == 2
-
-//================================================================================================
-
-#elif (PSTR == 1 || PSTR == 2)
-void cgh::compose_cgh(int nprocs)
-{
-  Commlev = new MPI_Comm[levels];
-  construct_mylev(nprocs);
-  for (int lev = 0; lev < levels; lev++)
-  {
-    MPI_Comm_split(MPI_COMM_WORLD, mylev, lev, &Commlev[lev]);
-    checkPatchList(PatL[lev], false);
-    Parallel::distribute(PatL[lev], end_rank[lev] - start_rank[lev] + 1, ingfs, fngfs, false, start_rank[lev], end_rank[lev]);
-#if (RPB == 1)
-#error "not support yet"
-#endif
-  }
-  /* note different comm field has its own rank index
-    int myrank;
-    MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
-    if(myrank==nprocs-1)
-    {
-        cout<<"myrank = "<<myrank<<", mylev = "<<mylev<<endl;
-        MPI_Comm_rank(Commlev[levels-1],&myrank);
-        cout<<myrank<<" :)"<<endl;
-    }
-  */
-}
-
-//================================================================================================
-
-#if (PSTR == 1)
-void cgh::construct_mylev(int nprocs)
-{
-  if (nprocs < levels)
-  {
-    cout << "Too few procs to use parallel level methods!" << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-  start_rank = new int[levels];
-  end_rank = new int[levels];
-
-  int myrank;
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  int mp;
-  mp = nprocs / levels;
-
-  start_rank[0] = 0;
-  end_rank[0] = mp - 1;
-  for (int lev = 1; lev < levels - 1; lev++)
-  {
-    start_rank[lev] = end_rank[lev - 1] + 1;
-    end_rank[lev] = end_rank[lev - 1] + mp;
-  }
-  start_rank[levels - 1] = end_rank[levels - 2] + 1;
-  end_rank[levels - 1] = nprocs - 1;
-
-  for (int lev = 0; lev < levels; lev++)
-  {
-    if (myrank >= start_rank[lev] && myrank <= end_rank[lev])
-      mylev = lev;
-  }
-}
-#elif (PSTR == 2)
-void cgh::construct_mylev(int nprocs)
-{
-  if (nprocs < levels)
-  {
-    cout << "Too few procs to use parallel level methods!" << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-  start_rank = new int[levels];
-  end_rank = new int[levels];
-
-  int myrank;
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  int mp;
-  mp = nprocs / levels;
-
-  start_rank[levels - 1] = 0;
-  end_rank[levels - 1] = mp - 1;
-  for (int lev = levels - 2; lev > 0; lev--)
-  {
-    start_rank[lev] = end_rank[lev - 1] + 1;
-    end_rank[lev] = end_rank[lev - 1] + mp;
-  }
-  start_rank[0] = end_rank[1] + 1;
-  end_rank[0] = nprocs - 1;
-
-  for (int lev = levels - 1; lev >= 0; lev--)
-  {
-    if (myrank >= start_rank[lev] && myrank <= end_rank[lev])
-      mylev = lev;
-  }
-}
-#endif
-
-#elif (PSTR == 3)
-void cgh::construct_mylev(int nprocs)
-{
-  if (nprocs <= 1)
-  {
-    cout << " cgh::construct_mylev requires at least 2 procs" << endl;
-    exit(0);
-  }
-
-  start_rank = new int[2];
-  end_rank = new int[2];
-
-  int myrank;
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  int mp;
-  mp = nprocs / 2;
-
-  // for other levels
-  for (int lev = 0; lev < levels - 1; lev++)
-  {
-    start_rank[lev] = 0;
-    end_rank[lev] = mp - 1;
-  }
-  // for finest level
-  start_rank[levels - 1] = end_rank[0] + 1;
-  end_rank[levels - 1] = nprocs - 1;
-
-  if (myrank >= start_rank[0] && myrank <= end_rank[0])
-    mylev = -1; // for other levels
-  else
-    mylev = 1; // for finest level
-}
-
-
-//-----------------------------------------------------------------------
-
-
-void cgh::compose_cgh(int nprocs)
-{
-  Commlev = new MPI_Comm[levels];
-  construct_mylev(nprocs);
-
-  for (int lev = 0; lev < levels - 1; lev++)
-  {
-    MPI_Comm_split(MPI_COMM_WORLD, mylev, -1, &Commlev[lev]);
-  }
-  MPI_Comm_split(MPI_COMM_WORLD, mylev, 1, &Commlev[levels - 1]);
-
-  for (int lev = 0; lev < levels; lev++)
-  {
-    checkPatchList(PatL[lev], false);
-    Parallel::distribute(PatL[lev], end_rank[lev] - start_rank[lev] + 1, ingfs, fngfs, false, start_rank[lev], end_rank[lev]);
-#if (RPB == 1)
-#error "not support yet"
-#endif
-  }
-}
-#endif
-
-
-void cgh::sethandle(monitor *ErrorMonitor)
-{
-  int BH_num;
-  Porgls = new double **[levels];
-  char filename[100];
-  {
-    map<string, string>::iterator iter = parameters::str_par.find("inputpar");
-    if (iter != parameters::str_par.end())
-    {
-      strcpy(filename, (iter->second).c_str());
-    }
-    else
-    {
-      cout << "Error inputpar" << endl;
-      exit(0);
-    }
-  }
-  // read parameter from file
-  {
-    const int LEN = 256;
-    char pline[LEN];
-    string str, sgrp, skey, sval;
-    int sind;
-    ifstream inf(filename, ifstream::in);
-    if (!inf.good() && ErrorMonitor && ErrorMonitor->outfile)
-    {
-      ErrorMonitor->outfile << "Can not open parameter file " << filename << " for inputing information of black holes" << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-
-    for (int i = 1; inf.good(); i++)
-    {
-      inf.getline(pline, LEN);
-      str = pline;
-
-      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
-      if (status == -1)
-      {
-        if (ErrorMonitor && ErrorMonitor->outfile)
-          ErrorMonitor->outfile << "error reading parameter file " << filename << " in line " << i << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-      else if (status == 0)
-        continue;
-
-      if (sgrp == "BSSN" && skey == "BH_num")
-        BH_num = atoi(sval.c_str());
-      else if (sgrp == "cgh" && skey == "moving levels start from")
-      {
-        movls = atoi(sval.c_str());
-        movls = Mymin(movls, levels);
-        movls = Mymax(0, movls);
-      }
-    }
-    inf.close();
-  }
-  for (int lev = 0; lev < levels; lev++)
-  {
-    Porgls[lev] = new double *[BH_num];
-    for (int i = 0; i < BH_num; i++)
-      Porgls[lev][i] = new double[dim];
-  }
-  // read parameter from file
-  {
-    const int LEN = 256;
-    char pline[LEN];
-    string str, sgrp, skey, sval;
-    int sind;
-    ifstream inf(filename, ifstream::in);
-    if (!inf.good() && ErrorMonitor && ErrorMonitor->outfile)
-    {
-      ErrorMonitor->outfile << "Can not open parameter file " << filename
-                            << " for inputing information of black holes" << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-
-    for (int i = 1; inf.good(); i++)
-    {
-      inf.getline(pline, LEN);
-      str = pline;
-
-      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
-      if (status == -1)
-      {
-        if (ErrorMonitor && ErrorMonitor->outfile)
-          ErrorMonitor->outfile << "error reading parameter file " << filename << " in line " << i << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-      else if (status == 0)
-        continue;
-
-      if (sgrp == "BSSN" && sind < BH_num)
-      {
-        if (skey == "Porgx")
-        {
-          for (int lev = 0; lev < levels; lev++)
-            Porgls[lev][sind][0] = atof(sval.c_str());
-        }
-        else if (skey == "Porgy")
-        {
-          for (int lev = 0; lev < levels; lev++)
-            Porgls[lev][sind][1] = atof(sval.c_str());
-        }
-        else if (skey == "Porgz")
-        {
-          for (int lev = 0; lev < levels; lev++)
-            Porgls[lev][sind][2] = atof(sval.c_str());
-        }
-      }
-    }
-    inf.close();
-  }
-
-  for (int lev = 0; lev < movls; lev++)
-    for (int grd = 0; grd < grids[lev]; grd++)
-      for (int i = 0; i < dim; i++)
-        handle[lev][grd][i] = 0;
-
-  if (movls < levels)
-  {
-    if (ErrorMonitor && ErrorMonitor->I_Print)
-    {
-      cout << endl;
-      cout << " moving levels are lev #" << movls << "--" << levels - 1 << endl;
-      cout << endl;
-    }
-
-    for (int lev = movls; lev < levels; lev++)
-      for (int grd = 0; grd < grids[lev]; grd++)
-      {
-#if 0	
-	 int bht=0;
-	 for(int bhi=0;bhi<BH_num;bhi++)
-	 {
-	    bool flag=false;
-    
-	    for(int i=0;i<dim;i++)
-               if(Porgls[0][bhi][i] < bbox[lev][grd][i] || Porgls[0][bhi][i] > bbox[lev][grd][i+dim]) {flag=true; break;}
-	    if(flag) continue;
-	    bht++;
-	    if(bht==1)  for(int i=0;i<dim;i++) handle[lev][grd][i]=Porgls[0][bhi][i];
-	    else if(ErrorMonitor && ErrorMonitor->outfile) 
-	    {
-               ErrorMonitor->outfile<<"cgh::sethandle: lev#"<<lev<<" grd#"<<grd<<" has too many black holes"<<endl;
-               MPI_Abort(MPI_COMM_WORLD,1);
-	    }
-	 }
-#else
-        double xxc[dim], dis0, dis1;
-        for (int i = 0; i < dim; i++)
-          xxc[i] = (bbox[lev][grd][i] + bbox[lev][grd][i + dim]) / 2;
-        int bht = 0;
-        for (int bhi = 0; bhi < BH_num; bhi++)
-        {
-          if (bhi == 0)
-          {
-            dis0 = 0;
-            for (int i = 0; i < dim; i++)
-              dis0 += pow(Porgls[0][bhi][i] - xxc[i], 2);
-            dis0 = sqrt(dis0);
-          }
-          else
-          {
-            dis1 = 0;
-            for (int i = 0; i < dim; i++)
-              dis1 += pow(Porgls[0][bhi][i] - xxc[i], 2);
-            dis1 = sqrt(dis1);
-            if (dis0 > dis1)
-            {
-              bht = bhi;
-              dis0 = dis1;
-            } // chose nearest one
-          }
-        }
-        for (int i = 0; i < dim; i++)
-          handle[lev][grd][i] = Porgls[0][bht][i];
-#endif
-      }
-  }
-  else if (ErrorMonitor && ErrorMonitor->I_Print)
-  {
-    if (levels > 1)
-      cout << "fixed mesh refinement!" << endl;
-    else
-      cout << "unigrid simulation!" << endl;
-  }
-
-  BH_num_in = BH_num;
-}
-void cgh::checkPatchList(MyList<Patch> *PatL, bool buflog)
-{
-  while (PatL)
-  {
-    PatL->data->checkPatch(buflog);
-    PatL = PatL->next;
-  }
-}
-
-
-//================================================================================================
-
-// This member function moves the grid
-
-//================================================================================================
-
-void cgh::Regrid(int Symmetry, int BH_num, double **Porgbr, double **Porg0,
-                 MyList<var> *OldList, MyList<var> *StateList,
-                 MyList<var> *FutureList, MyList<var> *tmList, bool BB,
-                 monitor *ErrorMonitor)
-{
-  // for moving part
-  if (movls < levels)
-  {
-    bool tot_flag = false;
-    bool *lev_flag;
-    double **tmpPorg;
-    tmpPorg = new double *[BH_num];
-    for (int bhi = 0; bhi < BH_num; bhi++)
-    {
-      tmpPorg[bhi] = new double[dim];
-      for (int i = 0; i < dim; i++)
-        tmpPorg[bhi][i] = Porgbr[bhi][i];
-    }
-    lev_flag = new bool[levels - movls];
-    for (int lev = movls; lev < levels; lev++)
-    {
-      lev_flag[lev - movls] = false;
-      for (int grd = 0; grd < grids[lev]; grd++)
-      {
-        int flag;
-        int do_every = 2;
-        double dX = PatL[lev]->data->blb->data->getdX(0);
-        double dY = PatL[lev]->data->blb->data->getdX(1);
-        double dZ = PatL[lev]->data->blb->data->getdX(2);
-        double rr;
-        // make sure that the grid corresponds to the black hole
-        int bhi = 0;
-        for (bhi = 0; bhi < BH_num; bhi++)
-        {
-          // because finner level may also change Porgbr, so we need factor 2
-          if (feq(Porgbr[bhi][0], handle[lev][grd][0], 2 * do_every * dX) &&
-              feq(Porgbr[bhi][1], handle[lev][grd][1], 2 * do_every * dY) &&
-              feq(Porgbr[bhi][2], handle[lev][grd][2], 2 * do_every * dZ))
-            break;
-        }
-        if (bhi == BH_num)
-        {
-          // if the box has already touched the original point
-          if (feq(0, bbox[lev][grd][0], dX / 2) &&
-              feq(0, bbox[lev][grd][1], dY / 2) &&
-              feq(0, bbox[lev][grd][2], dZ / 2))
-            break;
-
-          if (BH_num == 1)
-          {
-            bhi = 0;
-            break;
-          } // if only one black hole, it definitely match!
-
-          if (ErrorMonitor->outfile)
-          {
-            ErrorMonitor->outfile << "cgh::Regrid: no black hole matches with grid lev#" << lev << " grd#" << grd
-                                  << " with handle (" << handle[lev][grd][0] << "," << handle[lev][grd][1] << "," << handle[lev][grd][2] << ")" << endl;
-            ErrorMonitor->outfile << "black holes' old positions:" << endl;
-            for (bhi = 0; bhi < BH_num; bhi++)
-              ErrorMonitor->outfile << "#" << bhi << ": (" << Porgbr[bhi][0] << "," << Porgbr[bhi][1] << "," << Porgbr[bhi][2] << ")" << endl;
-            ErrorMonitor->outfile << "tolerance:" << endl;
-            ErrorMonitor->outfile << "(" << 2 * do_every * dX << "," << 2 * do_every * dY << "," << 2 * do_every * dZ << ")" << endl;
-            ErrorMonitor->outfile << "box lower boundary: (" << bbox[lev][grd][0] << "," << bbox[lev][grd][1] << "," << bbox[lev][grd][2] << ")" << endl;
-            MPI_Abort(MPI_COMM_WORLD, 1);
-          }
-
-          delete[] lev_flag;
-          for (bhi = 0; bhi < BH_num; bhi++)
-            delete[] tmpPorg[bhi];
-          delete[] tmpPorg;
-          return;
-        }
-        // x direction
-        rr = (Porg0[bhi][0] - handle[lev][grd][0]) / dX;
-        if (rr > 0)
-          flag = int(rr + 0.5) / do_every;
-        else
-          flag = int(rr - 0.5) / do_every;
-        flag = flag * do_every;
-        rr = bbox[lev][grd][0] + flag * dX;
-        // pay attention to the symmetric case
-        if (Symmetry == 2 && rr < 0)
-          rr = -bbox[lev][grd][0];
-        else
-          rr = flag * dX;
-
-        if (fabs(rr) > dX / 2)
-        {
-          lev_flag[lev - movls] = tot_flag = true;
-          bbox[lev][grd][0] = bbox[lev][grd][0] + rr;
-          bbox[lev][grd][3] = bbox[lev][grd][3] + rr;
-          handle[lev][grd][0] += rr;
-          tmpPorg[bhi][0] = Porg0[bhi][0];
-        }
-
-        // y direction
-        rr = (Porg0[bhi][1] - handle[lev][grd][1]) / dY;
-        if (rr > 0)
-          flag = int(rr + 0.5) / do_every;
-        else
-          flag = int(rr - 0.5) / do_every;
-        flag = flag * do_every;
-        rr = bbox[lev][grd][1] + flag * dY;
-        // pay attention to the symmetric case
-        if (Symmetry == 2 && rr < 0)
-          rr = -bbox[lev][grd][1];
-        else
-          rr = flag * dY;
-
-        if (fabs(rr) > dY / 2)
-        {
-          lev_flag[lev - movls] = tot_flag = true;
-          bbox[lev][grd][1] = bbox[lev][grd][1] + rr;
-          bbox[lev][grd][4] = bbox[lev][grd][4] + rr;
-          handle[lev][grd][1] += rr;
-          tmpPorg[bhi][1] = Porg0[bhi][1];
-        }
-
-        // z direction
-        rr = (Porg0[bhi][2] - handle[lev][grd][2]) / dZ;
-        if (rr > 0)
-          flag = int(rr + 0.5) / do_every;
-        else
-          flag = int(rr - 0.5) / do_every;
-        flag = flag * do_every;
-        rr = bbox[lev][grd][2] + flag * dZ;
-        // pay attention to the symmetric case
-        if (Symmetry > 0 && rr < 0)
-          rr = -bbox[lev][grd][1];
-        else
-          rr = flag * dZ;
-
-        if (fabs(rr) > dZ / 2)
-        {
-          lev_flag[lev - movls] = tot_flag = true;
-          bbox[lev][grd][2] = bbox[lev][grd][2] + rr;
-          bbox[lev][grd][5] = bbox[lev][grd][5] + rr;
-          handle[lev][grd][2] += rr;
-          tmpPorg[bhi][2] = Porg0[bhi][2];
-        }
-      }
-      //   if(ErrorMonitor->outfile && lev_flag[lev-movls]) cout<<"lev#"<<lev<<"'s boxes moved"<<endl;
-    }
-
-    if (tot_flag)
-    {
-      int nprocs;
-      MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-      recompose_cgh(nprocs, lev_flag, OldList, StateList, FutureList, tmList, Symmetry, BB);
-      for (int bhi = 0; bhi < BH_num; bhi++)
-      {
-        for (int i = 0; i < dim; i++)
-          Porgbr[bhi][i] = tmpPorg[bhi][i];
-      }
-    }
-
-    delete[] lev_flag;
-    for (int bhi = 0; bhi < BH_num; bhi++)
-      delete[] tmpPorg[bhi];
-    delete[] tmpPorg;
-  }
-}
-
-//================================================================================================
-
-#if (PSTR == 1 || PSTR == 2 || PSTR == 3)
-#warning "Regrid is not implimented yet"
-void cgh::Regrid_fake(int Symmetry, int BH_num, double **Porgbr, double **Porg0,
-                      MyList<var> *OldList, MyList<var> *StateList,
-                      MyList<var> *FutureList, MyList<var> *tmList, bool BB,
-                      monitor *ErrorMonitor)
-{
-  // for moving part
-  if (movls < levels)
-  {
-    bool tot_flag = false;
-    bool *lev_flag;
-    double **tmpPorg;
-    tmpPorg = new double *[BH_num];
-    for (int bhi = 0; bhi < BH_num; bhi++)
-    {
-      tmpPorg[bhi] = new double[dim];
-      for (int i = 0; i < dim; i++)
-        tmpPorg[bhi][i] = Porgbr[bhi][i];
-    }
-    lev_flag = new bool[levels - movls];
-    for (int lev = movls; lev < levels; lev++)
-    {
-      lev_flag[lev - movls] = false;
-      for (int grd = 0; grd < grids[lev]; grd++)
-      {
-        int flag;
-        int do_every = 2;
-        double dX = PatL[lev]->data->blb->data->getdX(0);
-        double dY = PatL[lev]->data->blb->data->getdX(1);
-        double dZ = PatL[lev]->data->blb->data->getdX(2);
-        double rr;
-        // make sure that the grid corresponds to the black hole
-        int bhi = 0;
-        for (bhi = 0; bhi < BH_num; bhi++)
-        {
-          // because finner level may also change Porgbr, so we need factor 2
-          if (feq(Porgbr[bhi][0], handle[lev][grd][0], 2 * do_every * dX) &&
-              feq(Porgbr[bhi][1], handle[lev][grd][1], 2 * do_every * dY) &&
-              feq(Porgbr[bhi][2], handle[lev][grd][2], 2 * do_every * dZ))
-            break;
-        }
-        if (bhi == BH_num)
-        {
-          // if the box has already touched the original point
-          if (feq(0, bbox[lev][grd][0], dX / 2) &&
-              feq(0, bbox[lev][grd][1], dY / 2) &&
-              feq(0, bbox[lev][grd][2], dZ / 2))
-            break;
-
-          if (BH_num == 1)
-          {
-            bhi = 0;
-            break;
-          } // if only one black hole, it definitely match!
-
-          if (ErrorMonitor->outfile)
-          {
-            ErrorMonitor->outfile << "cgh::Regrid: no black hole matches with grid lev#" << lev << " grd#" << grd
-                                  << " with handle (" << handle[lev][grd][0] << "," << handle[lev][grd][1] << "," << handle[lev][grd][2] << ")" << endl;
-            ErrorMonitor->outfile << "black holes' old positions:" << endl;
-            for (bhi = 0; bhi < BH_num; bhi++)
-              ErrorMonitor->outfile << "#" << bhi << ": (" << Porgbr[bhi][0] << "," << Porgbr[bhi][1] << "," << Porgbr[bhi][2] << ")" << endl;
-            ErrorMonitor->outfile << "tolerance:" << endl;
-            ErrorMonitor->outfile << "(" << 2 * do_every * dX << "," << 2 * do_every * dY << "," << 2 * do_every * dZ << ")" << endl;
-            ErrorMonitor->outfile << "box lower boundary: (" << bbox[lev][grd][0] << "," << bbox[lev][grd][1] << "," << bbox[lev][grd][2] << ")" << endl;
-            MPI_Abort(MPI_COMM_WORLD, 1);
-          }
-
-          delete[] lev_flag;
-          for (bhi = 0; bhi < BH_num; bhi++)
-            delete[] tmpPorg[bhi];
-          delete[] tmpPorg;
-          return;
-        }
-        // x direction
-        rr = (Porg0[bhi][0] - handle[lev][grd][0]) / dX;
-        if (rr > 0)
-          flag = int(rr + 0.5) / do_every;
-        else
-          flag = int(rr - 0.5) / do_every;
-        flag = flag * do_every;
-        rr = bbox[lev][grd][0] + flag * dX;
-        // pay attention to the symmetric case
-        if (Symmetry == 2 && rr < 0)
-          rr = -bbox[lev][grd][0];
-        else
-          rr = flag * dX;
-
-        if (fabs(rr) > dX / 2)
-        {
-          lev_flag[lev - movls] = tot_flag = true;
-          bbox[lev][grd][0] = bbox[lev][grd][0] + rr;
-          bbox[lev][grd][3] = bbox[lev][grd][3] + rr;
-          handle[lev][grd][0] += rr;
-          tmpPorg[bhi][0] = Porg0[bhi][0];
-        }
-
-        // y direction
-        rr = (Porg0[bhi][1] - handle[lev][grd][1]) / dY;
-        if (rr > 0)
-          flag = int(rr + 0.5) / do_every;
-        else
-          flag = int(rr - 0.5) / do_every;
-        flag = flag * do_every;
-        rr = bbox[lev][grd][1] + flag * dY;
-        // pay attention to the symmetric case
-        if (Symmetry == 2 && rr < 0)
-          rr = -bbox[lev][grd][1];
-        else
-          rr = flag * dY;
-
-        if (fabs(rr) > dY / 2)
-        {
-          lev_flag[lev - movls] = tot_flag = true;
-          bbox[lev][grd][1] = bbox[lev][grd][1] + rr;
-          bbox[lev][grd][4] = bbox[lev][grd][4] + rr;
-          handle[lev][grd][1] += rr;
-          tmpPorg[bhi][1] = Porg0[bhi][1];
-        }
-
-        // z direction
-        rr = (Porg0[bhi][2] - handle[lev][grd][2]) / dZ;
-        if (rr > 0)
-          flag = int(rr + 0.5) / do_every;
-        else
-          flag = int(rr - 0.5) / do_every;
-        flag = flag * do_every;
-        rr = bbox[lev][grd][2] + flag * dZ;
-        // pay attention to the symmetric case
-        if (Symmetry > 0 && rr < 0)
-          rr = -bbox[lev][grd][1];
-        else
-          rr = flag * dZ;
-
-        if (fabs(rr) > dZ / 2)
-        {
-          lev_flag[lev - movls] = tot_flag = true;
-          bbox[lev][grd][2] = bbox[lev][grd][2] + rr;
-          bbox[lev][grd][5] = bbox[lev][grd][5] + rr;
-          handle[lev][grd][2] += rr;
-          tmpPorg[bhi][2] = Porg0[bhi][2];
-        }
-      }
-      //   if(ErrorMonitor->outfile && lev_flag[lev-movls]) cout<<"lev#"<<lev<<"'s boxes moved"<<endl;
-    }
-
-    if (tot_flag)
-    {
-      int nprocs;
-      MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-      recompose_cgh_fake(nprocs, lev_flag, OldList, StateList, FutureList, tmList, Symmetry, BB);
-      for (int bhi = 0; bhi < BH_num; bhi++)
-      {
-        for (int i = 0; i < dim; i++)
-          Porgbr[bhi][i] = tmpPorg[bhi][i];
-      }
-    }
-
-    delete[] lev_flag;
-    for (int bhi = 0; bhi < BH_num; bhi++)
-      delete[] tmpPorg[bhi];
-    delete[] tmpPorg;
-  }
-}
-#endif
-
-
-//================================================================================================
-
-// This member function rebuilds the grid (regrid)
-
-//================================================================================================
-
-#if (PSTR == 0)
-void cgh::recompose_cgh(int nprocs, bool *lev_flag,
-                        MyList<var> *OldList, MyList<var> *StateList,
-                        MyList<var> *FutureList, MyList<var> *tmList,
-                        int Symmetry, bool BB)
-{
-  for (int lev = movls; lev < levels; lev++)
-    if (lev_flag[lev - movls])
-    {
-      MyList<Patch> *tmPat = 0;
-      tmPat = construct_patchlist(lev, Symmetry);
-      // tmPat construction completes
-      Parallel::distribute(tmPat, nprocs, ingfs, fngfs, false);
-      //    checkPatchList(tmPat,true);
-      bool CC = (lev > trfls);
-      Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
-
-      Parallel::KillBlocks(PatL[lev]);
-      PatL[lev]->destroyList();
-      PatL[lev] = tmPat;
-#if (RPB == 1)
-      Parallel::destroypsuList_bam(bdsul[lev]);
-      Parallel::destroypsuList_bam(rsul[lev]);
-      Parallel::Constr_pointstr_OutBdLow2Hi(PatL[lev], PatL[lev - 1], bdsul[lev]);
-      Parallel::Constr_pointstr_Restrict(PatL[lev], PatL[lev - 1], rsul[lev]);
-#endif
-    }
-}
-#elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
-#warning "recompose_cgh is not implimented yet"
-void cgh::recompose_cgh(int nprocs, bool *lev_flag,
-                        MyList<var> *OldList, MyList<var> *StateList,
-                        MyList<var> *FutureList, MyList<var> *tmList,
-                        int Symmetry, bool BB)
-{
-  for (int lev = movls; lev < levels; lev++)
-    if (lev_flag[lev - movls])
-    {
-      MyList<Patch> *tmPat = 0;
-      tmPat = construct_patchlist(lev, Symmetry);
-      // tmPat construction completes
-      Parallel::distribute(tmPat, end_rank[lev] - start_rank[lev] + 1, ingfs, fngfs, false, start_rank[lev], end_rank[lev]);
-      //    checkPatchList(tmPat,true);
-      bool CC = (lev > trfls);
-      Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
-
-      Parallel::KillBlocks(PatL[lev]);
-      PatL[lev]->destroyList();
-      PatL[lev] = tmPat;
-#if (RPB == 1)
-#error "not support yet"
-#endif
-    }
-}
-
-//================================================================================================
-
-void cgh::recompose_cgh_fake(int nprocs, bool *lev_flag,
-                             MyList<var> *OldList, MyList<var> *StateList,
-                             MyList<var> *FutureList, MyList<var> *tmList,
-                             int Symmetry, bool BB)
-{
-  for (int lev = movls; lev < levels; lev++)
-    if (lev_flag[lev - movls] && lev != mylev)
-    {
-      MyList<Patch> *tmPat = 0;
-      tmPat = construct_patchlist(lev, Symmetry);
-      // tmPat construction completes
-      Parallel::distribute(tmPat, end_rank[lev] - start_rank[lev] + 1, ingfs, fngfs, false, start_rank[lev], end_rank[lev]);
-
-      Parallel::KillBlocks(PatL[lev]);
-      PatL[lev]->destroyList();
-      PatL[lev] = tmPat;
-    }
-}
-#endif
-
-//================================================================================================
-
-// This member function reads grid information from input files
-
-//================================================================================================
-
-void cgh::read_bbox(int Symmetry, char *filename)
-{
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  // read parameter from file
-  {
-    const int LEN = 256;
-    char pline[LEN];
-    string str, sgrp, skey, sval;
-    int sind1, sind2, sind3;
-    ifstream inf(filename, ifstream::in);
-    if (!inf.good() && myrank == 0)
-    {
-      cout << "cgh::cgh: Can not open parameter file " << filename << " for inputing information of black holes" << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-
-    for (int i = 1; inf.good(); i++)
-    {
-      inf.getline(pline, LEN);
-      str = pline;
-
-      int status = misc::parse_parts(str, sgrp, skey, sval, sind1);
-      if (status == -1)
-      {
-        cout << "error reading parameter file " << filename << " in line " << i << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-      else if (status == 0)
-        continue;
-
-      if (sgrp == "cgh" && skey == "levels")
-      {
-        levels = atoi(sval.c_str());
-        break;
-      }
-    }
-    inf.close();
-  }
-
-  grids = new int[levels];
-  shape = new int **[levels];
-  handle = new double **[levels];
-  bbox = new double **[levels];
-  PatL = new MyList<Patch> *[levels];
-  Lt = new double[levels];
-#if (RPB == 1)
-  bdsul = new MyList<Parallel::pointstru_bam> *[levels];
-  rsul = new MyList<Parallel::pointstru_bam> *[levels];
-#endif
-  // read parameter from file
-  {
-    const int LEN = 256;
-    char pline[LEN];
-    string str, sgrp, skey, sval;
-    int sind1, sind2, sind3;
-    ifstream inf(filename, ifstream::in);
-    if (!inf.good() && myrank == 0)
-    {
-      cout << "cgh::cgh: Can not open parameter file " << filename << " for inputing information of black holes" << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-
-    for (int i = 1; inf.good(); i++)
-    {
-      inf.getline(pline, LEN);
-      str = pline;
-
-      int status = misc::parse_parts(str, sgrp, skey, sval, sind1, sind2, sind3);
-      if (status == -1)
-      {
-        cout << "error reading parameter file " << filename << " in line " << i << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-      else if (status == 0)
-        continue;
-
-      if (sgrp == "cgh" && skey == "grids" && sind1 < levels)
-        grids[sind1] = atoi(sval.c_str());
-    }
-    inf.close();
-  }
-
-  for (int sind1 = 0; sind1 < levels; sind1++)
-  {
-    shape[sind1] = new int *[grids[sind1]];
-    handle[sind1] = new double *[grids[sind1]];
-    bbox[sind1] = new double *[grids[sind1]];
-    for (int sind2 = 0; sind2 < grids[sind1]; sind2++)
-    {
-      shape[sind1][sind2] = new int[dim];
-      handle[sind1][sind2] = new double[dim];
-      bbox[sind1][sind2] = new double[2 * dim];
-    }
-  }
-  // read parameter from file
-  {
-    const int LEN = 256;
-    char pline[LEN];
-    string str, sgrp, skey, sval;
-    int sind1, sind2, sind3;
-    ifstream inf(filename, ifstream::in);
-    if (!inf.good() && myrank == 0)
-    {
-      cout << "cgh::cgh: Can not open parameter file " << filename << " for inputing information of black holes" << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-
-    for (int i = 1; inf.good(); i++)
-    {
-      inf.getline(pline, LEN);
-      str = pline;
-
-      int status = misc::parse_parts(str, sgrp, skey, sval, sind1, sind2, sind3);
-
-      if (status == -1)
-      {
-        cout << "error reading parameter file " << filename << " in line " << i << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-      else if (status == 0)
-        continue;
-
-      if (sgrp == "cgh" && sind1 < levels && sind2 < grids[sind1])
-      {
-        if (skey == "bbox")
-          bbox[sind1][sind2][sind3] = atof(sval.c_str());
-        else if (skey == "shape")
-          shape[sind1][sind2][sind3] = atoi(sval.c_str());
-      }
-    }
-    inf.close();
-  }
-// we always assume the input parameter is in cell center style
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-  for (int lev = 0; lev < levels; lev++)
-    for (int grd = 0; grd < grids[lev]; grd++)
-    {
-      for (int i = 0; i < dim; i++)
-      {
-
-        shape[lev][grd][i] = shape[lev][grd][i] + 1;
-      }
-    }
-#endif
-
-  {
-
-    // boxes align check
-    double DH0[dim];
-    for (int i = 0; i < dim; i++)
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-      DH0[i] = (bbox[0][0][i + dim] - bbox[0][0][i]) / (shape[0][0][i] - 1);
-#else
-#ifdef Cell
-      DH0[i] = (bbox[0][0][i + dim] - bbox[0][0][i]) / shape[0][0][i];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-    for (int lev = 0; lev < levels; lev++)
-      for (int grd = 0; grd < grids[lev]; grd++)
-        Parallel::aligncheck(bbox[0][0], bbox[lev][grd], lev, DH0, shape[lev][grd]);
-
-#if 0 // we do not need it here, because we do it in construct_patchlist
-// extend buffer points for shell overlap
-#ifdef WithShell   
-   for(int i=0;i<dim;i++)
-   {
-     shape[0][0][i] += buffer_width;
-     bbox[0][0][i+dim] += DH0[i]*buffer_width;
-   }
-   if(Symmetry < 2 )
-   {
-     shape[0][0][0] += buffer_width;
-     bbox[0][0][0] -= DH0[0]*buffer_width;
-     shape[0][0][1] += buffer_width;
-     bbox[0][0][1] -= DH0[1]*buffer_width;
-     if(Symmetry < 1)
-     {
-       shape[0][0][2] += buffer_width;
-       bbox[0][0][2] -= DH0[2]*buffer_width;
-     }
-   }
-#endif
-#endif
-  }
-  // print information of cgh
-  if (myrank == 0)
-  {
-    cout << endl;
-    cout << " cgh has levels: " << levels << endl;
-    cout << endl;
-    for (int lev = 0; lev < levels; lev++)
-    {
-      cout << " level #" << lev << " has boxes: " << grids[lev] << endl;
-      for (int grd = 0; grd < grids[lev]; grd++)
-      {
-        cout << " #" << grd << " box is" << "  (" << bbox[lev][grd][0] << ":" << bbox[lev][grd][3]
-             << "," << bbox[lev][grd][1] << ":" << bbox[lev][grd][4]
-             << "," << bbox[lev][grd][2] << ":" << bbox[lev][grd][5]
-             << ")." << endl;
-      }
-    }
-  }
-}
-
-//================================================================================================
-
-
-//================================================================================================
-
-// This member function generates required grid information
-
-//================================================================================================
-
-MyList<Patch> *cgh::construct_patchlist(int lev, int Symmetry)
-{
-  // Construct Patches
-  MyList<Patch> *tmPat = 0;
-  // construct box list
-  MyList<Parallel::gridseg> *boxes = 0, *gs;
-
-  /*
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  if (myrank == 0)
-  {
-      cout << " construct patchlist: " << " level = " << lev << ",  grids in this level = " << grids[lev] << endl;
-  }
-  */
-
-  for (int grd = 0; grd < grids[lev]; grd++)
-  {
-    if (boxes)
-    {
-      gs->next = new MyList<Parallel::gridseg>;
-      gs = gs->next;
-      gs->data = new Parallel::gridseg;
-    }
-    else
-    {
-      boxes = gs = new MyList<Parallel::gridseg>;
-      gs->data = new Parallel::gridseg;
-    }
-    for (int i = 0; i < dim; i++)
-    {
-      gs->data->llb[i] = bbox[lev][grd][i];
-      gs->data->uub[i] = bbox[lev][grd][dim + i];
-      gs->data->shape[i] = shape[lev][grd][i];
-    }
-    gs->data->Bg = 0;
-    gs->next = 0;
-  }
-
-  // Merge grid boxes (merging more than three boxes may cause bugs)
-  // Parallel::merge_gsl(boxes, ratio);
-  if (grids[lev] < 3)
-  {
-    Parallel::merge_gsl(boxes, ratio);
-  }
-
-  // When grid boxes overlap, re-split the boxes
-  // Parallel::cut_gsl(boxes);
-  if (grids[lev] < 3)
-  {
-    Parallel::cut_gsl(boxes);
-  }
-
-  // After splitting, add new ghost regions?
-  // Parallel::add_ghost_touch(boxes);
-  if (grids[lev] < 3)
-  {
-    Parallel::add_ghost_touch(boxes);
-  }
-
-  MyList<Patch> *gp;
-  gs = boxes;
-  while (gs)
-  {
-    double tbb[2 * dim];
-    if (tmPat)
-    {
-      gp->next = new MyList<Patch>;
-      gp = gp->next;
-      for (int i = 0; i < dim; i++)
-      {
-        tbb[i] = gs->data->llb[i];
-        tbb[dim + i] = gs->data->uub[i];
-      }
-#ifdef WithShell
-      gp->data = new Patch(3, gs->data->shape, tbb, lev, true, Symmetry);
-#else
-      gp->data = new Patch(3, gs->data->shape, tbb, lev, (lev > 0), Symmetry);
-#endif
-    }
-    else
-    {
-      tmPat = gp = new MyList<Patch>;
-      for (int i = 0; i < dim; i++)
-      {
-        tbb[i] = gs->data->llb[i];
-        tbb[dim + i] = gs->data->uub[i];
-      }
-#ifdef WithShell
-      gp->data = new Patch(3, gs->data->shape, tbb, lev, true, Symmetry);
-#else
-      gp->data = new Patch(3, gs->data->shape, tbb, lev, (lev > 0), Symmetry);
-#endif
-    }
-    gp->next = 0;
-
-    gs = gs->next;
-  }
-
-  boxes->destroyList();
-
-  return tmPat;
-}
-
-//================================================================================================
-
-
-bool cgh::Interp_One_Point(MyList<var> *VarList,
-                           double *XX, /*input global Cartesian coordinate*/
-                           double *Shellf, int Symmetry)
-{
-  int lev = levels - 1;
-  while (lev >= 0)
-  {
-    MyList<Patch> *Pp = PatL[lev];
-    while (Pp)
-    {
-#if (PSTR == 0)
-      if (Pp->data->Interp_ONE_Point(VarList, XX, Shellf, Symmetry))
-        return true;
-#elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
-      if (Pp->data->Interp_ONE_Point(VarList, XX, Shellf, Symmetry, Commlev[lev]))
-        return true;
-#endif
-      Pp = Pp->next;
-    }
-    lev--;
-  }
-  return false;
-}
-
-
-void cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
-                          MyList<var> *OldList, MyList<var> *StateList,
-                          MyList<var> *FutureList, MyList<var> *tmList, bool BB,
-                          monitor *ErrorMonitor)
-{
-  if (lev < movls)
-    return;
-
-#if (0)
-  // #if (PSTR == 1 || PSTR == 2)
-  MyList<Patch> *Pp = PatL[lev];
-  while (Pp)
-  {
-    Pp->data->checkPatch(0, start_rank[mylev]);
-    Pp = Pp->next;
-  }
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  if (myrank == start_rank[mylev])
-  {
-    cout << "out_rank = " << myrank << endl;
-    for (int grd = 0; grd < grids[lev]; grd++)
-    {
-      cout << handle[lev][grd][0] << "," << handle[lev][grd][1] << "," << handle[lev][grd][2] << endl;
-    }
-    for (int bhi = 0; bhi < BH_num; bhi++)
-    {
-      cout << Porgls[lev][bhi][0] << "," << Porgls[lev][bhi][1] << "," << Porgls[lev][bhi][2] << endl;
-      cout << Porg0[bhi][0] << "," << Porg0[bhi][1] << "," << Porg0[bhi][2] << endl;
-    }
-  }
-#endif
-
-  //   misc::tillherecheck(Commlev[lev],start_rank[lev],"start Regrid_Onelevel");
-  // for moving part
-  bool tot_flag = false;
-  double **tmpPorg;
-  tmpPorg = new double *[BH_num];
-  for (int bhi = 0; bhi < BH_num; bhi++)
-  {
-    tmpPorg[bhi] = new double[dim];
-    for (int i = 0; i < dim; i++)
-      tmpPorg[bhi][i] = Porgls[lev][bhi][i];
-  }
-
-  for (int grd = 0; grd < grids[lev]; grd++)
-  {
-    int flag;
-    int do_every = 2;
-    double dX = PatL[lev]->data->blb->data->getdX(0);
-    double dY = PatL[lev]->data->blb->data->getdX(1);
-    double dZ = PatL[lev]->data->blb->data->getdX(2);
-    double rr;
-    // make sure that the grid corresponds to the black hole
-    int bhi = 0;
-    for (bhi = 0; bhi < BH_num; bhi++)
-    {
-      // because finner level may also change Porgbr, so we need factor 2
-      // now I used Porgls
-      if (feq(Porgls[lev][bhi][0], handle[lev][grd][0], 2 * do_every * dX) &&
-          feq(Porgls[lev][bhi][1], handle[lev][grd][1], 2 * do_every * dY) &&
-          feq(Porgls[lev][bhi][2], handle[lev][grd][2], 2 * do_every * dZ))
-        break;
-    }
-    if (bhi == BH_num)
-    {
-      // if the box has already touched the original point
-      if (feq(0, bbox[lev][grd][0], dX / 2) &&
-          feq(0, bbox[lev][grd][1], dY / 2) &&
-          feq(0, bbox[lev][grd][2], dZ / 2))
-        break;
-
-      if (BH_num == 1)
-      {
-        bhi = 0;
-        break;
-      } // if only one black hole, it definitely match!
-
-      if (ErrorMonitor->outfile)
-      {
-        ErrorMonitor->outfile << "cgh::Regrid: no black hole matches with grid lev#" << lev << " grd#" << grd
-                              << " with handle (" << handle[lev][grd][0] << "," << handle[lev][grd][1] << "," << handle[lev][grd][2] << ")" << endl;
-        ErrorMonitor->outfile << "black holes' old positions:" << endl;
-        for (bhi = 0; bhi < BH_num; bhi++)
-          ErrorMonitor->outfile << "#" << bhi << ": (" << Porgls[lev][bhi][0] << "," << Porgls[lev][bhi][1] << ","
-                                << Porgls[lev][bhi][2] << ")" << endl;
-        ErrorMonitor->outfile << "tolerance:" << endl;
-        ErrorMonitor->outfile << "(" << 2 * do_every * dX << "," << 2 * do_every * dY << "," << 2 * do_every * dZ << ")" << endl;
-        ErrorMonitor->outfile << "box lower boundary: (" << bbox[lev][grd][0] << "," << bbox[lev][grd][1] << "," << bbox[lev][grd][2] << ")" << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-
-      for (bhi = 0; bhi < BH_num; bhi++)
-        delete[] tmpPorg[bhi];
-      delete[] tmpPorg;
-      return;
-    }
-    // x direction
-    rr = (Porg0[bhi][0] - handle[lev][grd][0]) / dX;
-    if (rr > 0)
-      flag = int(rr + 0.5) / do_every;
-    else
-      flag = int(rr - 0.5) / do_every;
-    flag = flag * do_every;
-    rr = bbox[lev][grd][0] + flag * dX;
-    // pay attention to the symmetric case
-    if (Symmetry == 2 && rr < 0)
-      rr = -bbox[lev][grd][0];
-    else
-      rr = flag * dX;
-
-    if (fabs(rr) > dX / 2)
-    {
-      tot_flag = true;
-      bbox[lev][grd][0] = bbox[lev][grd][0] + rr;
-      bbox[lev][grd][3] = bbox[lev][grd][3] + rr;
-      handle[lev][grd][0] += rr;
-      tmpPorg[bhi][0] = Porg0[bhi][0];
-    }
-
-    // y direction
-    rr = (Porg0[bhi][1] - handle[lev][grd][1]) / dY;
-    if (rr > 0)
-      flag = int(rr + 0.5) / do_every;
-    else
-      flag = int(rr - 0.5) / do_every;
-    flag = flag * do_every;
-    rr = bbox[lev][grd][1] + flag * dY;
-    // pay attention to the symmetric case
-    if (Symmetry == 2 && rr < 0)
-      rr = -bbox[lev][grd][1];
-    else
-      rr = flag * dY;
-
-    if (fabs(rr) > dY / 2)
-    {
-      tot_flag = true;
-      bbox[lev][grd][1] = bbox[lev][grd][1] + rr;
-      bbox[lev][grd][4] = bbox[lev][grd][4] + rr;
-      handle[lev][grd][1] += rr;
-      tmpPorg[bhi][1] = Porg0[bhi][1];
-    }
-
-    // z direction
-    rr = (Porg0[bhi][2] - handle[lev][grd][2]) / dZ;
-    if (rr > 0)
-      flag = int(rr + 0.5) / do_every;
-    else
-      flag = int(rr - 0.5) / do_every;
-    flag = flag * do_every;
-    rr = bbox[lev][grd][2] + flag * dZ;
-    // pay attention to the symmetric case
-    if (Symmetry > 0 && rr < 0)
-      rr = -bbox[lev][grd][1];
-    else
-      rr = flag * dZ;
-
-    if (fabs(rr) > dZ / 2)
-    {
-      tot_flag = true;
-      bbox[lev][grd][2] = bbox[lev][grd][2] + rr;
-      bbox[lev][grd][5] = bbox[lev][grd][5] + rr;
-      handle[lev][grd][2] += rr;
-      tmpPorg[bhi][2] = Porg0[bhi][2];
-    }
-  }
-
-  //   misc::tillherecheck(Commlev[lev],start_rank[lev],"after tot_flag check");
-
-  if (tot_flag)
-  {
-    int nprocs;
-    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-
-    //     misc::tillherecheck(Commlev[lev],start_rank[lev],"before recompose_cgh_Onelevel");
-
-    recompose_cgh_Onelevel(nprocs, lev, OldList, StateList, FutureList, tmList, Symmetry, BB);
-
-    //     misc::tillherecheck(Commlev[lev],start_rank[lev],"after recompose_cgh_Onelevel");
-
-    for (int bhi = 0; bhi < BH_num; bhi++)
-    {
-      for (int i = 0; i < dim; i++)
-        Porgls[lev][bhi][i] = tmpPorg[bhi][i];
-    }
-
-#if (PSTR == 1 || PSTR == 2)
-//       MyList<Patch> *Pp=PatL[lev];
-//       while(Pp)
-//       {
-//	 Pp->data->checkPatch(0,start_rank[mylev]);
-//	 Pp=Pp->next;
-//       }
-#endif
-  }
-
-  for (int bhi = 0; bhi < BH_num; bhi++)
-    delete[] tmpPorg[bhi];
-  delete[] tmpPorg;
-}
-
-
-#if (PSTR == 0)
-void cgh::recompose_cgh_Onelevel(int nprocs, int lev,
-                                 MyList<var> *OldList, MyList<var> *StateList,
-                                 MyList<var> *FutureList, MyList<var> *tmList,
-                                 int Symmetry, bool BB)
-{
-  MyList<Patch> *tmPat = 0;
-  tmPat = construct_patchlist(lev, Symmetry);
-  // tmPat construction completes
-  Parallel::distribute(tmPat, nprocs, ingfs, fngfs, false);
-  //    checkPatchList(tmPat,true);
-  bool CC = (lev > trfls);
-  Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
-
-  Parallel::KillBlocks(PatL[lev]);
-  PatL[lev]->destroyList();
-  PatL[lev] = tmPat;
-}
-#elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
-#warning "recompose_cgh_Onelevel is not implimented yet"
-void cgh::recompose_cgh_Onelevel(int nprocs, int lev,
-                                 MyList<var> *OldList, MyList<var> *StateList,
-                                 MyList<var> *FutureList, MyList<var> *tmList,
-                                 int Symmetry, bool BB)
-{
-  MyList<Patch> *tmPat = 0;
-  misc::tillherecheck(Commlev[lev], start_rank[lev], "before construct_patchlist");
-  tmPat = construct_patchlist(lev, Symmetry);
-  misc::tillherecheck(Commlev[lev], start_rank[lev], "after construct_patchlist");
-  // tmPat construction completes
-  Parallel::distribute(tmPat, end_rank[lev] - start_rank[lev] + 1, ingfs, fngfs, false, start_rank[lev], end_rank[lev]);
-  misc::tillherecheck(Commlev[lev], start_rank[lev], "after distribute");
-  //    checkPatchList(tmPat,true);
-  bool CC = (lev > trfls);
-  Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
-  misc::tillherecheck(Commlev[lev], start_rank[lev], "after fill_level_data");
-
-  Parallel::KillBlocks(PatL[lev]);
-  PatL[lev]->destroyList();
-  PatL[lev] = tmPat;
-}
-
-
-// the input lev is lower level for regrid
-void cgh::Regrid_Onelevel_aux(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
-                              MyList<var> *OldList, MyList<var> *StateList,
-                              MyList<var> *FutureList, MyList<var> *tmList, bool BB,
-                              monitor *ErrorMonitor)
-{
-  lev++;
-  if (lev < movls)
-    return;
-
-  // for moving part
-  bool tot_flag = false;
-  double **tmpPorg;
-  tmpPorg = new double *[BH_num];
-  for (int bhi = 0; bhi < BH_num; bhi++)
-  {
-    tmpPorg[bhi] = new double[dim];
-    for (int i = 0; i < dim; i++)
-      tmpPorg[bhi][i] = Porgbr[bhi][i];
-  }
-
-  for (int grd = 0; grd < grids[lev]; grd++)
-  {
-    int flag;
-    int do_every = 2;
-    double dX = PatL[lev]->data->blb->data->getdX(0);
-    double dY = PatL[lev]->data->blb->data->getdX(1);
-    double dZ = PatL[lev]->data->blb->data->getdX(2);
-    double rr;
-    // make sure that the grid corresponds to the black hole
-    int bhi = 0;
-    for (bhi = 0; bhi < BH_num; bhi++)
-    {
-      // because finner level may also change Porgbr, so we need factor 2
-      if (feq(Porgbr[bhi][0], handle[lev][grd][0], 2 * do_every * dX) &&
-          feq(Porgbr[bhi][1], handle[lev][grd][1], 2 * do_every * dY) &&
-          feq(Porgbr[bhi][2], handle[lev][grd][2], 2 * do_every * dZ))
-        break;
-    }
-    if (bhi == BH_num)
-    {
-      // if the box has already touched the original point
-      if (feq(0, bbox[lev][grd][0], dX / 2) &&
-          feq(0, bbox[lev][grd][1], dY / 2) &&
-          feq(0, bbox[lev][grd][2], dZ / 2))
-        break;
-
-      if (BH_num == 1)
-      {
-        bhi = 0;
-        break;
-      } // if only one black hole, it definitely match!
-
-      if (ErrorMonitor->outfile)
-      {
-        ErrorMonitor->outfile << "cgh::Regrid: no black hole matches with grid lev#" << lev << " grd#" << grd
-                              << " with handle (" << handle[lev][grd][0] << "," << handle[lev][grd][1] << "," << handle[lev][grd][2] << ")" << endl;
-        ErrorMonitor->outfile << "black holes' old positions:" << endl;
-        for (bhi = 0; bhi < BH_num; bhi++)
-          ErrorMonitor->outfile << "#" << bhi << ": (" << Porgbr[bhi][0] << "," << Porgbr[bhi][1] << "," << Porgbr[bhi][2] << ")" << endl;
-        ErrorMonitor->outfile << "tolerance:" << endl;
-        ErrorMonitor->outfile << "(" << 2 * do_every * dX << "," << 2 * do_every * dY << "," << 2 * do_every * dZ << ")" << endl;
-        ErrorMonitor->outfile << "box lower boundary: (" << bbox[lev][grd][0] << "," << bbox[lev][grd][1] << "," << bbox[lev][grd][2] << ")" << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-
-      for (bhi = 0; bhi < BH_num; bhi++)
-        delete[] tmpPorg[bhi];
-      delete[] tmpPorg;
-      return;
-    }
-    // x direction
-    rr = (Porg0[bhi][0] - handle[lev][grd][0]) / dX;
-    if (rr > 0)
-      flag = int(rr + 0.5) / do_every;
-    else
-      flag = int(rr - 0.5) / do_every;
-    flag = flag * do_every;
-    rr = bbox[lev][grd][0] + flag * dX;
-    // pay attention to the symmetric case
-    if (Symmetry == 2 && rr < 0)
-      rr = -bbox[lev][grd][0];
-    else
-      rr = flag * dX;
-
-    if (fabs(rr) > dX / 2)
-    {
-      tot_flag = true;
-      bbox[lev][grd][0] = bbox[lev][grd][0] + rr;
-      bbox[lev][grd][3] = bbox[lev][grd][3] + rr;
-      handle[lev][grd][0] += rr;
-      tmpPorg[bhi][0] = Porg0[bhi][0];
-    }
-
-    // y direction
-    rr = (Porg0[bhi][1] - handle[lev][grd][1]) / dY;
-    if (rr > 0)
-      flag = int(rr + 0.5) / do_every;
-    else
-      flag = int(rr - 0.5) / do_every;
-    flag = flag * do_every;
-    rr = bbox[lev][grd][1] + flag * dY;
-    // pay attention to the symmetric case
-    if (Symmetry == 2 && rr < 0)
-      rr = -bbox[lev][grd][1];
-    else
-      rr = flag * dY;
-
-    if (fabs(rr) > dY / 2)
-    {
-      tot_flag = true;
-      bbox[lev][grd][1] = bbox[lev][grd][1] + rr;
-      bbox[lev][grd][4] = bbox[lev][grd][4] + rr;
-      handle[lev][grd][1] += rr;
-      tmpPorg[bhi][1] = Porg0[bhi][1];
-    }
-
-    // z direction
-    rr = (Porg0[bhi][2] - handle[lev][grd][2]) / dZ;
-    if (rr > 0)
-      flag = int(rr + 0.5) / do_every;
-    else
-      flag = int(rr - 0.5) / do_every;
-    flag = flag * do_every;
-    rr = bbox[lev][grd][2] + flag * dZ;
-    // pay attention to the symmetric case
-    if (Symmetry > 0 && rr < 0)
-      rr = -bbox[lev][grd][1];
-    else
-      rr = flag * dZ;
-
-    if (fabs(rr) > dZ / 2)
-    {
-      tot_flag = true;
-      bbox[lev][grd][2] = bbox[lev][grd][2] + rr;
-      bbox[lev][grd][5] = bbox[lev][grd][5] + rr;
-      handle[lev][grd][2] += rr;
-      tmpPorg[bhi][2] = Porg0[bhi][2];
-    }
-  }
-
-  if (tot_flag)
-  {
-    int nprocs;
-    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-    recompose_cgh_Onelevel(nprocs, lev, OldList, StateList, FutureList, tmList, Symmetry, BB);
-  }
-
-  for (int bhi = 0; bhi < BH_num; bhi++)
-    delete[] tmpPorg[bhi];
-  delete[] tmpPorg;
-}
-#endif
-
-
-void cgh::settrfls(const int lev)
-{
-  trfls = lev;
-}
+
+#ifdef newc
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <cstdlib>
+#include <cstdio>
+#include <string>
+#include <cmath>
+#include <map>
+using namespace std;
+#else
+#include <iostream.h>
+#include <iomanip.h>
+#include <fstream.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <map.h>
+#endif
+
+#include <mpi.h>
+
+#include "macrodef.h"
+#include "misc.h"
+#include "cgh.h"
+#include "Parallel.h"
+#include "parameters.h"
+
+//================================================================================================
+
+// define cgh class
+
+//================================================================================================
+
+cgh::cgh(int ingfsi, int fngfsi, int Symmetry, char *filename, int checkrun,
+         monitor *ErrorMonitor) : ingfs(ingfsi), fngfs(fngfsi), trfls(0)
+{
+#if (PSTR == 1 || PSTR == 2 || PSTR == 3)
+  Commlev = 0;
+  start_rank = 0;
+  end_rank = 0;
+#endif
+
+  // Initialize load balancing variables
+  enable_load_balance = false;
+  load_balance_check_interval = 10;  // Check every 10 time steps
+  current_time_step = 0;
+  rank_interp_times = nullptr;
+  heavy_ranks = nullptr;
+  num_heavy_ranks = 0;
+
+  if (!checkrun)
+  {
+    read_bbox(Symmetry, filename);
+    sethandle(ErrorMonitor);
+    for (int lev = 0; lev < levels; lev++)
+      PatL[lev] = construct_patchlist(lev, Symmetry);
+  }
+}
+
+//================================================================================================
+
+
+
+//================================================================================================
+
+// This member function is the destructor; it releases allocated resources and deletes variables
+
+//================================================================================================
+
+cgh::~cgh()
+{
+  for (int lev = 0; lev < levels; lev++)
+  {
+    for (int grd = 0; grd < grids[lev]; grd++)
+    {
+      delete[] bbox[lev][grd];
+      delete[] shape[lev][grd];
+      delete[] handle[lev][grd];
+    }
+    delete[] bbox[lev];
+    delete[] shape[lev];
+    delete[] handle[lev];
+    Parallel::KillBlocks(PatL[lev]);
+    PatL[lev]->destroyList();
+#if (RPB == 1)
+    Parallel::destroypsuList_bam(bdsul[lev]);
+    Parallel::destroypsuList_bam(rsul[lev]);
+#endif
+  }
+  delete[] grids;
+  delete[] Lt;
+  delete[] bbox;
+  delete[] shape;
+  delete[] handle;
+  delete[] PatL;
+#if (RPB == 1)
+  delete[] bdsul;
+  delete[] rsul;
+#endif
+
+#if (PSTR == 1 || PSTR == 2 || PSTR == 3)
+  for (int lev = 0; lev < levels; lev++)
+  {
+    MPI_Comm_free(&Commlev[lev]);
+  }
+
+  if (Commlev)
+    delete[] Commlev;
+  if (start_rank)
+    delete[] start_rank;
+  if (end_rank)
+    delete[] end_rank;
+#endif
+  for (int lev = 0; lev < levels; lev++)
+  {
+    for (int ibh = 0; ibh < BH_num_in; ibh++)
+      delete[] Porgls[lev][ibh];
+    delete[] Porgls[lev];
+  }
+  delete[] Porgls;
+
+  // Clean up load balancing memory
+  if (rank_interp_times)
+    delete[] rank_interp_times;
+  if (heavy_ranks)
+    delete[] heavy_ranks;
+}
+
+//================================================================================================
+
+
+//================================================================================================
+
+// This member function constructs the computational grid
+
+//================================================================================================
+
+#if (PSTR == 0)
+void cgh::compose_cgh(int nprocs)
+{
+  for (int lev = 0; lev < levels; lev++)
+  {
+    checkPatchList(PatL[lev], false);
+    Parallel::distribute_hard(PatL[lev], nprocs, ingfs, fngfs, false);
+#if (RPB == 1)
+    // we need distributed box of PatL[lev] and PatL[lev-1]
+    if (lev > 0)
+    {
+      Parallel::Constr_pointstr_OutBdLow2Hi(PatL[lev], PatL[lev - 1], bdsul[lev]);
+      Parallel::Constr_pointstr_Restrict(PatL[lev], PatL[lev - 1], rsul[lev]);
+    }
+    else
+    {
+      bdsul[lev] = 0;
+      rsul[lev] = 0;
+    }
+#endif
+  }
+}
+
+//================================================================================================
+
+
+//================================================================================================
+
+// This member function constructs the computational grid
+// For the cases PSTR == 1 and PSTR == 2
+
+//================================================================================================
+
+#elif (PSTR == 1 || PSTR == 2)
+void cgh::compose_cgh(int nprocs)
+{
+  Commlev = new MPI_Comm[levels];
+  construct_mylev(nprocs);
+  for (int lev = 0; lev < levels; lev++)
+  {
+    MPI_Comm_split(MPI_COMM_WORLD, mylev, lev, &Commlev[lev]);
+    checkPatchList(PatL[lev], false);
+    Parallel::distribute(PatL[lev], end_rank[lev] - start_rank[lev] + 1, ingfs, fngfs, false, start_rank[lev], end_rank[lev]);
+#if (RPB == 1)
+#error "not support yet"
+#endif
+  }
+  /* note different comm field has its own rank index
+    int myrank;
+    MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
+    if(myrank==nprocs-1)
+    {
+        cout<<"myrank = "<<myrank<<", mylev = "<<mylev<<endl;
+        MPI_Comm_rank(Commlev[levels-1],&myrank);
+        cout<<myrank<<" :)"<<endl;
+    }
+  */
+}
+
+//================================================================================================
+
+#if (PSTR == 1)
+void cgh::construct_mylev(int nprocs)
+{
+  if (nprocs < levels)
+  {
+    cout << "Too few procs to use parallel level methods!" << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+
+  start_rank = new int[levels];
+  end_rank = new int[levels];
+
+  int myrank;
+
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+  int mp;
+  mp = nprocs / levels;
+
+  start_rank[0] = 0;
+  end_rank[0] = mp - 1;
+  for (int lev = 1; lev < levels - 1; lev++)
+  {
+    start_rank[lev] = end_rank[lev - 1] + 1;
+    end_rank[lev] = end_rank[lev - 1] + mp;
+  }
+  start_rank[levels - 1] = end_rank[levels - 2] + 1;
+  end_rank[levels - 1] = nprocs - 1;
+
+  for (int lev = 0; lev < levels; lev++)
+  {
+    if (myrank >= start_rank[lev] && myrank <= end_rank[lev])
+      mylev = lev;
+  }
+}
+#elif (PSTR == 2)
+void cgh::construct_mylev(int nprocs)
+{
+  if (nprocs < levels)
+  {
+    cout << "Too few procs to use parallel level methods!" << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+
+  start_rank = new int[levels];
+  end_rank = new int[levels];
+
+  int myrank;
+
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+  int mp;
+  mp = nprocs / levels;
+
+  start_rank[levels - 1] = 0;
+  end_rank[levels - 1] = mp - 1;
+  for (int lev = levels - 2; lev > 0; lev--)
+  {
+    start_rank[lev] = end_rank[lev - 1] + 1;
+    end_rank[lev] = end_rank[lev - 1] + mp;
+  }
+  start_rank[0] = end_rank[1] + 1;
+  end_rank[0] = nprocs - 1;
+
+  for (int lev = levels - 1; lev >= 0; lev--)
+  {
+    if (myrank >= start_rank[lev] && myrank <= end_rank[lev])
+      mylev = lev;
+  }
+}
+#endif
+
+#elif (PSTR == 3)
+void cgh::construct_mylev(int nprocs)
+{
+  if (nprocs <= 1)
+  {
+    cout << " cgh::construct_mylev requires at least 2 procs" << endl;
+    exit(0);
+  }
+
+  start_rank = new int[2];
+  end_rank = new int[2];
+
+  int myrank;
+
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+  int mp;
+  mp = nprocs / 2;
+
+  // for other levels
+  for (int lev = 0; lev < levels - 1; lev++)
+  {
+    start_rank[lev] = 0;
+    end_rank[lev] = mp - 1;
+  }
+  // for finest level
+  start_rank[levels - 1] = end_rank[0] + 1;
+  end_rank[levels - 1] = nprocs - 1;
+
+  if (myrank >= start_rank[0] && myrank <= end_rank[0])
+    mylev = -1; // for other levels
+  else
+    mylev = 1; // for finest level
+}
+
+
+//-----------------------------------------------------------------------
+
+
+void cgh::compose_cgh(int nprocs)
+{
+  Commlev = new MPI_Comm[levels];
+  construct_mylev(nprocs);
+
+  for (int lev = 0; lev < levels - 1; lev++)
+  {
+    MPI_Comm_split(MPI_COMM_WORLD, mylev, -1, &Commlev[lev]);
+  }
+  MPI_Comm_split(MPI_COMM_WORLD, mylev, 1, &Commlev[levels - 1]);
+
+  for (int lev = 0; lev < levels; lev++)
+  {
+    checkPatchList(PatL[lev], false);
+    Parallel::distribute(PatL[lev], end_rank[lev] - start_rank[lev] + 1, ingfs, fngfs, false, start_rank[lev], end_rank[lev]);
+#if (RPB == 1)
+#error "not support yet"
+#endif
+  }
+}
+#endif
+
+
+void cgh::sethandle(monitor *ErrorMonitor)
+{
+  int BH_num;
+  Porgls = new double **[levels];
+  char filename[100];
+  {
+    map<string, string>::iterator iter = parameters::str_par.find("inputpar");
+    if (iter != parameters::str_par.end())
+    {
+      strcpy(filename, (iter->second).c_str());
+    }
+    else
+    {
+      cout << "Error inputpar" << endl;
+      exit(0);
+    }
+  }
+  // read parameter from file
+  {
+    const int LEN = 256;
+    char pline[LEN];
+    string str, sgrp, skey, sval;
+    int sind;
+    ifstream inf(filename, ifstream::in);
+    if (!inf.good() && ErrorMonitor && ErrorMonitor->outfile)
+    {
+      ErrorMonitor->outfile << "Can not open parameter file " << filename << " for inputing information of black holes" << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    for (int i = 1; inf.good(); i++)
+    {
+      inf.getline(pline, LEN);
+      str = pline;
+
+      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
+      if (status == -1)
+      {
+        if (ErrorMonitor && ErrorMonitor->outfile)
+          ErrorMonitor->outfile << "error reading parameter file " << filename << " in line " << i << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+      else if (status == 0)
+        continue;
+
+      if (sgrp == "BSSN" && skey == "BH_num")
+        BH_num = atoi(sval.c_str());
+      else if (sgrp == "cgh" && skey == "moving levels start from")
+      {
+        movls = atoi(sval.c_str());
+        movls = Mymin(movls, levels);
+        movls = Mymax(0, movls);
+      }
+    }
+    inf.close();
+  }
+  for (int lev = 0; lev < levels; lev++)
+  {
+    Porgls[lev] = new double *[BH_num];
+    for (int i = 0; i < BH_num; i++)
+      Porgls[lev][i] = new double[dim];
+  }
+  // read parameter from file
+  {
+    const int LEN = 256;
+    char pline[LEN];
+    string str, sgrp, skey, sval;
+    int sind;
+    ifstream inf(filename, ifstream::in);
+    if (!inf.good() && ErrorMonitor && ErrorMonitor->outfile)
+    {
+      ErrorMonitor->outfile << "Can not open parameter file " << filename
+                            << " for inputing information of black holes" << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    for (int i = 1; inf.good(); i++)
+    {
+      inf.getline(pline, LEN);
+      str = pline;
+
+      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
+      if (status == -1)
+      {
+        if (ErrorMonitor && ErrorMonitor->outfile)
+          ErrorMonitor->outfile << "error reading parameter file " << filename << " in line " << i << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+      else if (status == 0)
+        continue;
+
+      if (sgrp == "BSSN" && sind < BH_num)
+      {
+        if (skey == "Porgx")
+        {
+          for (int lev = 0; lev < levels; lev++)
+            Porgls[lev][sind][0] = atof(sval.c_str());
+        }
+        else if (skey == "Porgy")
+        {
+          for (int lev = 0; lev < levels; lev++)
+            Porgls[lev][sind][1] = atof(sval.c_str());
+        }
+        else if (skey == "Porgz")
+        {
+          for (int lev = 0; lev < levels; lev++)
+            Porgls[lev][sind][2] = atof(sval.c_str());
+        }
+      }
+    }
+    inf.close();
+  }
+
+  for (int lev = 0; lev < movls; lev++)
+    for (int grd = 0; grd < grids[lev]; grd++)
+      for (int i = 0; i < dim; i++)
+        handle[lev][grd][i] = 0;
+
+  if (movls < levels)
+  {
+    if (ErrorMonitor && ErrorMonitor->I_Print)
+    {
+      cout << endl;
+      cout << " moving levels are lev #" << movls << "--" << levels - 1 << endl;
+      cout << endl;
+    }
+
+    for (int lev = movls; lev < levels; lev++)
+      for (int grd = 0; grd < grids[lev]; grd++)
+      {
+#if 0	
+	 int bht=0;
+	 for(int bhi=0;bhi<BH_num;bhi++)
+	 {
+	    bool flag=false;
+    
+	    for(int i=0;i<dim;i++)
+               if(Porgls[0][bhi][i] < bbox[lev][grd][i] || Porgls[0][bhi][i] > bbox[lev][grd][i+dim]) {flag=true; break;}
+	    if(flag) continue;
+	    bht++;
+	    if(bht==1)  for(int i=0;i<dim;i++) handle[lev][grd][i]=Porgls[0][bhi][i];
+	    else if(ErrorMonitor && ErrorMonitor->outfile) 
+	    {
+               ErrorMonitor->outfile<<"cgh::sethandle: lev#"<<lev<<" grd#"<<grd<<" has too many black holes"<<endl;
+               MPI_Abort(MPI_COMM_WORLD,1);
+	    }
+	 }
+#else
+        double xxc[dim], dis0, dis1;
+        for (int i = 0; i < dim; i++)
+          xxc[i] = (bbox[lev][grd][i] + bbox[lev][grd][i + dim]) / 2;
+        int bht = 0;
+        for (int bhi = 0; bhi < BH_num; bhi++)
+        {
+          if (bhi == 0)
+          {
+            dis0 = 0;
+            for (int i = 0; i < dim; i++)
+              dis0 += pow(Porgls[0][bhi][i] - xxc[i], 2);
+            dis0 = sqrt(dis0);
+          }
+          else
+          {
+            dis1 = 0;
+            for (int i = 0; i < dim; i++)
+              dis1 += pow(Porgls[0][bhi][i] - xxc[i], 2);
+            dis1 = sqrt(dis1);
+            if (dis0 > dis1)
+            {
+              bht = bhi;
+              dis0 = dis1;
+            } // chose nearest one
+          }
+        }
+        for (int i = 0; i < dim; i++)
+          handle[lev][grd][i] = Porgls[0][bht][i];
+#endif
+      }
+  }
+  else if (ErrorMonitor && ErrorMonitor->I_Print)
+  {
+    if (levels > 1)
+      cout << "fixed mesh refinement!" << endl;
+    else
+      cout << "unigrid simulation!" << endl;
+  }
+
+  BH_num_in = BH_num;
+}
+void cgh::checkPatchList(MyList<Patch> *PatL, bool buflog)
+{
+  while (PatL)
+  {
+    PatL->data->checkPatch(buflog);
+    PatL = PatL->next;
+  }
+}
+
+
+//================================================================================================
+
+// This member function moves the grid
+
+//================================================================================================
+
+void cgh::Regrid(int Symmetry, int BH_num, double **Porgbr, double **Porg0,
+                 MyList<var> *OldList, MyList<var> *StateList,
+                 MyList<var> *FutureList, MyList<var> *tmList, bool BB,
+                 monitor *ErrorMonitor)
+{
+  // for moving part
+  if (movls < levels)
+  {
+    bool tot_flag = false;
+    bool *lev_flag;
+    double **tmpPorg;
+    tmpPorg = new double *[BH_num];
+    for (int bhi = 0; bhi < BH_num; bhi++)
+    {
+      tmpPorg[bhi] = new double[dim];
+      for (int i = 0; i < dim; i++)
+        tmpPorg[bhi][i] = Porgbr[bhi][i];
+    }
+    lev_flag = new bool[levels - movls];
+    for (int lev = movls; lev < levels; lev++)
+    {
+      lev_flag[lev - movls] = false;
+      for (int grd = 0; grd < grids[lev]; grd++)
+      {
+        int flag;
+        int do_every = 2;
+        double dX = PatL[lev]->data->blb->data->getdX(0);
+        double dY = PatL[lev]->data->blb->data->getdX(1);
+        double dZ = PatL[lev]->data->blb->data->getdX(2);
+        double rr;
+        // make sure that the grid corresponds to the black hole
+        int bhi = 0;
+        for (bhi = 0; bhi < BH_num; bhi++)
+        {
+          // because finner level may also change Porgbr, so we need factor 2
+          if (feq(Porgbr[bhi][0], handle[lev][grd][0], 2 * do_every * dX) &&
+              feq(Porgbr[bhi][1], handle[lev][grd][1], 2 * do_every * dY) &&
+              feq(Porgbr[bhi][2], handle[lev][grd][2], 2 * do_every * dZ))
+            break;
+        }
+        if (bhi == BH_num)
+        {
+          // if the box has already touched the original point
+          if (feq(0, bbox[lev][grd][0], dX / 2) &&
+              feq(0, bbox[lev][grd][1], dY / 2) &&
+              feq(0, bbox[lev][grd][2], dZ / 2))
+            break;
+
+          if (BH_num == 1)
+          {
+            bhi = 0;
+            break;
+          } // if only one black hole, it definitely match!
+
+          if (ErrorMonitor->outfile)
+          {
+            ErrorMonitor->outfile << "cgh::Regrid: no black hole matches with grid lev#" << lev << " grd#" << grd
+                                  << " with handle (" << handle[lev][grd][0] << "," << handle[lev][grd][1] << "," << handle[lev][grd][2] << ")" << endl;
+            ErrorMonitor->outfile << "black holes' old positions:" << endl;
+            for (bhi = 0; bhi < BH_num; bhi++)
+              ErrorMonitor->outfile << "#" << bhi << ": (" << Porgbr[bhi][0] << "," << Porgbr[bhi][1] << "," << Porgbr[bhi][2] << ")" << endl;
+            ErrorMonitor->outfile << "tolerance:" << endl;
+            ErrorMonitor->outfile << "(" << 2 * do_every * dX << "," << 2 * do_every * dY << "," << 2 * do_every * dZ << ")" << endl;
+            ErrorMonitor->outfile << "box lower boundary: (" << bbox[lev][grd][0] << "," << bbox[lev][grd][1] << "," << bbox[lev][grd][2] << ")" << endl;
+            MPI_Abort(MPI_COMM_WORLD, 1);
+          }
+
+          delete[] lev_flag;
+          for (bhi = 0; bhi < BH_num; bhi++)
+            delete[] tmpPorg[bhi];
+          delete[] tmpPorg;
+          return;
+        }
+        // x direction
+        rr = (Porg0[bhi][0] - handle[lev][grd][0]) / dX;
+        if (rr > 0)
+          flag = int(rr + 0.5) / do_every;
+        else
+          flag = int(rr - 0.5) / do_every;
+        flag = flag * do_every;
+        rr = bbox[lev][grd][0] + flag * dX;
+        // pay attention to the symmetric case
+        if (Symmetry == 2 && rr < 0)
+          rr = -bbox[lev][grd][0];
+        else
+          rr = flag * dX;
+
+        if (fabs(rr) > dX / 2)
+        {
+          lev_flag[lev - movls] = tot_flag = true;
+          bbox[lev][grd][0] = bbox[lev][grd][0] + rr;
+          bbox[lev][grd][3] = bbox[lev][grd][3] + rr;
+          handle[lev][grd][0] += rr;
+          tmpPorg[bhi][0] = Porg0[bhi][0];
+        }
+
+        // y direction
+        rr = (Porg0[bhi][1] - handle[lev][grd][1]) / dY;
+        if (rr > 0)
+          flag = int(rr + 0.5) / do_every;
+        else
+          flag = int(rr - 0.5) / do_every;
+        flag = flag * do_every;
+        rr = bbox[lev][grd][1] + flag * dY;
+        // pay attention to the symmetric case
+        if (Symmetry == 2 && rr < 0)
+          rr = -bbox[lev][grd][1];
+        else
+          rr = flag * dY;
+
+        if (fabs(rr) > dY / 2)
+        {
+          lev_flag[lev - movls] = tot_flag = true;
+          bbox[lev][grd][1] = bbox[lev][grd][1] + rr;
+          bbox[lev][grd][4] = bbox[lev][grd][4] + rr;
+          handle[lev][grd][1] += rr;
+          tmpPorg[bhi][1] = Porg0[bhi][1];
+        }
+
+        // z direction
+        rr = (Porg0[bhi][2] - handle[lev][grd][2]) / dZ;
+        if (rr > 0)
+          flag = int(rr + 0.5) / do_every;
+        else
+          flag = int(rr - 0.5) / do_every;
+        flag = flag * do_every;
+        rr = bbox[lev][grd][2] + flag * dZ;
+        // pay attention to the symmetric case
+        if (Symmetry > 0 && rr < 0)
+          rr = -bbox[lev][grd][1];
+        else
+          rr = flag * dZ;
+
+        if (fabs(rr) > dZ / 2)
+        {
+          lev_flag[lev - movls] = tot_flag = true;
+          bbox[lev][grd][2] = bbox[lev][grd][2] + rr;
+          bbox[lev][grd][5] = bbox[lev][grd][5] + rr;
+          handle[lev][grd][2] += rr;
+          tmpPorg[bhi][2] = Porg0[bhi][2];
+        }
+      }
+      //   if(ErrorMonitor->outfile && lev_flag[lev-movls]) cout<<"lev#"<<lev<<"'s boxes moved"<<endl;
+    }
+
+    if (tot_flag)
+    {
+      int nprocs;
+      MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+      recompose_cgh(nprocs, lev_flag, OldList, StateList, FutureList, tmList, Symmetry, BB);
+      for (int bhi = 0; bhi < BH_num; bhi++)
+      {
+        for (int i = 0; i < dim; i++)
+          Porgbr[bhi][i] = tmpPorg[bhi][i];
+      }
+    }
+
+    delete[] lev_flag;
+    for (int bhi = 0; bhi < BH_num; bhi++)
+      delete[] tmpPorg[bhi];
+    delete[] tmpPorg;
+  }
+}
+
+//================================================================================================
+
+#if (PSTR == 1 || PSTR == 2 || PSTR == 3)
+#warning "Regrid is not implimented yet"
+void cgh::Regrid_fake(int Symmetry, int BH_num, double **Porgbr, double **Porg0,
+                      MyList<var> *OldList, MyList<var> *StateList,
+                      MyList<var> *FutureList, MyList<var> *tmList, bool BB,
+                      monitor *ErrorMonitor)
+{
+  // for moving part
+  if (movls < levels)
+  {
+    bool tot_flag = false;
+    bool *lev_flag;
+    double **tmpPorg;
+    tmpPorg = new double *[BH_num];
+    for (int bhi = 0; bhi < BH_num; bhi++)
+    {
+      tmpPorg[bhi] = new double[dim];
+      for (int i = 0; i < dim; i++)
+        tmpPorg[bhi][i] = Porgbr[bhi][i];
+    }
+    lev_flag = new bool[levels - movls];
+    for (int lev = movls; lev < levels; lev++)
+    {
+      lev_flag[lev - movls] = false;
+      for (int grd = 0; grd < grids[lev]; grd++)
+      {
+        int flag;
+        int do_every = 2;
+        double dX = PatL[lev]->data->blb->data->getdX(0);
+        double dY = PatL[lev]->data->blb->data->getdX(1);
+        double dZ = PatL[lev]->data->blb->data->getdX(2);
+        double rr;
+        // make sure that the grid corresponds to the black hole
+        int bhi = 0;
+        for (bhi = 0; bhi < BH_num; bhi++)
+        {
+          // because finner level may also change Porgbr, so we need factor 2
+          if (feq(Porgbr[bhi][0], handle[lev][grd][0], 2 * do_every * dX) &&
+              feq(Porgbr[bhi][1], handle[lev][grd][1], 2 * do_every * dY) &&
+              feq(Porgbr[bhi][2], handle[lev][grd][2], 2 * do_every * dZ))
+            break;
+        }
+        if (bhi == BH_num)
+        {
+          // if the box has already touched the original point
+          if (feq(0, bbox[lev][grd][0], dX / 2) &&
+              feq(0, bbox[lev][grd][1], dY / 2) &&
+              feq(0, bbox[lev][grd][2], dZ / 2))
+            break;
+
+          if (BH_num == 1)
+          {
+            bhi = 0;
+            break;
+          } // if only one black hole, it definitely match!
+
+          if (ErrorMonitor->outfile)
+          {
+            ErrorMonitor->outfile << "cgh::Regrid: no black hole matches with grid lev#" << lev << " grd#" << grd
+                                  << " with handle (" << handle[lev][grd][0] << "," << handle[lev][grd][1] << "," << handle[lev][grd][2] << ")" << endl;
+            ErrorMonitor->outfile << "black holes' old positions:" << endl;
+            for (bhi = 0; bhi < BH_num; bhi++)
+              ErrorMonitor->outfile << "#" << bhi << ": (" << Porgbr[bhi][0] << "," << Porgbr[bhi][1] << "," << Porgbr[bhi][2] << ")" << endl;
+            ErrorMonitor->outfile << "tolerance:" << endl;
+            ErrorMonitor->outfile << "(" << 2 * do_every * dX << "," << 2 * do_every * dY << "," << 2 * do_every * dZ << ")" << endl;
+            ErrorMonitor->outfile << "box lower boundary: (" << bbox[lev][grd][0] << "," << bbox[lev][grd][1] << "," << bbox[lev][grd][2] << ")" << endl;
+            MPI_Abort(MPI_COMM_WORLD, 1);
+          }
+
+          delete[] lev_flag;
+          for (bhi = 0; bhi < BH_num; bhi++)
+            delete[] tmpPorg[bhi];
+          delete[] tmpPorg;
+          return;
+        }
+        // x direction
+        rr = (Porg0[bhi][0] - handle[lev][grd][0]) / dX;
+        if (rr > 0)
+          flag = int(rr + 0.5) / do_every;
+        else
+          flag = int(rr - 0.5) / do_every;
+        flag = flag * do_every;
+        rr = bbox[lev][grd][0] + flag * dX;
+        // pay attention to the symmetric case
+        if (Symmetry == 2 && rr < 0)
+          rr = -bbox[lev][grd][0];
+        else
+          rr = flag * dX;
+
+        if (fabs(rr) > dX / 2)
+        {
+          lev_flag[lev - movls] = tot_flag = true;
+          bbox[lev][grd][0] = bbox[lev][grd][0] + rr;
+          bbox[lev][grd][3] = bbox[lev][grd][3] + rr;
+          handle[lev][grd][0] += rr;
+          tmpPorg[bhi][0] = Porg0[bhi][0];
+        }
+
+        // y direction
+        rr = (Porg0[bhi][1] - handle[lev][grd][1]) / dY;
+        if (rr > 0)
+          flag = int(rr + 0.5) / do_every;
+        else
+          flag = int(rr - 0.5) / do_every;
+        flag = flag * do_every;
+        rr = bbox[lev][grd][1] + flag * dY;
+        // pay attention to the symmetric case
+        if (Symmetry == 2 && rr < 0)
+          rr = -bbox[lev][grd][1];
+        else
+          rr = flag * dY;
+
+        if (fabs(rr) > dY / 2)
+        {
+          lev_flag[lev - movls] = tot_flag = true;
+          bbox[lev][grd][1] = bbox[lev][grd][1] + rr;
+          bbox[lev][grd][4] = bbox[lev][grd][4] + rr;
+          handle[lev][grd][1] += rr;
+          tmpPorg[bhi][1] = Porg0[bhi][1];
+        }
+
+        // z direction
+        rr = (Porg0[bhi][2] - handle[lev][grd][2]) / dZ;
+        if (rr > 0)
+          flag = int(rr + 0.5) / do_every;
+        else
+          flag = int(rr - 0.5) / do_every;
+        flag = flag * do_every;
+        rr = bbox[lev][grd][2] + flag * dZ;
+        // pay attention to the symmetric case
+        if (Symmetry > 0 && rr < 0)
+          rr = -bbox[lev][grd][1];
+        else
+          rr = flag * dZ;
+
+        if (fabs(rr) > dZ / 2)
+        {
+          lev_flag[lev - movls] = tot_flag = true;
+          bbox[lev][grd][2] = bbox[lev][grd][2] + rr;
+          bbox[lev][grd][5] = bbox[lev][grd][5] + rr;
+          handle[lev][grd][2] += rr;
+          tmpPorg[bhi][2] = Porg0[bhi][2];
+        }
+      }
+      //   if(ErrorMonitor->outfile && lev_flag[lev-movls]) cout<<"lev#"<<lev<<"'s boxes moved"<<endl;
+    }
+
+    if (tot_flag)
+    {
+      int nprocs;
+      MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+      recompose_cgh_fake(nprocs, lev_flag, OldList, StateList, FutureList, tmList, Symmetry, BB);
+      for (int bhi = 0; bhi < BH_num; bhi++)
+      {
+        for (int i = 0; i < dim; i++)
+          Porgbr[bhi][i] = tmpPorg[bhi][i];
+      }
+    }
+
+    delete[] lev_flag;
+    for (int bhi = 0; bhi < BH_num; bhi++)
+      delete[] tmpPorg[bhi];
+    delete[] tmpPorg;
+  }
+}
+#endif
+
+
+//================================================================================================
+
+// This member function rebuilds the grid (regrid)
+
+//================================================================================================
+
+#if (PSTR == 0)
+void cgh::recompose_cgh(int nprocs, bool *lev_flag,
+                        MyList<var> *OldList, MyList<var> *StateList,
+                        MyList<var> *FutureList, MyList<var> *tmList,
+                        int Symmetry, bool BB)
+{
+  for (int lev = movls; lev < levels; lev++)
+    if (lev_flag[lev - movls])
+    {
+      MyList<Patch> *tmPat = 0;
+      tmPat = construct_patchlist(lev, Symmetry);
+      // tmPat construction completes
+      Parallel::distribute(tmPat, nprocs, ingfs, fngfs, false);
+      //    checkPatchList(tmPat,true);
+      bool CC = (lev > trfls);
+      Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
+
+      Parallel::KillBlocks(PatL[lev]);
+      PatL[lev]->destroyList();
+      PatL[lev] = tmPat;
+#if (RPB == 1)
+      Parallel::destroypsuList_bam(bdsul[lev]);
+      Parallel::destroypsuList_bam(rsul[lev]);
+      Parallel::Constr_pointstr_OutBdLow2Hi(PatL[lev], PatL[lev - 1], bdsul[lev]);
+      Parallel::Constr_pointstr_Restrict(PatL[lev], PatL[lev - 1], rsul[lev]);
+#endif
+    }
+}
+#elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
+#warning "recompose_cgh is not implimented yet"
+void cgh::recompose_cgh(int nprocs, bool *lev_flag,
+                        MyList<var> *OldList, MyList<var> *StateList,
+                        MyList<var> *FutureList, MyList<var> *tmList,
+                        int Symmetry, bool BB)
+{
+  for (int lev = movls; lev < levels; lev++)
+    if (lev_flag[lev - movls])
+    {
+      MyList<Patch> *tmPat = 0;
+      tmPat = construct_patchlist(lev, Symmetry);
+      // tmPat construction completes
+      Parallel::distribute(tmPat, end_rank[lev] - start_rank[lev] + 1, ingfs, fngfs, false, start_rank[lev], end_rank[lev]);
+      //    checkPatchList(tmPat,true);
+      bool CC = (lev > trfls);
+      Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
+
+      Parallel::KillBlocks(PatL[lev]);
+      PatL[lev]->destroyList();
+      PatL[lev] = tmPat;
+#if (RPB == 1)
+#error "not support yet"
+#endif
+    }
+}
+
+//================================================================================================
+
+void cgh::recompose_cgh_fake(int nprocs, bool *lev_flag,
+                             MyList<var> *OldList, MyList<var> *StateList,
+                             MyList<var> *FutureList, MyList<var> *tmList,
+                             int Symmetry, bool BB)
+{
+  for (int lev = movls; lev < levels; lev++)
+    if (lev_flag[lev - movls] && lev != mylev)
+    {
+      MyList<Patch> *tmPat = 0;
+      tmPat = construct_patchlist(lev, Symmetry);
+      // tmPat construction completes
+      Parallel::distribute(tmPat, end_rank[lev] - start_rank[lev] + 1, ingfs, fngfs, false, start_rank[lev], end_rank[lev]);
+
+      Parallel::KillBlocks(PatL[lev]);
+      PatL[lev]->destroyList();
+      PatL[lev] = tmPat;
+    }
+}
+#endif
+
+//================================================================================================
+
+// This member function reads grid information from input files
+
+//================================================================================================
+
+void cgh::read_bbox(int Symmetry, char *filename)
+{
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  // read parameter from file
+  {
+    const int LEN = 256;
+    char pline[LEN];
+    string str, sgrp, skey, sval;
+    int sind1, sind2, sind3;
+    ifstream inf(filename, ifstream::in);
+    if (!inf.good() && myrank == 0)
+    {
+      cout << "cgh::cgh: Can not open parameter file " << filename << " for inputing information of black holes" << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    for (int i = 1; inf.good(); i++)
+    {
+      inf.getline(pline, LEN);
+      str = pline;
+
+      int status = misc::parse_parts(str, sgrp, skey, sval, sind1);
+      if (status == -1)
+      {
+        cout << "error reading parameter file " << filename << " in line " << i << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+      else if (status == 0)
+        continue;
+
+      if (sgrp == "cgh" && skey == "levels")
+      {
+        levels = atoi(sval.c_str());
+        break;
+      }
+    }
+    inf.close();
+  }
+
+  grids = new int[levels];
+  shape = new int **[levels];
+  handle = new double **[levels];
+  bbox = new double **[levels];
+  PatL = new MyList<Patch> *[levels];
+  Lt = new double[levels];
+#if (RPB == 1)
+  bdsul = new MyList<Parallel::pointstru_bam> *[levels];
+  rsul = new MyList<Parallel::pointstru_bam> *[levels];
+#endif
+  // read parameter from file
+  {
+    const int LEN = 256;
+    char pline[LEN];
+    string str, sgrp, skey, sval;
+    int sind1, sind2, sind3;
+    ifstream inf(filename, ifstream::in);
+    if (!inf.good() && myrank == 0)
+    {
+      cout << "cgh::cgh: Can not open parameter file " << filename << " for inputing information of black holes" << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    for (int i = 1; inf.good(); i++)
+    {
+      inf.getline(pline, LEN);
+      str = pline;
+
+      int status = misc::parse_parts(str, sgrp, skey, sval, sind1, sind2, sind3);
+      if (status == -1)
+      {
+        cout << "error reading parameter file " << filename << " in line " << i << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+      else if (status == 0)
+        continue;
+
+      if (sgrp == "cgh" && skey == "grids" && sind1 < levels)
+        grids[sind1] = atoi(sval.c_str());
+    }
+    inf.close();
+  }
+
+  for (int sind1 = 0; sind1 < levels; sind1++)
+  {
+    shape[sind1] = new int *[grids[sind1]];
+    handle[sind1] = new double *[grids[sind1]];
+    bbox[sind1] = new double *[grids[sind1]];
+    for (int sind2 = 0; sind2 < grids[sind1]; sind2++)
+    {
+      shape[sind1][sind2] = new int[dim];
+      handle[sind1][sind2] = new double[dim];
+      bbox[sind1][sind2] = new double[2 * dim];
+    }
+  }
+  // read parameter from file
+  {
+    const int LEN = 256;
+    char pline[LEN];
+    string str, sgrp, skey, sval;
+    int sind1, sind2, sind3;
+    ifstream inf(filename, ifstream::in);
+    if (!inf.good() && myrank == 0)
+    {
+      cout << "cgh::cgh: Can not open parameter file " << filename << " for inputing information of black holes" << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    for (int i = 1; inf.good(); i++)
+    {
+      inf.getline(pline, LEN);
+      str = pline;
+
+      int status = misc::parse_parts(str, sgrp, skey, sval, sind1, sind2, sind3);
+
+      if (status == -1)
+      {
+        cout << "error reading parameter file " << filename << " in line " << i << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+      else if (status == 0)
+        continue;
+
+      if (sgrp == "cgh" && sind1 < levels && sind2 < grids[sind1])
+      {
+        if (skey == "bbox")
+          bbox[sind1][sind2][sind3] = atof(sval.c_str());
+        else if (skey == "shape")
+          shape[sind1][sind2][sind3] = atoi(sval.c_str());
+      }
+    }
+    inf.close();
+  }
+// we always assume the input parameter is in cell center style
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+  for (int lev = 0; lev < levels; lev++)
+    for (int grd = 0; grd < grids[lev]; grd++)
+    {
+      for (int i = 0; i < dim; i++)
+      {
+
+        shape[lev][grd][i] = shape[lev][grd][i] + 1;
+      }
+    }
+#endif
+
+  {
+
+    // boxes align check
+    double DH0[dim];
+    for (int i = 0; i < dim; i++)
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+      DH0[i] = (bbox[0][0][i + dim] - bbox[0][0][i]) / (shape[0][0][i] - 1);
+#else
+#ifdef Cell
+      DH0[i] = (bbox[0][0][i + dim] - bbox[0][0][i]) / shape[0][0][i];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+    for (int lev = 0; lev < levels; lev++)
+      for (int grd = 0; grd < grids[lev]; grd++)
+        Parallel::aligncheck(bbox[0][0], bbox[lev][grd], lev, DH0, shape[lev][grd]);
+
+#if 0 // we do not need it here, because we do it in construct_patchlist
+// extend buffer points for shell overlap
+#ifdef WithShell   
+   for(int i=0;i<dim;i++)
+   {
+     shape[0][0][i] += buffer_width;
+     bbox[0][0][i+dim] += DH0[i]*buffer_width;
+   }
+   if(Symmetry < 2 )
+   {
+     shape[0][0][0] += buffer_width;
+     bbox[0][0][0] -= DH0[0]*buffer_width;
+     shape[0][0][1] += buffer_width;
+     bbox[0][0][1] -= DH0[1]*buffer_width;
+     if(Symmetry < 1)
+     {
+       shape[0][0][2] += buffer_width;
+       bbox[0][0][2] -= DH0[2]*buffer_width;
+     }
+   }
+#endif
+#endif
+  }
+  // print information of cgh
+  if (myrank == 0)
+  {
+    cout << endl;
+    cout << " cgh has levels: " << levels << endl;
+    cout << endl;
+    for (int lev = 0; lev < levels; lev++)
+    {
+      cout << " level #" << lev << " has boxes: " << grids[lev] << endl;
+      for (int grd = 0; grd < grids[lev]; grd++)
+      {
+        cout << " #" << grd << " box is" << "  (" << bbox[lev][grd][0] << ":" << bbox[lev][grd][3]
+             << "," << bbox[lev][grd][1] << ":" << bbox[lev][grd][4]
+             << "," << bbox[lev][grd][2] << ":" << bbox[lev][grd][5]
+             << ")." << endl;
+      }
+    }
+  }
+}
+
+//================================================================================================
+
+
+//================================================================================================
+
+// This member function generates required grid information
+
+//================================================================================================
+
+MyList<Patch> *cgh::construct_patchlist(int lev, int Symmetry)
+{
+  // Construct Patches
+  MyList<Patch> *tmPat = 0;
+  // construct box list
+  MyList<Parallel::gridseg> *boxes = 0, *gs;
+
+  /*
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  if (myrank == 0)
+  {
+      cout << " construct patchlist: " << " level = " << lev << ",  grids in this level = " << grids[lev] << endl;
+  }
+  */
+
+  for (int grd = 0; grd < grids[lev]; grd++)
+  {
+    if (boxes)
+    {
+      gs->next = new MyList<Parallel::gridseg>;
+      gs = gs->next;
+      gs->data = new Parallel::gridseg;
+    }
+    else
+    {
+      boxes = gs = new MyList<Parallel::gridseg>;
+      gs->data = new Parallel::gridseg;
+    }
+    for (int i = 0; i < dim; i++)
+    {
+      gs->data->llb[i] = bbox[lev][grd][i];
+      gs->data->uub[i] = bbox[lev][grd][dim + i];
+      gs->data->shape[i] = shape[lev][grd][i];
+    }
+    gs->data->Bg = 0;
+    gs->next = 0;
+  }
+
+  // Merge grid boxes (merging more than three boxes may cause bugs)
+  // Parallel::merge_gsl(boxes, ratio);
+  if (grids[lev] < 3)
+  {
+    Parallel::merge_gsl(boxes, ratio);
+  }
+
+  // When grid boxes overlap, re-split the boxes
+  // Parallel::cut_gsl(boxes);
+  if (grids[lev] < 3)
+  {
+    Parallel::cut_gsl(boxes);
+  }
+
+  // After splitting, add new ghost regions?
+  // Parallel::add_ghost_touch(boxes);
+  if (grids[lev] < 3)
+  {
+    Parallel::add_ghost_touch(boxes);
+  }
+
+  MyList<Patch> *gp;
+  gs = boxes;
+  while (gs)
+  {
+    double tbb[2 * dim];
+    if (tmPat)
+    {
+      gp->next = new MyList<Patch>;
+      gp = gp->next;
+      for (int i = 0; i < dim; i++)
+      {
+        tbb[i] = gs->data->llb[i];
+        tbb[dim + i] = gs->data->uub[i];
+      }
+#ifdef WithShell
+      gp->data = new Patch(3, gs->data->shape, tbb, lev, true, Symmetry);
+#else
+      gp->data = new Patch(3, gs->data->shape, tbb, lev, (lev > 0), Symmetry);
+#endif
+    }
+    else
+    {
+      tmPat = gp = new MyList<Patch>;
+      for (int i = 0; i < dim; i++)
+      {
+        tbb[i] = gs->data->llb[i];
+        tbb[dim + i] = gs->data->uub[i];
+      }
+#ifdef WithShell
+      gp->data = new Patch(3, gs->data->shape, tbb, lev, true, Symmetry);
+#else
+      gp->data = new Patch(3, gs->data->shape, tbb, lev, (lev > 0), Symmetry);
+#endif
+    }
+    gp->next = 0;
+
+    gs = gs->next;
+  }
+
+  boxes->destroyList();
+
+  return tmPat;
+}
+
+//================================================================================================
+
+
+bool cgh::Interp_One_Point(MyList<var> *VarList,
+                           double *XX, /*input global Cartesian coordinate*/
+                           double *Shellf, int Symmetry)
+{
+  int lev = levels - 1;
+  while (lev >= 0)
+  {
+    MyList<Patch> *Pp = PatL[lev];
+    while (Pp)
+    {
+#if (PSTR == 0)
+      if (Pp->data->Interp_ONE_Point(VarList, XX, Shellf, Symmetry))
+        return true;
+#elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
+      if (Pp->data->Interp_ONE_Point(VarList, XX, Shellf, Symmetry, Commlev[lev]))
+        return true;
+#endif
+      Pp = Pp->next;
+    }
+    lev--;
+  }
+  return false;
+}
+
+
+void cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
+                          MyList<var> *OldList, MyList<var> *StateList,
+                          MyList<var> *FutureList, MyList<var> *tmList, bool BB,
+                          monitor *ErrorMonitor)
+{
+  if (lev < movls)
+    return;
+
+#if (0)
+  // #if (PSTR == 1 || PSTR == 2)
+  MyList<Patch> *Pp = PatL[lev];
+  while (Pp)
+  {
+    Pp->data->checkPatch(0, start_rank[mylev]);
+    Pp = Pp->next;
+  }
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  if (myrank == start_rank[mylev])
+  {
+    cout << "out_rank = " << myrank << endl;
+    for (int grd = 0; grd < grids[lev]; grd++)
+    {
+      cout << handle[lev][grd][0] << "," << handle[lev][grd][1] << "," << handle[lev][grd][2] << endl;
+    }
+    for (int bhi = 0; bhi < BH_num; bhi++)
+    {
+      cout << Porgls[lev][bhi][0] << "," << Porgls[lev][bhi][1] << "," << Porgls[lev][bhi][2] << endl;
+      cout << Porg0[bhi][0] << "," << Porg0[bhi][1] << "," << Porg0[bhi][2] << endl;
+    }
+  }
+#endif
+
+  //   misc::tillherecheck(Commlev[lev],start_rank[lev],"start Regrid_Onelevel");
+  // for moving part
+  bool tot_flag = false;
+  double **tmpPorg;
+  tmpPorg = new double *[BH_num];
+  for (int bhi = 0; bhi < BH_num; bhi++)
+  {
+    tmpPorg[bhi] = new double[dim];
+    for (int i = 0; i < dim; i++)
+      tmpPorg[bhi][i] = Porgls[lev][bhi][i];
+  }
+
+  for (int grd = 0; grd < grids[lev]; grd++)
+  {
+    int flag;
+    int do_every = 2;
+    double dX = PatL[lev]->data->blb->data->getdX(0);
+    double dY = PatL[lev]->data->blb->data->getdX(1);
+    double dZ = PatL[lev]->data->blb->data->getdX(2);
+    double rr;
+    // make sure that the grid corresponds to the black hole
+    int bhi = 0;
+    for (bhi = 0; bhi < BH_num; bhi++)
+    {
+      // because finner level may also change Porgbr, so we need factor 2
+      // now I used Porgls
+      if (feq(Porgls[lev][bhi][0], handle[lev][grd][0], 2 * do_every * dX) &&
+          feq(Porgls[lev][bhi][1], handle[lev][grd][1], 2 * do_every * dY) &&
+          feq(Porgls[lev][bhi][2], handle[lev][grd][2], 2 * do_every * dZ))
+        break;
+    }
+    if (bhi == BH_num)
+    {
+      // if the box has already touched the original point
+      if (feq(0, bbox[lev][grd][0], dX / 2) &&
+          feq(0, bbox[lev][grd][1], dY / 2) &&
+          feq(0, bbox[lev][grd][2], dZ / 2))
+        break;
+
+      if (BH_num == 1)
+      {
+        bhi = 0;
+        break;
+      } // if only one black hole, it definitely match!
+
+      if (ErrorMonitor->outfile)
+      {
+        ErrorMonitor->outfile << "cgh::Regrid: no black hole matches with grid lev#" << lev << " grd#" << grd
+                              << " with handle (" << handle[lev][grd][0] << "," << handle[lev][grd][1] << "," << handle[lev][grd][2] << ")" << endl;
+        ErrorMonitor->outfile << "black holes' old positions:" << endl;
+        for (bhi = 0; bhi < BH_num; bhi++)
+          ErrorMonitor->outfile << "#" << bhi << ": (" << Porgls[lev][bhi][0] << "," << Porgls[lev][bhi][1] << ","
+                                << Porgls[lev][bhi][2] << ")" << endl;
+        ErrorMonitor->outfile << "tolerance:" << endl;
+        ErrorMonitor->outfile << "(" << 2 * do_every * dX << "," << 2 * do_every * dY << "," << 2 * do_every * dZ << ")" << endl;
+        ErrorMonitor->outfile << "box lower boundary: (" << bbox[lev][grd][0] << "," << bbox[lev][grd][1] << "," << bbox[lev][grd][2] << ")" << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+
+      for (bhi = 0; bhi < BH_num; bhi++)
+        delete[] tmpPorg[bhi];
+      delete[] tmpPorg;
+      return;
+    }
+    // x direction
+    rr = (Porg0[bhi][0] - handle[lev][grd][0]) / dX;
+    if (rr > 0)
+      flag = int(rr + 0.5) / do_every;
+    else
+      flag = int(rr - 0.5) / do_every;
+    flag = flag * do_every;
+    rr = bbox[lev][grd][0] + flag * dX;
+    // pay attention to the symmetric case
+    if (Symmetry == 2 && rr < 0)
+      rr = -bbox[lev][grd][0];
+    else
+      rr = flag * dX;
+
+    if (fabs(rr) > dX / 2)
+    {
+      tot_flag = true;
+      bbox[lev][grd][0] = bbox[lev][grd][0] + rr;
+      bbox[lev][grd][3] = bbox[lev][grd][3] + rr;
+      handle[lev][grd][0] += rr;
+      tmpPorg[bhi][0] = Porg0[bhi][0];
+    }
+
+    // y direction
+    rr = (Porg0[bhi][1] - handle[lev][grd][1]) / dY;
+    if (rr > 0)
+      flag = int(rr + 0.5) / do_every;
+    else
+      flag = int(rr - 0.5) / do_every;
+    flag = flag * do_every;
+    rr = bbox[lev][grd][1] + flag * dY;
+    // pay attention to the symmetric case
+    if (Symmetry == 2 && rr < 0)
+      rr = -bbox[lev][grd][1];
+    else
+      rr = flag * dY;
+
+    if (fabs(rr) > dY / 2)
+    {
+      tot_flag = true;
+      bbox[lev][grd][1] = bbox[lev][grd][1] + rr;
+      bbox[lev][grd][4] = bbox[lev][grd][4] + rr;
+      handle[lev][grd][1] += rr;
+      tmpPorg[bhi][1] = Porg0[bhi][1];
+    }
+
+    // z direction
+    rr = (Porg0[bhi][2] - handle[lev][grd][2]) / dZ;
+    if (rr > 0)
+      flag = int(rr + 0.5) / do_every;
+    else
+      flag = int(rr - 0.5) / do_every;
+    flag = flag * do_every;
+    rr = bbox[lev][grd][2] + flag * dZ;
+    // pay attention to the symmetric case
+    if (Symmetry > 0 && rr < 0)
+      rr = -bbox[lev][grd][1];
+    else
+      rr = flag * dZ;
+
+    if (fabs(rr) > dZ / 2)
+    {
+      tot_flag = true;
+      bbox[lev][grd][2] = bbox[lev][grd][2] + rr;
+      bbox[lev][grd][5] = bbox[lev][grd][5] + rr;
+      handle[lev][grd][2] += rr;
+      tmpPorg[bhi][2] = Porg0[bhi][2];
+    }
+  }
+
+  //   misc::tillherecheck(Commlev[lev],start_rank[lev],"after tot_flag check");
+
+  if (tot_flag)
+  {
+    int nprocs;
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    //     misc::tillherecheck(Commlev[lev],start_rank[lev],"before recompose_cgh_Onelevel");
+
+    recompose_cgh_Onelevel(nprocs, lev, OldList, StateList, FutureList, tmList, Symmetry, BB);
+
+    //     misc::tillherecheck(Commlev[lev],start_rank[lev],"after recompose_cgh_Onelevel");
+
+    for (int bhi = 0; bhi < BH_num; bhi++)
+    {
+      for (int i = 0; i < dim; i++)
+        Porgls[lev][bhi][i] = tmpPorg[bhi][i];
+    }
+
+#if (PSTR == 1 || PSTR == 2)
+//       MyList<Patch> *Pp=PatL[lev];
+//       while(Pp)
+//       {
+//	 Pp->data->checkPatch(0,start_rank[mylev]);
+//	 Pp=Pp->next;
+//       }
+#endif
+  }
+
+  for (int bhi = 0; bhi < BH_num; bhi++)
+    delete[] tmpPorg[bhi];
+  delete[] tmpPorg;
+}
+
+
+#if (PSTR == 0)
+void cgh::recompose_cgh_Onelevel(int nprocs, int lev,
+                                 MyList<var> *OldList, MyList<var> *StateList,
+                                 MyList<var> *FutureList, MyList<var> *tmList,
+                                 int Symmetry, bool BB)
+{
+  MyList<Patch> *tmPat = 0;
+  tmPat = construct_patchlist(lev, Symmetry);
+  // tmPat construction completes
+  Parallel::distribute(tmPat, nprocs, ingfs, fngfs, false);
+  //    checkPatchList(tmPat,true);
+  bool CC = (lev > trfls);
+  Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
+
+  Parallel::KillBlocks(PatL[lev]);
+  PatL[lev]->destroyList();
+  PatL[lev] = tmPat;
+}
+#elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
+#warning "recompose_cgh_Onelevel is not implimented yet"
+void cgh::recompose_cgh_Onelevel(int nprocs, int lev,
+                                 MyList<var> *OldList, MyList<var> *StateList,
+                                 MyList<var> *FutureList, MyList<var> *tmList,
+                                 int Symmetry, bool BB)
+{
+  MyList<Patch> *tmPat = 0;
+  misc::tillherecheck(Commlev[lev], start_rank[lev], "before construct_patchlist");
+  tmPat = construct_patchlist(lev, Symmetry);
+  misc::tillherecheck(Commlev[lev], start_rank[lev], "after construct_patchlist");
+  // tmPat construction completes
+  Parallel::distribute(tmPat, end_rank[lev] - start_rank[lev] + 1, ingfs, fngfs, false, start_rank[lev], end_rank[lev]);
+  misc::tillherecheck(Commlev[lev], start_rank[lev], "after distribute");
+  //    checkPatchList(tmPat,true);
+  bool CC = (lev > trfls);
+  Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
+  misc::tillherecheck(Commlev[lev], start_rank[lev], "after fill_level_data");
+
+  Parallel::KillBlocks(PatL[lev]);
+  PatL[lev]->destroyList();
+  PatL[lev] = tmPat;
+}
+
+
+// the input lev is lower level for regrid
+void cgh::Regrid_Onelevel_aux(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
+                              MyList<var> *OldList, MyList<var> *StateList,
+                              MyList<var> *FutureList, MyList<var> *tmList, bool BB,
+                              monitor *ErrorMonitor)
+{
+  lev++;
+  if (lev < movls)
+    return;
+
+  // for moving part
+  bool tot_flag = false;
+  double **tmpPorg;
+  tmpPorg = new double *[BH_num];
+  for (int bhi = 0; bhi < BH_num; bhi++)
+  {
+    tmpPorg[bhi] = new double[dim];
+    for (int i = 0; i < dim; i++)
+      tmpPorg[bhi][i] = Porgbr[bhi][i];
+  }
+
+  for (int grd = 0; grd < grids[lev]; grd++)
+  {
+    int flag;
+    int do_every = 2;
+    double dX = PatL[lev]->data->blb->data->getdX(0);
+    double dY = PatL[lev]->data->blb->data->getdX(1);
+    double dZ = PatL[lev]->data->blb->data->getdX(2);
+    double rr;
+    // make sure that the grid corresponds to the black hole
+    int bhi = 0;
+    for (bhi = 0; bhi < BH_num; bhi++)
+    {
+      // because finner level may also change Porgbr, so we need factor 2
+      if (feq(Porgbr[bhi][0], handle[lev][grd][0], 2 * do_every * dX) &&
+          feq(Porgbr[bhi][1], handle[lev][grd][1], 2 * do_every * dY) &&
+          feq(Porgbr[bhi][2], handle[lev][grd][2], 2 * do_every * dZ))
+        break;
+    }
+    if (bhi == BH_num)
+    {
+      // if the box has already touched the original point
+      if (feq(0, bbox[lev][grd][0], dX / 2) &&
+          feq(0, bbox[lev][grd][1], dY / 2) &&
+          feq(0, bbox[lev][grd][2], dZ / 2))
+        break;
+
+      if (BH_num == 1)
+      {
+        bhi = 0;
+        break;
+      } // if only one black hole, it definitely match!
+
+      if (ErrorMonitor->outfile)
+      {
+        ErrorMonitor->outfile << "cgh::Regrid: no black hole matches with grid lev#" << lev << " grd#" << grd
+                              << " with handle (" << handle[lev][grd][0] << "," << handle[lev][grd][1] << "," << handle[lev][grd][2] << ")" << endl;
+        ErrorMonitor->outfile << "black holes' old positions:" << endl;
+        for (bhi = 0; bhi < BH_num; bhi++)
+          ErrorMonitor->outfile << "#" << bhi << ": (" << Porgbr[bhi][0] << "," << Porgbr[bhi][1] << "," << Porgbr[bhi][2] << ")" << endl;
+        ErrorMonitor->outfile << "tolerance:" << endl;
+        ErrorMonitor->outfile << "(" << 2 * do_every * dX << "," << 2 * do_every * dY << "," << 2 * do_every * dZ << ")" << endl;
+        ErrorMonitor->outfile << "box lower boundary: (" << bbox[lev][grd][0] << "," << bbox[lev][grd][1] << "," << bbox[lev][grd][2] << ")" << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+
+      for (bhi = 0; bhi < BH_num; bhi++)
+        delete[] tmpPorg[bhi];
+      delete[] tmpPorg;
+      return;
+    }
+    // x direction
+    rr = (Porg0[bhi][0] - handle[lev][grd][0]) / dX;
+    if (rr > 0)
+      flag = int(rr + 0.5) / do_every;
+    else
+      flag = int(rr - 0.5) / do_every;
+    flag = flag * do_every;
+    rr = bbox[lev][grd][0] + flag * dX;
+    // pay attention to the symmetric case
+    if (Symmetry == 2 && rr < 0)
+      rr = -bbox[lev][grd][0];
+    else
+      rr = flag * dX;
+
+    if (fabs(rr) > dX / 2)
+    {
+      tot_flag = true;
+      bbox[lev][grd][0] = bbox[lev][grd][0] + rr;
+      bbox[lev][grd][3] = bbox[lev][grd][3] + rr;
+      handle[lev][grd][0] += rr;
+      tmpPorg[bhi][0] = Porg0[bhi][0];
+    }
+
+    // y direction
+    rr = (Porg0[bhi][1] - handle[lev][grd][1]) / dY;
+    if (rr > 0)
+      flag = int(rr + 0.5) / do_every;
+    else
+      flag = int(rr - 0.5) / do_every;
+    flag = flag * do_every;
+    rr = bbox[lev][grd][1] + flag * dY;
+    // pay attention to the symmetric case
+    if (Symmetry == 2 && rr < 0)
+      rr = -bbox[lev][grd][1];
+    else
+      rr = flag * dY;
+
+    if (fabs(rr) > dY / 2)
+    {
+      tot_flag = true;
+      bbox[lev][grd][1] = bbox[lev][grd][1] + rr;
+      bbox[lev][grd][4] = bbox[lev][grd][4] + rr;
+      handle[lev][grd][1] += rr;
+      tmpPorg[bhi][1] = Porg0[bhi][1];
+    }
+
+    // z direction
+    rr = (Porg0[bhi][2] - handle[lev][grd][2]) / dZ;
+    if (rr > 0)
+      flag = int(rr + 0.5) / do_every;
+    else
+      flag = int(rr - 0.5) / do_every;
+    flag = flag * do_every;
+    rr = bbox[lev][grd][2] + flag * dZ;
+    // pay attention to the symmetric case
+    if (Symmetry > 0 && rr < 0)
+      rr = -bbox[lev][grd][1];
+    else
+      rr = flag * dZ;
+
+    if (fabs(rr) > dZ / 2)
+    {
+      tot_flag = true;
+      bbox[lev][grd][2] = bbox[lev][grd][2] + rr;
+      bbox[lev][grd][5] = bbox[lev][grd][5] + rr;
+      handle[lev][grd][2] += rr;
+      tmpPorg[bhi][2] = Porg0[bhi][2];
+    }
+  }
+
+  if (tot_flag)
+  {
+    int nprocs;
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    recompose_cgh_Onelevel(nprocs, lev, OldList, StateList, FutureList, tmList, Symmetry, BB);
+  }
+
+  for (int bhi = 0; bhi < BH_num; bhi++)
+    delete[] tmpPorg[bhi];
+  delete[] tmpPorg;
+}
+#endif
+
+
+void cgh::settrfls(const int lev)
+{
+  trfls = lev;
+}
+
+//================================================================================================
+// Load Balancing Functions
+//================================================================================================
+
+// Initialize load balancing
+void cgh::init_load_balance(int nprocs)
+{
+  if (rank_interp_times)
+    delete[] rank_interp_times;
+  if (heavy_ranks)
+    delete[] heavy_ranks;
+
+  rank_interp_times = new double[nprocs];
+  heavy_ranks = new int[4];  // Maximum 4 heavy ranks
+  num_heavy_ranks = 0;
+
+  for (int i = 0; i < nprocs; i++)
+    rank_interp_times[i] = 0.0;
+}
+
+// Update interpolation time for a rank
+void cgh::update_interp_time(int rank, double time)
+{
+  if (rank_interp_times && rank >= 0)
+  {
+    rank_interp_times[rank] = time;
+  }
+}
+
+// Check and perform load balancing if needed
+bool cgh::check_and_rebalance(int nprocs, int lev,
+                               MyList<var> *OldList, MyList<var> *StateList,
+                               MyList<var> *FutureList, MyList<var> *tmList,
+                               int Symmetry, bool BB)
+{
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+  // Only check at specified intervals
+  current_time_step++;
+  if (current_time_step % load_balance_check_interval != 0)
+    return false;
+
+  if (myrank == 0)
+  {
+    cout << "\n=== Checking load balance at time step " << current_time_step << " ===" << endl;
+  }
+
+  // Collect all rank times on rank 0
+  double *all_times = nullptr;
+  if (myrank == 0)
+  {
+    all_times = new double[nprocs];
+  }
+
+  MPI_Gather(rank_interp_times, 1, MPI_DOUBLE, all_times, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+
+  bool need_rebalance = false;
+
+  if (myrank == 0)
+  {
+    // Check if load balancing is needed
+    need_rebalance = Parallel::check_load_balance_need(all_times, nprocs, num_heavy_ranks, heavy_ranks);
+
+    if (need_rebalance)
+    {
+      cout << "=== Load imbalance detected! Need to rebalance ===" << endl;
+      cout << "Top " << num_heavy_ranks << " heavy ranks: ";
+      for (int i = 0; i < num_heavy_ranks; i++)
+      {
+        cout << heavy_ranks[i] << " (" << all_times[heavy_ranks[i]] << " s) ";
+      }
+      cout << endl;
+
+      // Analyze blocks that need to be split
+      Parallel::split_heavy_blocks(PatL[lev], heavy_ranks, num_heavy_ranks, 2, nprocs, ingfs, fngfs);
+
+      // Set lev_flag to trigger recompose_cgh
+      cout << "=== Triggering recompose_cgh for level " << lev << " ===" << endl;
+    }
+    else
+    {
+      cout << "=== Load is balanced, no rebalancing needed ===" << endl;
+    }
+
+    delete[] all_times;
+  }
+
+  // Broadcast the decision to all ranks
+  MPI_Bcast(&need_rebalance, 1, MPI_C_BOOL, 0, MPI_COMM_WORLD);
+
+  if (need_rebalance)
+  {
+    // Broadcast heavy ranks information
+    MPI_Bcast(&num_heavy_ranks, 1, MPI_INT, 0, MPI_COMM_WORLD);
+    MPI_Bcast(heavy_ranks, num_heavy_ranks, MPI_INT, 0, MPI_COMM_WORLD);
+
+    // Perform recompose_cgh on the specified level
+    if (myrank == 0)
+    {
+      cout << "=== Performing recompose_cgh ===" << endl;
+    }
+
+    // Call recompose_cgh_Onelevel for the specified level
+    bool *lev_flag = new bool[1];
+    lev_flag[0] = true;
+    recompose_cgh_Onelevel(nprocs, lev, OldList, StateList, FutureList, tmList, Symmetry, BB);
+    delete[] lev_flag;
+
+    // Reset time counter after rebalancing
+    current_time_step = 0;
+
+    return true;
+  }
+
+  return false;
+}
diff --git a/AMSS_NCKU_source/cgh.h b/AMSS_NCKU_source/cgh.h
index 79e7bf6..0402481 100644
--- a/AMSS_NCKU_source/cgh.h
+++ b/AMSS_NCKU_source/cgh.h
@@ -1,92 +1,107 @@
-
-#ifndef CGH_H
-#define CGH_H
-
-#include <mpi.h>
-#include "MyList.h"
-#include "MPatch.h"
-#include "macrodef.h"
-#include "monitor.h"
-#include "Parallel.h"
-
-class cgh
-{
-
-public:
-   int levels, movls, BH_num_in;
-   // information of boxes
-   int *grids;
-   double ***bbox;
-   int ***shape;
-   double ***handle;
-   double ***Porgls;
-   double *Lt;
-
-   // information of Patch list
-   MyList<Patch> **PatL;
-
-// information of OutBdLow2Hi point list and Restrict point list
-#if (RPB == 1)
-   MyList<Parallel::pointstru_bam> **bdsul, **rsul;
-#endif
-
-#if (PSTR == 1 || PSTR == 2 || PSTR == 3)
-   int mylev;
-   int *start_rank, *end_rank;
-   MPI_Comm *Commlev;
-#endif
-
-protected:
-   int ingfs, fngfs;
-   static constexpr double ratio = 0.75;
-   int trfls;
-
-public:
-   cgh(int ingfsi, int fngfsi, int Symmetry, char *filename, int checkrun, monitor *ErrorMonitor);
-
-   ~cgh();
-
-   void compose_cgh(int nprocs);
-   void sethandle(monitor *ErrorMonitor);
-   void checkPatchList(MyList<Patch> *PatL, bool buflog);
-   void Regrid(int Symmetry, int BH_num, double **Porgbr, double **Porg0,
-               MyList<var> *OldList, MyList<var> *StateList,
-               MyList<var> *FutureList, MyList<var> *tmList, bool BB,
-               monitor *ErrorMonitor);
-   void Regrid_fake(int Symmetry, int BH_num, double **Porgbr, double **Porg0,
-                    MyList<var> *OldList, MyList<var> *StateList,
-                    MyList<var> *FutureList, MyList<var> *tmList, bool BB,
-                    monitor *ErrorMonitor);
-   void recompose_cgh(int nprocs, bool *lev_flag,
-                      MyList<var> *OldList, MyList<var> *StateList,
-                      MyList<var> *FutureList, MyList<var> *tmList,
-                      int Symmetry, bool BB);
-   void recompose_cgh_fake(int nprocs, bool *lev_flag,
-                           MyList<var> *OldList, MyList<var> *StateList,
-                           MyList<var> *FutureList, MyList<var> *tmList,
-                           int Symmetry, bool BB);
-   void read_bbox(int Symmetry, char *filename);
-   MyList<Patch> *construct_patchlist(int lev, int Symmetry);
-   bool Interp_One_Point(MyList<var> *VarList,
-                         double *XX, /*input global Cartesian coordinate*/
-                         double *Shellf, int Symmetry);
-   void recompose_cgh_Onelevel(int nprocs, int lev,
-                               MyList<var> *OldList, MyList<var> *StateList,
-                               MyList<var> *FutureList, MyList<var> *tmList,
-                               int Symmetry, bool BB);
-   void Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
-                        MyList<var> *OldList, MyList<var> *StateList,
-                        MyList<var> *FutureList, MyList<var> *tmList, bool BB,
-                        monitor *ErrorMonitor);
-   void Regrid_Onelevel_aux(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
-                            MyList<var> *OldList, MyList<var> *StateList,
-                            MyList<var> *FutureList, MyList<var> *tmList, bool BB,
-                            monitor *ErrorMonitor);
-   void settrfls(const int lev);
-
-#if (PSTR == 1 || PSTR == 2 || PSTR == 3)
-   void construct_mylev(int nprocs);
-#endif
-};
-
-#endif /* CGH_H */
+
+#ifndef CGH_H
+#define CGH_H
+
+#include <mpi.h>
+#include "MyList.h"
+#include "MPatch.h"
+#include "macrodef.h"
+#include "monitor.h"
+#include "Parallel.h"
+
+class cgh
+{
+
+public:
+   int levels, movls, BH_num_in;
+   // information of boxes
+   int *grids;
+   double ***bbox;
+   int ***shape;
+   double ***handle;
+   double ***Porgls;
+   double *Lt;
+
+   // information of Patch list
+   MyList<Patch> **PatL;
+
+// information of OutBdLow2Hi point list and Restrict point list
+#if (RPB == 1)
+   MyList<Parallel::pointstru_bam> **bdsul, **rsul;
+#endif
+
+#if (PSTR == 1 || PSTR == 2 || PSTR == 3)
+   int mylev;
+   int *start_rank, *end_rank;
+   MPI_Comm *Commlev;
+#endif
+
+protected:
+   int ingfs, fngfs;
+   static constexpr double ratio = 0.75;
+   int trfls;
+
+public:
+   cgh(int ingfsi, int fngfsi, int Symmetry, char *filename, int checkrun, monitor *ErrorMonitor);
+
+   ~cgh();
+
+   void compose_cgh(int nprocs);
+   void sethandle(monitor *ErrorMonitor);
+   void checkPatchList(MyList<Patch> *PatL, bool buflog);
+   void Regrid(int Symmetry, int BH_num, double **Porgbr, double **Porg0,
+               MyList<var> *OldList, MyList<var> *StateList,
+               MyList<var> *FutureList, MyList<var> *tmList, bool BB,
+               monitor *ErrorMonitor);
+   void Regrid_fake(int Symmetry, int BH_num, double **Porgbr, double **Porg0,
+                    MyList<var> *OldList, MyList<var> *StateList,
+                    MyList<var> *FutureList, MyList<var> *tmList, bool BB,
+                    monitor *ErrorMonitor);
+   void recompose_cgh(int nprocs, bool *lev_flag,
+                      MyList<var> *OldList, MyList<var> *StateList,
+                      MyList<var> *FutureList, MyList<var> *tmList,
+                      int Symmetry, bool BB);
+   void recompose_cgh_fake(int nprocs, bool *lev_flag,
+                           MyList<var> *OldList, MyList<var> *StateList,
+                           MyList<var> *FutureList, MyList<var> *tmList,
+                           int Symmetry, bool BB);
+   void read_bbox(int Symmetry, char *filename);
+   MyList<Patch> *construct_patchlist(int lev, int Symmetry);
+   bool Interp_One_Point(MyList<var> *VarList,
+                         double *XX, /*input global Cartesian coordinate*/
+                         double *Shellf, int Symmetry);
+   void recompose_cgh_Onelevel(int nprocs, int lev,
+                               MyList<var> *OldList, MyList<var> *StateList,
+                               MyList<var> *FutureList, MyList<var> *tmList,
+                               int Symmetry, bool BB);
+   void Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
+                        MyList<var> *OldList, MyList<var> *StateList,
+                        MyList<var> *FutureList, MyList<var> *tmList, bool BB,
+                        monitor *ErrorMonitor);
+   void Regrid_Onelevel_aux(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
+                            MyList<var> *OldList, MyList<var> *StateList,
+                            MyList<var> *FutureList, MyList<var> *tmList, bool BB,
+                            monitor *ErrorMonitor);
+   void settrfls(const int lev);
+
+#if (PSTR == 1 || PSTR == 2 || PSTR == 3)
+   void construct_mylev(int nprocs);
+#endif
+
+   // Load balancing support
+   bool enable_load_balance;         // Enable load balancing
+   int load_balance_check_interval;  // Check interval (in time steps)
+   int current_time_step;            // Current time step counter
+   double *rank_interp_times;        // Store interpolation times for each rank
+   int *heavy_ranks;                 // Store heavy rank numbers
+   int num_heavy_ranks;              // Number of heavy ranks
+
+   void init_load_balance(int nprocs);
+   void update_interp_time(int rank, double time);
+   bool check_and_rebalance(int nprocs, int lev,
+                           MyList<var> *OldList, MyList<var> *StateList,
+                           MyList<var> *FutureList, MyList<var> *tmList,
+                           int Symmetry, bool BB);
+};
+
+#endif /* CGH_H */
diff --git a/AMSS_NCKU_source/surface_integral.C b/AMSS_NCKU_source/surface_integral.C
index c2b7b67..44edce3 100644
--- a/AMSS_NCKU_source/surface_integral.C
+++ b/AMSS_NCKU_source/surface_integral.C
@@ -1,3751 +1,3750 @@
-
-//----------------------------------------------------------------
-// Using Gauss-Legendre quadrature in theta direction
-// and   trapezoidal rule in phi direction (from Second Euler-Maclaurin summation formula, we can see that
-// this method gives expolential convergence for periodic function)
-//----------------------------------------------------------------
-#ifdef newc
-#include <iostream>
-#include <iomanip>
-#include <fstream>
-#include <strstream>
-#include <cmath>
-#include <map>
-using namespace std;
-#else
-#include <iostream.h>
-#include <iomanip.h>
-#include <fstream.h>
-#include <string.h>
-#include <math.h>
-#include <map.h>
-#endif
-#include <mpi.h>
-
-#include "misc.h"
-#include "cgh.h"
-#include "Parallel.h"
-#include "surface_integral.h"
-#include "fadmquantites_bssn.h"
-#include "getnpem2.h"
-#include "getnp4.h"
-#include "parameters.h"
-
-#define PI M_PI
-//|============================================================================
-//| Constructor
-//|============================================================================
-
-surface_integral::surface_integral(int iSymmetry) : Symmetry(iSymmetry)
-{
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-  int N = 40;
-  // read parameter from file
-  {
-    const int LEN = 256;
-    char pline[LEN];
-    string str, sgrp, skey, sval;
-    int sind;
-    char pname[50];
-    {
-      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
-      if (iter != parameters::str_par.end())
-      {
-        strcpy(pname, (iter->second).c_str());
-      }
-      else
-      {
-        cout << "Error inputpar" << endl;
-        exit(0);
-      }
-    }
-    ifstream inf(pname, ifstream::in);
-    if (!inf.good() && myrank == 0)
-    {
-      cout << "Can not open parameter file " << pname << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-
-    for (int i = 1; inf.good(); i++)
-    {
-      inf.getline(pline, LEN);
-      str = pline;
-
-      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
-      if (status == -1)
-      {
-        cout << "error reading parameter file " << pname << " in line " << i << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-      else if (status == 0)
-        continue;
-
-      if (sgrp == "SurfaceIntegral")
-      {
-        if (skey == "number of points for quarter sphere")
-          N = atoi(sval.c_str());
-      }
-    }
-    inf.close();
-  }
-  //|-----number of points for whole [0,pi] x [0,2pi]
-  N_phi   = 4 * N;   // for simplicity, we require this number must be 4*N
-  N_theta = 2 * N;   //                                                2*N
-
-  if (myrank == 0)
-  {
-    cout << "-----------------------------------------------------------------------" << endl;
-#ifdef GaussInt
-    cout << " spherical integration for wave form extraction with Gauss method      " << endl;
-#else
-    cout << " spherical integration for wave form extraction with mid point method  " << endl;
-#endif
-    cout << " N_phi   = " << N_phi   << endl;
-    cout << " N_theta = " << N_theta << endl;
-    cout << "-----------------------------------------------------------------------" << endl;
-  }
-
-#ifdef GaussInt
-  //  weight function cover all of [0,pi]
-  arcostheta = new double[N_theta];
-  wtcostheta = new double[N_theta];
-
-  // note: theta in [0,pi/2], upper half sphere, corresponds to 1 < costheta < 0
-  misc::gaulegf(-1.0, 1.0, arcostheta, wtcostheta, N_theta);
-  // due to symmetry, I need first half array corresponds to upper sphere, note these two arrays must match each other
-  misc::inversearray(arcostheta, N_theta);
-  misc::inversearray(wtcostheta, N_theta);
-#endif
-
-  if (Symmetry == 2)
-  {
-    N_phi = N_phi / 4;
-    N_theta = N_theta / 2;
-    dphi = PI / (2.0 * N_phi);
-    dcostheta = 1.0 / N_theta;
-    factor = 8;
-  }
-  else if (Symmetry == 1)
-  {
-    N_theta = N_theta / 2;
-    dphi = 2.0 * PI / N_phi;
-    dcostheta = 1.0 / N_theta;
-    factor = 2;
-  }
-  else if (Symmetry == 0)
-  {
-    dphi = 2.0 * PI / N_phi;
-    dcostheta = 2.0 / N_theta;
-    factor = 1;
-  }
-  else if (myrank == 0)
-  {
-    cout << "surface_integral::surface_integral: not supported Symmetry setting!" << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-#ifndef GaussInt
-  //  weight function cover all of [0,pi]
-  arcostheta = new double[N_theta];
-#endif
-  n_tot = N_theta * N_phi;
-  nx_g = new double[n_tot];
-  ny_g = new double[n_tot];
-  nz_g = new double[n_tot];
-
-  int n = 0;
-  double costheta, sintheta, ph;
-
-  for (int i = 0; i < N_theta; ++i)
-  {
-#ifndef GaussInt
-    arcostheta[i] = 1.0 - (i + 0.5) * dcostheta;
-#endif
-    costheta = arcostheta[i];
-    sintheta = sqrt(1.0 - costheta * costheta);
-
-    for (int j = 0; j < N_phi; ++j)
-    {
-      ph = (j + 0.5) * dphi;
-      // normal vector respect to the constant R sphere
-      nx_g[n] = sintheta * cos(ph);
-      ny_g[n] = sintheta * sin(ph);
-      nz_g[n] = costheta;
-      n++;
-    }
-  }
-}
-
-//|============================================================================
-//| Destructor
-//|============================================================================
-surface_integral::~surface_integral()
-{
-  delete[] nx_g;
-  delete[] ny_g;
-  delete[] nz_g;
-  delete[] arcostheta;
-#ifdef GaussInt
-  delete[] wtcostheta;
-#endif
-}
-//|----------------------------------------------------------------
-//  spin weighted spinw component of psi4, general routine
-//  l takes from spinw to maxl; m takes from -l to l
-//|----------------------------------------------------------------
-void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
-                                 int spinw, int maxl, int NN, double *RP, double *IP,
-                                 monitor *Monitor) // NN is the length of RP and IP
-{
-  if (myrank == 0 && GH->grids[lev] != 1)
-    if (Monitor->outfile)
-      Monitor->outfile << "WARNING: surface integral on multipatches" << endl;
-    else
-      cout << "WARNING: surface integral on multipatches" << endl;
-
-  const int InList = 2;
-
-  MyList<var> *DG_List = new MyList<var>(Rpsi4);
-  DG_List->insert(Ipsi4);
-
-  int n;
-  double *pox[3];
-  for (int i = 0; i < 3; i++)
-    pox[i] = new double[n_tot];
-  for (n = 0; n < n_tot; n++)
-  {
-    pox[0][n] = rex * nx_g[n];
-    pox[1][n] = rex * ny_g[n];
-    pox[2][n] = rex * nz_g[n];
-  }
-
-  int mp, Lp, Nmin, Nmax;
-  mp = n_tot / cpusize;
-  Lp = n_tot - cpusize * mp;
-  if (Lp > myrank)
-  {
-    Nmin = myrank * mp + myrank;
-    Nmax = Nmin + mp;
-  }
-  else
-  {
-    Nmin = myrank * mp + Lp;
-    Nmax = Nmin + mp - 1;
-  }
-
-  double *shellf;
-  shellf = new double[n_tot * InList];
-
-  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
-
-  //|~~~~~> Integrate the dot product of Dphi with the surface normal.
-
-  double *RP_out, *IP_out;
-  RP_out = new double[NN];
-  IP_out = new double[NN];
-
-  for (int ii = 0; ii < NN; ii++)
-  {
-    RP_out[ii] = 0;
-    IP_out[ii] = 0;
-  }
-  // theta part
-  double costheta, thetap;
-  double cosmphi, sinmphi;
-
-  int i, j;
-  int lpsy = 0;
-  if (Symmetry == 0)
-    lpsy = 1;
-  else if (Symmetry == 1)
-    lpsy = 2;
-  else if (Symmetry == 2)
-    lpsy = 8;
-
-  double psi4RR, psi4II;
-  for (n = Nmin; n <= Nmax; n++)
-  {
-    //       need round off always
-    i = int(n / N_phi); // int(1.723) = 1, int(-1.732) = -1
-    j = n - i * N_phi;
-
-    int countlm = 0;
-    for (int pl = spinw; pl < maxl + 1; pl++)
-      for (int pm = -pl; pm < pl + 1; pm++)
-      {
-        for (int lp = 0; lp < lpsy; lp++)
-        {
-          switch (lp)
-          {
-          case 0: //+++ (theta, phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = sin(pm * (j + 0.5) * dphi);
-            psi4RR = shellf[InList * n];
-            psi4II = shellf[InList * n + 1];
-            break;
-          case 1: //++- (pi-theta, phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = sin(pm * (j + 0.5) * dphi);
-            psi4RR = Rpsi4->SoA[2] * shellf[InList * n];
-            psi4II = Ipsi4->SoA[2] * shellf[InList * n + 1];
-            break;
-          case 2: //+-+ (theta, 2*pi-phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = -sin(pm * (j + 0.5) * dphi);
-            psi4RR = Rpsi4->SoA[1] * shellf[InList * n];
-            psi4II = Ipsi4->SoA[1] * shellf[InList * n + 1];
-            break;
-          case 3: //+-- (pi-theta, 2*pi-phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = -sin(pm * (j + 0.5) * dphi);
-            psi4RR = Rpsi4->SoA[2] * Rpsi4->SoA[1] * shellf[InList * n];
-            psi4II = Ipsi4->SoA[2] * Ipsi4->SoA[1] * shellf[InList * n + 1];
-            break;
-          case 4: //-++ (theta, pi-phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
-            psi4RR = Rpsi4->SoA[0] * shellf[InList * n];
-            psi4II = Ipsi4->SoA[0] * shellf[InList * n + 1];
-            break;
-          case 5: //-+- (pi-theta, pi-phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
-            psi4RR = Rpsi4->SoA[2] * Rpsi4->SoA[0] * shellf[InList * n];
-            psi4II = Ipsi4->SoA[2] * Ipsi4->SoA[0] * shellf[InList * n + 1];
-            break;
-          case 6: //--+ (theta, pi+phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
-            psi4RR = Rpsi4->SoA[1] * Rpsi4->SoA[0] * shellf[InList * n];
-            psi4II = Ipsi4->SoA[1] * Ipsi4->SoA[0] * shellf[InList * n + 1];
-            break;
-          case 7: //--- (pi-theta, pi+phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
-            psi4RR = Rpsi4->SoA[2] * Rpsi4->SoA[1] * Rpsi4->SoA[0] * shellf[InList * n];
-            psi4II = Ipsi4->SoA[2] * Ipsi4->SoA[1] * Ipsi4->SoA[0] * shellf[InList * n + 1];
-          }
-
-          thetap = sqrt((2 * pl + 1.0) / 4.0 / PI) * misc::Wigner_d_function(pl, pm, spinw, costheta); // note the variation from -2 to 2
-#ifdef GaussInt
-          // wtcostheta is even function respect costheta
-          RP_out[countlm] = RP_out[countlm] + thetap * (psi4RR * cosmphi + psi4II * sinmphi) * wtcostheta[i];
-          IP_out[countlm] = IP_out[countlm] + thetap * (psi4II * cosmphi - psi4RR * sinmphi) * wtcostheta[i];
-#else
-          RP_out[countlm] = RP_out[countlm] + thetap * (psi4RR * cosmphi + psi4II * sinmphi);
-          IP_out[countlm] = IP_out[countlm] + thetap * (psi4II * cosmphi - psi4RR * sinmphi);
-#endif
-        }
-        countlm++; // no sanity check for countlm and NN which should be noted in the input parameters
-      }
-  }
-
-  for (int ii = 0; ii < NN; ii++)
-  {
-#ifdef GaussInt
-    RP_out[ii] = RP_out[ii] * rex * dphi;
-    IP_out[ii] = IP_out[ii] * rex * dphi;
-#else
-    RP_out[ii] = RP_out[ii] * rex * dphi * dcostheta;
-    IP_out[ii] = IP_out[ii] * rex * dphi * dcostheta;
-#endif
-  }
-  //|------+  Communicate and sum the results from each processor.
-
-  {
-    double *RPIP_out = new double[2 * NN];
-    double *RPIP = new double[2 * NN];
-    memcpy(RPIP_out, RP_out, NN * sizeof(double));
-    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
-    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    memcpy(RP, RPIP, NN * sizeof(double));
-    memcpy(IP, RPIP + NN, NN * sizeof(double));
-    delete[] RPIP_out;
-    delete[] RPIP;
-  }
-
-  //|------= Free memory.
-
-  delete[] pox[0];
-  delete[] pox[1];
-  delete[] pox[2];
-  delete[] shellf;
-  delete[] RP_out;
-  delete[] IP_out;
-  DG_List->clearList();
-}
-void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
-                                 int spinw, int maxl, int NN, double *RP, double *IP,
-                                 monitor *Monitor, MPI_Comm Comm_here) // NN is the length of RP and IP
-{
-  //   misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"start surface_integral::surf_Wave");
-
-  int lmyrank;
-  MPI_Comm_rank(Comm_here, &lmyrank);
-  if (lmyrank == 0 && GH->grids[lev] != 1)
-    if (Monitor->outfile)
-      Monitor->outfile << "WARNING: surface integral on multipatches" << endl;
-    else
-      cout << "WARNING: surface integral on multipatches" << endl;
-
-  const int InList = 2;
-
-  MyList<var> *DG_List = new MyList<var>(Rpsi4);
-  DG_List->insert(Ipsi4);
-
-  int n;
-  double *pox[3];
-  for (int i = 0; i < 3; i++)
-    pox[i] = new double[n_tot];
-  for (n = 0; n < n_tot; n++)
-  {
-    pox[0][n] = rex * nx_g[n];
-    pox[1][n] = rex * ny_g[n];
-    pox[2][n] = rex * nz_g[n];
-  }
-
-  double *shellf;
-  shellf = new double[n_tot * InList];
-
-  //    misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Interp_Points");
-
-  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Comm_here);
-
-  //    misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Interp_Points");
-
-  int mp, Lp, Nmin, Nmax;
-
-  int cpusize_here;
-  MPI_Comm_size(Comm_here, &cpusize_here);
-
-  mp = n_tot / cpusize_here;
-  Lp = n_tot - cpusize_here * mp;
-
-  if (Lp > lmyrank)
-  {
-    Nmin = lmyrank * mp + lmyrank;
-    Nmax = Nmin + mp;
-  }
-  else
-  {
-    Nmin = lmyrank * mp + Lp;
-    Nmax = Nmin + mp - 1;
-  }
-
-  //|~~~~~> Integrate the dot product of Dphi with the surface normal.
-
-  double *RP_out, *IP_out;
-  RP_out = new double[NN];
-  IP_out = new double[NN];
-
-  for (int ii = 0; ii < NN; ii++)
-  {
-    RP_out[ii] = 0;
-    IP_out[ii] = 0;
-  }
-  // theta part
-  double costheta, thetap;
-  double cosmphi, sinmphi;
-
-  int i, j;
-  int lpsy = 0;
-  if (Symmetry == 0)
-    lpsy = 1;
-  else if (Symmetry == 1)
-    lpsy = 2;
-  else if (Symmetry == 2)
-    lpsy = 8;
-
-  double psi4RR, psi4II;
-  for (n = Nmin; n <= Nmax; n++)
-  {
-    //       need round off always
-    i = int(n / N_phi); // int(1.723) = 1, int(-1.732) = -1
-    j = n - i * N_phi;
-
-    int countlm = 0;
-    for (int pl = spinw; pl < maxl + 1; pl++)
-      for (int pm = -pl; pm < pl + 1; pm++)
-      {
-        for (int lp = 0; lp < lpsy; lp++)
-        {
-          switch (lp)
-          {
-          case 0: //+++ (theta, phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = sin(pm * (j + 0.5) * dphi);
-            psi4RR = shellf[InList * n];
-            psi4II = shellf[InList * n + 1];
-            break;
-          case 1: //++- (pi-theta, phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = sin(pm * (j + 0.5) * dphi);
-            psi4RR = Rpsi4->SoA[2] * shellf[InList * n];
-            psi4II = Ipsi4->SoA[2] * shellf[InList * n + 1];
-            break;
-          case 2: //+-+ (theta, 2*pi-phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = -sin(pm * (j + 0.5) * dphi);
-            psi4RR = Rpsi4->SoA[1] * shellf[InList * n];
-            psi4II = Ipsi4->SoA[1] * shellf[InList * n + 1];
-            break;
-          case 3: //+-- (pi-theta, 2*pi-phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = -sin(pm * (j + 0.5) * dphi);
-            psi4RR = Rpsi4->SoA[2] * Rpsi4->SoA[1] * shellf[InList * n];
-            psi4II = Ipsi4->SoA[2] * Ipsi4->SoA[1] * shellf[InList * n + 1];
-            break;
-          case 4: //-++ (theta, pi-phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
-            psi4RR = Rpsi4->SoA[0] * shellf[InList * n];
-            psi4II = Ipsi4->SoA[0] * shellf[InList * n + 1];
-            break;
-          case 5: //-+- (pi-theta, pi-phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
-            psi4RR = Rpsi4->SoA[2] * Rpsi4->SoA[0] * shellf[InList * n];
-            psi4II = Ipsi4->SoA[2] * Ipsi4->SoA[0] * shellf[InList * n + 1];
-            break;
-          case 6: //--+ (theta, pi+phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
-            psi4RR = Rpsi4->SoA[1] * Rpsi4->SoA[0] * shellf[InList * n];
-            psi4II = Ipsi4->SoA[1] * Ipsi4->SoA[0] * shellf[InList * n + 1];
-            break;
-          case 7: //--- (pi-theta, pi+phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
-            psi4RR = Rpsi4->SoA[2] * Rpsi4->SoA[1] * Rpsi4->SoA[0] * shellf[InList * n];
-            psi4II = Ipsi4->SoA[2] * Ipsi4->SoA[1] * Ipsi4->SoA[0] * shellf[InList * n + 1];
-          }
-
-          thetap = sqrt((2 * pl + 1.0) / 4.0 / PI) * misc::Wigner_d_function(pl, pm, spinw, costheta); // note the variation from -2 to 2
-#ifdef GaussInt
-          // wtcostheta is even function respect costheta
-          RP_out[countlm] = RP_out[countlm] + thetap * (psi4RR * cosmphi + psi4II * sinmphi) * wtcostheta[i];
-          IP_out[countlm] = IP_out[countlm] + thetap * (psi4II * cosmphi - psi4RR * sinmphi) * wtcostheta[i];
-#else
-          RP_out[countlm] = RP_out[countlm] + thetap * (psi4RR * cosmphi + psi4II * sinmphi);
-          IP_out[countlm] = IP_out[countlm] + thetap * (psi4II * cosmphi - psi4RR * sinmphi);
-#endif
-        }
-        countlm++; // no sanity check for countlm and NN which should be noted in the input parameters
-      }
-  }
-
-  for (int ii = 0; ii < NN; ii++)
-  {
-#ifdef GaussInt
-    RP_out[ii] = RP_out[ii] * rex * dphi;
-    IP_out[ii] = IP_out[ii] * rex * dphi;
-#else
-    RP_out[ii] = RP_out[ii] * rex * dphi * dcostheta;
-    IP_out[ii] = IP_out[ii] * rex * dphi * dcostheta;
-#endif
-  }
-  //|------+  Communicate and sum the results from each processor.
-
-  {
-    double *RPIP_out = new double[2 * NN];
-    double *RPIP = new double[2 * NN];
-    memcpy(RPIP_out, RP_out, NN * sizeof(double));
-    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
-    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, Comm_here);
-    memcpy(RP, RPIP, NN * sizeof(double));
-    memcpy(IP, RPIP + NN, NN * sizeof(double));
-    delete[] RPIP_out;
-    delete[] RPIP;
-  }
-
-  //|------= Free memory.
-
-  delete[] pox[0];
-  delete[] pox[1];
-  delete[] pox[2];
-  delete[] shellf;
-  delete[] RP_out;
-  delete[] IP_out;
-  DG_List->clearList();
-}
-//|----------------------------------------------------------------
-//  for shell patch
-//|----------------------------------------------------------------
-void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH, var *Rpsi4, var *Ipsi4,
-                                 int spinw, int maxl, int NN, double *RP, double *IP,
-                                 monitor *Monitor) // NN is the length of RP and IP
-{
-  const int InList = 2;
-
-  MyList<var> *DG_List = new MyList<var>(Rpsi4);
-  DG_List->insert(Ipsi4);
-
-  int n;
-  double *pox[3];
-  for (int i = 0; i < 3; i++)
-    pox[i] = new double[n_tot];
-  for (n = 0; n < n_tot; n++)
-  {
-    pox[0][n] = rex * nx_g[n];
-    pox[1][n] = rex * ny_g[n];
-    pox[2][n] = rex * nz_g[n];
-  }
-
-  double *shellf;
-  shellf = new double[n_tot * InList];
-
-  GH->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);
-
-  int mp, Lp, Nmin, Nmax;
-
-  mp = n_tot / cpusize;
-  Lp = n_tot - cpusize * mp;
-
-  if (Lp > myrank)
-  {
-    Nmin = myrank * mp + myrank;
-    Nmax = Nmin + mp;
-  }
-  else
-  {
-    Nmin = myrank * mp + Lp;
-    Nmax = Nmin + mp - 1;
-  }
-
-  //|~~~~~> Integrate the dot product of Dphi with the surface normal.
-
-  double *RP_out, *IP_out;
-  RP_out = new double[NN];
-  IP_out = new double[NN];
-
-  for (int ii = 0; ii < NN; ii++)
-  {
-    RP_out[ii] = 0;
-    IP_out[ii] = 0;
-  }
-  // theta part
-  double costheta, thetap;
-  double cosmphi, sinmphi;
-
-  int i, j;
-  int lpsy = 0;
-  if (Symmetry == 0)
-    lpsy = 1;
-  else if (Symmetry == 1)
-    lpsy = 2;
-  else if (Symmetry == 2)
-    lpsy = 8;
-
-  double psi4RR, psi4II;
-  for (n = Nmin; n <= Nmax; n++)
-  {
-    //       need round off always
-    i = int(n / N_phi); // int(1.723) = 1, int(-1.732) = -1
-    j = n - i * N_phi;
-
-    int countlm = 0;
-    for (int pl = spinw; pl < maxl + 1; pl++)
-      for (int pm = -pl; pm < pl + 1; pm++)
-      {
-        for (int lp = 0; lp < lpsy; lp++)
-        {
-          switch (lp)
-          {
-          case 0: //+++ (theta, phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = sin(pm * (j + 0.5) * dphi);
-            psi4RR = shellf[InList * n];
-            psi4II = shellf[InList * n + 1];
-            break;
-          case 1: //++- (pi-theta, phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = sin(pm * (j + 0.5) * dphi);
-            psi4RR = Rpsi4->SoA[2] * shellf[InList * n];
-            psi4II = Ipsi4->SoA[2] * shellf[InList * n + 1];
-            break;
-          case 2: //+-+ (theta, 2*pi-phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = -sin(pm * (j + 0.5) * dphi);
-            psi4RR = Rpsi4->SoA[1] * shellf[InList * n];
-            psi4II = Ipsi4->SoA[1] * shellf[InList * n + 1];
-            break;
-          case 3: //+-- (pi-theta, 2*pi-phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = -sin(pm * (j + 0.5) * dphi);
-            psi4RR = Rpsi4->SoA[2] * Rpsi4->SoA[1] * shellf[InList * n];
-            psi4II = Ipsi4->SoA[2] * Ipsi4->SoA[1] * shellf[InList * n + 1];
-            break;
-          case 4: //-++ (theta, pi-phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
-            psi4RR = Rpsi4->SoA[0] * shellf[InList * n];
-            psi4II = Ipsi4->SoA[0] * shellf[InList * n + 1];
-            break;
-          case 5: //-+- (pi-theta, pi-phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
-            psi4RR = Rpsi4->SoA[2] * Rpsi4->SoA[0] * shellf[InList * n];
-            psi4II = Ipsi4->SoA[2] * Ipsi4->SoA[0] * shellf[InList * n + 1];
-            break;
-          case 6: //--+ (theta, pi+phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
-            psi4RR = Rpsi4->SoA[1] * Rpsi4->SoA[0] * shellf[InList * n];
-            psi4II = Ipsi4->SoA[1] * Ipsi4->SoA[0] * shellf[InList * n + 1];
-            break;
-          case 7: //--- (pi-theta, pi+phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
-            psi4RR = Rpsi4->SoA[2] * Rpsi4->SoA[1] * Rpsi4->SoA[0] * shellf[InList * n];
-            psi4II = Ipsi4->SoA[2] * Ipsi4->SoA[1] * Ipsi4->SoA[0] * shellf[InList * n + 1];
-          }
-
-          thetap = sqrt((2 * pl + 1.0) / 4.0 / PI) * misc::Wigner_d_function(pl, pm, spinw, costheta); // note the variation from -2 to 2
-#ifdef GaussInt
-          // wtcostheta is even function respect costheta
-          RP_out[countlm] = RP_out[countlm] + thetap * (psi4RR * cosmphi + psi4II * sinmphi) * wtcostheta[i];
-          IP_out[countlm] = IP_out[countlm] + thetap * (psi4II * cosmphi - psi4RR * sinmphi) * wtcostheta[i];
-#else
-          RP_out[countlm] = RP_out[countlm] + thetap * (psi4RR * cosmphi + psi4II * sinmphi);
-          IP_out[countlm] = IP_out[countlm] + thetap * (psi4II * cosmphi - psi4RR * sinmphi);
-#endif
-        }
-        countlm++; // no sanity check for countlm and NN which should be noted in the input parameters
-      }
-  }
-
-  for (int ii = 0; ii < NN; ii++)
-  {
-#ifdef GaussInt
-    RP_out[ii] = RP_out[ii] * rex * dphi;
-    IP_out[ii] = IP_out[ii] * rex * dphi;
-#else
-    RP_out[ii] = RP_out[ii] * rex * dphi * dcostheta;
-    IP_out[ii] = IP_out[ii] * rex * dphi * dcostheta;
-#endif
-  }
-  //|------+  Communicate and sum the results from each processor.
-
-  {
-    double *RPIP_out = new double[2 * NN];
-    double *RPIP = new double[2 * NN];
-    memcpy(RPIP_out, RP_out, NN * sizeof(double));
-    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
-    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    memcpy(RP, RPIP, NN * sizeof(double));
-    memcpy(IP, RPIP + NN, NN * sizeof(double));
-    delete[] RPIP_out;
-    delete[] RPIP;
-  }
-
-  //|------= Free memory.
-
-  delete[] pox[0];
-  delete[] pox[1];
-  delete[] pox[2];
-  delete[] shellf;
-  delete[] RP_out;
-  delete[] IP_out;
-  DG_List->clearList();
-}
-//|----------------------------------------------------------------
-//  for shell patch
-//  for EM wave specially symmetric case
-//|----------------------------------------------------------------
-void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH,
-                                 var *Ex, var *Ey, var *Ez, var *Bx, var *By, var *Bz,
-                                 var *chi, var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
-                                 int spinw, int maxl, int NN, double *RP, double *IP,
-                                 monitor *Monitor) // NN is the length of RP and IP
-{
-  const int InList = 13;
-
-  MyList<var> *DG_List = new MyList<var>(Ex);
-  DG_List->insert(Ey);
-  DG_List->insert(Ez);
-  DG_List->insert(Bx);
-  DG_List->insert(By);
-  DG_List->insert(Bz);
-  DG_List->insert(chi);
-  DG_List->insert(gxx);
-  DG_List->insert(gxy);
-  DG_List->insert(gxz);
-  DG_List->insert(gyy);
-  DG_List->insert(gyz);
-  DG_List->insert(gzz);
-
-  int n;
-  double *pox[3];
-  for (int i = 0; i < 3; i++)
-    pox[i] = new double[n_tot];
-  for (n = 0; n < n_tot; n++)
-  {
-    pox[0][n] = rex * nx_g[n];
-    pox[1][n] = rex * ny_g[n];
-    pox[2][n] = rex * nz_g[n];
-  }
-
-  double *shellf;
-  shellf = new double[n_tot * InList];
-
-  GH->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);
-
-  int mp, Lp, Nmin, Nmax;
-
-  mp = n_tot / cpusize;
-  Lp = n_tot - cpusize * mp;
-
-  if (Lp > myrank)
-  {
-    Nmin = myrank * mp + myrank;
-    Nmax = Nmin + mp;
-  }
-  else
-  {
-    Nmin = myrank * mp + Lp;
-    Nmax = Nmin + mp - 1;
-  }
-
-  //|~~~~~> Integrate the dot product of Dphi with the surface normal.
-
-  double *RP_out, *IP_out;
-  RP_out = new double[NN];
-  IP_out = new double[NN];
-
-  for (int ii = 0; ii < NN; ii++)
-  {
-    RP_out[ii] = 0;
-    IP_out[ii] = 0;
-  }
-  // theta part
-  double costheta, thetap;
-  double cosmphi, sinmphi;
-
-  int i, j;
-  int lpsy = 0;
-  if (Symmetry == 0)
-    lpsy = 1;
-  else if (Symmetry == 1)
-    lpsy = 2;
-  else if (Symmetry == 2)
-    lpsy = 8;
-
-  double psi4RR, psi4II;
-  double px, py, pz;
-  double pEx, pEy, pEz, pBx, pBy, pBz;
-  double pchi, pgxx, pgxy, pgxz, pgyy, pgyz, pgzz;
-  for (n = Nmin; n <= Nmax; n++)
-  {
-    //       need round off always
-    i = int(n / N_phi); // int(1.723) = 1, int(-1.732) = -1
-    j = n - i * N_phi;
-
-    int countlm = 0;
-    for (int pl = spinw; pl < maxl + 1; pl++)
-      for (int pm = -pl; pm < pl + 1; pm++)
-      {
-        for (int lp = 0; lp < lpsy; lp++)
-        {
-          px = pox[0][n];
-          py = pox[1][n];
-          pz = pox[2][n];
-          pEx = shellf[InList * n];
-          pEy = shellf[InList * n + 1];
-          pEz = shellf[InList * n + 2];
-          pBx = shellf[InList * n + 3];
-          pBy = shellf[InList * n + 4];
-          pBz = shellf[InList * n + 5];
-          pchi = shellf[InList * n + 6];
-          pgxx = shellf[InList * n + 7];
-          pgxy = shellf[InList * n + 8];
-          pgxz = shellf[InList * n + 9];
-          pgyy = shellf[InList * n + 10];
-          pgyz = shellf[InList * n + 11];
-          pgzz = shellf[InList * n + 12];
-          switch (lp)
-          {
-          case 0: //+++ (theta, phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = sin(pm * (j + 0.5) * dphi);
-            break;
-          case 1: //++- (pi-theta, phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = sin(pm * (j + 0.5) * dphi);
-            pz = -pz;
-            pEz = -pEz;
-            pBx = -pBx;
-            pBy = -pBy;
-            pgxz = -pgxz;
-            pgyz = -pgyz;
-            break;
-          case 2: //+-+ (theta, 2*pi-phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = -sin(pm * (j + 0.5) * dphi);
-            py = -py;
-            pEy = -pEy;
-            pBx = -pBx;
-            pBz = -pBz;
-            pgxy = -pgxy;
-            pgyz = -pgyz;
-            break;
-          case 3: //+-- (pi-theta, 2*pi-phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = -sin(pm * (j + 0.5) * dphi);
-            py = -py;
-            pz = -pz;
-            pEz = -pEz;
-            pBz = -pBz;
-            pgxz = -pgxz;
-            pEy = -pEy;
-            pBy = -pBy;
-            pgxy = -pgxy;
-            break;
-          case 4: //-++ (theta, pi-phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
-            px = -px;
-            pEx = -pEx;
-            pBy = -pBy;
-            pBz = -pBz;
-            pgxy = -pgxy;
-            pgxz = -pgxz;
-            break;
-          case 5: //-+- (pi-theta, pi-phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
-            pz = -pz;
-            px = -px;
-            pEz = -pEz;
-            pBz = -pBz;
-            pgyz = -pgyz;
-            pEx = -pEx;
-            pBx = -pBx;
-            pgxy = -pgxy;
-            break;
-          case 6: //--+ (theta, pi+phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
-            px = -px;
-            py = -py;
-            pEx = -pEx;
-            pBx = -pBx;
-            pgxz = -pgxz;
-            pEy = -pEy;
-            pBy = -pBy;
-            pgyz = -pgyz;
-            break;
-          case 7: //--- (pi-theta, pi+phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
-            px = -px;
-            py = -py;
-            pz = -pz;
-            pEx = -pEx;
-            pEy = -pEy;
-            pEz = -pEz;
-          }
-
-          f_getnpem2_point(px, py, pz, pchi, pgxx, pgxy, pgxz, pgyy, pgyz, pgzz, pEx, pEy, pEz, pBx, pBy, pBz,
-                           psi4RR, psi4II);
-          thetap = sqrt((2 * pl + 1.0) / 4.0 / PI) * misc::Wigner_d_function(pl, pm, spinw, costheta); // note the variation from -2 to 2
-
-          //	 find back the one
-          pchi = pchi + 1;
-#ifdef GaussInt
-          // wtcostheta is even function respect costheta
-          RP_out[countlm] = RP_out[countlm] + thetap / pchi / pchi * (psi4RR * cosmphi + psi4II * sinmphi) * wtcostheta[i];
-          IP_out[countlm] = IP_out[countlm] + thetap / pchi / pchi * (psi4II * cosmphi - psi4RR * sinmphi) * wtcostheta[i];
-#else
-          RP_out[countlm] = RP_out[countlm] + thetap / pchi / pchi * (psi4RR * cosmphi + psi4II * sinmphi);
-          IP_out[countlm] = IP_out[countlm] + thetap / pchi / pchi * (psi4II * cosmphi - psi4RR * sinmphi);
-#endif
-        }
-        countlm++; // no sanity check for countlm and NN which should be noted in the input parameters
-      }
-  }
-
-  for (int ii = 0; ii < NN; ii++)
-  {
-#ifdef GaussInt
-    RP_out[ii] = RP_out[ii] * rex * dphi;
-    IP_out[ii] = IP_out[ii] * rex * dphi;
-#else
-    RP_out[ii] = RP_out[ii] * rex * dphi * dcostheta;
-    IP_out[ii] = IP_out[ii] * rex * dphi * dcostheta;
-#endif
-  }
-  //|------+  Communicate and sum the results from each processor.
-
-  {
-    double *RPIP_out = new double[2 * NN];
-    double *RPIP = new double[2 * NN];
-    memcpy(RPIP_out, RP_out, NN * sizeof(double));
-    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
-    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    memcpy(RP, RPIP, NN * sizeof(double));
-    memcpy(IP, RPIP + NN, NN * sizeof(double));
-    delete[] RPIP_out;
-    delete[] RPIP;
-  }
-
-  //|------= Free memory.
-
-  delete[] pox[0];
-  delete[] pox[1];
-  delete[] pox[2];
-  delete[] shellf;
-  delete[] RP_out;
-  delete[] IP_out;
-  DG_List->clearList();
-}
-//|----------------------------------------------------------------
-//  for shell patch
-//  for EM wave specially symmetric case
-//  unify for phi1 and phi2
-//|----------------------------------------------------------------
-void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH,
-                                 var *Ex, var *Ey, var *Ez, var *Bx, var *By, var *Bz,
-                                 var *chi, var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
-                                 int spinw, int maxl, int NN, double *RP, double *IP,
-                                 monitor *Monitor,
-                                 void (*funcs)(double &, double &, double &,
-                                               double &, double &, double &, double &, double &, double &, double &,
-                                               double &, double &, double &, double &, double &, double &,
-                                               double &, double &)) // NN is the length of RP and IP
-{
-  const int InList = 13;
-
-  MyList<var> *DG_List = new MyList<var>(Ex);
-  DG_List->insert(Ey);
-  DG_List->insert(Ez);
-  DG_List->insert(Bx);
-  DG_List->insert(By);
-  DG_List->insert(Bz);
-  DG_List->insert(chi);
-  DG_List->insert(gxx);
-  DG_List->insert(gxy);
-  DG_List->insert(gxz);
-  DG_List->insert(gyy);
-  DG_List->insert(gyz);
-  DG_List->insert(gzz);
-
-  int n;
-  double *pox[3];
-  for (int i = 0; i < 3; i++)
-    pox[i] = new double[n_tot];
-  for (n = 0; n < n_tot; n++)
-  {
-    pox[0][n] = rex * nx_g[n];
-    pox[1][n] = rex * ny_g[n];
-    pox[2][n] = rex * nz_g[n];
-  }
-
-  double *shellf;
-  shellf = new double[n_tot * InList];
-
-  GH->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);
-
-  double *RP_out, *IP_out;
-  RP_out = new double[NN];
-  IP_out = new double[NN];
-
-  for (int ii = 0; ii < NN; ii++)
-  {
-    RP_out[ii] = 0;
-    IP_out[ii] = 0;
-  }
-
-#if 0
-// for debug    
-  if(myrank==0)
-  {
-    double costheta, thetap;
-    double cosmphi,sinmphi;
-
-    int i,j;
-    int lpsy=0;
-         if( Symmetry == 0 )     lpsy=1;
-    else if( Symmetry == 1 )     lpsy=2;
-    else if( Symmetry == 2 )     lpsy=8;
-
-    double psi4RR,psi4II;
-    double px,py,pz;
-    double pEx,pEy,pEz,pBx,pBy,pBz;
-    double pchi,pgxx,pgxy,pgxz,pgyy,pgyz,pgzz;
-    for( n = 0; n <= n_tot-1; n++) 
-     {
-//       need round off always	     
-        i = int(n/N_phi); // int(1.723) = 1, int(-1.732) = -1
-        j = n - i * N_phi;
-        
-	for(int lp=0;lp<lpsy;lp++)
-	{
-         px = pox[0][n];		 
-         py = pox[1][n];		 
-         pz = pox[2][n];		 
-	 pEx = shellf[InList*n  ];
-	 pEy = shellf[InList*n+1];
-	 pEz = shellf[InList*n+2];
-	 pBx = shellf[InList*n+3];
-	 pBy = shellf[InList*n+4];
-	 pBz = shellf[InList*n+5];
-	 pchi = shellf[InList*n+6];
-	 pgxx = shellf[InList*n+7];
-	 pgxy = shellf[InList*n+8];
-	 pgxz = shellf[InList*n+9];
-	 pgyy = shellf[InList*n+10];
-	 pgyz = shellf[InList*n+11];
-	 pgzz = shellf[InList*n+12];
- 	 switch(lp)
-	 {
-	  case 1:  //++- (pi-theta, phi)
-	  pz = -pz;
-	  pEz = -pEz;
-	  pBx = -pBx;
-	  pBy = -pBy;
-	  pgxz = -pgxz;
-	  pgyz = -pgyz;
-	  break;
-	  case 2:  //+-+ (theta, 2*pi-phi)
-	  py = -py;
-	  pEy = -pEy;
-	  pBx = -pBx;
-	  pBz = -pBz;
-	  pgxy = -pgxy;
-	  pgyz = -pgyz;
-	  break;
-	  case 3:  //+-- (pi-theta, 2*pi-phi)
-	  py = -py;
-	  pz = -pz;
-	  pEz = -pEz;
-	  pBz = -pBz;;
-	  pgxz = -pgxz;
-	  pEy = -pEy;
-	  pBy = -pBy;
-	  pgxy = -pgxy;
-	  break;
-	  case 4:  //-++ (theta, pi-phi)
-	  px = -px;
-	  pEx = -pEx;
-	  pBy = -pBy;
-	  pBz = -pBz;
-	  pgxy = -pgxy;
-	  pgxz = -pgxz;
-	  break;
-	  case 5:  //-+- (pi-theta, pi-phi)
-	  pz = -pz;
-	  px = -px;
-	  pEz = -pEz;
-	  pBz = -pBz;
-	  pgyz = -pgyz;
-	  pEx = -pEx;
-	  pBx = -pBx;
-	  pgxy = -pgxy;
-	  break;
-	  case 6:  //--+ (theta, pi+phi)
-	  px = -px;
-	  py = -py;
-	  pEx = -pEx;
-	  pBx = -pBx;
-	  pgxz = -pgxz;
-	  pEy = -pEy;
-	  pBy = -pBy;
-	  pgyz = -pgyz;
-	  break;
-	  case 7:  //--- (pi-theta, pi+phi)
-	  px = -px;
-	  py = -py;
-	  pz = -pz;
-	  pEx = -pEx;
-	  pEy = -pEy;
-	  pEz = -pEz;
-	 }
-	  
-	 funcs(px,py,pz,pchi,pgxx,pgxy,pgxz,pgyy,pgyz,pgzz,pEx,pEy,pEz,pBx,pBy,pBz,
-			  psi4RR,psi4II);
-//	 if(n==0 || n==N_phi/2-1 || n==N_phi/2 || n==N_phi-1 ||
-//	    n==N_phi*(N_theta-1)+0 || n==N_phi*(N_theta-1)+N_phi/2-1 || n==N_phi*(N_theta-1)+N_phi/2 || n==N_phi*(N_theta-1)+N_phi-1)
-//	 cout<<px<<","<<py<<","<<pz<<","<<pchi<<","<<pgxx<<","<<pgxy<<","<<pgxz<<","<<pgyy<<","<<pgyz<<","<<pgzz<<","<<pEx<<","
-//	     <<pEy<<","<<pEz<<","<<pBx<<","<<pBy<<","<<pBz<<","<<psi4RR<<","<<psi4II<<endl<<endl;
-
-// find back the one
-        pchi = pchi+1;
-
-	int countlm=0;
-	for(int pl=spinw;pl<maxl+1;pl++)
-          for(int pm=-pl;pm<pl+1;pm++)
-	  {
- 	 switch(lp)
-	 {
-	  case 0:  //+++ (theta, phi)
-          costheta = arcostheta[i];
-	  cosmphi = cos(pm * (j+0.5) * dphi);
-	  sinmphi = sin(pm * (j+0.5) * dphi);
-	  break;
-	  case 1:  //++- (pi-theta, phi)
-          costheta = -arcostheta[i];
-	  cosmphi = cos(pm * (j+0.5) * dphi);
-	  sinmphi = sin(pm * (j+0.5) * dphi);
-	  break;
-	  case 2:  //+-+ (theta, 2*pi-phi)
-          costheta = arcostheta[i];
-	  cosmphi = cos(pm * (j+0.5) * dphi);
-	  sinmphi =-sin(pm * (j+0.5) * dphi);
-	  break;
-	  case 3:  //+-- (pi-theta, 2*pi-phi)
-          costheta = -arcostheta[i];
-	  cosmphi = cos(pm * (j+0.5) * dphi);
-	  sinmphi =-sin(pm * (j+0.5) * dphi);
-	  break;
-	  case 4:  //-++ (theta, pi-phi)
-          costheta = arcostheta[i];
-	  cosmphi = cos(pm * (PI - (j+0.5) * dphi));
-	  sinmphi = sin(pm * (PI - (j+0.5) * dphi));
-	  break;
-	  case 5:  //-+- (pi-theta, pi-phi)
-          costheta = -arcostheta[i];
- 	  cosmphi = cos(pm * (PI - (j+0.5) * dphi));
-	  sinmphi = sin(pm * (PI - (j+0.5) * dphi));
-	  break;
-	  case 6:  //--+ (theta, pi+phi)
-          costheta = arcostheta[i];
-	  cosmphi = cos(pm * (PI + (j+0.5) * dphi));
-	  sinmphi = sin(pm * (PI + (j+0.5) * dphi));
-	  break;
-	  case 7:  //--- (pi-theta, pi+phi)
-          costheta = -arcostheta[i];
-	  cosmphi = cos(pm * (PI + (j+0.5) * dphi));
-	  sinmphi = sin(pm * (PI + (j+0.5) * dphi));
-	 }
-   	    thetap = sqrt((2*pl+1.0)/4.0/PI)*misc::Wigner_d_function(pl,pm,spinw,costheta); //note the variation from -2 to 2
-
-#ifdef GaussInt
-// wtcostheta is even function respect costheta
-            RP_out[countlm] = RP_out[countlm] + thetap/pchi/pchi * (psi4RR * cosmphi + psi4II * sinmphi)*wtcostheta[i];
-  	    IP_out[countlm] = IP_out[countlm] + thetap/pchi/pchi * (psi4II * cosmphi - psi4RR * sinmphi)*wtcostheta[i];
-	    if(pl==2 && pm==0) cout<<countlm+1<<","<<RP_out[countlm] * rex * dphi<<endl;
-#else	 
-            RP_out[countlm] = RP_out[countlm] + thetap/pchi/pchi * (psi4RR * cosmphi + psi4II * sinmphi);
-  	    IP_out[countlm] = IP_out[countlm] + thetap/pchi/pchi * (psi4II * cosmphi - psi4RR * sinmphi);
-#endif	 
-  	    countlm++;  //no sanity check for countlm and NN which should be noted in the input parameters
-	  }
-	}
-//        if(Symmetry == 2) MPI_Abort(MPI_COMM_WORLD,1);
-     }
-     MPI_Abort(MPI_COMM_WORLD,1);
-  }
-#else
-  int mp, Lp, Nmin, Nmax;
-
-  mp = n_tot / cpusize;
-  Lp = n_tot - cpusize * mp;
-
-  if (Lp > myrank)
-  {
-    Nmin = myrank * mp + myrank;
-    Nmax = Nmin + mp;
-  }
-  else
-  {
-    Nmin = myrank * mp + Lp;
-    Nmax = Nmin + mp - 1;
-  }
-
-  // theta part
-  double costheta, thetap;
-  double cosmphi, sinmphi;
-
-  int i, j;
-  int lpsy = 0;
-  if (Symmetry == 0)
-    lpsy = 1;
-  else if (Symmetry == 1)
-    lpsy = 2;
-  else if (Symmetry == 2)
-    lpsy = 8;
-
-  double psi4RR, psi4II;
-  double px, py, pz;
-  double pEx, pEy, pEz, pBx, pBy, pBz;
-  double pchi, pgxx, pgxy, pgxz, pgyy, pgyz, pgzz;
-  for (n = Nmin; n <= Nmax; n++)
-  {
-    //       need round off always
-    i = int(n / N_phi); // int(1.723) = 1, int(-1.732) = -1
-    j = n - i * N_phi;
-
-    int countlm = 0;
-    for (int pl = spinw; pl < maxl + 1; pl++)
-      for (int pm = -pl; pm < pl + 1; pm++)
-      {
-        for (int lp = 0; lp < lpsy; lp++)
-        {
-          px = pox[0][n];
-          py = pox[1][n];
-          pz = pox[2][n];
-          pEx = shellf[InList * n];
-          pEy = shellf[InList * n + 1];
-          pEz = shellf[InList * n + 2];
-          pBx = shellf[InList * n + 3];
-          pBy = shellf[InList * n + 4];
-          pBz = shellf[InList * n + 5];
-          pchi = shellf[InList * n + 6];
-          pgxx = shellf[InList * n + 7];
-          pgxy = shellf[InList * n + 8];
-          pgxz = shellf[InList * n + 9];
-          pgyy = shellf[InList * n + 10];
-          pgyz = shellf[InList * n + 11];
-          pgzz = shellf[InList * n + 12];
-          switch (lp)
-          {
-          case 0: //+++ (theta, phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = sin(pm * (j + 0.5) * dphi);
-            break;
-          case 1: //++- (pi-theta, phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = sin(pm * (j + 0.5) * dphi);
-            pz = -pz;
-            pEz = -pEz;
-            pBx = -pBx;
-            pBy = -pBy;
-            pgxz = -pgxz;
-            pgyz = -pgyz;
-            break;
-          case 2: //+-+ (theta, 2*pi-phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = -sin(pm * (j + 0.5) * dphi);
-            py = -py;
-            pEy = -pEy;
-            pBx = -pBx;
-            pBz = -pBz;
-            pgxy = -pgxy;
-            pgyz = -pgyz;
-            break;
-          case 3: //+-- (pi-theta, 2*pi-phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = -sin(pm * (j + 0.5) * dphi);
-            py = -py;
-            pz = -pz;
-            pEz = -pEz;
-            pBz = -pBz;
-            pgxz = -pgxz;
-            pEy = -pEy;
-            pBy = -pBy;
-            pgxy = -pgxy;
-            break;
-          case 4: //-++ (theta, pi-phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
-            px = -px;
-            pEx = -pEx;
-            pBy = -pBy;
-            pBz = -pBz;
-            pgxy = -pgxy;
-            pgxz = -pgxz;
-            break;
-          case 5: //-+- (pi-theta, pi-phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
-            pz = -pz;
-            px = -px;
-            pEz = -pEz;
-            pBz = -pBz;
-            pgyz = -pgyz;
-            pEx = -pEx;
-            pBx = -pBx;
-            pgxy = -pgxy;
-            break;
-          case 6: //--+ (theta, pi+phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
-            px = -px;
-            py = -py;
-            pEx = -pEx;
-            pBx = -pBx;
-            pgxz = -pgxz;
-            pEy = -pEy;
-            pBy = -pBy;
-            pgyz = -pgyz;
-            break;
-          case 7: //--- (pi-theta, pi+phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
-            px = -px;
-            py = -py;
-            pz = -pz;
-            pEx = -pEx;
-            pEy = -pEy;
-            pEz = -pEz;
-          }
-
-          funcs(px, py, pz, pchi, pgxx, pgxy, pgxz, pgyy, pgyz, pgzz, pEx, pEy, pEz, pBx, pBy, pBz,
-                psi4RR, psi4II);
-          thetap = sqrt((2 * pl + 1.0) / 4.0 / PI) * misc::Wigner_d_function(pl, pm, spinw, costheta); // note the variation from -2 to 2
-
-          //	 find back the one
-          pchi = pchi + 1;
-#ifdef GaussInt
-          // wtcostheta is even function respect costheta
-          RP_out[countlm] = RP_out[countlm] + thetap / pchi / pchi * (psi4RR * cosmphi + psi4II * sinmphi) * wtcostheta[i];
-          IP_out[countlm] = IP_out[countlm] + thetap / pchi / pchi * (psi4II * cosmphi - psi4RR * sinmphi) * wtcostheta[i];
-#else
-          RP_out[countlm] = RP_out[countlm] + thetap / pchi / pchi * (psi4RR * cosmphi + psi4II * sinmphi);
-          IP_out[countlm] = IP_out[countlm] + thetap / pchi / pchi * (psi4II * cosmphi - psi4RR * sinmphi);
-#endif
-        }
-        countlm++; // no sanity check for countlm and NN which should be noted in the input parameters
-      }
-  }
-#endif
-
-  for (int ii = 0; ii < NN; ii++)
-  {
-#ifdef GaussInt
-    RP_out[ii] = RP_out[ii] * rex * dphi;
-    IP_out[ii] = IP_out[ii] * rex * dphi;
-#else
-    RP_out[ii] = RP_out[ii] * rex * dphi * dcostheta;
-    IP_out[ii] = IP_out[ii] * rex * dphi * dcostheta;
-#endif
-  }
-  //|------+  Communicate and sum the results from each processor.
-
-  {
-    double *RPIP_out = new double[2 * NN];
-    double *RPIP = new double[2 * NN];
-    memcpy(RPIP_out, RP_out, NN * sizeof(double));
-    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
-    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    memcpy(RP, RPIP, NN * sizeof(double));
-    memcpy(IP, RPIP + NN, NN * sizeof(double));
-    delete[] RPIP_out;
-    delete[] RPIP;
-  }
-
-  //|------= Free memory.
-
-  delete[] pox[0];
-  delete[] pox[1];
-  delete[] pox[2];
-  delete[] shellf;
-  delete[] RP_out;
-  delete[] IP_out;
-  DG_List->clearList();
-}
-//|----------------------------------------------------------------
-//  for box
-//  for EM wave specially symmetric case
-//  unify for phi1 and phi2
-//|----------------------------------------------------------------
-void surface_integral::surf_Wave(double rex, int lev, cgh *GH,
-                                 var *Ex, var *Ey, var *Ez, var *Bx, var *By, var *Bz,
-                                 var *chi, var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
-                                 int spinw, int maxl, int NN, double *RP, double *IP,
-                                 monitor *Monitor,
-                                 void (*funcs)(double &, double &, double &,
-                                               double &, double &, double &, double &, double &, double &, double &,
-                                               double &, double &, double &, double &, double &, double &,
-                                               double &, double &)) // NN is the length of RP and IP
-{
-  const int InList = 13;
-
-  MyList<var> *DG_List = new MyList<var>(Ex);
-  DG_List->insert(Ey);
-  DG_List->insert(Ez);
-  DG_List->insert(Bx);
-  DG_List->insert(By);
-  DG_List->insert(Bz);
-  DG_List->insert(chi);
-  DG_List->insert(gxx);
-  DG_List->insert(gxy);
-  DG_List->insert(gxz);
-  DG_List->insert(gyy);
-  DG_List->insert(gyz);
-  DG_List->insert(gzz);
-
-  int n;
-  double *pox[3];
-  for (int i = 0; i < 3; i++)
-    pox[i] = new double[n_tot];
-  for (n = 0; n < n_tot; n++)
-  {
-    pox[0][n] = rex * nx_g[n];
-    pox[1][n] = rex * ny_g[n];
-    pox[2][n] = rex * nz_g[n];
-  }
-
-  double *shellf;
-  shellf = new double[n_tot * InList];
-
-  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);
-
-  double *RP_out, *IP_out;
-  RP_out = new double[NN];
-  IP_out = new double[NN];
-
-  for (int ii = 0; ii < NN; ii++)
-  {
-    RP_out[ii] = 0;
-    IP_out[ii] = 0;
-  }
-
-#if 0
-// for debug    
-  if(myrank==0)
-  {
-    double costheta, thetap;
-    double cosmphi,sinmphi;
-
-    int i,j;
-    int lpsy=0;
-         if( Symmetry == 0 )     lpsy=1;
-    else if( Symmetry == 1 )     lpsy=2;
-    else if( Symmetry == 2 )     lpsy=8;
-
-    double psi4RR,psi4II;
-    double px,py,pz;
-    double pEx,pEy,pEz,pBx,pBy,pBz;
-    double pchi,pgxx,pgxy,pgxz,pgyy,pgyz,pgzz;
-    for( n = 0; n <= n_tot-1; n++) 
-     {
-//       need round off always	     
-        i = int(n/N_phi); // int(1.723) = 1, int(-1.732) = -1
-        j = n - i * N_phi;
-        
-	for(int lp=0;lp<lpsy;lp++)
-	{
-         px = pox[0][n];		 
-         py = pox[1][n];		 
-         pz = pox[2][n];		 
-	 pEx = shellf[InList*n  ];
-	 pEy = shellf[InList*n+1];
-	 pEz = shellf[InList*n+2];
-	 pBx = shellf[InList*n+3];
-	 pBy = shellf[InList*n+4];
-	 pBz = shellf[InList*n+5];
-	 pchi = shellf[InList*n+6];
-	 pgxx = shellf[InList*n+7];
-	 pgxy = shellf[InList*n+8];
-	 pgxz = shellf[InList*n+9];
-	 pgyy = shellf[InList*n+10];
-	 pgyz = shellf[InList*n+11];
-	 pgzz = shellf[InList*n+12];
- 	 switch(lp)
-	 {
-	  case 1:  //++- (pi-theta, phi)
-	  pz = -pz;
-	  pEz = -pEz;
-	  pBx = -pBx;
-	  pBy = -pBy;
-	  pgxz = -pgxz;
-	  pgyz = -pgyz;
-	  break;
-	  case 2:  //+-+ (theta, 2*pi-phi)
-	  py = -py;
-	  pEy = -pEy;
-	  pBx = -pBx;
-	  pBz = -pBz;
-	  pgxy = -pgxy;
-	  pgyz = -pgyz;
-	  break;
-	  case 3:  //+-- (pi-theta, 2*pi-phi)
-	  py = -py;
-	  pz = -pz;
-	  pEz = -pEz;
-	  pBz = -pBz;;
-	  pgxz = -pgxz;
-	  pEy = -pEy;
-	  pBy = -pBy;
-	  pgxy = -pgxy;
-	  break;
-	  case 4:  //-++ (theta, pi-phi)
-	  px = -px;
-	  pEx = -pEx;
-	  pBy = -pBy;
-	  pBz = -pBz;
-	  pgxy = -pgxy;
-	  pgxz = -pgxz;
-	  break;
-	  case 5:  //-+- (pi-theta, pi-phi)
-	  pz = -pz;
-	  px = -px;
-	  pEz = -pEz;
-	  pBz = -pBz;
-	  pgyz = -pgyz;
-	  pEx = -pEx;
-	  pBx = -pBx;
-	  pgxy = -pgxy;
-	  break;
-	  case 6:  //--+ (theta, pi+phi)
-	  px = -px;
-	  py = -py;
-	  pEx = -pEx;
-	  pBx = -pBx;
-	  pgxz = -pgxz;
-	  pEy = -pEy;
-	  pBy = -pBy;
-	  pgyz = -pgyz;
-	  break;
-	  case 7:  //--- (pi-theta, pi+phi)
-	  px = -px;
-	  py = -py;
-	  pz = -pz;
-	  pEx = -pEx;
-	  pEy = -pEy;
-	  pEz = -pEz;
-	 }
-	  
-	 funcs(px,py,pz,pchi,pgxx,pgxy,pgxz,pgyy,pgyz,pgzz,pEx,pEy,pEz,pBx,pBy,pBz,
-			  psi4RR,psi4II);
-//	 if(n==0 || n==N_phi/2-1 || n==N_phi/2 || n==N_phi-1 ||
-//	    n==N_phi*(N_theta-1)+0 || n==N_phi*(N_theta-1)+N_phi/2-1 || n==N_phi*(N_theta-1)+N_phi/2 || n==N_phi*(N_theta-1)+N_phi-1)
-//	 cout<<px<<","<<py<<","<<pz<<","<<pchi<<","<<pgxx<<","<<pgxy<<","<<pgxz<<","<<pgyy<<","<<pgyz<<","<<pgzz<<","<<pEx<<","
-//	     <<pEy<<","<<pEz<<","<<pBx<<","<<pBy<<","<<pBz<<","<<psi4RR<<","<<psi4II<<endl<<endl;
-
-// find back the one
-        pchi = pchi+1;
-
-	int countlm=0;
-	for(int pl=spinw;pl<maxl+1;pl++)
-          for(int pm=-pl;pm<pl+1;pm++)
-	  {
- 	 switch(lp)
-	 {
-	  case 0:  //+++ (theta, phi)
-          costheta = arcostheta[i];
-	  cosmphi = cos(pm * (j+0.5) * dphi);
-	  sinmphi = sin(pm * (j+0.5) * dphi);
-	  break;
-	  case 1:  //++- (pi-theta, phi)
-          costheta = -arcostheta[i];
-	  cosmphi = cos(pm * (j+0.5) * dphi);
-	  sinmphi = sin(pm * (j+0.5) * dphi);
-	  break;
-	  case 2:  //+-+ (theta, 2*pi-phi)
-          costheta = arcostheta[i];
-	  cosmphi = cos(pm * (j+0.5) * dphi);
-	  sinmphi =-sin(pm * (j+0.5) * dphi);
-	  break;
-	  case 3:  //+-- (pi-theta, 2*pi-phi)
-          costheta = -arcostheta[i];
-	  cosmphi = cos(pm * (j+0.5) * dphi);
-	  sinmphi =-sin(pm * (j+0.5) * dphi);
-	  break;
-	  case 4:  //-++ (theta, pi-phi)
-          costheta = arcostheta[i];
-	  cosmphi = cos(pm * (PI - (j+0.5) * dphi));
-	  sinmphi = sin(pm * (PI - (j+0.5) * dphi));
-	  break;
-	  case 5:  //-+- (pi-theta, pi-phi)
-          costheta = -arcostheta[i];
- 	  cosmphi = cos(pm * (PI - (j+0.5) * dphi));
-	  sinmphi = sin(pm * (PI - (j+0.5) * dphi));
-	  break;
-	  case 6:  //--+ (theta, pi+phi)
-          costheta = arcostheta[i];
-	  cosmphi = cos(pm * (PI + (j+0.5) * dphi));
-	  sinmphi = sin(pm * (PI + (j+0.5) * dphi));
-	  break;
-	  case 7:  //--- (pi-theta, pi+phi)
-          costheta = -arcostheta[i];
-	  cosmphi = cos(pm * (PI + (j+0.5) * dphi));
-	  sinmphi = sin(pm * (PI + (j+0.5) * dphi));
-	 }
-   	    thetap = sqrt((2*pl+1.0)/4.0/PI)*misc::Wigner_d_function(pl,pm,spinw,costheta); //note the variation from -2 to 2
-
-#ifdef GaussInt
-// wtcostheta is even function respect costheta
-            RP_out[countlm] = RP_out[countlm] + thetap/pchi/pchi * (psi4RR * cosmphi + psi4II * sinmphi)*wtcostheta[i];
-  	    IP_out[countlm] = IP_out[countlm] + thetap/pchi/pchi * (psi4II * cosmphi - psi4RR * sinmphi)*wtcostheta[i];
-	    if(pl==2 && pm==0) cout<<countlm+1<<","<<RP_out[countlm] * rex * dphi<<endl;
-#else	 
-            RP_out[countlm] = RP_out[countlm] + thetap/pchi/pchi * (psi4RR * cosmphi + psi4II * sinmphi);
-  	    IP_out[countlm] = IP_out[countlm] + thetap/pchi/pchi * (psi4II * cosmphi - psi4RR * sinmphi);
-#endif	 
-  	    countlm++;  //no sanity check for countlm and NN which should be noted in the input parameters
-	  }
-	}
-//        if(Symmetry == 2) MPI_Abort(MPI_COMM_WORLD,1);
-     }
-     MPI_Abort(MPI_COMM_WORLD,1);
-  }
-#else
-  int mp, Lp, Nmin, Nmax;
-
-  mp = n_tot / cpusize;
-  Lp = n_tot - cpusize * mp;
-
-  if (Lp > myrank)
-  {
-    Nmin = myrank * mp + myrank;
-    Nmax = Nmin + mp;
-  }
-  else
-  {
-    Nmin = myrank * mp + Lp;
-    Nmax = Nmin + mp - 1;
-  }
-
-  // theta part
-  double costheta, thetap;
-  double cosmphi, sinmphi;
-
-  int i, j;
-  int lpsy = 0;
-  if (Symmetry == 0)
-    lpsy = 1;
-  else if (Symmetry == 1)
-    lpsy = 2;
-  else if (Symmetry == 2)
-    lpsy = 8;
-
-  double psi4RR, psi4II;
-  double px, py, pz;
-  double pEx, pEy, pEz, pBx, pBy, pBz;
-  double pchi, pgxx, pgxy, pgxz, pgyy, pgyz, pgzz;
-  for (n = Nmin; n <= Nmax; n++)
-  {
-    //       need round off always
-    i = int(n / N_phi); // int(1.723) = 1, int(-1.732) = -1
-    j = n - i * N_phi;
-
-    int countlm = 0;
-    for (int pl = spinw; pl < maxl + 1; pl++)
-      for (int pm = -pl; pm < pl + 1; pm++)
-      {
-        for (int lp = 0; lp < lpsy; lp++)
-        {
-          px = pox[0][n];
-          py = pox[1][n];
-          pz = pox[2][n];
-          pEx = shellf[InList * n];
-          pEy = shellf[InList * n + 1];
-          pEz = shellf[InList * n + 2];
-          pBx = shellf[InList * n + 3];
-          pBy = shellf[InList * n + 4];
-          pBz = shellf[InList * n + 5];
-          pchi = shellf[InList * n + 6];
-          pgxx = shellf[InList * n + 7];
-          pgxy = shellf[InList * n + 8];
-          pgxz = shellf[InList * n + 9];
-          pgyy = shellf[InList * n + 10];
-          pgyz = shellf[InList * n + 11];
-          pgzz = shellf[InList * n + 12];
-          switch (lp)
-          {
-          case 0: //+++ (theta, phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = sin(pm * (j + 0.5) * dphi);
-            break;
-          case 1: //++- (pi-theta, phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = sin(pm * (j + 0.5) * dphi);
-            pz = -pz;
-            pEz = -pEz;
-            pBx = -pBx;
-            pBy = -pBy;
-            pgxz = -pgxz;
-            pgyz = -pgyz;
-            break;
-          case 2: //+-+ (theta, 2*pi-phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = -sin(pm * (j + 0.5) * dphi);
-            py = -py;
-            pEy = -pEy;
-            pBx = -pBx;
-            pBz = -pBz;
-            pgxy = -pgxy;
-            pgyz = -pgyz;
-            break;
-          case 3: //+-- (pi-theta, 2*pi-phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = -sin(pm * (j + 0.5) * dphi);
-            py = -py;
-            pz = -pz;
-            pEz = -pEz;
-            pBz = -pBz;
-            pgxz = -pgxz;
-            pEy = -pEy;
-            pBy = -pBy;
-            pgxy = -pgxy;
-            break;
-          case 4: //-++ (theta, pi-phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
-            px = -px;
-            pEx = -pEx;
-            pBy = -pBy;
-            pBz = -pBz;
-            pgxy = -pgxy;
-            pgxz = -pgxz;
-            break;
-          case 5: //-+- (pi-theta, pi-phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
-            pz = -pz;
-            px = -px;
-            pEz = -pEz;
-            pBz = -pBz;
-            pgyz = -pgyz;
-            pEx = -pEx;
-            pBx = -pBx;
-            pgxy = -pgxy;
-            break;
-          case 6: //--+ (theta, pi+phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
-            px = -px;
-            py = -py;
-            pEx = -pEx;
-            pBx = -pBx;
-            pgxz = -pgxz;
-            pEy = -pEy;
-            pBy = -pBy;
-            pgyz = -pgyz;
-            break;
-          case 7: //--- (pi-theta, pi+phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
-            px = -px;
-            py = -py;
-            pz = -pz;
-            pEx = -pEx;
-            pEy = -pEy;
-            pEz = -pEz;
-          }
-
-          funcs(px, py, pz, pchi, pgxx, pgxy, pgxz, pgyy, pgyz, pgzz, pEx, pEy, pEz, pBx, pBy, pBz,
-                psi4RR, psi4II);
-          thetap = sqrt((2 * pl + 1.0) / 4.0 / PI) * misc::Wigner_d_function(pl, pm, spinw, costheta); // note the variation from -2 to 2
-
-          //	 find back the one
-          pchi = pchi + 1;
-#ifdef GaussInt
-          // wtcostheta is even function respect costheta
-          RP_out[countlm] = RP_out[countlm] + thetap / pchi / pchi * (psi4RR * cosmphi + psi4II * sinmphi) * wtcostheta[i];
-          IP_out[countlm] = IP_out[countlm] + thetap / pchi / pchi * (psi4II * cosmphi - psi4RR * sinmphi) * wtcostheta[i];
-#else
-          RP_out[countlm] = RP_out[countlm] + thetap / pchi / pchi * (psi4RR * cosmphi + psi4II * sinmphi);
-          IP_out[countlm] = IP_out[countlm] + thetap / pchi / pchi * (psi4II * cosmphi - psi4RR * sinmphi);
-#endif
-        }
-        countlm++; // no sanity check for countlm and NN which should be noted in the input parameters
-      }
-  }
-#endif
-
-  for (int ii = 0; ii < NN; ii++)
-  {
-#ifdef GaussInt
-    RP_out[ii] = RP_out[ii] * rex * dphi;
-    IP_out[ii] = IP_out[ii] * rex * dphi;
-#else
-    RP_out[ii] = RP_out[ii] * rex * dphi * dcostheta;
-    IP_out[ii] = IP_out[ii] * rex * dphi * dcostheta;
-#endif
-  }
-  //|------+  Communicate and sum the results from each processor.
-
-  {
-    double *RPIP_out = new double[2 * NN];
-    double *RPIP = new double[2 * NN];
-    memcpy(RPIP_out, RP_out, NN * sizeof(double));
-    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
-    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    memcpy(RP, RPIP, NN * sizeof(double));
-    memcpy(IP, RPIP + NN, NN * sizeof(double));
-    delete[] RPIP_out;
-    delete[] RPIP;
-  }
-
-  //|------= Free memory.
-
-  delete[] pox[0];
-  delete[] pox[1];
-  delete[] pox[2];
-  delete[] shellf;
-  delete[] RP_out;
-  delete[] IP_out;
-  DG_List->clearList();
-}
-//|----------------------------------------------------------------
-//  for null shell patch2
-//|----------------------------------------------------------------
-// rex is x instead of r
-void surface_integral::surf_Wave(double rex, int lev, NullShellPatch2 *GH, var *Rpsi4, var *Ipsi4,
-                                 int spinw, int maxl, int NN, double *RP, double *IP,
-                                 monitor *Monitor) // NN is the length of RP and IP
-// spinw 0 for scalar; 1 for electricmagnetic wave; 2 for gravitaitonal wave
-// we always assume spinw >= 0
-{
-  const int InList = 2;
-
-  MyList<var> *DG_List = new MyList<var>(Rpsi4);
-  DG_List->insert(Ipsi4);
-
-  int n;
-  // since we used x instead of r, these global coordinates are fake
-  double *pox[3];
-  for (int i = 0; i < 3; i++)
-    pox[i] = new double[n_tot];
-  for (n = 0; n < n_tot; n++)
-  {
-    pox[0][n] = rex * nx_g[n];
-    pox[1][n] = rex * ny_g[n];
-    pox[2][n] = rex * nz_g[n];
-  }
-
-  double *shellf;
-  shellf = new double[n_tot * InList];
-
-  GH->Interp_Points_2D(DG_List, n_tot, pox, shellf, Symmetry);
-
-  int mp, Lp, Nmin, Nmax;
-
-  mp = n_tot / cpusize;
-  Lp = n_tot - cpusize * mp;
-
-  if (Lp > myrank)
-  {
-    Nmin = myrank * mp + myrank;
-    Nmax = Nmin + mp;
-  }
-  else
-  {
-    Nmin = myrank * mp + Lp;
-    Nmax = Nmin + mp - 1;
-  }
-
-  //|~~~~~> Integrate the dot product of Dphi with the surface normal.
-
-  double *RP_out, *IP_out;
-  RP_out = new double[NN];
-  IP_out = new double[NN];
-
-  for (int ii = 0; ii < NN; ii++)
-  {
-    RP_out[ii] = 0;
-    IP_out[ii] = 0;
-  }
-  // theta part
-  double costheta, thetap;
-  double cosmphi, sinmphi;
-
-  int i, j;
-  int lpsy = 0;
-  if (Symmetry == 0)
-    lpsy = 1;
-  else if (Symmetry == 1)
-    lpsy = 2;
-  else if (Symmetry == 2)
-    lpsy = 8;
-
-  double psi4RR, psi4II;
-  for (n = Nmin; n <= Nmax; n++)
-  {
-    //       need round off always
-    i = int(n / N_phi); // int(1.723) = 1, int(-1.732) = -1
-    j = n - i * N_phi;
-
-    int countlm = 0;
-    for (int pl = spinw; pl < maxl + 1; pl++)
-      for (int pm = -pl; pm < pl + 1; pm++)
-      {
-        for (int lp = 0; lp < lpsy; lp++)
-        {
-          switch (lp)
-          {
-          case 0: //+++ (theta, phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = sin(pm * (j + 0.5) * dphi);
-            psi4RR = shellf[InList * n];
-            psi4II = shellf[InList * n + 1];
-            break;
-          case 1: //++- (pi-theta, phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = sin(pm * (j + 0.5) * dphi);
-            psi4RR = shellf[InList * n];
-            psi4II = -shellf[InList * n + 1];
-            break;
-          case 2: //+-+ (theta, 2*pi-phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = -sin(pm * (j + 0.5) * dphi);
-            psi4RR = shellf[InList * n];
-            psi4II = -shellf[InList * n + 1];
-            break;
-          case 3: //+-- (pi-theta, 2*pi-phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = -sin(pm * (j + 0.5) * dphi);
-            psi4RR = shellf[InList * n];
-            psi4II = shellf[InList * n + 1];
-            break;
-          case 4: //-++ (theta, pi-phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
-            psi4RR = shellf[InList * n];
-            psi4II = -shellf[InList * n + 1];
-            break;
-          case 5: //-+- (pi-theta, pi-phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
-            psi4RR = shellf[InList * n];
-            psi4II = shellf[InList * n + 1];
-            break;
-          case 6: //--+ (theta, pi+phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
-            psi4RR = shellf[InList * n];
-            psi4II = shellf[InList * n + 1];
-            break;
-          case 7: //--- (pi-theta, pi+phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
-            psi4RR = shellf[InList * n];
-            psi4II = -shellf[InList * n + 1];
-          }
-
-          thetap = sqrt((2 * pl + 1.0) / 4.0 / PI) * misc::Wigner_d_function(pl, pm, spinw, costheta); // note the variation from -2 to 2
-                                                                                                       // based on Eq.(41) of PRD 77, 024027 (2008)
-#ifdef GaussInt
-          // wtcostheta is even function respect costheta
-          RP_out[countlm] = RP_out[countlm] + thetap * (psi4RR * cosmphi + psi4II * sinmphi) * wtcostheta[i];
-          IP_out[countlm] = IP_out[countlm] + thetap * (psi4II * cosmphi - psi4RR * sinmphi) * wtcostheta[i];
-#else
-          RP_out[countlm] = RP_out[countlm] + thetap * (psi4RR * cosmphi + psi4II * sinmphi); // + is because \bar of \bar{Y^s_lm} in Eq.(40)
-                                                                                              // of PRD 77, 024027 (2008)
-          IP_out[countlm] = IP_out[countlm] + thetap * (psi4II * cosmphi - psi4RR * sinmphi);
-#endif
-        }
-        countlm++; // no sanity check for countlm and NN which should be noted in the input parameters
-      }
-  }
-
-  for (int ii = 0; ii < NN; ii++)
-  {
-// do not need multiply with rex for null shell
-#ifdef GaussInt
-    RP_out[ii] = RP_out[ii] * dphi;
-    IP_out[ii] = IP_out[ii] * dphi;
-#else
-    RP_out[ii] = RP_out[ii] * dphi * dcostheta;
-    IP_out[ii] = IP_out[ii] * dphi * dcostheta;
-#endif
-  }
-  //|------+  Communicate and sum the results from each processor.
-
-  {
-    double *RPIP_out = new double[2 * NN];
-    double *RPIP = new double[2 * NN];
-    memcpy(RPIP_out, RP_out, NN * sizeof(double));
-    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
-    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    memcpy(RP, RPIP, NN * sizeof(double));
-    memcpy(IP, RPIP + NN, NN * sizeof(double));
-    delete[] RPIP_out;
-    delete[] RPIP;
-  }
-
-  //|------= Free memory.
-
-  delete[] pox[0];
-  delete[] pox[1];
-  delete[] pox[2];
-  delete[] shellf;
-  delete[] RP_out;
-  delete[] IP_out;
-  DG_List->clearList();
-}
-//|----------------------------------------------------------------
-//  for null shell patch
-//|----------------------------------------------------------------
-// rex is x instead of r
-void surface_integral::surf_Wave(double rex, int lev, NullShellPatch *GH, var *Rpsi4, var *Ipsi4,
-                                 int spinw, int maxl, int NN, double *RP, double *IP,
-                                 monitor *Monitor) // NN is the length of RP and IP
-// spinw 0 for scalar; 1 for electricmagnetic wave; 2 for gravitaitonal wave
-// we always assume spinw >= 0
-{
-  const int InList = 2;
-
-  MyList<var> *DG_List = new MyList<var>(Rpsi4);
-  DG_List->insert(Ipsi4);
-
-  int n;
-  // since we used x instead of r, these global coordinates are fake
-  double *pox[3];
-  for (int i = 0; i < 3; i++)
-    pox[i] = new double[n_tot];
-  for (n = 0; n < n_tot; n++)
-  {
-    pox[0][n] = rex * nx_g[n];
-    pox[1][n] = rex * ny_g[n];
-    pox[2][n] = rex * nz_g[n];
-  }
-
-  double *shellf;
-  shellf = new double[n_tot * InList];
-
-  GH->Interp_Points_2D(DG_List, n_tot, pox, shellf, Symmetry);
-
-  int mp, Lp, Nmin, Nmax;
-
-  mp = n_tot / cpusize;
-  Lp = n_tot - cpusize * mp;
-
-  if (Lp > myrank)
-  {
-    Nmin = myrank * mp + myrank;
-    Nmax = Nmin + mp;
-  }
-  else
-  {
-    Nmin = myrank * mp + Lp;
-    Nmax = Nmin + mp - 1;
-  }
-
-  //|~~~~~> Integrate the dot product of Dphi with the surface normal.
-
-  double *RP_out, *IP_out;
-  RP_out = new double[NN];
-  IP_out = new double[NN];
-
-  for (int ii = 0; ii < NN; ii++)
-  {
-    RP_out[ii] = 0;
-    IP_out[ii] = 0;
-  }
-  // theta part
-  double costheta, thetap;
-  double cosmphi, sinmphi;
-
-  int i, j;
-  int lpsy = 0;
-  if (Symmetry == 0)
-    lpsy = 1;
-  else if (Symmetry == 1)
-    lpsy = 2;
-  else if (Symmetry == 2)
-    lpsy = 8;
-
-  double psi4RR, psi4II;
-  for (n = Nmin; n <= Nmax; n++)
-  {
-    //       need round off always
-    i = int(n / N_phi); // int(1.723) = 1, int(-1.732) = -1
-    j = n - i * N_phi;
-
-    int countlm = 0;
-    for (int pl = spinw; pl < maxl + 1; pl++)
-      for (int pm = -pl; pm < pl + 1; pm++)
-      {
-        for (int lp = 0; lp < lpsy; lp++)
-        {
-          switch (lp)
-          {
-          case 0: //+++ (theta, phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = sin(pm * (j + 0.5) * dphi);
-            psi4RR = shellf[InList * n];
-            psi4II = shellf[InList * n + 1];
-            break;
-          case 1: //++- (pi-theta, phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = sin(pm * (j + 0.5) * dphi);
-            psi4RR = shellf[InList * n];
-            psi4II = -shellf[InList * n + 1];
-            break;
-          case 2: //+-+ (theta, 2*pi-phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = -sin(pm * (j + 0.5) * dphi);
-            psi4RR = shellf[InList * n];
-            psi4II = -shellf[InList * n + 1];
-            break;
-          case 3: //+-- (pi-theta, 2*pi-phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = -sin(pm * (j + 0.5) * dphi);
-            psi4RR = shellf[InList * n];
-            psi4II = shellf[InList * n + 1];
-            break;
-          case 4: //-++ (theta, pi-phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
-            psi4RR = shellf[InList * n];
-            psi4II = -shellf[InList * n + 1];
-            break;
-          case 5: //-+- (pi-theta, pi-phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
-            psi4RR = shellf[InList * n];
-            psi4II = shellf[InList * n + 1];
-            break;
-          case 6: //--+ (theta, pi+phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
-            psi4RR = shellf[InList * n];
-            psi4II = shellf[InList * n + 1];
-            break;
-          case 7: //--- (pi-theta, pi+phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
-            psi4RR = shellf[InList * n];
-            psi4II = -shellf[InList * n + 1];
-          }
-
-          thetap = sqrt((2 * pl + 1.0) / 4.0 / PI) * misc::Wigner_d_function(pl, pm, spinw, costheta); // note the variation from -2 to 2
-                                                                                                       // based on Eq.(41) of PRD 77, 024027 (2008)
-#ifdef GaussInt
-          // wtcostheta is even function respect costheta
-          RP_out[countlm] = RP_out[countlm] + thetap * (psi4RR * cosmphi + psi4II * sinmphi) * wtcostheta[i];
-          IP_out[countlm] = IP_out[countlm] + thetap * (psi4II * cosmphi - psi4RR * sinmphi) * wtcostheta[i];
-#else
-          RP_out[countlm] = RP_out[countlm] + thetap * (psi4RR * cosmphi + psi4II * sinmphi); // + is because \bar of \bar{Y^s_lm} in Eq.(40)
-                                                                                              // of PRD 77, 024027 (2008)
-          IP_out[countlm] = IP_out[countlm] + thetap * (psi4II * cosmphi - psi4RR * sinmphi);
-#endif
-        }
-        countlm++; // no sanity check for countlm and NN which should be noted in the input parameters
-      }
-  }
-
-  for (int ii = 0; ii < NN; ii++)
-  {
-// do not need multiply with rex for null shell
-#ifdef GaussInt
-    RP_out[ii] = RP_out[ii] * dphi;
-    IP_out[ii] = IP_out[ii] * dphi;
-#else
-    RP_out[ii] = RP_out[ii] * dphi * dcostheta;
-    IP_out[ii] = IP_out[ii] * dphi * dcostheta;
-#endif
-  }
-  //|------+  Communicate and sum the results from each processor.
-
-  {
-    double *RPIP_out = new double[2 * NN];
-    double *RPIP = new double[2 * NN];
-    memcpy(RPIP_out, RP_out, NN * sizeof(double));
-    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
-    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    memcpy(RP, RPIP, NN * sizeof(double));
-    memcpy(IP, RPIP + NN, NN * sizeof(double));
-    delete[] RPIP_out;
-    delete[] RPIP;
-  }
-
-  //|------= Free memory.
-
-  delete[] pox[0];
-  delete[] pox[1];
-  delete[] pox[2];
-  delete[] shellf;
-  delete[] RP_out;
-  delete[] IP_out;
-  DG_List->clearList();
-}
-//|----------------------------------------------------
-//|
-//| ADM mass, linear momentum and angular momentum
-//|
-//|----------------------------------------------------
-void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var *trK,
-                                     var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
-                                     var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
-                                     var *Gmx, var *Gmy, var *Gmz,
-                                     var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, // temparay memory for mass^i
-                                     double *Rout, monitor *Monitor)
-{
-  if (myrank == 0 && GH->grids[lev] != 1)
-    if (Monitor && Monitor->outfile)
-      Monitor->outfile << "WARNING: surface integral on multipatches" << endl;
-    else
-      cout << "WARNING: surface integral on multipatches" << endl;
-
-  double mass, px, py, pz, sx, sy, sz;
-
-  MyList<Patch> *Pp = GH->PatL[lev];
-  while (Pp)
-  {
-    MyList<Block> *BP = Pp->data->blb;
-    while (BP)
-    {
-      Block *cg = BP->data;
-      if (myrank == cg->rank)
-      {
-        f_admmass_bssn(cg->shape, cg->X[0], cg->X[1], cg->X[2],
-                       cg->fgfs[chi->sgfn], cg->fgfs[trK->sgfn],
-                       cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
-                       cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn],
-                       cg->fgfs[Gmx->sgfn], cg->fgfs[Gmy->sgfn], cg->fgfs[Gmz->sgfn],
-                       cg->fgfs[Sfx_rhs->sgfn], cg->fgfs[Sfy_rhs->sgfn], cg->fgfs[Sfz_rhs->sgfn],
-                       Symmetry);
-      }
-      if (BP == Pp->data->ble)
-        break;
-      BP = BP->next;
-    }
-    Pp = Pp->next;
-  }
-
-  const int InList = 17;
-
-  MyList<var> *DG_List = new MyList<var>(Sfx_rhs);
-  DG_List->insert(Sfy_rhs);
-  DG_List->insert(Sfz_rhs);
-  DG_List->insert(chi);
-  DG_List->insert(trK);
-  DG_List->insert(gxx);
-  DG_List->insert(gxy);
-  DG_List->insert(gxz);
-  DG_List->insert(gyy);
-  DG_List->insert(gyz);
-  DG_List->insert(gzz);
-  DG_List->insert(Axx);
-  DG_List->insert(Axy);
-  DG_List->insert(Axz);
-  DG_List->insert(Ayy);
-  DG_List->insert(Ayz);
-  DG_List->insert(Azz);
-
-  int n;
-  double *pox[3];
-  for (int i = 0; i < 3; i++)
-    pox[i] = new double[n_tot];
-  for (n = 0; n < n_tot; n++)
-  {
-    pox[0][n] = rex * nx_g[n];
-    pox[1][n] = rex * ny_g[n];
-    pox[2][n] = rex * nz_g[n];
-  }
-
-  int mp, Lp, Nmin, Nmax;
-  mp = n_tot / cpusize;
-  Lp = n_tot - cpusize * mp;
-  if (Lp > myrank)
-  {
-    Nmin = myrank * mp + myrank;
-    Nmax = Nmin + mp;
-  }
-  else
-  {
-    Nmin = myrank * mp + Lp;
-    Nmax = Nmin + mp - 1;
-  }
-
-  double *shellf;
-  shellf = new double[n_tot * InList];
-
-  // we have assumed there is only one box on this level,
-  // so we do not need loop boxes
-  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
-
-  double Mass_out = 0;
-  double ang_outx, ang_outy, ang_outz;
-  double p_outx, p_outy, p_outz;
-  ang_outx = ang_outy = ang_outz = 0.0;
-  p_outx = p_outy = p_outz = 0.0;
-  const double f1o8 = 0.125;
-
-  double Chi, Psi;
-  double Gxx, Gxy, Gxz, Gyy, Gyz, Gzz;
-  double gupxx, gupxy, gupxz, gupyy, gupyz, gupzz;
-  double TRK, axx, axy, axz, ayy, ayz, azz;
-  double aupxx, aupxy, aupxz, aupyx, aupyy, aupyz, aupzx, aupzy, aupzz;
-  int i;
-  for (n = Nmin; n <= Nmax; n++)
-  {
-    //       need round off always
-    i = int(n / N_phi); // int(1.723) = 1, int(-1.732) = -1
-
-    Chi = shellf[InList * n + 3]; // chi in fact
-    TRK = shellf[InList * n + 4];
-    Gxx = shellf[InList * n + 5] + 1.0;
-    Gxy = shellf[InList * n + 6];
-    Gxz = shellf[InList * n + 7];
-    Gyy = shellf[InList * n + 8] + 1.0;
-    Gyz = shellf[InList * n + 9];
-    Gzz = shellf[InList * n + 10] + 1.0;
-    axx = shellf[InList * n + 11];
-    axy = shellf[InList * n + 12];
-    axz = shellf[InList * n + 13];
-    ayy = shellf[InList * n + 14];
-    ayz = shellf[InList * n + 15];
-    azz = shellf[InList * n + 16];
-
-    Chi = 1.0 / (1.0 + Chi); // exp(4*phi)
-    Psi = Chi * sqrt(Chi);   // Psi^6
-
-// Chi^2 corresponds to metric determinant
-// but this factor has been considered in f_admmass_bssn
-#ifdef GaussInt
-    // wtcostheta is even function respect costheta
-    Mass_out = Mass_out + (shellf[InList * n] * nx_g[n] + shellf[InList * n + 1] * ny_g[n] + shellf[InList * n + 2] * nz_g[n]) * wtcostheta[i];
-#else
-    Mass_out = Mass_out + (shellf[InList * n] * nx_g[n] + shellf[InList * n + 1] * ny_g[n] + shellf[InList * n + 2] * nz_g[n]);
-#endif
-
-    gupzz = Gxx * Gyy * Gzz + Gxy * Gyz * Gxz + Gxz * Gxy * Gyz -
-            Gxz * Gyy * Gxz - Gxy * Gxy * Gzz - Gxx * Gyz * Gyz;
-    gupxx = (Gyy * Gzz - Gyz * Gyz) / gupzz;
-    gupxy = -(Gxy * Gzz - Gyz * Gxz) / gupzz;
-    gupxz = (Gxy * Gyz - Gyy * Gxz) / gupzz;
-    gupyy = (Gxx * Gzz - Gxz * Gxz) / gupzz;
-    gupyz = -(Gxx * Gyz - Gxy * Gxz) / gupzz;
-    gupzz = (Gxx * Gyy - Gxy * Gxy) / gupzz;
-
-    aupxx = gupxx * axx + gupxy * axy + gupxz * axz;
-    aupxy = gupxx * axy + gupxy * ayy + gupxz * ayz;
-    aupxz = gupxx * axz + gupxy * ayz + gupxz * azz;
-    aupyx = gupxy * axx + gupyy * axy + gupyz * axz;
-    aupyy = gupxy * axy + gupyy * ayy + gupyz * ayz;
-    aupyz = gupxy * axz + gupyy * ayz + gupyz * azz;
-    aupzx = gupxz * axx + gupyz * axy + gupzz * axz;
-    aupzy = gupxz * axy + gupyz * ayy + gupzz * ayz;
-    aupzz = gupxz * axz + gupyz * ayz + gupzz * azz;
-    if (Symmetry == 0)
-    {
-#ifdef GaussInt
-      // wtcostheta is even function respect costheta
-      //  1/8\pi \int \psi^6 (y A^m_z - zA^m_y) dS_m
-      ang_outx = ang_outx + f1o8 * Psi * (nx_g[n] * (pox[1][n] * aupxz - pox[2][n] * aupxy) + ny_g[n] * (pox[1][n] * aupyz - pox[2][n] * aupyy) + nz_g[n] * (pox[1][n] * aupzz - pox[2][n] * aupzy)) * wtcostheta[i];
-      //  1/8\pi \int \psi^6 (z A^m_x - xA^m_z) dS_m
-      ang_outy = ang_outy + f1o8 * Psi * (nx_g[n] * (pox[2][n] * aupxx - pox[0][n] * aupxz) + ny_g[n] * (pox[2][n] * aupyx - pox[0][n] * aupyz) + nz_g[n] * (pox[2][n] * aupzx - pox[0][n] * aupzz)) * wtcostheta[i];
-      // 1/8\pi \int \psi^6 (x A^m_y - yA^m_x) dS_m
-      ang_outz = ang_outz + f1o8 * Psi * (nx_g[n] * (pox[0][n] * aupxy - pox[1][n] * aupxx) + ny_g[n] * (pox[0][n] * aupyy - pox[1][n] * aupyx) + nz_g[n] * (pox[0][n] * aupzy - pox[1][n] * aupzx)) * wtcostheta[i];
-#else
-      //  1/8\pi \int \psi^6 (y A^m_z - zA^m_y) dS_m
-      ang_outx = ang_outx + f1o8 * Psi * (nx_g[n] * (pox[1][n] * aupxz - pox[2][n] * aupxy) + ny_g[n] * (pox[1][n] * aupyz - pox[2][n] * aupyy) + nz_g[n] * (pox[1][n] * aupzz - pox[2][n] * aupzy));
-      //  1/8\pi \int \psi^6 (z A^m_x - xA^m_z) dS_m
-      ang_outy = ang_outy + f1o8 * Psi * (nx_g[n] * (pox[2][n] * aupxx - pox[0][n] * aupxz) + ny_g[n] * (pox[2][n] * aupyx - pox[0][n] * aupyz) + nz_g[n] * (pox[2][n] * aupzx - pox[0][n] * aupzz));
-      // 1/8\pi \int \psi^6 (x A^m_y - yA^m_x) dS_m
-      ang_outz = ang_outz + f1o8 * Psi * (nx_g[n] * (pox[0][n] * aupxy - pox[1][n] * aupxx) + ny_g[n] * (pox[0][n] * aupyy - pox[1][n] * aupyx) + nz_g[n] * (pox[0][n] * aupzy - pox[1][n] * aupzx));
-#endif
-    }
-    else if (Symmetry == 1)
-    {
-#ifdef GaussInt
-      ang_outz = ang_outz + f1o8 * Psi * (nx_g[n] * (pox[0][n] * aupxy - pox[1][n] * aupxx) + ny_g[n] * (pox[0][n] * aupyy - pox[1][n] * aupyx) + nz_g[n] * (pox[0][n] * aupzy - pox[1][n] * aupzx)) * wtcostheta[i];
-#else
-      ang_outz = ang_outz + f1o8 * Psi * (nx_g[n] * (pox[0][n] * aupxy - pox[1][n] * aupxx) + ny_g[n] * (pox[0][n] * aupyy - pox[1][n] * aupyx) + nz_g[n] * (pox[0][n] * aupzy - pox[1][n] * aupzx));
-#endif
-    }
-
-    axx = Chi * (axx + Gxx * TRK / 3.0);
-    axy = Chi * (axy + Gxy * TRK / 3.0);
-    axz = Chi * (axz + Gxz * TRK / 3.0);
-    ayy = Chi * (ayy + Gyy * TRK / 3.0);
-    ayz = Chi * (ayz + Gyz * TRK / 3.0);
-    azz = Chi * (azz + Gzz * TRK / 3.0);
-
-    axx = axx - TRK;
-    ayy = ayy - TRK;
-    azz = azz - TRK;
-
-    // 1/8\pi \int \psi^6 (K_mi - \delta_mi trK) dS^m: lower index linear momentum
-    if (Symmetry == 0)
-    {
-#ifdef GaussInt
-      p_outx = p_outx + f1o8 * Psi * (nx_g[n] * axx + ny_g[n] * axy + nz_g[n] * axz) * wtcostheta[i];
-      p_outy = p_outy + f1o8 * Psi * (nx_g[n] * axy + ny_g[n] * ayy + nz_g[n] * ayz) * wtcostheta[i];
-      p_outz = p_outz + f1o8 * Psi * (nx_g[n] * axz + ny_g[n] * ayz + nz_g[n] * azz) * wtcostheta[i];
-#else
-      p_outx = p_outx + f1o8 * Psi * (nx_g[n] * axx + ny_g[n] * axy + nz_g[n] * axz);
-      p_outy = p_outy + f1o8 * Psi * (nx_g[n] * axy + ny_g[n] * ayy + nz_g[n] * ayz);
-      p_outz = p_outz + f1o8 * Psi * (nx_g[n] * axz + ny_g[n] * ayz + nz_g[n] * azz);
-#endif
-    }
-    else if (Symmetry == 1)
-    {
-#ifdef GaussInt
-      p_outx = p_outx + f1o8 * Psi * (nx_g[n] * axx + ny_g[n] * axy + nz_g[n] * axz) * wtcostheta[i];
-      p_outy = p_outy + f1o8 * Psi * (nx_g[n] * axy + ny_g[n] * ayy + nz_g[n] * ayz) * wtcostheta[i];
-#else
-      p_outx = p_outx + f1o8 * Psi * (nx_g[n] * axx + ny_g[n] * axy + nz_g[n] * axz);
-      p_outy = p_outy + f1o8 * Psi * (nx_g[n] * axy + ny_g[n] * ayy + nz_g[n] * ayz);
-#endif
-    }
-  }
-
-  {
-    double scalar_out[7] = {Mass_out, ang_outx, ang_outy, ang_outz, p_outx, p_outy, p_outz};
-    double scalar_in[7];
-    MPI_Allreduce(scalar_out, scalar_in, 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    mass = scalar_in[0]; sx = scalar_in[1]; sy = scalar_in[2]; sz = scalar_in[3];
-    px = scalar_in[4]; py = scalar_in[5]; pz = scalar_in[6];
-  }
-
-#ifdef GaussInt
-  mass = mass * rex * rex * dphi * factor;
-
-  sx = sx * rex * rex * dphi * (1.0 / PI) * factor;
-  sy = sy * rex * rex * dphi * (1.0 / PI) * factor;
-  sz = sz * rex * rex * dphi * (1.0 / PI) * factor;
-
-  px = px * rex * rex * dphi * (1.0 / PI) * factor;
-  py = py * rex * rex * dphi * (1.0 / PI) * factor;
-  pz = pz * rex * rex * dphi * (1.0 / PI) * factor;
-#else
-  mass = mass * rex * rex * dphi * dcostheta * factor;
-
-  sx = sx * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
-  sy = sy * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
-  sz = sz * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
-
-  px = px * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
-  py = py * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
-  pz = pz * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
-#endif
-
-  Rout[0] = mass;
-  Rout[1] = px;
-  Rout[2] = py;
-  Rout[3] = pz;
-  Rout[4] = sx;
-  Rout[5] = sy;
-  Rout[6] = sz;
-
-  delete[] pox[0];
-  delete[] pox[1];
-  delete[] pox[2];
-  delete[] shellf;
-  DG_List->clearList();
-}
-void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var *trK,
-                                     var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
-                                     var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
-                                     var *Gmx, var *Gmy, var *Gmz,
-                                     var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, // temparay memory for mass^i
-                                     double *Rout, monitor *Monitor, MPI_Comm Comm_here)
-{
-  int lmyrank;
-  MPI_Comm_rank(Comm_here, &lmyrank);
-  if (lmyrank == 0 && GH->grids[lev] != 1)
-    if (Monitor && Monitor->outfile)
-      Monitor->outfile << "WARNING: surface integral on multipatches" << endl;
-    else
-      cout << "WARNING: surface integral on multipatches" << endl;
-
-  double mass, px, py, pz, sx, sy, sz;
-
-  MyList<Patch> *Pp = GH->PatL[lev];
-  while (Pp)
-  {
-    MyList<Block> *BP = Pp->data->blb;
-    while (BP)
-    {
-      Block *cg = BP->data;
-      if (myrank == cg->rank)
-      {
-        f_admmass_bssn(cg->shape, cg->X[0], cg->X[1], cg->X[2],
-                       cg->fgfs[chi->sgfn], cg->fgfs[trK->sgfn],
-                       cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
-                       cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn],
-                       cg->fgfs[Gmx->sgfn], cg->fgfs[Gmy->sgfn], cg->fgfs[Gmz->sgfn],
-                       cg->fgfs[Sfx_rhs->sgfn], cg->fgfs[Sfy_rhs->sgfn], cg->fgfs[Sfz_rhs->sgfn],
-                       Symmetry);
-      }
-      if (BP == Pp->data->ble)
-        break;
-      BP = BP->next;
-    }
-    Pp = Pp->next;
-  }
-
-  const int InList = 17;
-
-  MyList<var> *DG_List = new MyList<var>(Sfx_rhs);
-  DG_List->insert(Sfy_rhs);
-  DG_List->insert(Sfz_rhs);
-  DG_List->insert(chi);
-  DG_List->insert(trK);
-  DG_List->insert(gxx);
-  DG_List->insert(gxy);
-  DG_List->insert(gxz);
-  DG_List->insert(gyy);
-  DG_List->insert(gyz);
-  DG_List->insert(gzz);
-  DG_List->insert(Axx);
-  DG_List->insert(Axy);
-  DG_List->insert(Axz);
-  DG_List->insert(Ayy);
-  DG_List->insert(Ayz);
-  DG_List->insert(Azz);
-
-  int n;
-  double *pox[3];
-  for (int i = 0; i < 3; i++)
-    pox[i] = new double[n_tot];
-  for (n = 0; n < n_tot; n++)
-  {
-    pox[0][n] = rex * nx_g[n];
-    pox[1][n] = rex * ny_g[n];
-    pox[2][n] = rex * nz_g[n];
-  }
-
-  double *shellf;
-  shellf = new double[n_tot * InList];
-
-  // we have assumed there is only one box on this level,
-  // so we do not need loop boxes
-  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Comm_here);
-
-  double Mass_out = 0;
-  double ang_outx, ang_outy, ang_outz;
-  double p_outx, p_outy, p_outz;
-  ang_outx = ang_outy = ang_outz = 0.0;
-  p_outx = p_outy = p_outz = 0.0;
-  const double f1o8 = 0.125;
-
-  int mp, Lp, Nmin, Nmax;
-
-  int cpusize_here;
-  MPI_Comm_size(Comm_here, &cpusize_here);
-
-  mp = n_tot / cpusize_here;
-  Lp = n_tot - cpusize_here * mp;
-
-  if (Lp > lmyrank)
-  {
-    Nmin = lmyrank * mp + lmyrank;
-    Nmax = Nmin + mp;
-  }
-  else
-  {
-    Nmin = lmyrank * mp + Lp;
-    Nmax = Nmin + mp - 1;
-  }
-
-  double Chi, Psi;
-  double Gxx, Gxy, Gxz, Gyy, Gyz, Gzz;
-  double gupxx, gupxy, gupxz, gupyy, gupyz, gupzz;
-  double TRK, axx, axy, axz, ayy, ayz, azz;
-  double aupxx, aupxy, aupxz, aupyx, aupyy, aupyz, aupzx, aupzy, aupzz;
-  int i;
-  for (n = Nmin; n <= Nmax; n++)
-  {
-    //       need round off always
-    i = int(n / N_phi); // int(1.723) = 1, int(-1.732) = -1
-
-    Chi = shellf[InList * n + 3]; // chi in fact
-    TRK = shellf[InList * n + 4];
-    Gxx = shellf[InList * n + 5] + 1.0;
-    Gxy = shellf[InList * n + 6];
-    Gxz = shellf[InList * n + 7];
-    Gyy = shellf[InList * n + 8] + 1.0;
-    Gyz = shellf[InList * n + 9];
-    Gzz = shellf[InList * n + 10] + 1.0;
-    axx = shellf[InList * n + 11];
-    axy = shellf[InList * n + 12];
-    axz = shellf[InList * n + 13];
-    ayy = shellf[InList * n + 14];
-    ayz = shellf[InList * n + 15];
-    azz = shellf[InList * n + 16];
-
-    Chi = 1.0 / (1.0 + Chi); // exp(4*phi)
-    Psi = Chi * sqrt(Chi);   // Psi^6
-
-// Chi^2 corresponds to metric determinant
-// but this factor has been considered in f_admmass_bssn
-#ifdef GaussInt
-    // wtcostheta is even function respect costheta
-    Mass_out = Mass_out + (shellf[InList * n] * nx_g[n] + shellf[InList * n + 1] * ny_g[n] + shellf[InList * n + 2] * nz_g[n]) * wtcostheta[i];
-#else
-    Mass_out = Mass_out + (shellf[InList * n] * nx_g[n] + shellf[InList * n + 1] * ny_g[n] + shellf[InList * n + 2] * nz_g[n]);
-#endif
-
-    gupzz = Gxx * Gyy * Gzz + Gxy * Gyz * Gxz + Gxz * Gxy * Gyz -
-            Gxz * Gyy * Gxz - Gxy * Gxy * Gzz - Gxx * Gyz * Gyz;
-    gupxx = (Gyy * Gzz - Gyz * Gyz) / gupzz;
-    gupxy = -(Gxy * Gzz - Gyz * Gxz) / gupzz;
-    gupxz = (Gxy * Gyz - Gyy * Gxz) / gupzz;
-    gupyy = (Gxx * Gzz - Gxz * Gxz) / gupzz;
-    gupyz = -(Gxx * Gyz - Gxy * Gxz) / gupzz;
-    gupzz = (Gxx * Gyy - Gxy * Gxy) / gupzz;
-
-    aupxx = gupxx * axx + gupxy * axy + gupxz * axz;
-    aupxy = gupxx * axy + gupxy * ayy + gupxz * ayz;
-    aupxz = gupxx * axz + gupxy * ayz + gupxz * azz;
-    aupyx = gupxy * axx + gupyy * axy + gupyz * axz;
-    aupyy = gupxy * axy + gupyy * ayy + gupyz * ayz;
-    aupyz = gupxy * axz + gupyy * ayz + gupyz * azz;
-    aupzx = gupxz * axx + gupyz * axy + gupzz * axz;
-    aupzy = gupxz * axy + gupyz * ayy + gupzz * ayz;
-    aupzz = gupxz * axz + gupyz * ayz + gupzz * azz;
-    if (Symmetry == 0)
-    {
-#ifdef GaussInt
-      // wtcostheta is even function respect costheta
-      //  1/8\pi \int \psi^6 (y A^m_z - zA^m_y) dS_m
-      ang_outx = ang_outx + f1o8 * Psi * (nx_g[n] * (pox[1][n] * aupxz - pox[2][n] * aupxy) + ny_g[n] * (pox[1][n] * aupyz - pox[2][n] * aupyy) + nz_g[n] * (pox[1][n] * aupzz - pox[2][n] * aupzy)) * wtcostheta[i];
-      //  1/8\pi \int \psi^6 (z A^m_x - xA^m_z) dS_m
-      ang_outy = ang_outy + f1o8 * Psi * (nx_g[n] * (pox[2][n] * aupxx - pox[0][n] * aupxz) + ny_g[n] * (pox[2][n] * aupyx - pox[0][n] * aupyz) + nz_g[n] * (pox[2][n] * aupzx - pox[0][n] * aupzz)) * wtcostheta[i];
-      // 1/8\pi \int \psi^6 (x A^m_y - yA^m_x) dS_m
-      ang_outz = ang_outz + f1o8 * Psi * (nx_g[n] * (pox[0][n] * aupxy - pox[1][n] * aupxx) + ny_g[n] * (pox[0][n] * aupyy - pox[1][n] * aupyx) + nz_g[n] * (pox[0][n] * aupzy - pox[1][n] * aupzx)) * wtcostheta[i];
-#else
-      //  1/8\pi \int \psi^6 (y A^m_z - zA^m_y) dS_m
-      ang_outx = ang_outx + f1o8 * Psi * (nx_g[n] * (pox[1][n] * aupxz - pox[2][n] * aupxy) + ny_g[n] * (pox[1][n] * aupyz - pox[2][n] * aupyy) + nz_g[n] * (pox[1][n] * aupzz - pox[2][n] * aupzy));
-      //  1/8\pi \int \psi^6 (z A^m_x - xA^m_z) dS_m
-      ang_outy = ang_outy + f1o8 * Psi * (nx_g[n] * (pox[2][n] * aupxx - pox[0][n] * aupxz) + ny_g[n] * (pox[2][n] * aupyx - pox[0][n] * aupyz) + nz_g[n] * (pox[2][n] * aupzx - pox[0][n] * aupzz));
-      // 1/8\pi \int \psi^6 (x A^m_y - yA^m_x) dS_m
-      ang_outz = ang_outz + f1o8 * Psi * (nx_g[n] * (pox[0][n] * aupxy - pox[1][n] * aupxx) + ny_g[n] * (pox[0][n] * aupyy - pox[1][n] * aupyx) + nz_g[n] * (pox[0][n] * aupzy - pox[1][n] * aupzx));
-#endif
-    }
-    else if (Symmetry == 1)
-    {
-#ifdef GaussInt
-      ang_outz = ang_outz + f1o8 * Psi * (nx_g[n] * (pox[0][n] * aupxy - pox[1][n] * aupxx) + ny_g[n] * (pox[0][n] * aupyy - pox[1][n] * aupyx) + nz_g[n] * (pox[0][n] * aupzy - pox[1][n] * aupzx)) * wtcostheta[i];
-#else
-      ang_outz = ang_outz + f1o8 * Psi * (nx_g[n] * (pox[0][n] * aupxy - pox[1][n] * aupxx) + ny_g[n] * (pox[0][n] * aupyy - pox[1][n] * aupyx) + nz_g[n] * (pox[0][n] * aupzy - pox[1][n] * aupzx));
-#endif
-    }
-
-    axx = Chi * (axx + Gxx * TRK / 3.0);
-    axy = Chi * (axy + Gxy * TRK / 3.0);
-    axz = Chi * (axz + Gxz * TRK / 3.0);
-    ayy = Chi * (ayy + Gyy * TRK / 3.0);
-    ayz = Chi * (ayz + Gyz * TRK / 3.0);
-    azz = Chi * (azz + Gzz * TRK / 3.0);
-
-    axx = axx - TRK;
-    ayy = ayy - TRK;
-    azz = azz - TRK;
-
-    // 1/8\pi \int \psi^6 (K_mi - \delta_mi trK) dS^m: lower index linear momentum
-    if (Symmetry == 0)
-    {
-#ifdef GaussInt
-      p_outx = p_outx + f1o8 * Psi * (nx_g[n] * axx + ny_g[n] * axy + nz_g[n] * axz) * wtcostheta[i];
-      p_outy = p_outy + f1o8 * Psi * (nx_g[n] * axy + ny_g[n] * ayy + nz_g[n] * ayz) * wtcostheta[i];
-      p_outz = p_outz + f1o8 * Psi * (nx_g[n] * axz + ny_g[n] * ayz + nz_g[n] * azz) * wtcostheta[i];
-#else
-      p_outx = p_outx + f1o8 * Psi * (nx_g[n] * axx + ny_g[n] * axy + nz_g[n] * axz);
-      p_outy = p_outy + f1o8 * Psi * (nx_g[n] * axy + ny_g[n] * ayy + nz_g[n] * ayz);
-      p_outz = p_outz + f1o8 * Psi * (nx_g[n] * axz + ny_g[n] * ayz + nz_g[n] * azz);
-#endif
-    }
-    else if (Symmetry == 1)
-    {
-#ifdef GaussInt
-      p_outx = p_outx + f1o8 * Psi * (nx_g[n] * axx + ny_g[n] * axy + nz_g[n] * axz) * wtcostheta[i];
-      p_outy = p_outy + f1o8 * Psi * (nx_g[n] * axy + ny_g[n] * ayy + nz_g[n] * ayz) * wtcostheta[i];
-#else
-      p_outx = p_outx + f1o8 * Psi * (nx_g[n] * axx + ny_g[n] * axy + nz_g[n] * axz);
-      p_outy = p_outy + f1o8 * Psi * (nx_g[n] * axy + ny_g[n] * ayy + nz_g[n] * ayz);
-#endif
-    }
-  }
-
-  {
-    double scalar_out[7] = {Mass_out, ang_outx, ang_outy, ang_outz, p_outx, p_outy, p_outz};
-    double scalar_in[7];
-    MPI_Allreduce(scalar_out, scalar_in, 7, MPI_DOUBLE, MPI_SUM, Comm_here);
-    mass = scalar_in[0]; sx = scalar_in[1]; sy = scalar_in[2]; sz = scalar_in[3];
-    px = scalar_in[4]; py = scalar_in[5]; pz = scalar_in[6];
-  }
-
-#ifdef GaussInt
-  mass = mass * rex * rex * dphi * factor;
-
-  sx = sx * rex * rex * dphi * (1.0 / PI) * factor;
-  sy = sy * rex * rex * dphi * (1.0 / PI) * factor;
-  sz = sz * rex * rex * dphi * (1.0 / PI) * factor;
-
-  px = px * rex * rex * dphi * (1.0 / PI) * factor;
-  py = py * rex * rex * dphi * (1.0 / PI) * factor;
-  pz = pz * rex * rex * dphi * (1.0 / PI) * factor;
-#else
-  mass = mass * rex * rex * dphi * dcostheta * factor;
-
-  sx = sx * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
-  sy = sy * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
-  sz = sz * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
-
-  px = px * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
-  py = py * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
-  pz = pz * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
-#endif
-
-  Rout[0] = mass;
-  Rout[1] = px;
-  Rout[2] = py;
-  Rout[3] = pz;
-  Rout[4] = sx;
-  Rout[5] = sy;
-  Rout[6] = sz;
-
-  delete[] pox[0];
-  delete[] pox[1];
-  delete[] pox[2];
-  delete[] shellf;
-  DG_List->clearList();
-}
-//|----------------------------------------------------------------
-//  for shell patch
-//|----------------------------------------------------------------
-void surface_integral::surf_MassPAng(double rex, int lev, ShellPatch *GH, var *chi, var *trK,
-                                     var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
-                                     var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
-                                     var *Gmx, var *Gmy, var *Gmz,
-                                     var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, // temparay memory for mass^i
-                                     double *Rout, monitor *Monitor)
-{
-  if (lev != 0)
-  {
-    if (myrank == 0)
-    {
-      if (Monitor && Monitor->outfile)
-        Monitor->outfile << "WARNING: shell surface integral not on level 0" << endl;
-      else
-        cout << "WARNING: shell surface integral not on level 0" << endl;
-    }
-    return;
-  }
-
-  double mass, px, py, pz, sx, sy, sz;
-
-  MyList<ss_patch> *Pp = GH->PatL;
-  while (Pp)
-  {
-    MyList<Block> *BL = Pp->data->blb;
-    int fngfs = Pp->data->fngfs;
-    while (BL)
-    {
-      Block *cg = BL->data;
-      if (myrank == cg->rank)
-      {
-        f_admmass_bssn_ss(cg->shape, cg->X[0], cg->X[1], cg->X[2],
-                          cg->fgfs[fngfs + ShellPatch::gx], cg->fgfs[fngfs + ShellPatch::gy], cg->fgfs[fngfs + ShellPatch::gz],
-                          cg->fgfs[fngfs + ShellPatch::drhodx], cg->fgfs[fngfs + ShellPatch::drhody], cg->fgfs[fngfs + ShellPatch::drhodz],
-                          cg->fgfs[fngfs + ShellPatch::dsigmadx], cg->fgfs[fngfs + ShellPatch::dsigmady], cg->fgfs[fngfs + ShellPatch::dsigmadz],
-                          cg->fgfs[fngfs + ShellPatch::dRdx], cg->fgfs[fngfs + ShellPatch::dRdy], cg->fgfs[fngfs + ShellPatch::dRdz],
-                          cg->fgfs[fngfs + ShellPatch::drhodxx], cg->fgfs[fngfs + ShellPatch::drhodxy], cg->fgfs[fngfs + ShellPatch::drhodxz],
-                          cg->fgfs[fngfs + ShellPatch::drhodyy], cg->fgfs[fngfs + ShellPatch::drhodyz], cg->fgfs[fngfs + ShellPatch::drhodzz],
-                          cg->fgfs[fngfs + ShellPatch::dsigmadxx], cg->fgfs[fngfs + ShellPatch::dsigmadxy], cg->fgfs[fngfs + ShellPatch::dsigmadxz],
-                          cg->fgfs[fngfs + ShellPatch::dsigmadyy], cg->fgfs[fngfs + ShellPatch::dsigmadyz], cg->fgfs[fngfs + ShellPatch::dsigmadzz],
-                          cg->fgfs[fngfs + ShellPatch::dRdxx], cg->fgfs[fngfs + ShellPatch::dRdxy], cg->fgfs[fngfs + ShellPatch::dRdxz],
-                          cg->fgfs[fngfs + ShellPatch::dRdyy], cg->fgfs[fngfs + ShellPatch::dRdyz], cg->fgfs[fngfs + ShellPatch::dRdzz],
-                          cg->fgfs[chi->sgfn], cg->fgfs[trK->sgfn],
-                          cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
-                          cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn],
-                          cg->fgfs[Gmx->sgfn], cg->fgfs[Gmy->sgfn], cg->fgfs[Gmz->sgfn],
-                          cg->fgfs[Sfx_rhs->sgfn], cg->fgfs[Sfy_rhs->sgfn], cg->fgfs[Sfz_rhs->sgfn],
-                          Symmetry, Pp->data->sst);
-      }
-      if (BL == Pp->data->ble)
-        break;
-      BL = BL->next;
-    }
-    Pp = Pp->next;
-  }
-
-  const int InList = 17;
-
-  MyList<var> *DG_List = new MyList<var>(Sfx_rhs);
-  DG_List->insert(Sfy_rhs);
-  DG_List->insert(Sfz_rhs);
-  DG_List->insert(chi);
-  DG_List->insert(trK);
-  DG_List->insert(gxx);
-  DG_List->insert(gxy);
-  DG_List->insert(gxz);
-  DG_List->insert(gyy);
-  DG_List->insert(gyz);
-  DG_List->insert(gzz);
-  DG_List->insert(Axx);
-  DG_List->insert(Axy);
-  DG_List->insert(Axz);
-  DG_List->insert(Ayy);
-  DG_List->insert(Ayz);
-  DG_List->insert(Azz);
-
-  int n;
-  double *pox[3];
-  for (int i = 0; i < 3; i++)
-    pox[i] = new double[n_tot];
-  for (n = 0; n < n_tot; n++)
-  {
-    pox[0][n] = rex * nx_g[n];
-    pox[1][n] = rex * ny_g[n];
-    pox[2][n] = rex * nz_g[n];
-  }
-
-  double *shellf;
-  shellf = new double[n_tot * InList];
-
-  // we have assumed there is only one box on this level,
-  // so we do not need loop boxes
-  GH->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);
-
-  double Mass_out = 0;
-  double ang_outx, ang_outy, ang_outz;
-  double p_outx, p_outy, p_outz;
-  ang_outx = ang_outy = ang_outz = 0.0;
-  p_outx = p_outy = p_outz = 0.0;
-  const double f1o8 = 0.125;
-
-  int mp, Lp, Nmin, Nmax;
-
-  mp = n_tot / cpusize;
-  Lp = n_tot - cpusize * mp;
-
-  if (Lp > myrank)
-  {
-    Nmin = myrank * mp + myrank;
-    Nmax = Nmin + mp;
-  }
-  else
-  {
-    Nmin = myrank * mp + Lp;
-    Nmax = Nmin + mp - 1;
-  }
-
-  double Chi, Psi;
-  double Gxx, Gxy, Gxz, Gyy, Gyz, Gzz;
-  double gupxx, gupxy, gupxz, gupyy, gupyz, gupzz;
-  double TRK, axx, axy, axz, ayy, ayz, azz;
-  double aupxx, aupxy, aupxz, aupyx, aupyy, aupyz, aupzx, aupzy, aupzz;
-  int i;
-  for (n = Nmin; n <= Nmax; n++)
-  {
-    //       need round off always
-    i = int(n / N_phi); // int(1.723) = 1, int(-1.732) = -1
-
-    Chi = shellf[InList * n + 3]; // chi in fact
-    TRK = shellf[InList * n + 4];
-    Gxx = shellf[InList * n + 5] + 1.0;
-    Gxy = shellf[InList * n + 6];
-    Gxz = shellf[InList * n + 7];
-    Gyy = shellf[InList * n + 8] + 1.0;
-    Gyz = shellf[InList * n + 9];
-    Gzz = shellf[InList * n + 10] + 1.0;
-    axx = shellf[InList * n + 11];
-    axy = shellf[InList * n + 12];
-    axz = shellf[InList * n + 13];
-    ayy = shellf[InList * n + 14];
-    ayz = shellf[InList * n + 15];
-    azz = shellf[InList * n + 16];
-
-    Chi = 1.0 / (1.0 + Chi); // exp(4*phi)
-    Psi = Chi * sqrt(Chi);   // Psi^6
-// Chi^2 corresponds to metric determinant
-// but this factor has been considered in f_admmass_bssn
-#ifdef GaussInt
-    // wtcostheta is even function respect costheta
-    Mass_out = Mass_out + (shellf[InList * n] * nx_g[n] + shellf[InList * n + 1] * ny_g[n] + shellf[InList * n + 2] * nz_g[n]) * wtcostheta[i];
-#else
-    Mass_out = Mass_out + (shellf[InList * n] * nx_g[n] + shellf[InList * n + 1] * ny_g[n] + shellf[InList * n + 2] * nz_g[n]);
-#endif
-
-    gupzz = Gxx * Gyy * Gzz + Gxy * Gyz * Gxz + Gxz * Gxy * Gyz -
-            Gxz * Gyy * Gxz - Gxy * Gxy * Gzz - Gxx * Gyz * Gyz;
-    gupxx = (Gyy * Gzz - Gyz * Gyz) / gupzz;
-    gupxy = -(Gxy * Gzz - Gyz * Gxz) / gupzz;
-    gupxz = (Gxy * Gyz - Gyy * Gxz) / gupzz;
-    gupyy = (Gxx * Gzz - Gxz * Gxz) / gupzz;
-    gupyz = -(Gxx * Gyz - Gxy * Gxz) / gupzz;
-    gupzz = (Gxx * Gyy - Gxy * Gxy) / gupzz;
-
-    aupxx = gupxx * axx + gupxy * axy + gupxz * axz;
-    aupxy = gupxx * axy + gupxy * ayy + gupxz * ayz;
-    aupxz = gupxx * axz + gupxy * ayz + gupxz * azz;
-    aupyx = gupxy * axx + gupyy * axy + gupyz * axz;
-    aupyy = gupxy * axy + gupyy * ayy + gupyz * ayz;
-    aupyz = gupxy * axz + gupyy * ayz + gupyz * azz;
-    aupzx = gupxz * axx + gupyz * axy + gupzz * axz;
-    aupzy = gupxz * axy + gupyz * ayy + gupzz * ayz;
-    aupzz = gupxz * axz + gupyz * ayz + gupzz * azz;
-    if (Symmetry == 0)
-    {
-#ifdef GaussInt
-      // wtcostheta is even function respect costheta
-      //  1/8\pi \int \psi^6 (y A^m_z - zA^m_y) dS_m
-      ang_outx = ang_outx + f1o8 * Psi * (nx_g[n] * (pox[1][n] * aupxz - pox[2][n] * aupxy) + ny_g[n] * (pox[1][n] * aupyz - pox[2][n] * aupyy) + nz_g[n] * (pox[1][n] * aupzz - pox[2][n] * aupzy)) * wtcostheta[i];
-      //  1/8\pi \int \psi^6 (z A^m_x - xA^m_z) dS_m
-      ang_outy = ang_outy + f1o8 * Psi * (nx_g[n] * (pox[2][n] * aupxx - pox[0][n] * aupxz) + ny_g[n] * (pox[2][n] * aupyx - pox[0][n] * aupyz) + nz_g[n] * (pox[2][n] * aupzx - pox[0][n] * aupzz)) * wtcostheta[i];
-      // 1/8\pi \int \psi^6 (x A^m_y - yA^m_x) dS_m
-      ang_outz = ang_outz + f1o8 * Psi * (nx_g[n] * (pox[0][n] * aupxy - pox[1][n] * aupxx) + ny_g[n] * (pox[0][n] * aupyy - pox[1][n] * aupyx) + nz_g[n] * (pox[0][n] * aupzy - pox[1][n] * aupzx)) * wtcostheta[i];
-#else
-      //  1/8\pi \int \psi^6 (y A^m_z - zA^m_y) dS_m
-      ang_outx = ang_outx + f1o8 * Psi * (nx_g[n] * (pox[1][n] * aupxz - pox[2][n] * aupxy) + ny_g[n] * (pox[1][n] * aupyz - pox[2][n] * aupyy) + nz_g[n] * (pox[1][n] * aupzz - pox[2][n] * aupzy));
-      //  1/8\pi \int \psi^6 (z A^m_x - xA^m_z) dS_m
-      ang_outy = ang_outy + f1o8 * Psi * (nx_g[n] * (pox[2][n] * aupxx - pox[0][n] * aupxz) + ny_g[n] * (pox[2][n] * aupyx - pox[0][n] * aupyz) + nz_g[n] * (pox[2][n] * aupzx - pox[0][n] * aupzz));
-      // 1/8\pi \int \psi^6 (x A^m_y - yA^m_x) dS_m
-      ang_outz = ang_outz + f1o8 * Psi * (nx_g[n] * (pox[0][n] * aupxy - pox[1][n] * aupxx) + ny_g[n] * (pox[0][n] * aupyy - pox[1][n] * aupyx) + nz_g[n] * (pox[0][n] * aupzy - pox[1][n] * aupzx));
-#endif
-    }
-    else if (Symmetry == 1)
-    {
-#ifdef GaussInt
-      ang_outz = ang_outz + f1o8 * Psi * (nx_g[n] * (pox[0][n] * aupxy - pox[1][n] * aupxx) + ny_g[n] * (pox[0][n] * aupyy - pox[1][n] * aupyx) + nz_g[n] * (pox[0][n] * aupzy - pox[1][n] * aupzx)) * wtcostheta[i];
-#else
-      ang_outz = ang_outz + f1o8 * Psi * (nx_g[n] * (pox[0][n] * aupxy - pox[1][n] * aupxx) + ny_g[n] * (pox[0][n] * aupyy - pox[1][n] * aupyx) + nz_g[n] * (pox[0][n] * aupzy - pox[1][n] * aupzx));
-#endif
-    }
-
-    axx = Chi * (axx + Gxx * TRK / 3.0);
-    axy = Chi * (axy + Gxy * TRK / 3.0);
-    axz = Chi * (axz + Gxz * TRK / 3.0);
-    ayy = Chi * (ayy + Gyy * TRK / 3.0);
-    ayz = Chi * (ayz + Gyz * TRK / 3.0);
-    azz = Chi * (azz + Gzz * TRK / 3.0);
-
-    axx = axx - TRK;
-    ayy = ayy - TRK;
-    azz = azz - TRK;
-
-    // 1/8\pi \int \psi^6 (K_mi - \delta_mi trK) dS^m: lower index linear momentum
-    if (Symmetry == 0)
-    {
-#ifdef GaussInt
-      p_outx = p_outx + f1o8 * Psi * (nx_g[n] * axx + ny_g[n] * axy + nz_g[n] * axz) * wtcostheta[i];
-      p_outy = p_outy + f1o8 * Psi * (nx_g[n] * axy + ny_g[n] * ayy + nz_g[n] * ayz) * wtcostheta[i];
-      p_outz = p_outz + f1o8 * Psi * (nx_g[n] * axz + ny_g[n] * ayz + nz_g[n] * azz) * wtcostheta[i];
-#else
-      p_outx = p_outx + f1o8 * Psi * (nx_g[n] * axx + ny_g[n] * axy + nz_g[n] * axz);
-      p_outy = p_outy + f1o8 * Psi * (nx_g[n] * axy + ny_g[n] * ayy + nz_g[n] * ayz);
-      p_outz = p_outz + f1o8 * Psi * (nx_g[n] * axz + ny_g[n] * ayz + nz_g[n] * azz);
-#endif
-    }
-    else if (Symmetry == 1)
-    {
-#ifdef GaussInt
-      p_outx = p_outx + f1o8 * Psi * (nx_g[n] * axx + ny_g[n] * axy + nz_g[n] * axz) * wtcostheta[i];
-      p_outy = p_outy + f1o8 * Psi * (nx_g[n] * axy + ny_g[n] * ayy + nz_g[n] * ayz) * wtcostheta[i];
-#else
-      p_outx = p_outx + f1o8 * Psi * (nx_g[n] * axx + ny_g[n] * axy + nz_g[n] * axz);
-      p_outy = p_outy + f1o8 * Psi * (nx_g[n] * axy + ny_g[n] * ayy + nz_g[n] * ayz);
-#endif
-    }
-  }
-
-  {
-    double scalar_out[7] = {Mass_out, ang_outx, ang_outy, ang_outz, p_outx, p_outy, p_outz};
-    double scalar_in[7];
-    MPI_Allreduce(scalar_out, scalar_in, 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    mass = scalar_in[0]; sx = scalar_in[1]; sy = scalar_in[2]; sz = scalar_in[3];
-    px = scalar_in[4]; py = scalar_in[5]; pz = scalar_in[6];
-  }
-
-#ifdef GaussInt
-  mass = mass * rex * rex * dphi * factor;
-
-  sx = sx * rex * rex * dphi * (1.0 / PI) * factor;
-  sy = sy * rex * rex * dphi * (1.0 / PI) * factor;
-  sz = sz * rex * rex * dphi * (1.0 / PI) * factor;
-
-  px = px * rex * rex * dphi * (1.0 / PI) * factor;
-  py = py * rex * rex * dphi * (1.0 / PI) * factor;
-  pz = pz * rex * rex * dphi * (1.0 / PI) * factor;
-#else
-  mass = mass * rex * rex * dphi * dcostheta * factor;
-
-  sx = sx * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
-  sy = sy * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
-  sz = sz * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
-
-  px = px * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
-  py = py * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
-  pz = pz * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
-#endif
-
-  Rout[0] = mass;
-  Rout[1] = px;
-  Rout[2] = py;
-  Rout[3] = pz;
-  Rout[4] = sx;
-  Rout[5] = sy;
-  Rout[6] = sz;
-
-  delete[] pox[0];
-  delete[] pox[1];
-  delete[] pox[2];
-  delete[] shellf;
-  DG_List->clearList();
-}
-//|----------------------------------------------------------------
-//  do not discriminate box and shell
-//  for Gravitational wave specially symmetric case
-//|----------------------------------------------------------------
-void surface_integral::surf_Wave(double rex, cgh *GH, ShellPatch *SH,
-                                 var *chi, var *trK,
-                                 var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
-                                 var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
-                                 var *chix, var *chiy, var *chiz,
-                                 var *trKx, var *trKy, var *trKz,
-                                 var *Axxx, var *Axxy, var *Axxz,
-                                 var *Axyx, var *Axyy, var *Axyz,
-                                 var *Axzx, var *Axzy, var *Axzz,
-                                 var *Ayyx, var *Ayyy, var *Ayyz,
-                                 var *Ayzx, var *Ayzy, var *Ayzz,
-                                 var *Azzx, var *Azzy, var *Azzz,
-                                 var *Gamxxx, var *Gamxxy, var *Gamxxz, var *Gamxyy, var *Gamxyz, var *Gamxzz,
-                                 var *Gamyxx, var *Gamyxy, var *Gamyxz, var *Gamyyy, var *Gamyyz, var *Gamyzz,
-                                 var *Gamzxx, var *Gamzxy, var *Gamzxz, var *Gamzyy, var *Gamzyz, var *Gamzzz,
-                                 var *Rxx, var *Rxy, var *Rxz, var *Ryy, var *Ryz, var *Rzz,
-                                 int spinw, int maxl, int NN, double *RP, double *IP,
-                                 monitor *Monitor) // NN is the length of RP and IP
-{
-  const int InList = 62;
-
-  MyList<var> *DG_List = new MyList<var>(chi);
-  DG_List->insert(trK);
-  DG_List->insert(gxx);
-  DG_List->insert(gxy);
-  DG_List->insert(gxz);
-  DG_List->insert(gyy);
-  DG_List->insert(gyz);
-  DG_List->insert(gzz);
-  DG_List->insert(Axx);
-  DG_List->insert(Axy);
-  DG_List->insert(Axz);
-  DG_List->insert(Ayy);
-  DG_List->insert(Ayz);
-  DG_List->insert(Azz);
-  DG_List->insert(chix);
-  DG_List->insert(chiy);
-  DG_List->insert(chiz);
-  DG_List->insert(trKx);
-  DG_List->insert(trKy);
-  DG_List->insert(trKz);
-  DG_List->insert(Axxx);
-  DG_List->insert(Axxy);
-  DG_List->insert(Axxz);
-  DG_List->insert(Axyx);
-  DG_List->insert(Axyy);
-  DG_List->insert(Axyz);
-  DG_List->insert(Axzx);
-  DG_List->insert(Axzy);
-  DG_List->insert(Axzz);
-  DG_List->insert(Ayyx);
-  DG_List->insert(Ayyy);
-  DG_List->insert(Ayyz);
-  DG_List->insert(Ayzx);
-  DG_List->insert(Ayzy);
-  DG_List->insert(Ayzz);
-  DG_List->insert(Azzx);
-  DG_List->insert(Azzy);
-  DG_List->insert(Azzz);
-  DG_List->insert(Gamxxx);
-  DG_List->insert(Gamxxy);
-  DG_List->insert(Gamxxz);
-  DG_List->insert(Gamxyy);
-  DG_List->insert(Gamxyz);
-  DG_List->insert(Gamxzz);
-  DG_List->insert(Gamyxx);
-  DG_List->insert(Gamyxy);
-  DG_List->insert(Gamyxz);
-  DG_List->insert(Gamyyy);
-  DG_List->insert(Gamyyz);
-  DG_List->insert(Gamyzz);
-  DG_List->insert(Gamzxx);
-  DG_List->insert(Gamzxy);
-  DG_List->insert(Gamzxz);
-  DG_List->insert(Gamzyy);
-  DG_List->insert(Gamzyz);
-  DG_List->insert(Gamzzz);
-  DG_List->insert(Rxx);
-  DG_List->insert(Rxy);
-  DG_List->insert(Rxz);
-  DG_List->insert(Ryy);
-  DG_List->insert(Ryz);
-  DG_List->insert(Rzz);
-
-  int n;
-  double *pox[3];
-  for (int i = 0; i < 3; i++)
-    pox[i] = new double[n_tot];
-  for (n = 0; n < n_tot; n++)
-  {
-    pox[0][n] = rex * nx_g[n];
-    pox[1][n] = rex * ny_g[n];
-    pox[2][n] = rex * nz_g[n];
-  }
-
-  double *shellf;
-  shellf = new double[n_tot * InList];
-
-  SR_Interp_Points(DG_List, GH, SH, n_tot, pox, shellf);
-
-  double *RP_out, *IP_out;
-  RP_out = new double[NN];
-  IP_out = new double[NN];
-
-  for (int ii = 0; ii < NN; ii++)
-  {
-    RP_out[ii] = 0;
-    IP_out[ii] = 0;
-  }
-
-  int mp, Lp, Nmin, Nmax;
-
-  mp = n_tot / cpusize;
-  Lp = n_tot - cpusize * mp;
-
-  if (Lp > myrank)
-  {
-    Nmin = myrank * mp + myrank;
-    Nmax = Nmin + mp;
-  }
-  else
-  {
-    Nmin = myrank * mp + Lp;
-    Nmax = Nmin + mp - 1;
-  }
-
-  // theta part
-  double costheta, thetap;
-  double cosmphi, sinmphi;
-
-  int i, j;
-  int lpsy = 0;
-  if (Symmetry == 0)
-    lpsy = 1;
-  else if (Symmetry == 1)
-    lpsy = 2;
-  else if (Symmetry == 2)
-    lpsy = 8;
-
-  double psi4RR, psi4II;
-  double px, py, pz;
-  double pchi, ptrK, pgxx, pgxy, pgxz, pgyy, pgyz, pgzz;
-  double pAxx, pAxy, pAxz, pAyy, pAyz, pAzz;
-  double pchix, pchiy, pchiz;
-  double ptrKx, ptrKy, ptrKz;
-  double pAxxx, pAxxy, pAxxz;
-  double pAxyx, pAxyy, pAxyz;
-  double pAxzx, pAxzy, pAxzz;
-  double pAyyx, pAyyy, pAyyz;
-  double pAyzx, pAyzy, pAyzz;
-  double pAzzx, pAzzy, pAzzz;
-  double pGamxxx, pGamxxy, pGamxxz, pGamxyy, pGamxyz, pGamxzz;
-  double pGamyxx, pGamyxy, pGamyxz, pGamyyy, pGamyyz, pGamyzz;
-  double pGamzxx, pGamzxy, pGamzxz, pGamzyy, pGamzyz, pGamzzz;
-  double pRxx, pRxy, pRxz, pRyy, pRyz, pRzz;
-  for (n = Nmin; n <= Nmax; n++)
-  {
-    //       need round off always
-    i = int(n / N_phi); // int(1.723) = 1, int(-1.732) = -1
-    j = n - i * N_phi;
-
-    int countlm = 0;
-    for (int pl = spinw; pl < maxl + 1; pl++)
-      for (int pm = -pl; pm < pl + 1; pm++)
-      {
-        for (int lp = 0; lp < lpsy; lp++)
-        {
-          px = pox[0][n];
-          py = pox[1][n];
-          pz = pox[2][n];
-          pchi = shellf[InList * n];
-          ptrK = shellf[InList * n + 1];
-          pgxx = shellf[InList * n + 2];
-          pgxy = shellf[InList * n + 3];
-          pgxz = shellf[InList * n + 4];
-          pgyy = shellf[InList * n + 5];
-          pgyz = shellf[InList * n + 6];
-          pgzz = shellf[InList * n + 7];
-          pAxx = shellf[InList * n + 8];
-          pAxy = shellf[InList * n + 9];
-          pAxz = shellf[InList * n + 10];
-          pAyy = shellf[InList * n + 11];
-          pAyz = shellf[InList * n + 12];
-          pAzz = shellf[InList * n + 13];
-          pchix = shellf[InList * n + 14];
-          pchiy = shellf[InList * n + 15];
-          pchiz = shellf[InList * n + 16];
-          ptrKx = shellf[InList * n + 17];
-          ptrKy = shellf[InList * n + 18];
-          ptrKz = shellf[InList * n + 19];
-          pAxxx = shellf[InList * n + 20];
-          pAxxy = shellf[InList * n + 21];
-          pAxxz = shellf[InList * n + 22];
-          pAxyx = shellf[InList * n + 23];
-          pAxyy = shellf[InList * n + 24];
-          pAxyz = shellf[InList * n + 25];
-          pAxzx = shellf[InList * n + 26];
-          pAxzy = shellf[InList * n + 27];
-          pAxzz = shellf[InList * n + 28];
-          pAyyx = shellf[InList * n + 29];
-          pAyyy = shellf[InList * n + 30];
-          pAyyz = shellf[InList * n + 31];
-          pAyzx = shellf[InList * n + 32];
-          pAyzy = shellf[InList * n + 33];
-          pAyzz = shellf[InList * n + 34];
-          pAzzx = shellf[InList * n + 35];
-          pAzzy = shellf[InList * n + 36];
-          pAzzz = shellf[InList * n + 37];
-          pGamxxx = shellf[InList * n + 38];
-          pGamxxy = shellf[InList * n + 39];
-          pGamxxz = shellf[InList * n + 40];
-          pGamxyy = shellf[InList * n + 41];
-          pGamxyz = shellf[InList * n + 42];
-          pGamxzz = shellf[InList * n + 43];
-          pGamyxx = shellf[InList * n + 44];
-          pGamyxy = shellf[InList * n + 45];
-          pGamyxz = shellf[InList * n + 46];
-          pGamyyy = shellf[InList * n + 47];
-          pGamyyz = shellf[InList * n + 48];
-          pGamyzz = shellf[InList * n + 49];
-          pGamzxx = shellf[InList * n + 50];
-          pGamzxy = shellf[InList * n + 51];
-          pGamzxz = shellf[InList * n + 52];
-          pGamzyy = shellf[InList * n + 53];
-          pGamzyz = shellf[InList * n + 54];
-          pGamzzz = shellf[InList * n + 55];
-          pRxx = shellf[InList * n + 56];
-          pRxy = shellf[InList * n + 57];
-          pRxz = shellf[InList * n + 58];
-          pRyy = shellf[InList * n + 59];
-          pRyz = shellf[InList * n + 60];
-          pRzz = shellf[InList * n + 61];
-          switch (lp)
-          {
-          case 0: //+++ (theta, phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = sin(pm * (j + 0.5) * dphi);
-            break;
-          case 1: //++- (pi-theta, phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = sin(pm * (j + 0.5) * dphi);
-            pz = -pz;
-            pgxz = -pgxz;
-            pgyz = -pgyz;
-            pAxz = -pAxz;
-            pAyz = -pAyz;
-            pchiz = -pchiz;
-            ptrKz = -ptrKz;
-            pAxxz = -pAxxz;
-            pAxyz = -pAxyz;
-            pAxzx = -pAxzx;
-            pAxzy = -pAxzy;
-            pAyyz = -pAyyz;
-            pAyzx = -pAyzx;
-            pAyzy = -pAyzy;
-            pAzzz = -pAzzz;
-            pGamxxz = -pGamxxz;
-            pGamxyz = -pGamxyz;
-            pGamyxz = -pGamyxz;
-            pGamyyz = -pGamyyz;
-            pGamzxx = -pGamzxx;
-            pGamzxy = -pGamzxy;
-            pGamzyy = -pGamzyy;
-            pGamzzz = -pGamzzz;
-            pRxz = -pRxz;
-            pRyz = -pRyz;
-            break;
-          case 2: //+-+ (theta, 2*pi-phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = -sin(pm * (j + 0.5) * dphi);
-            py = -py;
-            pgxy = -pgxy;
-            pgyz = -pgyz;
-            pAxy = -pAxy;
-            pAyz = -pAyz;
-            pchiy = -pchiy;
-            ptrKy = -ptrKy;
-            pAxxy = -pAxxy;
-            pAxyx = -pAxyx;
-            pAxyz = -pAxyz;
-            pAxzy = -pAxzy;
-            pAyyy = -pAyyy;
-            pAyzx = -pAyzx;
-            pAyzz = -pAyzz;
-            pAzzy = -pAzzy;
-            pGamxxy = -pGamxxy;
-            pGamxyz = -pGamxyz;
-            pGamyxx = -pGamyxx;
-            pGamyxz = -pGamyxz;
-            pGamyyy = -pGamyyy;
-            pGamyzz = -pGamyzz;
-            pGamzxy = -pGamzxy;
-            pGamzyz = -pGamzyz;
-            pRxy = -pRxy;
-            pRyz = -pRyz;
-            break;
-          case 3: //+-- (pi-theta, 2*pi-phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (j + 0.5) * dphi);
-            sinmphi = -sin(pm * (j + 0.5) * dphi);
-            py = -py;
-            pz = -pz;
-            pgxy = -pgxy;
-            pgxz = -pgxz;
-            pAxy = -pAxy;
-            pAxz = -pAxz;
-            pchiy = -pchiy;
-            pchiz = -pchiz;
-            ptrKy = -ptrKy;
-            ptrKz = -ptrKz;
-            pAxxy = -pAxxy;
-            pAxxz = -pAxxz;
-            pAxyx = -pAxyx;
-            pAxzx = -pAxzx;
-            pAyyy = -pAyyy;
-            pAyyz = -pAyyz;
-            pAyzy = -pAyzy;
-            pAyzz = -pAyzz;
-            pAzzy = -pAzzy;
-            pAzzz = -pAzzz;
-            pGamxxy = -pGamxxy;
-            pGamxxz = -pGamxxz;
-            pGamyxx = -pGamyxx;
-            pGamyyy = -pGamyyy;
-            pGamyyz = -pGamyyz;
-            pGamyzz = -pGamyzz;
-            pGamzxx = -pGamzxx;
-            pGamzyy = -pGamzyy;
-            pGamzyz = -pGamzyz;
-            pGamzzz = -pGamzzz;
-            pRxy = -pRxy;
-            pRxz = -pRxz;
-            break;
-          case 4: //-++ (theta, pi-phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
-            px = -px;
-            pgxy = -pgxy;
-            pgxz = -pgxz;
-            pAxy = -pAxy;
-            pAxz = -pAxz;
-            pchix = -pchix;
-            ptrKx = -ptrKx;
-            pAxxx = -pAxxx;
-            pAxyy = -pAxyy;
-            pAxyz = -pAxyz;
-            pAxzy = -pAxzy;
-            pAxzz = -pAxzz;
-            pAyyx = -pAyyx;
-            pAyzx = -pAyzx;
-            pAzzx = -pAzzx;
-            pGamxxx = -pGamxxx;
-            pGamxyy = -pGamxyy;
-            pGamxyz = -pGamxyz;
-            pGamxzz = -pGamxzz;
-            pGamyxy = -pGamyxy;
-            pGamyxz = -pGamyxz;
-            pGamzxy = -pGamzxy;
-            pGamzxz = -pGamzxz;
-            pRxy = -pRxy;
-            pRxz = -pRxz;
-            break;
-          case 5: //-+- (pi-theta, pi-phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
-            px = -px;
-            pz = -pz;
-            pgxy = -pgxy;
-            pgyz = -pgyz;
-            pAxy = -pAxy;
-            pAyz = -pAyz;
-            pchix = -pchix;
-            pchiz = -pchiz;
-            ptrKx = -ptrKx;
-            ptrKz = -ptrKz;
-            pAxxx = -pAxxx;
-            pAxxz = -pAxxz;
-            pAxyy = -pAxyy;
-            pAxzx = -pAxzx;
-            pAxzz = -pAxzz;
-            pAyyx = -pAyyx;
-            pAyyz = -pAyyz;
-            pAyzy = -pAyzy;
-            pAzzx = -pAzzx;
-            pAzzz = -pAzzz;
-            pGamxxx = -pGamxxx;
-            pGamxxz = -pGamxxz;
-            pGamxyy = -pGamxyy;
-            pGamxzz = -pGamxzz;
-            pGamyxy = -pGamyxy;
-            pGamyyz = -pGamyyz;
-            pGamzxx = -pGamzxx;
-            pGamzxz = -pGamzxz;
-            pGamzyy = -pGamzyy;
-            pGamzzz = -pGamzzz;
-            pRxy = -pRxy;
-            pRyz = -pRyz;
-            break;
-          case 6: //--+ (theta, pi+phi)
-            costheta = arcostheta[i];
-            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
-            px = -px;
-            py = -py;
-            pgxz = -pgxz;
-            pgyz = -pgyz;
-            pAxz = -pAxz;
-            pAyz = -pAyz;
-            pchix = -pchix;
-            pchiy = -pchiy;
-            ptrKx = -ptrKx;
-            ptrKy = -ptrKy;
-            pAxxx = -pAxxx;
-            pAxxy = -pAxxy;
-            pAxyx = -pAxyx;
-            pAxyy = -pAxyy;
-            pAxzz = -pAxzz;
-            pAyyx = -pAyyx;
-            pAyyy = -pAyyy;
-            pAyzz = -pAyzz;
-            pAzzx = -pAzzx;
-            pAzzy = -pAzzy;
-            pGamxxx = -pGamxxx;
-            pGamxxy = -pGamxxy;
-            pGamxyy = -pGamxyy;
-            pGamxzz = -pGamxzz;
-            pGamyxx = -pGamyxx;
-            pGamyxy = -pGamyxy;
-            pGamyyy = -pGamyyy;
-            pGamyzz = -pGamyzz;
-            pGamzxz = -pGamzxz;
-            pGamzyz = -pGamzyz;
-            pRxz = -pRxz;
-            pRyz = -pRyz;
-            break;
-          case 7: //--- (pi-theta, pi+phi)
-            costheta = -arcostheta[i];
-            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
-            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
-            px = -px;
-            py = -py;
-            pz = -pz;
-            pchix = -pchix;
-            pchiy = -pchiy;
-            pchiz = -pchiz;
-            ptrKx = -ptrKx;
-            ptrKy = -ptrKy;
-            ptrKz = -ptrKz;
-            pAxxx = -pAxxx;
-            pAxxy = -pAxxy;
-            pAxxz = -pAxxz;
-            pAxyx = -pAxyx;
-            pAxyy = -pAxyy;
-            pAxyz = -pAxyz;
-            pAxzx = -pAxzx;
-            pAxzy = -pAxzy;
-            pAxzz = -pAxzz;
-            pAyyx = -pAyyx;
-            pAyyy = -pAyyy;
-            pAyyz = -pAyyz;
-            pAyzx = -pAyzx;
-            pAyzy = -pAyzy;
-            pAyzz = -pAyzz;
-            pAzzx = -pAzzx;
-            pAzzy = -pAzzy;
-            pAzzz = -pAzzz;
-            pGamxxx = -pGamxxx;
-            pGamxxy = -pGamxxy;
-            pGamxxz = -pGamxxz;
-            pGamxyy = -pGamxyy;
-            pGamxyz = -pGamxyz;
-            pGamxzz = -pGamxzz;
-            pGamyxx = -pGamyxx;
-            pGamyxy = -pGamyxy;
-            pGamyxz = -pGamyxz;
-            pGamyyy = -pGamyyy;
-            pGamyyz = -pGamyyz;
-            pGamyzz = -pGamyzz;
-            pGamzxx = -pGamzxx;
-            pGamzxy = -pGamzxy;
-            pGamzxz = -pGamzxz;
-            pGamzyy = -pGamzyy;
-            pGamzyz = -pGamzyz;
-            pGamzzz = -pGamzzz;
-          }
-
-          f_getnp4_point(px, py, pz, pchi, ptrK,
-                         pgxx, pgxy, pgxz, pgyy, pgyz, pgzz,
-                         pAxx, pAxy, pAxz, pAyy, pAyz, pAzz,
-                         pchix, pchiy, pchiz,
-                         ptrKx, ptrKy, ptrKz,
-                         pAxxx, pAxxy, pAxxz,
-                         pAxyx, pAxyy, pAxyz,
-                         pAxzx, pAxzy, pAxzz,
-                         pAyyx, pAyyy, pAyyz,
-                         pAyzx, pAyzy, pAyzz,
-                         pAzzx, pAzzy, pAzzz,
-                         pGamxxx, pGamxxy, pGamxxz, pGamxyy, pGamxyz, pGamxzz,
-                         pGamyxx, pGamyxy, pGamyxz, pGamyyy, pGamyyz, pGamyzz,
-                         pGamzxx, pGamzxy, pGamzxz, pGamzyy, pGamzyz, pGamzzz,
-                         pRxx, pRxy, pRxz, pRyy, pRyz, pRzz,
-                         psi4RR, psi4II);
-
-          thetap = sqrt((2 * pl + 1.0) / 4.0 / PI) * misc::Wigner_d_function(pl, pm, spinw, costheta); // note the variation from -2 to 2
-
-          //	 find back the one
-          pchi = pchi + 1;
-#ifdef GaussInt
-          // wtcostheta is even function respect costheta
-          RP_out[countlm] = RP_out[countlm] + thetap / pchi / pchi * (psi4RR * cosmphi + psi4II * sinmphi) * wtcostheta[i];
-          IP_out[countlm] = IP_out[countlm] + thetap / pchi / pchi * (psi4II * cosmphi - psi4RR * sinmphi) * wtcostheta[i];
-#else
-          RP_out[countlm] = RP_out[countlm] + thetap / pchi / pchi * (psi4RR * cosmphi + psi4II * sinmphi);
-          IP_out[countlm] = IP_out[countlm] + thetap / pchi / pchi * (psi4II * cosmphi - psi4RR * sinmphi);
-#endif
-        }
-        countlm++; // no sanity check for countlm and NN which should be noted in the input parameters
-      }
-  }
-
-  for (int ii = 0; ii < NN; ii++)
-  {
-#ifdef GaussInt
-    RP_out[ii] = RP_out[ii] * rex * dphi;
-    IP_out[ii] = IP_out[ii] * rex * dphi;
-#else
-    RP_out[ii] = RP_out[ii] * rex * dphi * dcostheta;
-    IP_out[ii] = IP_out[ii] * rex * dphi * dcostheta;
-#endif
-  }
-  //|------+  Communicate and sum the results from each processor.
-
-  {
-    double *RPIP_out = new double[2 * NN];
-    double *RPIP = new double[2 * NN];
-    memcpy(RPIP_out, RP_out, NN * sizeof(double));
-    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
-    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    memcpy(RP, RPIP, NN * sizeof(double));
-    memcpy(IP, RPIP + NN, NN * sizeof(double));
-    delete[] RPIP_out;
-    delete[] RPIP;
-  }
-
-  //|------= Free memory.
-
-  delete[] pox[0];
-  delete[] pox[1];
-  delete[] pox[2];
-  delete[] shellf;
-  delete[] RP_out;
-  delete[] IP_out;
-  DG_List->clearList();
-}
-//|----------------------------------------------------------------
-//  do not discriminate box and shell
-//|----------------------------------------------------------------
-bool surface_integral::SR_Interp_Points(MyList<var> *VarList, cgh *GH, ShellPatch *SH,
-                                        int NN, double **XX, double *Shellf)
-{
-  MyList<var> *varl;
-  int num_var = 0;
-  varl = VarList;
-  while (varl)
-  {
-    num_var++;
-    varl = varl->next;
-  }
-
-  double pox[3];
-  for (int i = 0; i < NN; i++)
-  {
-    for (int j = 0; j < 3; j++)
-      pox[j] = XX[j][i];
-    int lev = GH->levels - 1;
-    bool notfound = true;
-
-    while (notfound)
-    {
-      if (lev < 0)
-      {
-        if (SH)
-        {
-          if (SH->Interp_One_Point(VarList, pox, Shellf + i * num_var, Symmetry))
-          {
-            return true;
-          }
-          if (myrank == 0)
-            cout << "surface_integral::SR_Interp_Points point (" << pox[0] << "," << pox[1] << "," << pox[2] << ") is out of cgh and shell domain!" << endl;
-        }
-        else
-        {
-          if (myrank == 0)
-            cout << "surface_integral::SR_Interp_Points: point (" << pox[0] << "," << pox[1] << "," << pox[2] << ") is out of cgh domain!" << endl;
-        }
-        return false;
-      }
-      MyList<Patch> *Pp = GH->PatL[lev];
-      while (Pp)
-      {
-        if (Pp->data->Interp_ONE_Point(VarList, pox, Shellf + i * num_var, Symmetry))
-        {
-          notfound = false;
-          break;
-        }
-        Pp = Pp->next;
-      }
-      lev--;
-    }
-  }
-  return true;
-}
+
+//----------------------------------------------------------------
+// Using Gauss-Legendre quadrature in theta direction
+// and   trapezoidal rule in phi direction (from Second Euler-Maclaurin summation formula, we can see that
+// this method gives expolential convergence for periodic function)
+//----------------------------------------------------------------
+#ifdef newc
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <strstream>
+#include <cmath>
+#include <map>
+#include <vector>
+#include <algorithm>
+using namespace std;
+#else
+#include <iostream.h>
+#include <iomanip.h>
+#include <fstream.h>
+#include <string.h>
+#include <math.h>
+#include <map.h>
+#endif
+#include <mpi.h>
+
+#include "misc.h"
+#include "cgh.h"
+#include "Parallel.h"
+#include "surface_integral.h"
+#include "fadmquantites_bssn.h"
+#include "getnpem2.h"
+#include "getnp4.h"
+#include "parameters.h"
+
+#define PI M_PI
+//|============================================================================
+//| Constructor
+//|============================================================================
+
+surface_integral::surface_integral(int iSymmetry) : Symmetry(iSymmetry)
+{
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+  int N = 40;
+  // read parameter from file
+  {
+    const int LEN = 256;
+    char pline[LEN];
+    string str, sgrp, skey, sval;
+    int sind;
+    char pname[50];
+    {
+      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
+      if (iter != parameters::str_par.end())
+      {
+        strcpy(pname, (iter->second).c_str());
+      }
+      else
+      {
+        cout << "Error inputpar" << endl;
+        exit(0);
+      }
+    }
+    ifstream inf(pname, ifstream::in);
+    if (!inf.good() && myrank == 0)
+    {
+      cout << "Can not open parameter file " << pname << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    for (int i = 1; inf.good(); i++)
+    {
+      inf.getline(pline, LEN);
+      str = pline;
+
+      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
+      if (status == -1)
+      {
+        cout << "error reading parameter file " << pname << " in line " << i << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+      else if (status == 0)
+        continue;
+
+      if (sgrp == "SurfaceIntegral")
+      {
+        if (skey == "number of points for quarter sphere")
+          N = atoi(sval.c_str());
+      }
+    }
+    inf.close();
+  }
+  //|-----number of points for whole [0,pi] x [0,2pi]
+  N_phi   = 4 * N;   // for simplicity, we require this number must be 4*N
+  N_theta = 2 * N;   //                                                2*N
+
+  if (myrank == 0)
+  {
+    cout << "-----------------------------------------------------------------------" << endl;
+#ifdef GaussInt
+    cout << " spherical integration for wave form extraction with Gauss method      " << endl;
+#else
+    cout << " spherical integration for wave form extraction with mid point method  " << endl;
+#endif
+    cout << " N_phi   = " << N_phi   << endl;
+    cout << " N_theta = " << N_theta << endl;
+    cout << "-----------------------------------------------------------------------" << endl;
+  }
+
+#ifdef GaussInt
+  //  weight function cover all of [0,pi]
+  arcostheta = new double[N_theta];
+  wtcostheta = new double[N_theta];
+
+  // note: theta in [0,pi/2], upper half sphere, corresponds to 1 < costheta < 0
+  misc::gaulegf(-1.0, 1.0, arcostheta, wtcostheta, N_theta);
+  // due to symmetry, I need first half array corresponds to upper sphere, note these two arrays must match each other
+  misc::inversearray(arcostheta, N_theta);
+  misc::inversearray(wtcostheta, N_theta);
+#endif
+
+  if (Symmetry == 2)
+  {
+    N_phi = N_phi / 4;
+    N_theta = N_theta / 2;
+    dphi = PI / (2.0 * N_phi);
+    dcostheta = 1.0 / N_theta;
+    factor = 8;
+  }
+  else if (Symmetry == 1)
+  {
+    N_theta = N_theta / 2;
+    dphi = 2.0 * PI / N_phi;
+    dcostheta = 1.0 / N_theta;
+    factor = 2;
+  }
+  else if (Symmetry == 0)
+  {
+    dphi = 2.0 * PI / N_phi;
+    dcostheta = 2.0 / N_theta;
+    factor = 1;
+  }
+  else if (myrank == 0)
+  {
+    cout << "surface_integral::surface_integral: not supported Symmetry setting!" << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+
+#ifndef GaussInt
+  //  weight function cover all of [0,pi]
+  arcostheta = new double[N_theta];
+#endif
+  n_tot = N_theta * N_phi;
+  nx_g = new double[n_tot];
+  ny_g = new double[n_tot];
+  nz_g = new double[n_tot];
+
+  int n = 0;
+  double costheta, sintheta, ph;
+
+  for (int i = 0; i < N_theta; ++i)
+  {
+#ifndef GaussInt
+    arcostheta[i] = 1.0 - (i + 0.5) * dcostheta;
+#endif
+    costheta = arcostheta[i];
+    sintheta = sqrt(1.0 - costheta * costheta);
+
+    for (int j = 0; j < N_phi; ++j)
+    {
+      ph = (j + 0.5) * dphi;
+      // normal vector respect to the constant R sphere
+      nx_g[n] = sintheta * cos(ph);
+      ny_g[n] = sintheta * sin(ph);
+      nz_g[n] = costheta;
+      n++;
+    }
+  }
+}
+
+//|============================================================================
+//| Destructor
+//|============================================================================
+surface_integral::~surface_integral()
+{
+  delete[] nx_g;
+  delete[] ny_g;
+  delete[] nz_g;
+  delete[] arcostheta;
+#ifdef GaussInt
+  delete[] wtcostheta;
+#endif
+}
+//|----------------------------------------------------------------
+//  spin weighted spinw component of psi4, general routine
+//  l takes from spinw to maxl; m takes from -l to l
+//|----------------------------------------------------------------
+void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
+                                 int spinw, int maxl, int NN, double *RP, double *IP,
+                                 monitor *Monitor) // NN is the length of RP and IP
+{
+  if (myrank == 0 && GH->grids[lev] != 1)
+    if (Monitor->outfile)
+      Monitor->outfile << "WARNING: surface integral on multipatches" << endl;
+    else
+      cout << "WARNING: surface integral on multipatches" << endl;
+
+  const int InList = 2;
+
+  MyList<var> *DG_List = new MyList<var>(Rpsi4);
+  DG_List->insert(Ipsi4);
+
+  int n;
+  double *pox[3];
+  for (int i = 0; i < 3; i++)
+    pox[i] = new double[n_tot];
+  for (n = 0; n < n_tot; n++)
+  {
+    pox[0][n] = rex * nx_g[n];
+    pox[1][n] = rex * ny_g[n];
+    pox[2][n] = rex * nz_g[n];
+  }
+
+  int mp, Lp, Nmin, Nmax;
+  mp = n_tot / cpusize;
+  Lp = n_tot - cpusize * mp;
+  if (Lp > myrank)
+  {
+    Nmin = myrank * mp + myrank;
+    Nmax = Nmin + mp;
+  }
+  else
+  {
+    Nmin = myrank * mp + Lp;
+    Nmax = Nmin + mp - 1;
+  }
+
+  double *shellf;
+  shellf = new double[n_tot * InList];
+
+  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
+  double *RP_out, *IP_out;
+  RP_out = new double[NN];
+  IP_out = new double[NN];
+
+  for (int ii = 0; ii < NN; ii++)
+  {
+    RP_out[ii] = 0;
+    IP_out[ii] = 0;
+  }
+  // theta part
+  double costheta, thetap;
+  double cosmphi, sinmphi;
+
+  int i, j;
+  int lpsy = 0;
+  if (Symmetry == 0)
+    lpsy = 1;
+  else if (Symmetry == 1)
+    lpsy = 2;
+  else if (Symmetry == 2)
+    lpsy = 8;
+
+  double psi4RR, psi4II;
+  for (n = Nmin; n <= Nmax; n++)
+  {
+    //       need round off always
+    i = int(n / N_phi); // int(1.723) = 1, int(-1.732) = -1
+    j = n - i * N_phi;
+
+    int countlm = 0;
+    for (int pl = spinw; pl < maxl + 1; pl++)
+      for (int pm = -pl; pm < pl + 1; pm++)
+      {
+        for (int lp = 0; lp < lpsy; lp++)
+        {
+          switch (lp)
+          {
+          case 0: //+++ (theta, phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = sin(pm * (j + 0.5) * dphi);
+            psi4RR = shellf[InList * n];
+            psi4II = shellf[InList * n + 1];
+            break;
+          case 1: //++- (pi-theta, phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = sin(pm * (j + 0.5) * dphi);
+            psi4RR = Rpsi4->SoA[2] * shellf[InList * n];
+            psi4II = Ipsi4->SoA[2] * shellf[InList * n + 1];
+            break;
+          case 2: //+-+ (theta, 2*pi-phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = -sin(pm * (j + 0.5) * dphi);
+            psi4RR = Rpsi4->SoA[1] * shellf[InList * n];
+            psi4II = Ipsi4->SoA[1] * shellf[InList * n + 1];
+            break;
+          case 3: //+-- (pi-theta, 2*pi-phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = -sin(pm * (j + 0.5) * dphi);
+            psi4RR = Rpsi4->SoA[2] * Rpsi4->SoA[1] * shellf[InList * n];
+            psi4II = Ipsi4->SoA[2] * Ipsi4->SoA[1] * shellf[InList * n + 1];
+            break;
+          case 4: //-++ (theta, pi-phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
+            psi4RR = Rpsi4->SoA[0] * shellf[InList * n];
+            psi4II = Ipsi4->SoA[0] * shellf[InList * n + 1];
+            break;
+          case 5: //-+- (pi-theta, pi-phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
+            psi4RR = Rpsi4->SoA[2] * Rpsi4->SoA[0] * shellf[InList * n];
+            psi4II = Ipsi4->SoA[2] * Ipsi4->SoA[0] * shellf[InList * n + 1];
+            break;
+          case 6: //--+ (theta, pi+phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
+            psi4RR = Rpsi4->SoA[1] * Rpsi4->SoA[0] * shellf[InList * n];
+            psi4II = Ipsi4->SoA[1] * Ipsi4->SoA[0] * shellf[InList * n + 1];
+            break;
+          case 7: //--- (pi-theta, pi+phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
+            psi4RR = Rpsi4->SoA[2] * Rpsi4->SoA[1] * Rpsi4->SoA[0] * shellf[InList * n];
+            psi4II = Ipsi4->SoA[2] * Ipsi4->SoA[1] * Ipsi4->SoA[0] * shellf[InList * n + 1];
+          }
+
+          thetap = sqrt((2 * pl + 1.0) / 4.0 / PI) * misc::Wigner_d_function(pl, pm, spinw, costheta); // note the variation from -2 to 2
+#ifdef GaussInt
+          // wtcostheta is even function respect costheta
+          RP_out[countlm] = RP_out[countlm] + thetap * (psi4RR * cosmphi + psi4II * sinmphi) * wtcostheta[i];
+          IP_out[countlm] = IP_out[countlm] + thetap * (psi4II * cosmphi - psi4RR * sinmphi) * wtcostheta[i];
+#else
+          RP_out[countlm] = RP_out[countlm] + thetap * (psi4RR * cosmphi + psi4II * sinmphi);
+          IP_out[countlm] = IP_out[countlm] + thetap * (psi4II * cosmphi - psi4RR * sinmphi);
+#endif
+        }
+        countlm++; // no sanity check for countlm and NN which should be noted in the input parameters
+      }
+  }
+
+  for (int ii = 0; ii < NN; ii++)
+  {
+#ifdef GaussInt
+    RP_out[ii] = RP_out[ii] * rex * dphi;
+    IP_out[ii] = IP_out[ii] * rex * dphi;
+#else
+    RP_out[ii] = RP_out[ii] * rex * dphi * dcostheta;
+    IP_out[ii] = IP_out[ii] * rex * dphi * dcostheta;
+#endif
+  }
+  //|------+  Communicate and sum the results from each processor.
+
+  {
+    double *RPIP_out = new double[2 * NN];
+    double *RPIP = new double[2 * NN];
+    memcpy(RPIP_out, RP_out, NN * sizeof(double));
+    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
+    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    memcpy(RP, RPIP, NN * sizeof(double));
+    memcpy(IP, RPIP + NN, NN * sizeof(double));
+    delete[] RPIP_out;
+    delete[] RPIP;
+  }
+
+  //|------= Free memory.
+
+  delete[] pox[0];
+  delete[] pox[1];
+  delete[] pox[2];
+  delete[] shellf;
+  delete[] RP_out;
+  delete[] IP_out;
+  DG_List->clearList();
+}
+void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
+                                 int spinw, int maxl, int NN, double *RP, double *IP,
+                                 monitor *Monitor, MPI_Comm Comm_here) // NN is the length of RP and IP
+{
+  //   misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"start surface_integral::surf_Wave");
+
+  int lmyrank;
+  MPI_Comm_rank(Comm_here, &lmyrank);
+  if (lmyrank == 0 && GH->grids[lev] != 1)
+    if (Monitor->outfile)
+      Monitor->outfile << "WARNING: surface integral on multipatches" << endl;
+    else
+      cout << "WARNING: surface integral on multipatches" << endl;
+
+  const int InList = 2;
+
+  MyList<var> *DG_List = new MyList<var>(Rpsi4);
+  DG_List->insert(Ipsi4);
+
+  int n;
+  double *pox[3];
+  for (int i = 0; i < 3; i++)
+    pox[i] = new double[n_tot];
+  for (n = 0; n < n_tot; n++)
+  {
+    pox[0][n] = rex * nx_g[n];
+    pox[1][n] = rex * ny_g[n];
+    pox[2][n] = rex * nz_g[n];
+  }
+
+  double *shellf;
+  shellf = new double[n_tot * InList];
+
+  //    misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Interp_Points");
+
+  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Comm_here);
+
+  //    misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Interp_Points");
+
+  int mp, Lp, Nmin, Nmax;
+
+  int cpusize_here;
+  MPI_Comm_size(Comm_here, &cpusize_here);
+
+  mp = n_tot / cpusize_here;
+  Lp = n_tot - cpusize_here * mp;
+
+  if (Lp > lmyrank)
+  {
+    Nmin = lmyrank * mp + lmyrank;
+    Nmax = Nmin + mp;
+  }
+  else
+  {
+    Nmin = lmyrank * mp + Lp;
+    Nmax = Nmin + mp - 1;
+  }
+
+  //|~~~~~> Integrate the dot product of Dphi with the surface normal.
+
+  double *RP_out, *IP_out;
+  RP_out = new double[NN];
+  IP_out = new double[NN];
+
+  for (int ii = 0; ii < NN; ii++)
+  {
+    RP_out[ii] = 0;
+    IP_out[ii] = 0;
+  }
+  // theta part
+  double costheta, thetap;
+  double cosmphi, sinmphi;
+
+  int i, j;
+  int lpsy = 0;
+  if (Symmetry == 0)
+    lpsy = 1;
+  else if (Symmetry == 1)
+    lpsy = 2;
+  else if (Symmetry == 2)
+    lpsy = 8;
+
+  double psi4RR, psi4II;
+  for (n = Nmin; n <= Nmax; n++)
+  {
+    //       need round off always
+    i = int(n / N_phi); // int(1.723) = 1, int(-1.732) = -1
+    j = n - i * N_phi;
+
+    int countlm = 0;
+    for (int pl = spinw; pl < maxl + 1; pl++)
+      for (int pm = -pl; pm < pl + 1; pm++)
+      {
+        for (int lp = 0; lp < lpsy; lp++)
+        {
+          switch (lp)
+          {
+          case 0: //+++ (theta, phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = sin(pm * (j + 0.5) * dphi);
+            psi4RR = shellf[InList * n];
+            psi4II = shellf[InList * n + 1];
+            break;
+          case 1: //++- (pi-theta, phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = sin(pm * (j + 0.5) * dphi);
+            psi4RR = Rpsi4->SoA[2] * shellf[InList * n];
+            psi4II = Ipsi4->SoA[2] * shellf[InList * n + 1];
+            break;
+          case 2: //+-+ (theta, 2*pi-phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = -sin(pm * (j + 0.5) * dphi);
+            psi4RR = Rpsi4->SoA[1] * shellf[InList * n];
+            psi4II = Ipsi4->SoA[1] * shellf[InList * n + 1];
+            break;
+          case 3: //+-- (pi-theta, 2*pi-phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = -sin(pm * (j + 0.5) * dphi);
+            psi4RR = Rpsi4->SoA[2] * Rpsi4->SoA[1] * shellf[InList * n];
+            psi4II = Ipsi4->SoA[2] * Ipsi4->SoA[1] * shellf[InList * n + 1];
+            break;
+          case 4: //-++ (theta, pi-phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
+            psi4RR = Rpsi4->SoA[0] * shellf[InList * n];
+            psi4II = Ipsi4->SoA[0] * shellf[InList * n + 1];
+            break;
+          case 5: //-+- (pi-theta, pi-phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
+            psi4RR = Rpsi4->SoA[2] * Rpsi4->SoA[0] * shellf[InList * n];
+            psi4II = Ipsi4->SoA[2] * Ipsi4->SoA[0] * shellf[InList * n + 1];
+            break;
+          case 6: //--+ (theta, pi+phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
+            psi4RR = Rpsi4->SoA[1] * Rpsi4->SoA[0] * shellf[InList * n];
+            psi4II = Ipsi4->SoA[1] * Ipsi4->SoA[0] * shellf[InList * n + 1];
+            break;
+          case 7: //--- (pi-theta, pi+phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
+            psi4RR = Rpsi4->SoA[2] * Rpsi4->SoA[1] * Rpsi4->SoA[0] * shellf[InList * n];
+            psi4II = Ipsi4->SoA[2] * Ipsi4->SoA[1] * Ipsi4->SoA[0] * shellf[InList * n + 1];
+          }
+
+          thetap = sqrt((2 * pl + 1.0) / 4.0 / PI) * misc::Wigner_d_function(pl, pm, spinw, costheta); // note the variation from -2 to 2
+#ifdef GaussInt
+          // wtcostheta is even function respect costheta
+          RP_out[countlm] = RP_out[countlm] + thetap * (psi4RR * cosmphi + psi4II * sinmphi) * wtcostheta[i];
+          IP_out[countlm] = IP_out[countlm] + thetap * (psi4II * cosmphi - psi4RR * sinmphi) * wtcostheta[i];
+#else
+          RP_out[countlm] = RP_out[countlm] + thetap * (psi4RR * cosmphi + psi4II * sinmphi);
+          IP_out[countlm] = IP_out[countlm] + thetap * (psi4II * cosmphi - psi4RR * sinmphi);
+#endif
+        }
+        countlm++; // no sanity check for countlm and NN which should be noted in the input parameters
+      }
+  }
+
+  for (int ii = 0; ii < NN; ii++)
+  {
+#ifdef GaussInt
+    RP_out[ii] = RP_out[ii] * rex * dphi;
+    IP_out[ii] = IP_out[ii] * rex * dphi;
+#else
+    RP_out[ii] = RP_out[ii] * rex * dphi * dcostheta;
+    IP_out[ii] = IP_out[ii] * rex * dphi * dcostheta;
+#endif
+  }
+  //|------+  Communicate and sum the results from each processor.
+
+  {
+    double *RPIP_out = new double[2 * NN];
+    double *RPIP = new double[2 * NN];
+    memcpy(RPIP_out, RP_out, NN * sizeof(double));
+    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
+    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, Comm_here);
+    memcpy(RP, RPIP, NN * sizeof(double));
+    memcpy(IP, RPIP + NN, NN * sizeof(double));
+    delete[] RPIP_out;
+    delete[] RPIP;
+  }
+
+  //|------= Free memory.
+
+  delete[] pox[0];
+  delete[] pox[1];
+  delete[] pox[2];
+  delete[] shellf;
+  delete[] RP_out;
+  delete[] IP_out;
+  DG_List->clearList();
+}
+//|----------------------------------------------------------------
+//  for shell patch
+//|----------------------------------------------------------------
+void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH, var *Rpsi4, var *Ipsi4,
+                                 int spinw, int maxl, int NN, double *RP, double *IP,
+                                 monitor *Monitor) // NN is the length of RP and IP
+{
+  const int InList = 2;
+
+  MyList<var> *DG_List = new MyList<var>(Rpsi4);
+  DG_List->insert(Ipsi4);
+
+  int n;
+  double *pox[3];
+  for (int i = 0; i < 3; i++)
+    pox[i] = new double[n_tot];
+  for (n = 0; n < n_tot; n++)
+  {
+    pox[0][n] = rex * nx_g[n];
+    pox[1][n] = rex * ny_g[n];
+    pox[2][n] = rex * nz_g[n];
+  }
+
+  double *shellf;
+  shellf = new double[n_tot * InList];
+
+  GH->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);
+
+  int mp, Lp, Nmin, Nmax;
+
+  mp = n_tot / cpusize;
+  Lp = n_tot - cpusize * mp;
+
+  if (Lp > myrank)
+  {
+    Nmin = myrank * mp + myrank;
+    Nmax = Nmin + mp;
+  }
+  else
+  {
+    Nmin = myrank * mp + Lp;
+    Nmax = Nmin + mp - 1;
+  }
+
+  //|~~~~~> Integrate the dot product of Dphi with the surface normal.
+
+  double *RP_out, *IP_out;
+  RP_out = new double[NN];
+  IP_out = new double[NN];
+
+  for (int ii = 0; ii < NN; ii++)
+  {
+    RP_out[ii] = 0;
+    IP_out[ii] = 0;
+  }
+  // theta part
+  double costheta, thetap;
+  double cosmphi, sinmphi;
+
+  int i, j;
+  int lpsy = 0;
+  if (Symmetry == 0)
+    lpsy = 1;
+  else if (Symmetry == 1)
+    lpsy = 2;
+  else if (Symmetry == 2)
+    lpsy = 8;
+
+  double psi4RR, psi4II;
+  for (n = Nmin; n <= Nmax; n++)
+  {
+    //       need round off always
+    i = int(n / N_phi); // int(1.723) = 1, int(-1.732) = -1
+    j = n - i * N_phi;
+
+    int countlm = 0;
+    for (int pl = spinw; pl < maxl + 1; pl++)
+      for (int pm = -pl; pm < pl + 1; pm++)
+      {
+        for (int lp = 0; lp < lpsy; lp++)
+        {
+          switch (lp)
+          {
+          case 0: //+++ (theta, phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = sin(pm * (j + 0.5) * dphi);
+            psi4RR = shellf[InList * n];
+            psi4II = shellf[InList * n + 1];
+            break;
+          case 1: //++- (pi-theta, phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = sin(pm * (j + 0.5) * dphi);
+            psi4RR = Rpsi4->SoA[2] * shellf[InList * n];
+            psi4II = Ipsi4->SoA[2] * shellf[InList * n + 1];
+            break;
+          case 2: //+-+ (theta, 2*pi-phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = -sin(pm * (j + 0.5) * dphi);
+            psi4RR = Rpsi4->SoA[1] * shellf[InList * n];
+            psi4II = Ipsi4->SoA[1] * shellf[InList * n + 1];
+            break;
+          case 3: //+-- (pi-theta, 2*pi-phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = -sin(pm * (j + 0.5) * dphi);
+            psi4RR = Rpsi4->SoA[2] * Rpsi4->SoA[1] * shellf[InList * n];
+            psi4II = Ipsi4->SoA[2] * Ipsi4->SoA[1] * shellf[InList * n + 1];
+            break;
+          case 4: //-++ (theta, pi-phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
+            psi4RR = Rpsi4->SoA[0] * shellf[InList * n];
+            psi4II = Ipsi4->SoA[0] * shellf[InList * n + 1];
+            break;
+          case 5: //-+- (pi-theta, pi-phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
+            psi4RR = Rpsi4->SoA[2] * Rpsi4->SoA[0] * shellf[InList * n];
+            psi4II = Ipsi4->SoA[2] * Ipsi4->SoA[0] * shellf[InList * n + 1];
+            break;
+          case 6: //--+ (theta, pi+phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
+            psi4RR = Rpsi4->SoA[1] * Rpsi4->SoA[0] * shellf[InList * n];
+            psi4II = Ipsi4->SoA[1] * Ipsi4->SoA[0] * shellf[InList * n + 1];
+            break;
+          case 7: //--- (pi-theta, pi+phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
+            psi4RR = Rpsi4->SoA[2] * Rpsi4->SoA[1] * Rpsi4->SoA[0] * shellf[InList * n];
+            psi4II = Ipsi4->SoA[2] * Ipsi4->SoA[1] * Ipsi4->SoA[0] * shellf[InList * n + 1];
+          }
+
+          thetap = sqrt((2 * pl + 1.0) / 4.0 / PI) * misc::Wigner_d_function(pl, pm, spinw, costheta); // note the variation from -2 to 2
+#ifdef GaussInt
+          // wtcostheta is even function respect costheta
+          RP_out[countlm] = RP_out[countlm] + thetap * (psi4RR * cosmphi + psi4II * sinmphi) * wtcostheta[i];
+          IP_out[countlm] = IP_out[countlm] + thetap * (psi4II * cosmphi - psi4RR * sinmphi) * wtcostheta[i];
+#else
+          RP_out[countlm] = RP_out[countlm] + thetap * (psi4RR * cosmphi + psi4II * sinmphi);
+          IP_out[countlm] = IP_out[countlm] + thetap * (psi4II * cosmphi - psi4RR * sinmphi);
+#endif
+        }
+        countlm++; // no sanity check for countlm and NN which should be noted in the input parameters
+      }
+  }
+
+  for (int ii = 0; ii < NN; ii++)
+  {
+#ifdef GaussInt
+    RP_out[ii] = RP_out[ii] * rex * dphi;
+    IP_out[ii] = IP_out[ii] * rex * dphi;
+#else
+    RP_out[ii] = RP_out[ii] * rex * dphi * dcostheta;
+    IP_out[ii] = IP_out[ii] * rex * dphi * dcostheta;
+#endif
+  }
+  //|------+  Communicate and sum the results from each processor.
+
+  {
+    double *RPIP_out = new double[2 * NN];
+    double *RPIP = new double[2 * NN];
+    memcpy(RPIP_out, RP_out, NN * sizeof(double));
+    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
+    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    memcpy(RP, RPIP, NN * sizeof(double));
+    memcpy(IP, RPIP + NN, NN * sizeof(double));
+    delete[] RPIP_out;
+    delete[] RPIP;
+  }
+
+  //|------= Free memory.
+
+  delete[] pox[0];
+  delete[] pox[1];
+  delete[] pox[2];
+  delete[] shellf;
+  delete[] RP_out;
+  delete[] IP_out;
+  DG_List->clearList();
+}
+//|----------------------------------------------------------------
+//  for shell patch
+//  for EM wave specially symmetric case
+//|----------------------------------------------------------------
+void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH,
+                                 var *Ex, var *Ey, var *Ez, var *Bx, var *By, var *Bz,
+                                 var *chi, var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
+                                 int spinw, int maxl, int NN, double *RP, double *IP,
+                                 monitor *Monitor) // NN is the length of RP and IP
+{
+  const int InList = 13;
+
+  MyList<var> *DG_List = new MyList<var>(Ex);
+  DG_List->insert(Ey);
+  DG_List->insert(Ez);
+  DG_List->insert(Bx);
+  DG_List->insert(By);
+  DG_List->insert(Bz);
+  DG_List->insert(chi);
+  DG_List->insert(gxx);
+  DG_List->insert(gxy);
+  DG_List->insert(gxz);
+  DG_List->insert(gyy);
+  DG_List->insert(gyz);
+  DG_List->insert(gzz);
+
+  int n;
+  double *pox[3];
+  for (int i = 0; i < 3; i++)
+    pox[i] = new double[n_tot];
+  for (n = 0; n < n_tot; n++)
+  {
+    pox[0][n] = rex * nx_g[n];
+    pox[1][n] = rex * ny_g[n];
+    pox[2][n] = rex * nz_g[n];
+  }
+
+  double *shellf;
+  shellf = new double[n_tot * InList];
+
+  GH->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);
+
+  int mp, Lp, Nmin, Nmax;
+
+  mp = n_tot / cpusize;
+  Lp = n_tot - cpusize * mp;
+
+  if (Lp > myrank)
+  {
+    Nmin = myrank * mp + myrank;
+    Nmax = Nmin + mp;
+  }
+  else
+  {
+    Nmin = myrank * mp + Lp;
+    Nmax = Nmin + mp - 1;
+  }
+
+  //|~~~~~> Integrate the dot product of Dphi with the surface normal.
+
+  double *RP_out, *IP_out;
+  RP_out = new double[NN];
+  IP_out = new double[NN];
+
+  for (int ii = 0; ii < NN; ii++)
+  {
+    RP_out[ii] = 0;
+    IP_out[ii] = 0;
+  }
+  // theta part
+  double costheta, thetap;
+  double cosmphi, sinmphi;
+
+  int i, j;
+  int lpsy = 0;
+  if (Symmetry == 0)
+    lpsy = 1;
+  else if (Symmetry == 1)
+    lpsy = 2;
+  else if (Symmetry == 2)
+    lpsy = 8;
+
+  double psi4RR, psi4II;
+  double px, py, pz;
+  double pEx, pEy, pEz, pBx, pBy, pBz;
+  double pchi, pgxx, pgxy, pgxz, pgyy, pgyz, pgzz;
+  for (n = Nmin; n <= Nmax; n++)
+  {
+    //       need round off always
+    i = int(n / N_phi); // int(1.723) = 1, int(-1.732) = -1
+    j = n - i * N_phi;
+
+    int countlm = 0;
+    for (int pl = spinw; pl < maxl + 1; pl++)
+      for (int pm = -pl; pm < pl + 1; pm++)
+      {
+        for (int lp = 0; lp < lpsy; lp++)
+        {
+          px = pox[0][n];
+          py = pox[1][n];
+          pz = pox[2][n];
+          pEx = shellf[InList * n];
+          pEy = shellf[InList * n + 1];
+          pEz = shellf[InList * n + 2];
+          pBx = shellf[InList * n + 3];
+          pBy = shellf[InList * n + 4];
+          pBz = shellf[InList * n + 5];
+          pchi = shellf[InList * n + 6];
+          pgxx = shellf[InList * n + 7];
+          pgxy = shellf[InList * n + 8];
+          pgxz = shellf[InList * n + 9];
+          pgyy = shellf[InList * n + 10];
+          pgyz = shellf[InList * n + 11];
+          pgzz = shellf[InList * n + 12];
+          switch (lp)
+          {
+          case 0: //+++ (theta, phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = sin(pm * (j + 0.5) * dphi);
+            break;
+          case 1: //++- (pi-theta, phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = sin(pm * (j + 0.5) * dphi);
+            pz = -pz;
+            pEz = -pEz;
+            pBx = -pBx;
+            pBy = -pBy;
+            pgxz = -pgxz;
+            pgyz = -pgyz;
+            break;
+          case 2: //+-+ (theta, 2*pi-phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = -sin(pm * (j + 0.5) * dphi);
+            py = -py;
+            pEy = -pEy;
+            pBx = -pBx;
+            pBz = -pBz;
+            pgxy = -pgxy;
+            pgyz = -pgyz;
+            break;
+          case 3: //+-- (pi-theta, 2*pi-phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = -sin(pm * (j + 0.5) * dphi);
+            py = -py;
+            pz = -pz;
+            pEz = -pEz;
+            pBz = -pBz;
+            pgxz = -pgxz;
+            pEy = -pEy;
+            pBy = -pBy;
+            pgxy = -pgxy;
+            break;
+          case 4: //-++ (theta, pi-phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
+            px = -px;
+            pEx = -pEx;
+            pBy = -pBy;
+            pBz = -pBz;
+            pgxy = -pgxy;
+            pgxz = -pgxz;
+            break;
+          case 5: //-+- (pi-theta, pi-phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
+            pz = -pz;
+            px = -px;
+            pEz = -pEz;
+            pBz = -pBz;
+            pgyz = -pgyz;
+            pEx = -pEx;
+            pBx = -pBx;
+            pgxy = -pgxy;
+            break;
+          case 6: //--+ (theta, pi+phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
+            px = -px;
+            py = -py;
+            pEx = -pEx;
+            pBx = -pBx;
+            pgxz = -pgxz;
+            pEy = -pEy;
+            pBy = -pBy;
+            pgyz = -pgyz;
+            break;
+          case 7: //--- (pi-theta, pi+phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
+            px = -px;
+            py = -py;
+            pz = -pz;
+            pEx = -pEx;
+            pEy = -pEy;
+            pEz = -pEz;
+          }
+
+          f_getnpem2_point(px, py, pz, pchi, pgxx, pgxy, pgxz, pgyy, pgyz, pgzz, pEx, pEy, pEz, pBx, pBy, pBz,
+                           psi4RR, psi4II);
+          thetap = sqrt((2 * pl + 1.0) / 4.0 / PI) * misc::Wigner_d_function(pl, pm, spinw, costheta); // note the variation from -2 to 2
+
+          //	 find back the one
+          pchi = pchi + 1;
+#ifdef GaussInt
+          // wtcostheta is even function respect costheta
+          RP_out[countlm] = RP_out[countlm] + thetap / pchi / pchi * (psi4RR * cosmphi + psi4II * sinmphi) * wtcostheta[i];
+          IP_out[countlm] = IP_out[countlm] + thetap / pchi / pchi * (psi4II * cosmphi - psi4RR * sinmphi) * wtcostheta[i];
+#else
+          RP_out[countlm] = RP_out[countlm] + thetap / pchi / pchi * (psi4RR * cosmphi + psi4II * sinmphi);
+          IP_out[countlm] = IP_out[countlm] + thetap / pchi / pchi * (psi4II * cosmphi - psi4RR * sinmphi);
+#endif
+        }
+        countlm++; // no sanity check for countlm and NN which should be noted in the input parameters
+      }
+  }
+
+  for (int ii = 0; ii < NN; ii++)
+  {
+#ifdef GaussInt
+    RP_out[ii] = RP_out[ii] * rex * dphi;
+    IP_out[ii] = IP_out[ii] * rex * dphi;
+#else
+    RP_out[ii] = RP_out[ii] * rex * dphi * dcostheta;
+    IP_out[ii] = IP_out[ii] * rex * dphi * dcostheta;
+#endif
+  }
+  //|------+  Communicate and sum the results from each processor.
+
+  {
+    double *RPIP_out = new double[2 * NN];
+    double *RPIP = new double[2 * NN];
+    memcpy(RPIP_out, RP_out, NN * sizeof(double));
+    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
+    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    memcpy(RP, RPIP, NN * sizeof(double));
+    memcpy(IP, RPIP + NN, NN * sizeof(double));
+    delete[] RPIP_out;
+    delete[] RPIP;
+  }
+
+  //|------= Free memory.
+
+  delete[] pox[0];
+  delete[] pox[1];
+  delete[] pox[2];
+  delete[] shellf;
+  delete[] RP_out;
+  delete[] IP_out;
+  DG_List->clearList();
+}
+//|----------------------------------------------------------------
+//  for shell patch
+//  for EM wave specially symmetric case
+//  unify for phi1 and phi2
+//|----------------------------------------------------------------
+void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH,
+                                 var *Ex, var *Ey, var *Ez, var *Bx, var *By, var *Bz,
+                                 var *chi, var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
+                                 int spinw, int maxl, int NN, double *RP, double *IP,
+                                 monitor *Monitor,
+                                 void (*funcs)(double &, double &, double &,
+                                               double &, double &, double &, double &, double &, double &, double &,
+                                               double &, double &, double &, double &, double &, double &,
+                                               double &, double &)) // NN is the length of RP and IP
+{
+  const int InList = 13;
+
+  MyList<var> *DG_List = new MyList<var>(Ex);
+  DG_List->insert(Ey);
+  DG_List->insert(Ez);
+  DG_List->insert(Bx);
+  DG_List->insert(By);
+  DG_List->insert(Bz);
+  DG_List->insert(chi);
+  DG_List->insert(gxx);
+  DG_List->insert(gxy);
+  DG_List->insert(gxz);
+  DG_List->insert(gyy);
+  DG_List->insert(gyz);
+  DG_List->insert(gzz);
+
+  int n;
+  double *pox[3];
+  for (int i = 0; i < 3; i++)
+    pox[i] = new double[n_tot];
+  for (n = 0; n < n_tot; n++)
+  {
+    pox[0][n] = rex * nx_g[n];
+    pox[1][n] = rex * ny_g[n];
+    pox[2][n] = rex * nz_g[n];
+  }
+
+  double *shellf;
+  shellf = new double[n_tot * InList];
+
+  GH->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);
+
+  double *RP_out, *IP_out;
+  RP_out = new double[NN];
+  IP_out = new double[NN];
+
+  for (int ii = 0; ii < NN; ii++)
+  {
+    RP_out[ii] = 0;
+    IP_out[ii] = 0;
+  }
+
+#if 0
+// for debug    
+  if(myrank==0)
+  {
+    double costheta, thetap;
+    double cosmphi,sinmphi;
+
+    int i,j;
+    int lpsy=0;
+         if( Symmetry == 0 )     lpsy=1;
+    else if( Symmetry == 1 )     lpsy=2;
+    else if( Symmetry == 2 )     lpsy=8;
+
+    double psi4RR,psi4II;
+    double px,py,pz;
+    double pEx,pEy,pEz,pBx,pBy,pBz;
+    double pchi,pgxx,pgxy,pgxz,pgyy,pgyz,pgzz;
+    for( n = 0; n <= n_tot-1; n++) 
+     {
+//       need round off always	     
+        i = int(n/N_phi); // int(1.723) = 1, int(-1.732) = -1
+        j = n - i * N_phi;
+        
+	for(int lp=0;lp<lpsy;lp++)
+	{
+         px = pox[0][n];		 
+         py = pox[1][n];		 
+         pz = pox[2][n];		 
+	 pEx = shellf[InList*n  ];
+	 pEy = shellf[InList*n+1];
+	 pEz = shellf[InList*n+2];
+	 pBx = shellf[InList*n+3];
+	 pBy = shellf[InList*n+4];
+	 pBz = shellf[InList*n+5];
+	 pchi = shellf[InList*n+6];
+	 pgxx = shellf[InList*n+7];
+	 pgxy = shellf[InList*n+8];
+	 pgxz = shellf[InList*n+9];
+	 pgyy = shellf[InList*n+10];
+	 pgyz = shellf[InList*n+11];
+	 pgzz = shellf[InList*n+12];
+ 	 switch(lp)
+	 {
+	  case 1:  //++- (pi-theta, phi)
+	  pz = -pz;
+	  pEz = -pEz;
+	  pBx = -pBx;
+	  pBy = -pBy;
+	  pgxz = -pgxz;
+	  pgyz = -pgyz;
+	  break;
+	  case 2:  //+-+ (theta, 2*pi-phi)
+	  py = -py;
+	  pEy = -pEy;
+	  pBx = -pBx;
+	  pBz = -pBz;
+	  pgxy = -pgxy;
+	  pgyz = -pgyz;
+	  break;
+	  case 3:  //+-- (pi-theta, 2*pi-phi)
+	  py = -py;
+	  pz = -pz;
+	  pEz = -pEz;
+	  pBz = -pBz;;
+	  pgxz = -pgxz;
+	  pEy = -pEy;
+	  pBy = -pBy;
+	  pgxy = -pgxy;
+	  break;
+	  case 4:  //-++ (theta, pi-phi)
+	  px = -px;
+	  pEx = -pEx;
+	  pBy = -pBy;
+	  pBz = -pBz;
+	  pgxy = -pgxy;
+	  pgxz = -pgxz;
+	  break;
+	  case 5:  //-+- (pi-theta, pi-phi)
+	  pz = -pz;
+	  px = -px;
+	  pEz = -pEz;
+	  pBz = -pBz;
+	  pgyz = -pgyz;
+	  pEx = -pEx;
+	  pBx = -pBx;
+	  pgxy = -pgxy;
+	  break;
+	  case 6:  //--+ (theta, pi+phi)
+	  px = -px;
+	  py = -py;
+	  pEx = -pEx;
+	  pBx = -pBx;
+	  pgxz = -pgxz;
+	  pEy = -pEy;
+	  pBy = -pBy;
+	  pgyz = -pgyz;
+	  break;
+	  case 7:  //--- (pi-theta, pi+phi)
+	  px = -px;
+	  py = -py;
+	  pz = -pz;
+	  pEx = -pEx;
+	  pEy = -pEy;
+	  pEz = -pEz;
+	 }
+	  
+	 funcs(px,py,pz,pchi,pgxx,pgxy,pgxz,pgyy,pgyz,pgzz,pEx,pEy,pEz,pBx,pBy,pBz,
+			  psi4RR,psi4II);
+//	 if(n==0 || n==N_phi/2-1 || n==N_phi/2 || n==N_phi-1 ||
+//	    n==N_phi*(N_theta-1)+0 || n==N_phi*(N_theta-1)+N_phi/2-1 || n==N_phi*(N_theta-1)+N_phi/2 || n==N_phi*(N_theta-1)+N_phi-1)
+//	 cout<<px<<","<<py<<","<<pz<<","<<pchi<<","<<pgxx<<","<<pgxy<<","<<pgxz<<","<<pgyy<<","<<pgyz<<","<<pgzz<<","<<pEx<<","
+//	     <<pEy<<","<<pEz<<","<<pBx<<","<<pBy<<","<<pBz<<","<<psi4RR<<","<<psi4II<<endl<<endl;
+
+// find back the one
+        pchi = pchi+1;
+
+	int countlm=0;
+	for(int pl=spinw;pl<maxl+1;pl++)
+          for(int pm=-pl;pm<pl+1;pm++)
+	  {
+ 	 switch(lp)
+	 {
+	  case 0:  //+++ (theta, phi)
+          costheta = arcostheta[i];
+	  cosmphi = cos(pm * (j+0.5) * dphi);
+	  sinmphi = sin(pm * (j+0.5) * dphi);
+	  break;
+	  case 1:  //++- (pi-theta, phi)
+          costheta = -arcostheta[i];
+	  cosmphi = cos(pm * (j+0.5) * dphi);
+	  sinmphi = sin(pm * (j+0.5) * dphi);
+	  break;
+	  case 2:  //+-+ (theta, 2*pi-phi)
+          costheta = arcostheta[i];
+	  cosmphi = cos(pm * (j+0.5) * dphi);
+	  sinmphi =-sin(pm * (j+0.5) * dphi);
+	  break;
+	  case 3:  //+-- (pi-theta, 2*pi-phi)
+          costheta = -arcostheta[i];
+	  cosmphi = cos(pm * (j+0.5) * dphi);
+	  sinmphi =-sin(pm * (j+0.5) * dphi);
+	  break;
+	  case 4:  //-++ (theta, pi-phi)
+          costheta = arcostheta[i];
+	  cosmphi = cos(pm * (PI - (j+0.5) * dphi));
+	  sinmphi = sin(pm * (PI - (j+0.5) * dphi));
+	  break;
+	  case 5:  //-+- (pi-theta, pi-phi)
+          costheta = -arcostheta[i];
+ 	  cosmphi = cos(pm * (PI - (j+0.5) * dphi));
+	  sinmphi = sin(pm * (PI - (j+0.5) * dphi));
+	  break;
+	  case 6:  //--+ (theta, pi+phi)
+          costheta = arcostheta[i];
+	  cosmphi = cos(pm * (PI + (j+0.5) * dphi));
+	  sinmphi = sin(pm * (PI + (j+0.5) * dphi));
+	  break;
+	  case 7:  //--- (pi-theta, pi+phi)
+          costheta = -arcostheta[i];
+	  cosmphi = cos(pm * (PI + (j+0.5) * dphi));
+	  sinmphi = sin(pm * (PI + (j+0.5) * dphi));
+	 }
+   	    thetap = sqrt((2*pl+1.0)/4.0/PI)*misc::Wigner_d_function(pl,pm,spinw,costheta); //note the variation from -2 to 2
+
+#ifdef GaussInt
+// wtcostheta is even function respect costheta
+            RP_out[countlm] = RP_out[countlm] + thetap/pchi/pchi * (psi4RR * cosmphi + psi4II * sinmphi)*wtcostheta[i];
+  	    IP_out[countlm] = IP_out[countlm] + thetap/pchi/pchi * (psi4II * cosmphi - psi4RR * sinmphi)*wtcostheta[i];
+	    if(pl==2 && pm==0) cout<<countlm+1<<","<<RP_out[countlm] * rex * dphi<<endl;
+#else	 
+            RP_out[countlm] = RP_out[countlm] + thetap/pchi/pchi * (psi4RR * cosmphi + psi4II * sinmphi);
+  	    IP_out[countlm] = IP_out[countlm] + thetap/pchi/pchi * (psi4II * cosmphi - psi4RR * sinmphi);
+#endif	 
+  	    countlm++;  //no sanity check for countlm and NN which should be noted in the input parameters
+	  }
+	}
+//        if(Symmetry == 2) MPI_Abort(MPI_COMM_WORLD,1);
+     }
+     MPI_Abort(MPI_COMM_WORLD,1);
+  }
+#else
+  int mp, Lp, Nmin, Nmax;
+
+  mp = n_tot / cpusize;
+  Lp = n_tot - cpusize * mp;
+
+  if (Lp > myrank)
+  {
+    Nmin = myrank * mp + myrank;
+    Nmax = Nmin + mp;
+  }
+  else
+  {
+    Nmin = myrank * mp + Lp;
+    Nmax = Nmin + mp - 1;
+  }
+
+  // theta part
+  double costheta, thetap;
+  double cosmphi, sinmphi;
+
+  int i, j;
+  int lpsy = 0;
+  if (Symmetry == 0)
+    lpsy = 1;
+  else if (Symmetry == 1)
+    lpsy = 2;
+  else if (Symmetry == 2)
+    lpsy = 8;
+
+  double psi4RR, psi4II;
+  double px, py, pz;
+  double pEx, pEy, pEz, pBx, pBy, pBz;
+  double pchi, pgxx, pgxy, pgxz, pgyy, pgyz, pgzz;
+  for (n = Nmin; n <= Nmax; n++)
+  {
+    //       need round off always
+    i = int(n / N_phi); // int(1.723) = 1, int(-1.732) = -1
+    j = n - i * N_phi;
+
+    int countlm = 0;
+    for (int pl = spinw; pl < maxl + 1; pl++)
+      for (int pm = -pl; pm < pl + 1; pm++)
+      {
+        for (int lp = 0; lp < lpsy; lp++)
+        {
+          px = pox[0][n];
+          py = pox[1][n];
+          pz = pox[2][n];
+          pEx = shellf[InList * n];
+          pEy = shellf[InList * n + 1];
+          pEz = shellf[InList * n + 2];
+          pBx = shellf[InList * n + 3];
+          pBy = shellf[InList * n + 4];
+          pBz = shellf[InList * n + 5];
+          pchi = shellf[InList * n + 6];
+          pgxx = shellf[InList * n + 7];
+          pgxy = shellf[InList * n + 8];
+          pgxz = shellf[InList * n + 9];
+          pgyy = shellf[InList * n + 10];
+          pgyz = shellf[InList * n + 11];
+          pgzz = shellf[InList * n + 12];
+          switch (lp)
+          {
+          case 0: //+++ (theta, phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = sin(pm * (j + 0.5) * dphi);
+            break;
+          case 1: //++- (pi-theta, phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = sin(pm * (j + 0.5) * dphi);
+            pz = -pz;
+            pEz = -pEz;
+            pBx = -pBx;
+            pBy = -pBy;
+            pgxz = -pgxz;
+            pgyz = -pgyz;
+            break;
+          case 2: //+-+ (theta, 2*pi-phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = -sin(pm * (j + 0.5) * dphi);
+            py = -py;
+            pEy = -pEy;
+            pBx = -pBx;
+            pBz = -pBz;
+            pgxy = -pgxy;
+            pgyz = -pgyz;
+            break;
+          case 3: //+-- (pi-theta, 2*pi-phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = -sin(pm * (j + 0.5) * dphi);
+            py = -py;
+            pz = -pz;
+            pEz = -pEz;
+            pBz = -pBz;
+            pgxz = -pgxz;
+            pEy = -pEy;
+            pBy = -pBy;
+            pgxy = -pgxy;
+            break;
+          case 4: //-++ (theta, pi-phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
+            px = -px;
+            pEx = -pEx;
+            pBy = -pBy;
+            pBz = -pBz;
+            pgxy = -pgxy;
+            pgxz = -pgxz;
+            break;
+          case 5: //-+- (pi-theta, pi-phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
+            pz = -pz;
+            px = -px;
+            pEz = -pEz;
+            pBz = -pBz;
+            pgyz = -pgyz;
+            pEx = -pEx;
+            pBx = -pBx;
+            pgxy = -pgxy;
+            break;
+          case 6: //--+ (theta, pi+phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
+            px = -px;
+            py = -py;
+            pEx = -pEx;
+            pBx = -pBx;
+            pgxz = -pgxz;
+            pEy = -pEy;
+            pBy = -pBy;
+            pgyz = -pgyz;
+            break;
+          case 7: //--- (pi-theta, pi+phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
+            px = -px;
+            py = -py;
+            pz = -pz;
+            pEx = -pEx;
+            pEy = -pEy;
+            pEz = -pEz;
+          }
+
+          funcs(px, py, pz, pchi, pgxx, pgxy, pgxz, pgyy, pgyz, pgzz, pEx, pEy, pEz, pBx, pBy, pBz,
+                psi4RR, psi4II);
+          thetap = sqrt((2 * pl + 1.0) / 4.0 / PI) * misc::Wigner_d_function(pl, pm, spinw, costheta); // note the variation from -2 to 2
+
+          //	 find back the one
+          pchi = pchi + 1;
+#ifdef GaussInt
+          // wtcostheta is even function respect costheta
+          RP_out[countlm] = RP_out[countlm] + thetap / pchi / pchi * (psi4RR * cosmphi + psi4II * sinmphi) * wtcostheta[i];
+          IP_out[countlm] = IP_out[countlm] + thetap / pchi / pchi * (psi4II * cosmphi - psi4RR * sinmphi) * wtcostheta[i];
+#else
+          RP_out[countlm] = RP_out[countlm] + thetap / pchi / pchi * (psi4RR * cosmphi + psi4II * sinmphi);
+          IP_out[countlm] = IP_out[countlm] + thetap / pchi / pchi * (psi4II * cosmphi - psi4RR * sinmphi);
+#endif
+        }
+        countlm++; // no sanity check for countlm and NN which should be noted in the input parameters
+      }
+  }
+#endif
+
+  for (int ii = 0; ii < NN; ii++)
+  {
+#ifdef GaussInt
+    RP_out[ii] = RP_out[ii] * rex * dphi;
+    IP_out[ii] = IP_out[ii] * rex * dphi;
+#else
+    RP_out[ii] = RP_out[ii] * rex * dphi * dcostheta;
+    IP_out[ii] = IP_out[ii] * rex * dphi * dcostheta;
+#endif
+  }
+  //|------+  Communicate and sum the results from each processor.
+
+  {
+    double *RPIP_out = new double[2 * NN];
+    double *RPIP = new double[2 * NN];
+    memcpy(RPIP_out, RP_out, NN * sizeof(double));
+    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
+    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    memcpy(RP, RPIP, NN * sizeof(double));
+    memcpy(IP, RPIP + NN, NN * sizeof(double));
+    delete[] RPIP_out;
+    delete[] RPIP;
+  }
+
+  //|------= Free memory.
+
+  delete[] pox[0];
+  delete[] pox[1];
+  delete[] pox[2];
+  delete[] shellf;
+  delete[] RP_out;
+  delete[] IP_out;
+  DG_List->clearList();
+}
+//|----------------------------------------------------------------
+//  for box
+//  for EM wave specially symmetric case
+//  unify for phi1 and phi2
+//|----------------------------------------------------------------
+void surface_integral::surf_Wave(double rex, int lev, cgh *GH,
+                                 var *Ex, var *Ey, var *Ez, var *Bx, var *By, var *Bz,
+                                 var *chi, var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
+                                 int spinw, int maxl, int NN, double *RP, double *IP,
+                                 monitor *Monitor,
+                                 void (*funcs)(double &, double &, double &,
+                                               double &, double &, double &, double &, double &, double &, double &,
+                                               double &, double &, double &, double &, double &, double &,
+                                               double &, double &)) // NN is the length of RP and IP
+{
+  const int InList = 13;
+
+  MyList<var> *DG_List = new MyList<var>(Ex);
+  DG_List->insert(Ey);
+  DG_List->insert(Ez);
+  DG_List->insert(Bx);
+  DG_List->insert(By);
+  DG_List->insert(Bz);
+  DG_List->insert(chi);
+  DG_List->insert(gxx);
+  DG_List->insert(gxy);
+  DG_List->insert(gxz);
+  DG_List->insert(gyy);
+  DG_List->insert(gyz);
+  DG_List->insert(gzz);
+
+  int n;
+  double *pox[3];
+  for (int i = 0; i < 3; i++)
+    pox[i] = new double[n_tot];
+  for (n = 0; n < n_tot; n++)
+  {
+    pox[0][n] = rex * nx_g[n];
+    pox[1][n] = rex * ny_g[n];
+    pox[2][n] = rex * nz_g[n];
+  }
+
+  double *shellf;
+  shellf = new double[n_tot * InList];
+
+  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);
+
+  double *RP_out, *IP_out;
+  RP_out = new double[NN];
+  IP_out = new double[NN];
+
+  for (int ii = 0; ii < NN; ii++)
+  {
+    RP_out[ii] = 0;
+    IP_out[ii] = 0;
+  }
+
+#if 0
+// for debug    
+  if(myrank==0)
+  {
+    double costheta, thetap;
+    double cosmphi,sinmphi;
+
+    int i,j;
+    int lpsy=0;
+         if( Symmetry == 0 )     lpsy=1;
+    else if( Symmetry == 1 )     lpsy=2;
+    else if( Symmetry == 2 )     lpsy=8;
+
+    double psi4RR,psi4II;
+    double px,py,pz;
+    double pEx,pEy,pEz,pBx,pBy,pBz;
+    double pchi,pgxx,pgxy,pgxz,pgyy,pgyz,pgzz;
+    for( n = 0; n <= n_tot-1; n++) 
+     {
+//       need round off always	     
+        i = int(n/N_phi); // int(1.723) = 1, int(-1.732) = -1
+        j = n - i * N_phi;
+        
+	for(int lp=0;lp<lpsy;lp++)
+	{
+         px = pox[0][n];		 
+         py = pox[1][n];		 
+         pz = pox[2][n];		 
+	 pEx = shellf[InList*n  ];
+	 pEy = shellf[InList*n+1];
+	 pEz = shellf[InList*n+2];
+	 pBx = shellf[InList*n+3];
+	 pBy = shellf[InList*n+4];
+	 pBz = shellf[InList*n+5];
+	 pchi = shellf[InList*n+6];
+	 pgxx = shellf[InList*n+7];
+	 pgxy = shellf[InList*n+8];
+	 pgxz = shellf[InList*n+9];
+	 pgyy = shellf[InList*n+10];
+	 pgyz = shellf[InList*n+11];
+	 pgzz = shellf[InList*n+12];
+ 	 switch(lp)
+	 {
+	  case 1:  //++- (pi-theta, phi)
+	  pz = -pz;
+	  pEz = -pEz;
+	  pBx = -pBx;
+	  pBy = -pBy;
+	  pgxz = -pgxz;
+	  pgyz = -pgyz;
+	  break;
+	  case 2:  //+-+ (theta, 2*pi-phi)
+	  py = -py;
+	  pEy = -pEy;
+	  pBx = -pBx;
+	  pBz = -pBz;
+	  pgxy = -pgxy;
+	  pgyz = -pgyz;
+	  break;
+	  case 3:  //+-- (pi-theta, 2*pi-phi)
+	  py = -py;
+	  pz = -pz;
+	  pEz = -pEz;
+	  pBz = -pBz;;
+	  pgxz = -pgxz;
+	  pEy = -pEy;
+	  pBy = -pBy;
+	  pgxy = -pgxy;
+	  break;
+	  case 4:  //-++ (theta, pi-phi)
+	  px = -px;
+	  pEx = -pEx;
+	  pBy = -pBy;
+	  pBz = -pBz;
+	  pgxy = -pgxy;
+	  pgxz = -pgxz;
+	  break;
+	  case 5:  //-+- (pi-theta, pi-phi)
+	  pz = -pz;
+	  px = -px;
+	  pEz = -pEz;
+	  pBz = -pBz;
+	  pgyz = -pgyz;
+	  pEx = -pEx;
+	  pBx = -pBx;
+	  pgxy = -pgxy;
+	  break;
+	  case 6:  //--+ (theta, pi+phi)
+	  px = -px;
+	  py = -py;
+	  pEx = -pEx;
+	  pBx = -pBx;
+	  pgxz = -pgxz;
+	  pEy = -pEy;
+	  pBy = -pBy;
+	  pgyz = -pgyz;
+	  break;
+	  case 7:  //--- (pi-theta, pi+phi)
+	  px = -px;
+	  py = -py;
+	  pz = -pz;
+	  pEx = -pEx;
+	  pEy = -pEy;
+	  pEz = -pEz;
+	 }
+	  
+	 funcs(px,py,pz,pchi,pgxx,pgxy,pgxz,pgyy,pgyz,pgzz,pEx,pEy,pEz,pBx,pBy,pBz,
+			  psi4RR,psi4II);
+//	 if(n==0 || n==N_phi/2-1 || n==N_phi/2 || n==N_phi-1 ||
+//	    n==N_phi*(N_theta-1)+0 || n==N_phi*(N_theta-1)+N_phi/2-1 || n==N_phi*(N_theta-1)+N_phi/2 || n==N_phi*(N_theta-1)+N_phi-1)
+//	 cout<<px<<","<<py<<","<<pz<<","<<pchi<<","<<pgxx<<","<<pgxy<<","<<pgxz<<","<<pgyy<<","<<pgyz<<","<<pgzz<<","<<pEx<<","
+//	     <<pEy<<","<<pEz<<","<<pBx<<","<<pBy<<","<<pBz<<","<<psi4RR<<","<<psi4II<<endl<<endl;
+
+// find back the one
+        pchi = pchi+1;
+
+	int countlm=0;
+	for(int pl=spinw;pl<maxl+1;pl++)
+          for(int pm=-pl;pm<pl+1;pm++)
+	  {
+ 	 switch(lp)
+	 {
+	  case 0:  //+++ (theta, phi)
+          costheta = arcostheta[i];
+	  cosmphi = cos(pm * (j+0.5) * dphi);
+	  sinmphi = sin(pm * (j+0.5) * dphi);
+	  break;
+	  case 1:  //++- (pi-theta, phi)
+          costheta = -arcostheta[i];
+	  cosmphi = cos(pm * (j+0.5) * dphi);
+	  sinmphi = sin(pm * (j+0.5) * dphi);
+	  break;
+	  case 2:  //+-+ (theta, 2*pi-phi)
+          costheta = arcostheta[i];
+	  cosmphi = cos(pm * (j+0.5) * dphi);
+	  sinmphi =-sin(pm * (j+0.5) * dphi);
+	  break;
+	  case 3:  //+-- (pi-theta, 2*pi-phi)
+          costheta = -arcostheta[i];
+	  cosmphi = cos(pm * (j+0.5) * dphi);
+	  sinmphi =-sin(pm * (j+0.5) * dphi);
+	  break;
+	  case 4:  //-++ (theta, pi-phi)
+          costheta = arcostheta[i];
+	  cosmphi = cos(pm * (PI - (j+0.5) * dphi));
+	  sinmphi = sin(pm * (PI - (j+0.5) * dphi));
+	  break;
+	  case 5:  //-+- (pi-theta, pi-phi)
+          costheta = -arcostheta[i];
+ 	  cosmphi = cos(pm * (PI - (j+0.5) * dphi));
+	  sinmphi = sin(pm * (PI - (j+0.5) * dphi));
+	  break;
+	  case 6:  //--+ (theta, pi+phi)
+          costheta = arcostheta[i];
+	  cosmphi = cos(pm * (PI + (j+0.5) * dphi));
+	  sinmphi = sin(pm * (PI + (j+0.5) * dphi));
+	  break;
+	  case 7:  //--- (pi-theta, pi+phi)
+          costheta = -arcostheta[i];
+	  cosmphi = cos(pm * (PI + (j+0.5) * dphi));
+	  sinmphi = sin(pm * (PI + (j+0.5) * dphi));
+	 }
+   	    thetap = sqrt((2*pl+1.0)/4.0/PI)*misc::Wigner_d_function(pl,pm,spinw,costheta); //note the variation from -2 to 2
+
+#ifdef GaussInt
+// wtcostheta is even function respect costheta
+            RP_out[countlm] = RP_out[countlm] + thetap/pchi/pchi * (psi4RR * cosmphi + psi4II * sinmphi)*wtcostheta[i];
+  	    IP_out[countlm] = IP_out[countlm] + thetap/pchi/pchi * (psi4II * cosmphi - psi4RR * sinmphi)*wtcostheta[i];
+	    if(pl==2 && pm==0) cout<<countlm+1<<","<<RP_out[countlm] * rex * dphi<<endl;
+#else	 
+            RP_out[countlm] = RP_out[countlm] + thetap/pchi/pchi * (psi4RR * cosmphi + psi4II * sinmphi);
+  	    IP_out[countlm] = IP_out[countlm] + thetap/pchi/pchi * (psi4II * cosmphi - psi4RR * sinmphi);
+#endif	 
+  	    countlm++;  //no sanity check for countlm and NN which should be noted in the input parameters
+	  }
+	}
+//        if(Symmetry == 2) MPI_Abort(MPI_COMM_WORLD,1);
+     }
+     MPI_Abort(MPI_COMM_WORLD,1);
+  }
+#else
+  int mp, Lp, Nmin, Nmax;
+
+  mp = n_tot / cpusize;
+  Lp = n_tot - cpusize * mp;
+
+  if (Lp > myrank)
+  {
+    Nmin = myrank * mp + myrank;
+    Nmax = Nmin + mp;
+  }
+  else
+  {
+    Nmin = myrank * mp + Lp;
+    Nmax = Nmin + mp - 1;
+  }
+
+  // theta part
+  double costheta, thetap;
+  double cosmphi, sinmphi;
+
+  int i, j;
+  int lpsy = 0;
+  if (Symmetry == 0)
+    lpsy = 1;
+  else if (Symmetry == 1)
+    lpsy = 2;
+  else if (Symmetry == 2)
+    lpsy = 8;
+
+  double psi4RR, psi4II;
+  double px, py, pz;
+  double pEx, pEy, pEz, pBx, pBy, pBz;
+  double pchi, pgxx, pgxy, pgxz, pgyy, pgyz, pgzz;
+  for (n = Nmin; n <= Nmax; n++)
+  {
+    //       need round off always
+    i = int(n / N_phi); // int(1.723) = 1, int(-1.732) = -1
+    j = n - i * N_phi;
+
+    int countlm = 0;
+    for (int pl = spinw; pl < maxl + 1; pl++)
+      for (int pm = -pl; pm < pl + 1; pm++)
+      {
+        for (int lp = 0; lp < lpsy; lp++)
+        {
+          px = pox[0][n];
+          py = pox[1][n];
+          pz = pox[2][n];
+          pEx = shellf[InList * n];
+          pEy = shellf[InList * n + 1];
+          pEz = shellf[InList * n + 2];
+          pBx = shellf[InList * n + 3];
+          pBy = shellf[InList * n + 4];
+          pBz = shellf[InList * n + 5];
+          pchi = shellf[InList * n + 6];
+          pgxx = shellf[InList * n + 7];
+          pgxy = shellf[InList * n + 8];
+          pgxz = shellf[InList * n + 9];
+          pgyy = shellf[InList * n + 10];
+          pgyz = shellf[InList * n + 11];
+          pgzz = shellf[InList * n + 12];
+          switch (lp)
+          {
+          case 0: //+++ (theta, phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = sin(pm * (j + 0.5) * dphi);
+            break;
+          case 1: //++- (pi-theta, phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = sin(pm * (j + 0.5) * dphi);
+            pz = -pz;
+            pEz = -pEz;
+            pBx = -pBx;
+            pBy = -pBy;
+            pgxz = -pgxz;
+            pgyz = -pgyz;
+            break;
+          case 2: //+-+ (theta, 2*pi-phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = -sin(pm * (j + 0.5) * dphi);
+            py = -py;
+            pEy = -pEy;
+            pBx = -pBx;
+            pBz = -pBz;
+            pgxy = -pgxy;
+            pgyz = -pgyz;
+            break;
+          case 3: //+-- (pi-theta, 2*pi-phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = -sin(pm * (j + 0.5) * dphi);
+            py = -py;
+            pz = -pz;
+            pEz = -pEz;
+            pBz = -pBz;
+            pgxz = -pgxz;
+            pEy = -pEy;
+            pBy = -pBy;
+            pgxy = -pgxy;
+            break;
+          case 4: //-++ (theta, pi-phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
+            px = -px;
+            pEx = -pEx;
+            pBy = -pBy;
+            pBz = -pBz;
+            pgxy = -pgxy;
+            pgxz = -pgxz;
+            break;
+          case 5: //-+- (pi-theta, pi-phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
+            pz = -pz;
+            px = -px;
+            pEz = -pEz;
+            pBz = -pBz;
+            pgyz = -pgyz;
+            pEx = -pEx;
+            pBx = -pBx;
+            pgxy = -pgxy;
+            break;
+          case 6: //--+ (theta, pi+phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
+            px = -px;
+            py = -py;
+            pEx = -pEx;
+            pBx = -pBx;
+            pgxz = -pgxz;
+            pEy = -pEy;
+            pBy = -pBy;
+            pgyz = -pgyz;
+            break;
+          case 7: //--- (pi-theta, pi+phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
+            px = -px;
+            py = -py;
+            pz = -pz;
+            pEx = -pEx;
+            pEy = -pEy;
+            pEz = -pEz;
+          }
+
+          funcs(px, py, pz, pchi, pgxx, pgxy, pgxz, pgyy, pgyz, pgzz, pEx, pEy, pEz, pBx, pBy, pBz,
+                psi4RR, psi4II);
+          thetap = sqrt((2 * pl + 1.0) / 4.0 / PI) * misc::Wigner_d_function(pl, pm, spinw, costheta); // note the variation from -2 to 2
+
+          //	 find back the one
+          pchi = pchi + 1;
+#ifdef GaussInt
+          // wtcostheta is even function respect costheta
+          RP_out[countlm] = RP_out[countlm] + thetap / pchi / pchi * (psi4RR * cosmphi + psi4II * sinmphi) * wtcostheta[i];
+          IP_out[countlm] = IP_out[countlm] + thetap / pchi / pchi * (psi4II * cosmphi - psi4RR * sinmphi) * wtcostheta[i];
+#else
+          RP_out[countlm] = RP_out[countlm] + thetap / pchi / pchi * (psi4RR * cosmphi + psi4II * sinmphi);
+          IP_out[countlm] = IP_out[countlm] + thetap / pchi / pchi * (psi4II * cosmphi - psi4RR * sinmphi);
+#endif
+        }
+        countlm++; // no sanity check for countlm and NN which should be noted in the input parameters
+      }
+  }
+#endif
+
+  for (int ii = 0; ii < NN; ii++)
+  {
+#ifdef GaussInt
+    RP_out[ii] = RP_out[ii] * rex * dphi;
+    IP_out[ii] = IP_out[ii] * rex * dphi;
+#else
+    RP_out[ii] = RP_out[ii] * rex * dphi * dcostheta;
+    IP_out[ii] = IP_out[ii] * rex * dphi * dcostheta;
+#endif
+  }
+  //|------+  Communicate and sum the results from each processor.
+
+  {
+    double *RPIP_out = new double[2 * NN];
+    double *RPIP = new double[2 * NN];
+    memcpy(RPIP_out, RP_out, NN * sizeof(double));
+    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
+    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    memcpy(RP, RPIP, NN * sizeof(double));
+    memcpy(IP, RPIP + NN, NN * sizeof(double));
+    delete[] RPIP_out;
+    delete[] RPIP;
+  }
+
+  //|------= Free memory.
+
+  delete[] pox[0];
+  delete[] pox[1];
+  delete[] pox[2];
+  delete[] shellf;
+  delete[] RP_out;
+  delete[] IP_out;
+  DG_List->clearList();
+}
+//|----------------------------------------------------------------
+//  for null shell patch2
+//|----------------------------------------------------------------
+// rex is x instead of r
+void surface_integral::surf_Wave(double rex, int lev, NullShellPatch2 *GH, var *Rpsi4, var *Ipsi4,
+                                 int spinw, int maxl, int NN, double *RP, double *IP,
+                                 monitor *Monitor) // NN is the length of RP and IP
+// spinw 0 for scalar; 1 for electricmagnetic wave; 2 for gravitaitonal wave
+// we always assume spinw >= 0
+{
+  const int InList = 2;
+
+  MyList<var> *DG_List = new MyList<var>(Rpsi4);
+  DG_List->insert(Ipsi4);
+
+  int n;
+  // since we used x instead of r, these global coordinates are fake
+  double *pox[3];
+  for (int i = 0; i < 3; i++)
+    pox[i] = new double[n_tot];
+  for (n = 0; n < n_tot; n++)
+  {
+    pox[0][n] = rex * nx_g[n];
+    pox[1][n] = rex * ny_g[n];
+    pox[2][n] = rex * nz_g[n];
+  }
+
+  double *shellf;
+  shellf = new double[n_tot * InList];
+
+  GH->Interp_Points_2D(DG_List, n_tot, pox, shellf, Symmetry);
+
+  int mp, Lp, Nmin, Nmax;
+
+  mp = n_tot / cpusize;
+  Lp = n_tot - cpusize * mp;
+
+  if (Lp > myrank)
+  {
+    Nmin = myrank * mp + myrank;
+    Nmax = Nmin + mp;
+  }
+  else
+  {
+    Nmin = myrank * mp + Lp;
+    Nmax = Nmin + mp - 1;
+  }
+
+  //|~~~~~> Integrate the dot product of Dphi with the surface normal.
+
+  double *RP_out, *IP_out;
+  RP_out = new double[NN];
+  IP_out = new double[NN];
+
+  for (int ii = 0; ii < NN; ii++)
+  {
+    RP_out[ii] = 0;
+    IP_out[ii] = 0;
+  }
+  // theta part
+  double costheta, thetap;
+  double cosmphi, sinmphi;
+
+  int i, j;
+  int lpsy = 0;
+  if (Symmetry == 0)
+    lpsy = 1;
+  else if (Symmetry == 1)
+    lpsy = 2;
+  else if (Symmetry == 2)
+    lpsy = 8;
+
+  double psi4RR, psi4II;
+  for (n = Nmin; n <= Nmax; n++)
+  {
+    //       need round off always
+    i = int(n / N_phi); // int(1.723) = 1, int(-1.732) = -1
+    j = n - i * N_phi;
+
+    int countlm = 0;
+    for (int pl = spinw; pl < maxl + 1; pl++)
+      for (int pm = -pl; pm < pl + 1; pm++)
+      {
+        for (int lp = 0; lp < lpsy; lp++)
+        {
+          switch (lp)
+          {
+          case 0: //+++ (theta, phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = sin(pm * (j + 0.5) * dphi);
+            psi4RR = shellf[InList * n];
+            psi4II = shellf[InList * n + 1];
+            break;
+          case 1: //++- (pi-theta, phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = sin(pm * (j + 0.5) * dphi);
+            psi4RR = shellf[InList * n];
+            psi4II = -shellf[InList * n + 1];
+            break;
+          case 2: //+-+ (theta, 2*pi-phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = -sin(pm * (j + 0.5) * dphi);
+            psi4RR = shellf[InList * n];
+            psi4II = -shellf[InList * n + 1];
+            break;
+          case 3: //+-- (pi-theta, 2*pi-phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = -sin(pm * (j + 0.5) * dphi);
+            psi4RR = shellf[InList * n];
+            psi4II = shellf[InList * n + 1];
+            break;
+          case 4: //-++ (theta, pi-phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
+            psi4RR = shellf[InList * n];
+            psi4II = -shellf[InList * n + 1];
+            break;
+          case 5: //-+- (pi-theta, pi-phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
+            psi4RR = shellf[InList * n];
+            psi4II = shellf[InList * n + 1];
+            break;
+          case 6: //--+ (theta, pi+phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
+            psi4RR = shellf[InList * n];
+            psi4II = shellf[InList * n + 1];
+            break;
+          case 7: //--- (pi-theta, pi+phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
+            psi4RR = shellf[InList * n];
+            psi4II = -shellf[InList * n + 1];
+          }
+
+          thetap = sqrt((2 * pl + 1.0) / 4.0 / PI) * misc::Wigner_d_function(pl, pm, spinw, costheta); // note the variation from -2 to 2
+                                                                                                       // based on Eq.(41) of PRD 77, 024027 (2008)
+#ifdef GaussInt
+          // wtcostheta is even function respect costheta
+          RP_out[countlm] = RP_out[countlm] + thetap * (psi4RR * cosmphi + psi4II * sinmphi) * wtcostheta[i];
+          IP_out[countlm] = IP_out[countlm] + thetap * (psi4II * cosmphi - psi4RR * sinmphi) * wtcostheta[i];
+#else
+          RP_out[countlm] = RP_out[countlm] + thetap * (psi4RR * cosmphi + psi4II * sinmphi); // + is because \bar of \bar{Y^s_lm} in Eq.(40)
+                                                                                              // of PRD 77, 024027 (2008)
+          IP_out[countlm] = IP_out[countlm] + thetap * (psi4II * cosmphi - psi4RR * sinmphi);
+#endif
+        }
+        countlm++; // no sanity check for countlm and NN which should be noted in the input parameters
+      }
+  }
+
+  for (int ii = 0; ii < NN; ii++)
+  {
+// do not need multiply with rex for null shell
+#ifdef GaussInt
+    RP_out[ii] = RP_out[ii] * dphi;
+    IP_out[ii] = IP_out[ii] * dphi;
+#else
+    RP_out[ii] = RP_out[ii] * dphi * dcostheta;
+    IP_out[ii] = IP_out[ii] * dphi * dcostheta;
+#endif
+  }
+  //|------+  Communicate and sum the results from each processor.
+
+  {
+    double *RPIP_out = new double[2 * NN];
+    double *RPIP = new double[2 * NN];
+    memcpy(RPIP_out, RP_out, NN * sizeof(double));
+    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
+    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    memcpy(RP, RPIP, NN * sizeof(double));
+    memcpy(IP, RPIP + NN, NN * sizeof(double));
+    delete[] RPIP_out;
+    delete[] RPIP;
+  }
+
+  //|------= Free memory.
+
+  delete[] pox[0];
+  delete[] pox[1];
+  delete[] pox[2];
+  delete[] shellf;
+  delete[] RP_out;
+  delete[] IP_out;
+  DG_List->clearList();
+}
+//|----------------------------------------------------------------
+//  for null shell patch
+//|----------------------------------------------------------------
+// rex is x instead of r
+void surface_integral::surf_Wave(double rex, int lev, NullShellPatch *GH, var *Rpsi4, var *Ipsi4,
+                                 int spinw, int maxl, int NN, double *RP, double *IP,
+                                 monitor *Monitor) // NN is the length of RP and IP
+// spinw 0 for scalar; 1 for electricmagnetic wave; 2 for gravitaitonal wave
+// we always assume spinw >= 0
+{
+  const int InList = 2;
+
+  MyList<var> *DG_List = new MyList<var>(Rpsi4);
+  DG_List->insert(Ipsi4);
+
+  int n;
+  // since we used x instead of r, these global coordinates are fake
+  double *pox[3];
+  for (int i = 0; i < 3; i++)
+    pox[i] = new double[n_tot];
+  for (n = 0; n < n_tot; n++)
+  {
+    pox[0][n] = rex * nx_g[n];
+    pox[1][n] = rex * ny_g[n];
+    pox[2][n] = rex * nz_g[n];
+  }
+
+  double *shellf;
+  shellf = new double[n_tot * InList];
+
+  GH->Interp_Points_2D(DG_List, n_tot, pox, shellf, Symmetry);
+
+  int mp, Lp, Nmin, Nmax;
+
+  mp = n_tot / cpusize;
+  Lp = n_tot - cpusize * mp;
+
+  if (Lp > myrank)
+  {
+    Nmin = myrank * mp + myrank;
+    Nmax = Nmin + mp;
+  }
+  else
+  {
+    Nmin = myrank * mp + Lp;
+    Nmax = Nmin + mp - 1;
+  }
+
+  //|~~~~~> Integrate the dot product of Dphi with the surface normal.
+
+  double *RP_out, *IP_out;
+  RP_out = new double[NN];
+  IP_out = new double[NN];
+
+  for (int ii = 0; ii < NN; ii++)
+  {
+    RP_out[ii] = 0;
+    IP_out[ii] = 0;
+  }
+  // theta part
+  double costheta, thetap;
+  double cosmphi, sinmphi;
+
+  int i, j;
+  int lpsy = 0;
+  if (Symmetry == 0)
+    lpsy = 1;
+  else if (Symmetry == 1)
+    lpsy = 2;
+  else if (Symmetry == 2)
+    lpsy = 8;
+
+  double psi4RR, psi4II;
+  for (n = Nmin; n <= Nmax; n++)
+  {
+    //       need round off always
+    i = int(n / N_phi); // int(1.723) = 1, int(-1.732) = -1
+    j = n - i * N_phi;
+
+    int countlm = 0;
+    for (int pl = spinw; pl < maxl + 1; pl++)
+      for (int pm = -pl; pm < pl + 1; pm++)
+      {
+        for (int lp = 0; lp < lpsy; lp++)
+        {
+          switch (lp)
+          {
+          case 0: //+++ (theta, phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = sin(pm * (j + 0.5) * dphi);
+            psi4RR = shellf[InList * n];
+            psi4II = shellf[InList * n + 1];
+            break;
+          case 1: //++- (pi-theta, phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = sin(pm * (j + 0.5) * dphi);
+            psi4RR = shellf[InList * n];
+            psi4II = -shellf[InList * n + 1];
+            break;
+          case 2: //+-+ (theta, 2*pi-phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = -sin(pm * (j + 0.5) * dphi);
+            psi4RR = shellf[InList * n];
+            psi4II = -shellf[InList * n + 1];
+            break;
+          case 3: //+-- (pi-theta, 2*pi-phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = -sin(pm * (j + 0.5) * dphi);
+            psi4RR = shellf[InList * n];
+            psi4II = shellf[InList * n + 1];
+            break;
+          case 4: //-++ (theta, pi-phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
+            psi4RR = shellf[InList * n];
+            psi4II = -shellf[InList * n + 1];
+            break;
+          case 5: //-+- (pi-theta, pi-phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
+            psi4RR = shellf[InList * n];
+            psi4II = shellf[InList * n + 1];
+            break;
+          case 6: //--+ (theta, pi+phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
+            psi4RR = shellf[InList * n];
+            psi4II = shellf[InList * n + 1];
+            break;
+          case 7: //--- (pi-theta, pi+phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
+            psi4RR = shellf[InList * n];
+            psi4II = -shellf[InList * n + 1];
+          }
+
+          thetap = sqrt((2 * pl + 1.0) / 4.0 / PI) * misc::Wigner_d_function(pl, pm, spinw, costheta); // note the variation from -2 to 2
+                                                                                                       // based on Eq.(41) of PRD 77, 024027 (2008)
+#ifdef GaussInt
+          // wtcostheta is even function respect costheta
+          RP_out[countlm] = RP_out[countlm] + thetap * (psi4RR * cosmphi + psi4II * sinmphi) * wtcostheta[i];
+          IP_out[countlm] = IP_out[countlm] + thetap * (psi4II * cosmphi - psi4RR * sinmphi) * wtcostheta[i];
+#else
+          RP_out[countlm] = RP_out[countlm] + thetap * (psi4RR * cosmphi + psi4II * sinmphi); // + is because \bar of \bar{Y^s_lm} in Eq.(40)
+                                                                                              // of PRD 77, 024027 (2008)
+          IP_out[countlm] = IP_out[countlm] + thetap * (psi4II * cosmphi - psi4RR * sinmphi);
+#endif
+        }
+        countlm++; // no sanity check for countlm and NN which should be noted in the input parameters
+      }
+  }
+
+  for (int ii = 0; ii < NN; ii++)
+  {
+// do not need multiply with rex for null shell
+#ifdef GaussInt
+    RP_out[ii] = RP_out[ii] * dphi;
+    IP_out[ii] = IP_out[ii] * dphi;
+#else
+    RP_out[ii] = RP_out[ii] * dphi * dcostheta;
+    IP_out[ii] = IP_out[ii] * dphi * dcostheta;
+#endif
+  }
+  //|------+  Communicate and sum the results from each processor.
+
+  {
+    double *RPIP_out = new double[2 * NN];
+    double *RPIP = new double[2 * NN];
+    memcpy(RPIP_out, RP_out, NN * sizeof(double));
+    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
+    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    memcpy(RP, RPIP, NN * sizeof(double));
+    memcpy(IP, RPIP + NN, NN * sizeof(double));
+    delete[] RPIP_out;
+    delete[] RPIP;
+  }
+
+  //|------= Free memory.
+
+  delete[] pox[0];
+  delete[] pox[1];
+  delete[] pox[2];
+  delete[] shellf;
+  delete[] RP_out;
+  delete[] IP_out;
+  DG_List->clearList();
+}
+//|----------------------------------------------------
+//|
+//| ADM mass, linear momentum and angular momentum
+//|
+//|----------------------------------------------------
+void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var *trK,
+                                     var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
+                                     var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
+                                     var *Gmx, var *Gmy, var *Gmz,
+                                     var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, // temparay memory for mass^i
+                                     double *Rout, monitor *Monitor)
+{
+  if (myrank == 0 && GH->grids[lev] != 1)
+    if (Monitor && Monitor->outfile)
+      Monitor->outfile << "WARNING: surface integral on multipatches" << endl;
+    else
+      cout << "WARNING: surface integral on multipatches" << endl;
+
+  double mass, px, py, pz, sx, sy, sz;
+
+  MyList<Patch> *Pp = GH->PatL[lev];
+  while (Pp)
+  {
+    MyList<Block> *BP = Pp->data->blb;
+    while (BP)
+    {
+      Block *cg = BP->data;
+      if (myrank == cg->rank)
+      {
+        f_admmass_bssn(cg->shape, cg->X[0], cg->X[1], cg->X[2],
+                       cg->fgfs[chi->sgfn], cg->fgfs[trK->sgfn],
+                       cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
+                       cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn],
+                       cg->fgfs[Gmx->sgfn], cg->fgfs[Gmy->sgfn], cg->fgfs[Gmz->sgfn],
+                       cg->fgfs[Sfx_rhs->sgfn], cg->fgfs[Sfy_rhs->sgfn], cg->fgfs[Sfz_rhs->sgfn],
+                       Symmetry);
+      }
+      if (BP == Pp->data->ble)
+        break;
+      BP = BP->next;
+    }
+    Pp = Pp->next;
+  }
+
+  const int InList = 17;
+
+  MyList<var> *DG_List = new MyList<var>(Sfx_rhs);
+  DG_List->insert(Sfy_rhs);
+  DG_List->insert(Sfz_rhs);
+  DG_List->insert(chi);
+  DG_List->insert(trK);
+  DG_List->insert(gxx);
+  DG_List->insert(gxy);
+  DG_List->insert(gxz);
+  DG_List->insert(gyy);
+  DG_List->insert(gyz);
+  DG_List->insert(gzz);
+  DG_List->insert(Axx);
+  DG_List->insert(Axy);
+  DG_List->insert(Axz);
+  DG_List->insert(Ayy);
+  DG_List->insert(Ayz);
+  DG_List->insert(Azz);
+
+  int n;
+  double *pox[3];
+  for (int i = 0; i < 3; i++)
+    pox[i] = new double[n_tot];
+  for (n = 0; n < n_tot; n++)
+  {
+    pox[0][n] = rex * nx_g[n];
+    pox[1][n] = rex * ny_g[n];
+    pox[2][n] = rex * nz_g[n];
+  }
+
+  int mp, Lp, Nmin, Nmax;
+  mp = n_tot / cpusize;
+  Lp = n_tot - cpusize * mp;
+  if (Lp > myrank)
+  {
+    Nmin = myrank * mp + myrank;
+    Nmax = Nmin + mp;
+  }
+  else
+  {
+    Nmin = myrank * mp + Lp;
+    Nmax = Nmin + mp - 1;
+  }
+
+  double *shellf;
+  shellf = new double[n_tot * InList];
+
+  // we have assumed there is only one box on this level,
+  // so we do not need loop boxes
+  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
+
+  double Mass_out = 0;
+  double ang_outx, ang_outy, ang_outz;
+  double p_outx, p_outy, p_outz;
+  ang_outx = ang_outy = ang_outz = 0.0;
+  p_outx = p_outy = p_outz = 0.0;
+  const double f1o8 = 0.125;
+
+  double Chi, Psi;
+  double Gxx, Gxy, Gxz, Gyy, Gyz, Gzz;
+  double gupxx, gupxy, gupxz, gupyy, gupyz, gupzz;
+  double TRK, axx, axy, axz, ayy, ayz, azz;
+  double aupxx, aupxy, aupxz, aupyx, aupyy, aupyz, aupzx, aupzy, aupzz;
+  int i;
+  for (n = Nmin; n <= Nmax; n++)
+  {
+    //       need round off always
+    i = int(n / N_phi); // int(1.723) = 1, int(-1.732) = -1
+
+    Chi = shellf[InList * n + 3]; // chi in fact
+    TRK = shellf[InList * n + 4];
+    Gxx = shellf[InList * n + 5] + 1.0;
+    Gxy = shellf[InList * n + 6];
+    Gxz = shellf[InList * n + 7];
+    Gyy = shellf[InList * n + 8] + 1.0;
+    Gyz = shellf[InList * n + 9];
+    Gzz = shellf[InList * n + 10] + 1.0;
+    axx = shellf[InList * n + 11];
+    axy = shellf[InList * n + 12];
+    axz = shellf[InList * n + 13];
+    ayy = shellf[InList * n + 14];
+    ayz = shellf[InList * n + 15];
+    azz = shellf[InList * n + 16];
+
+    Chi = 1.0 / (1.0 + Chi); // exp(4*phi)
+    Psi = Chi * sqrt(Chi);   // Psi^6
+
+// Chi^2 corresponds to metric determinant
+// but this factor has been considered in f_admmass_bssn
+#ifdef GaussInt
+    // wtcostheta is even function respect costheta
+    Mass_out = Mass_out + (shellf[InList * n] * nx_g[n] + shellf[InList * n + 1] * ny_g[n] + shellf[InList * n + 2] * nz_g[n]) * wtcostheta[i];
+#else
+    Mass_out = Mass_out + (shellf[InList * n] * nx_g[n] + shellf[InList * n + 1] * ny_g[n] + shellf[InList * n + 2] * nz_g[n]);
+#endif
+
+    gupzz = Gxx * Gyy * Gzz + Gxy * Gyz * Gxz + Gxz * Gxy * Gyz -
+            Gxz * Gyy * Gxz - Gxy * Gxy * Gzz - Gxx * Gyz * Gyz;
+    gupxx = (Gyy * Gzz - Gyz * Gyz) / gupzz;
+    gupxy = -(Gxy * Gzz - Gyz * Gxz) / gupzz;
+    gupxz = (Gxy * Gyz - Gyy * Gxz) / gupzz;
+    gupyy = (Gxx * Gzz - Gxz * Gxz) / gupzz;
+    gupyz = -(Gxx * Gyz - Gxy * Gxz) / gupzz;
+    gupzz = (Gxx * Gyy - Gxy * Gxy) / gupzz;
+
+    aupxx = gupxx * axx + gupxy * axy + gupxz * axz;
+    aupxy = gupxx * axy + gupxy * ayy + gupxz * ayz;
+    aupxz = gupxx * axz + gupxy * ayz + gupxz * azz;
+    aupyx = gupxy * axx + gupyy * axy + gupyz * axz;
+    aupyy = gupxy * axy + gupyy * ayy + gupyz * ayz;
+    aupyz = gupxy * axz + gupyy * ayz + gupyz * azz;
+    aupzx = gupxz * axx + gupyz * axy + gupzz * axz;
+    aupzy = gupxz * axy + gupyz * ayy + gupzz * ayz;
+    aupzz = gupxz * axz + gupyz * ayz + gupzz * azz;
+    if (Symmetry == 0)
+    {
+#ifdef GaussInt
+      // wtcostheta is even function respect costheta
+      //  1/8\pi \int \psi^6 (y A^m_z - zA^m_y) dS_m
+      ang_outx = ang_outx + f1o8 * Psi * (nx_g[n] * (pox[1][n] * aupxz - pox[2][n] * aupxy) + ny_g[n] * (pox[1][n] * aupyz - pox[2][n] * aupyy) + nz_g[n] * (pox[1][n] * aupzz - pox[2][n] * aupzy)) * wtcostheta[i];
+      //  1/8\pi \int \psi^6 (z A^m_x - xA^m_z) dS_m
+      ang_outy = ang_outy + f1o8 * Psi * (nx_g[n] * (pox[2][n] * aupxx - pox[0][n] * aupxz) + ny_g[n] * (pox[2][n] * aupyx - pox[0][n] * aupyz) + nz_g[n] * (pox[2][n] * aupzx - pox[0][n] * aupzz)) * wtcostheta[i];
+      // 1/8\pi \int \psi^6 (x A^m_y - yA^m_x) dS_m
+      ang_outz = ang_outz + f1o8 * Psi * (nx_g[n] * (pox[0][n] * aupxy - pox[1][n] * aupxx) + ny_g[n] * (pox[0][n] * aupyy - pox[1][n] * aupyx) + nz_g[n] * (pox[0][n] * aupzy - pox[1][n] * aupzx)) * wtcostheta[i];
+#else
+      //  1/8\pi \int \psi^6 (y A^m_z - zA^m_y) dS_m
+      ang_outx = ang_outx + f1o8 * Psi * (nx_g[n] * (pox[1][n] * aupxz - pox[2][n] * aupxy) + ny_g[n] * (pox[1][n] * aupyz - pox[2][n] * aupyy) + nz_g[n] * (pox[1][n] * aupzz - pox[2][n] * aupzy));
+      //  1/8\pi \int \psi^6 (z A^m_x - xA^m_z) dS_m
+      ang_outy = ang_outy + f1o8 * Psi * (nx_g[n] * (pox[2][n] * aupxx - pox[0][n] * aupxz) + ny_g[n] * (pox[2][n] * aupyx - pox[0][n] * aupyz) + nz_g[n] * (pox[2][n] * aupzx - pox[0][n] * aupzz));
+      // 1/8\pi \int \psi^6 (x A^m_y - yA^m_x) dS_m
+      ang_outz = ang_outz + f1o8 * Psi * (nx_g[n] * (pox[0][n] * aupxy - pox[1][n] * aupxx) + ny_g[n] * (pox[0][n] * aupyy - pox[1][n] * aupyx) + nz_g[n] * (pox[0][n] * aupzy - pox[1][n] * aupzx));
+#endif
+    }
+    else if (Symmetry == 1)
+    {
+#ifdef GaussInt
+      ang_outz = ang_outz + f1o8 * Psi * (nx_g[n] * (pox[0][n] * aupxy - pox[1][n] * aupxx) + ny_g[n] * (pox[0][n] * aupyy - pox[1][n] * aupyx) + nz_g[n] * (pox[0][n] * aupzy - pox[1][n] * aupzx)) * wtcostheta[i];
+#else
+      ang_outz = ang_outz + f1o8 * Psi * (nx_g[n] * (pox[0][n] * aupxy - pox[1][n] * aupxx) + ny_g[n] * (pox[0][n] * aupyy - pox[1][n] * aupyx) + nz_g[n] * (pox[0][n] * aupzy - pox[1][n] * aupzx));
+#endif
+    }
+
+    axx = Chi * (axx + Gxx * TRK / 3.0);
+    axy = Chi * (axy + Gxy * TRK / 3.0);
+    axz = Chi * (axz + Gxz * TRK / 3.0);
+    ayy = Chi * (ayy + Gyy * TRK / 3.0);
+    ayz = Chi * (ayz + Gyz * TRK / 3.0);
+    azz = Chi * (azz + Gzz * TRK / 3.0);
+
+    axx = axx - TRK;
+    ayy = ayy - TRK;
+    azz = azz - TRK;
+
+    // 1/8\pi \int \psi^6 (K_mi - \delta_mi trK) dS^m: lower index linear momentum
+    if (Symmetry == 0)
+    {
+#ifdef GaussInt
+      p_outx = p_outx + f1o8 * Psi * (nx_g[n] * axx + ny_g[n] * axy + nz_g[n] * axz) * wtcostheta[i];
+      p_outy = p_outy + f1o8 * Psi * (nx_g[n] * axy + ny_g[n] * ayy + nz_g[n] * ayz) * wtcostheta[i];
+      p_outz = p_outz + f1o8 * Psi * (nx_g[n] * axz + ny_g[n] * ayz + nz_g[n] * azz) * wtcostheta[i];
+#else
+      p_outx = p_outx + f1o8 * Psi * (nx_g[n] * axx + ny_g[n] * axy + nz_g[n] * axz);
+      p_outy = p_outy + f1o8 * Psi * (nx_g[n] * axy + ny_g[n] * ayy + nz_g[n] * ayz);
+      p_outz = p_outz + f1o8 * Psi * (nx_g[n] * axz + ny_g[n] * ayz + nz_g[n] * azz);
+#endif
+    }
+    else if (Symmetry == 1)
+    {
+#ifdef GaussInt
+      p_outx = p_outx + f1o8 * Psi * (nx_g[n] * axx + ny_g[n] * axy + nz_g[n] * axz) * wtcostheta[i];
+      p_outy = p_outy + f1o8 * Psi * (nx_g[n] * axy + ny_g[n] * ayy + nz_g[n] * ayz) * wtcostheta[i];
+#else
+      p_outx = p_outx + f1o8 * Psi * (nx_g[n] * axx + ny_g[n] * axy + nz_g[n] * axz);
+      p_outy = p_outy + f1o8 * Psi * (nx_g[n] * axy + ny_g[n] * ayy + nz_g[n] * ayz);
+#endif
+    }
+  }
+
+  {
+    double scalar_out[7] = {Mass_out, ang_outx, ang_outy, ang_outz, p_outx, p_outy, p_outz};
+    double scalar_in[7];
+    MPI_Allreduce(scalar_out, scalar_in, 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    mass = scalar_in[0]; sx = scalar_in[1]; sy = scalar_in[2]; sz = scalar_in[3];
+    px = scalar_in[4]; py = scalar_in[5]; pz = scalar_in[6];
+  }
+
+#ifdef GaussInt
+  mass = mass * rex * rex * dphi * factor;
+
+  sx = sx * rex * rex * dphi * (1.0 / PI) * factor;
+  sy = sy * rex * rex * dphi * (1.0 / PI) * factor;
+  sz = sz * rex * rex * dphi * (1.0 / PI) * factor;
+
+  px = px * rex * rex * dphi * (1.0 / PI) * factor;
+  py = py * rex * rex * dphi * (1.0 / PI) * factor;
+  pz = pz * rex * rex * dphi * (1.0 / PI) * factor;
+#else
+  mass = mass * rex * rex * dphi * dcostheta * factor;
+
+  sx = sx * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
+  sy = sy * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
+  sz = sz * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
+
+  px = px * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
+  py = py * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
+  pz = pz * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
+#endif
+
+  Rout[0] = mass;
+  Rout[1] = px;
+  Rout[2] = py;
+  Rout[3] = pz;
+  Rout[4] = sx;
+  Rout[5] = sy;
+  Rout[6] = sz;
+
+  delete[] pox[0];
+  delete[] pox[1];
+  delete[] pox[2];
+  delete[] shellf;
+  DG_List->clearList();
+}
+void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var *trK,
+                                     var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
+                                     var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
+                                     var *Gmx, var *Gmy, var *Gmz,
+                                     var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, // temparay memory for mass^i
+                                     double *Rout, monitor *Monitor, MPI_Comm Comm_here)
+{
+  int lmyrank;
+  MPI_Comm_rank(Comm_here, &lmyrank);
+  if (lmyrank == 0 && GH->grids[lev] != 1)
+    if (Monitor && Monitor->outfile)
+      Monitor->outfile << "WARNING: surface integral on multipatches" << endl;
+    else
+      cout << "WARNING: surface integral on multipatches" << endl;
+
+  double mass, px, py, pz, sx, sy, sz;
+
+  MyList<Patch> *Pp = GH->PatL[lev];
+  while (Pp)
+  {
+    MyList<Block> *BP = Pp->data->blb;
+    while (BP)
+    {
+      Block *cg = BP->data;
+      if (myrank == cg->rank)
+      {
+        f_admmass_bssn(cg->shape, cg->X[0], cg->X[1], cg->X[2],
+                       cg->fgfs[chi->sgfn], cg->fgfs[trK->sgfn],
+                       cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
+                       cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn],
+                       cg->fgfs[Gmx->sgfn], cg->fgfs[Gmy->sgfn], cg->fgfs[Gmz->sgfn],
+                       cg->fgfs[Sfx_rhs->sgfn], cg->fgfs[Sfy_rhs->sgfn], cg->fgfs[Sfz_rhs->sgfn],
+                       Symmetry);
+      }
+      if (BP == Pp->data->ble)
+        break;
+      BP = BP->next;
+    }
+    Pp = Pp->next;
+  }
+
+  const int InList = 17;
+
+  MyList<var> *DG_List = new MyList<var>(Sfx_rhs);
+  DG_List->insert(Sfy_rhs);
+  DG_List->insert(Sfz_rhs);
+  DG_List->insert(chi);
+  DG_List->insert(trK);
+  DG_List->insert(gxx);
+  DG_List->insert(gxy);
+  DG_List->insert(gxz);
+  DG_List->insert(gyy);
+  DG_List->insert(gyz);
+  DG_List->insert(gzz);
+  DG_List->insert(Axx);
+  DG_List->insert(Axy);
+  DG_List->insert(Axz);
+  DG_List->insert(Ayy);
+  DG_List->insert(Ayz);
+  DG_List->insert(Azz);
+
+  int n;
+  double *pox[3];
+  for (int i = 0; i < 3; i++)
+    pox[i] = new double[n_tot];
+  for (n = 0; n < n_tot; n++)
+  {
+    pox[0][n] = rex * nx_g[n];
+    pox[1][n] = rex * ny_g[n];
+    pox[2][n] = rex * nz_g[n];
+  }
+
+  double *shellf;
+  shellf = new double[n_tot * InList];
+
+  // we have assumed there is only one box on this level,
+  // so we do not need loop boxes
+  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Comm_here);
+
+  double Mass_out = 0;
+  double ang_outx, ang_outy, ang_outz;
+  double p_outx, p_outy, p_outz;
+  ang_outx = ang_outy = ang_outz = 0.0;
+  p_outx = p_outy = p_outz = 0.0;
+  const double f1o8 = 0.125;
+
+  int mp, Lp, Nmin, Nmax;
+
+  int cpusize_here;
+  MPI_Comm_size(Comm_here, &cpusize_here);
+
+  mp = n_tot / cpusize_here;
+  Lp = n_tot - cpusize_here * mp;
+
+  if (Lp > lmyrank)
+  {
+    Nmin = lmyrank * mp + lmyrank;
+    Nmax = Nmin + mp;
+  }
+  else
+  {
+    Nmin = lmyrank * mp + Lp;
+    Nmax = Nmin + mp - 1;
+  }
+
+  double Chi, Psi;
+  double Gxx, Gxy, Gxz, Gyy, Gyz, Gzz;
+  double gupxx, gupxy, gupxz, gupyy, gupyz, gupzz;
+  double TRK, axx, axy, axz, ayy, ayz, azz;
+  double aupxx, aupxy, aupxz, aupyx, aupyy, aupyz, aupzx, aupzy, aupzz;
+  int i;
+  for (n = Nmin; n <= Nmax; n++)
+  {
+    //       need round off always
+    i = int(n / N_phi); // int(1.723) = 1, int(-1.732) = -1
+
+    Chi = shellf[InList * n + 3]; // chi in fact
+    TRK = shellf[InList * n + 4];
+    Gxx = shellf[InList * n + 5] + 1.0;
+    Gxy = shellf[InList * n + 6];
+    Gxz = shellf[InList * n + 7];
+    Gyy = shellf[InList * n + 8] + 1.0;
+    Gyz = shellf[InList * n + 9];
+    Gzz = shellf[InList * n + 10] + 1.0;
+    axx = shellf[InList * n + 11];
+    axy = shellf[InList * n + 12];
+    axz = shellf[InList * n + 13];
+    ayy = shellf[InList * n + 14];
+    ayz = shellf[InList * n + 15];
+    azz = shellf[InList * n + 16];
+
+    Chi = 1.0 / (1.0 + Chi); // exp(4*phi)
+    Psi = Chi * sqrt(Chi);   // Psi^6
+
+// Chi^2 corresponds to metric determinant
+// but this factor has been considered in f_admmass_bssn
+#ifdef GaussInt
+    // wtcostheta is even function respect costheta
+    Mass_out = Mass_out + (shellf[InList * n] * nx_g[n] + shellf[InList * n + 1] * ny_g[n] + shellf[InList * n + 2] * nz_g[n]) * wtcostheta[i];
+#else
+    Mass_out = Mass_out + (shellf[InList * n] * nx_g[n] + shellf[InList * n + 1] * ny_g[n] + shellf[InList * n + 2] * nz_g[n]);
+#endif
+
+    gupzz = Gxx * Gyy * Gzz + Gxy * Gyz * Gxz + Gxz * Gxy * Gyz -
+            Gxz * Gyy * Gxz - Gxy * Gxy * Gzz - Gxx * Gyz * Gyz;
+    gupxx = (Gyy * Gzz - Gyz * Gyz) / gupzz;
+    gupxy = -(Gxy * Gzz - Gyz * Gxz) / gupzz;
+    gupxz = (Gxy * Gyz - Gyy * Gxz) / gupzz;
+    gupyy = (Gxx * Gzz - Gxz * Gxz) / gupzz;
+    gupyz = -(Gxx * Gyz - Gxy * Gxz) / gupzz;
+    gupzz = (Gxx * Gyy - Gxy * Gxy) / gupzz;
+
+    aupxx = gupxx * axx + gupxy * axy + gupxz * axz;
+    aupxy = gupxx * axy + gupxy * ayy + gupxz * ayz;
+    aupxz = gupxx * axz + gupxy * ayz + gupxz * azz;
+    aupyx = gupxy * axx + gupyy * axy + gupyz * axz;
+    aupyy = gupxy * axy + gupyy * ayy + gupyz * ayz;
+    aupyz = gupxy * axz + gupyy * ayz + gupyz * azz;
+    aupzx = gupxz * axx + gupyz * axy + gupzz * axz;
+    aupzy = gupxz * axy + gupyz * ayy + gupzz * ayz;
+    aupzz = gupxz * axz + gupyz * ayz + gupzz * azz;
+    if (Symmetry == 0)
+    {
+#ifdef GaussInt
+      // wtcostheta is even function respect costheta
+      //  1/8\pi \int \psi^6 (y A^m_z - zA^m_y) dS_m
+      ang_outx = ang_outx + f1o8 * Psi * (nx_g[n] * (pox[1][n] * aupxz - pox[2][n] * aupxy) + ny_g[n] * (pox[1][n] * aupyz - pox[2][n] * aupyy) + nz_g[n] * (pox[1][n] * aupzz - pox[2][n] * aupzy)) * wtcostheta[i];
+      //  1/8\pi \int \psi^6 (z A^m_x - xA^m_z) dS_m
+      ang_outy = ang_outy + f1o8 * Psi * (nx_g[n] * (pox[2][n] * aupxx - pox[0][n] * aupxz) + ny_g[n] * (pox[2][n] * aupyx - pox[0][n] * aupyz) + nz_g[n] * (pox[2][n] * aupzx - pox[0][n] * aupzz)) * wtcostheta[i];
+      // 1/8\pi \int \psi^6 (x A^m_y - yA^m_x) dS_m
+      ang_outz = ang_outz + f1o8 * Psi * (nx_g[n] * (pox[0][n] * aupxy - pox[1][n] * aupxx) + ny_g[n] * (pox[0][n] * aupyy - pox[1][n] * aupyx) + nz_g[n] * (pox[0][n] * aupzy - pox[1][n] * aupzx)) * wtcostheta[i];
+#else
+      //  1/8\pi \int \psi^6 (y A^m_z - zA^m_y) dS_m
+      ang_outx = ang_outx + f1o8 * Psi * (nx_g[n] * (pox[1][n] * aupxz - pox[2][n] * aupxy) + ny_g[n] * (pox[1][n] * aupyz - pox[2][n] * aupyy) + nz_g[n] * (pox[1][n] * aupzz - pox[2][n] * aupzy));
+      //  1/8\pi \int \psi^6 (z A^m_x - xA^m_z) dS_m
+      ang_outy = ang_outy + f1o8 * Psi * (nx_g[n] * (pox[2][n] * aupxx - pox[0][n] * aupxz) + ny_g[n] * (pox[2][n] * aupyx - pox[0][n] * aupyz) + nz_g[n] * (pox[2][n] * aupzx - pox[0][n] * aupzz));
+      // 1/8\pi \int \psi^6 (x A^m_y - yA^m_x) dS_m
+      ang_outz = ang_outz + f1o8 * Psi * (nx_g[n] * (pox[0][n] * aupxy - pox[1][n] * aupxx) + ny_g[n] * (pox[0][n] * aupyy - pox[1][n] * aupyx) + nz_g[n] * (pox[0][n] * aupzy - pox[1][n] * aupzx));
+#endif
+    }
+    else if (Symmetry == 1)
+    {
+#ifdef GaussInt
+      ang_outz = ang_outz + f1o8 * Psi * (nx_g[n] * (pox[0][n] * aupxy - pox[1][n] * aupxx) + ny_g[n] * (pox[0][n] * aupyy - pox[1][n] * aupyx) + nz_g[n] * (pox[0][n] * aupzy - pox[1][n] * aupzx)) * wtcostheta[i];
+#else
+      ang_outz = ang_outz + f1o8 * Psi * (nx_g[n] * (pox[0][n] * aupxy - pox[1][n] * aupxx) + ny_g[n] * (pox[0][n] * aupyy - pox[1][n] * aupyx) + nz_g[n] * (pox[0][n] * aupzy - pox[1][n] * aupzx));
+#endif
+    }
+
+    axx = Chi * (axx + Gxx * TRK / 3.0);
+    axy = Chi * (axy + Gxy * TRK / 3.0);
+    axz = Chi * (axz + Gxz * TRK / 3.0);
+    ayy = Chi * (ayy + Gyy * TRK / 3.0);
+    ayz = Chi * (ayz + Gyz * TRK / 3.0);
+    azz = Chi * (azz + Gzz * TRK / 3.0);
+
+    axx = axx - TRK;
+    ayy = ayy - TRK;
+    azz = azz - TRK;
+
+    // 1/8\pi \int \psi^6 (K_mi - \delta_mi trK) dS^m: lower index linear momentum
+    if (Symmetry == 0)
+    {
+#ifdef GaussInt
+      p_outx = p_outx + f1o8 * Psi * (nx_g[n] * axx + ny_g[n] * axy + nz_g[n] * axz) * wtcostheta[i];
+      p_outy = p_outy + f1o8 * Psi * (nx_g[n] * axy + ny_g[n] * ayy + nz_g[n] * ayz) * wtcostheta[i];
+      p_outz = p_outz + f1o8 * Psi * (nx_g[n] * axz + ny_g[n] * ayz + nz_g[n] * azz) * wtcostheta[i];
+#else
+      p_outx = p_outx + f1o8 * Psi * (nx_g[n] * axx + ny_g[n] * axy + nz_g[n] * axz);
+      p_outy = p_outy + f1o8 * Psi * (nx_g[n] * axy + ny_g[n] * ayy + nz_g[n] * ayz);
+      p_outz = p_outz + f1o8 * Psi * (nx_g[n] * axz + ny_g[n] * ayz + nz_g[n] * azz);
+#endif
+    }
+    else if (Symmetry == 1)
+    {
+#ifdef GaussInt
+      p_outx = p_outx + f1o8 * Psi * (nx_g[n] * axx + ny_g[n] * axy + nz_g[n] * axz) * wtcostheta[i];
+      p_outy = p_outy + f1o8 * Psi * (nx_g[n] * axy + ny_g[n] * ayy + nz_g[n] * ayz) * wtcostheta[i];
+#else
+      p_outx = p_outx + f1o8 * Psi * (nx_g[n] * axx + ny_g[n] * axy + nz_g[n] * axz);
+      p_outy = p_outy + f1o8 * Psi * (nx_g[n] * axy + ny_g[n] * ayy + nz_g[n] * ayz);
+#endif
+    }
+  }
+
+  {
+    double scalar_out[7] = {Mass_out, ang_outx, ang_outy, ang_outz, p_outx, p_outy, p_outz};
+    double scalar_in[7];
+    MPI_Allreduce(scalar_out, scalar_in, 7, MPI_DOUBLE, MPI_SUM, Comm_here);
+    mass = scalar_in[0]; sx = scalar_in[1]; sy = scalar_in[2]; sz = scalar_in[3];
+    px = scalar_in[4]; py = scalar_in[5]; pz = scalar_in[6];
+  }
+
+#ifdef GaussInt
+  mass = mass * rex * rex * dphi * factor;
+
+  sx = sx * rex * rex * dphi * (1.0 / PI) * factor;
+  sy = sy * rex * rex * dphi * (1.0 / PI) * factor;
+  sz = sz * rex * rex * dphi * (1.0 / PI) * factor;
+
+  px = px * rex * rex * dphi * (1.0 / PI) * factor;
+  py = py * rex * rex * dphi * (1.0 / PI) * factor;
+  pz = pz * rex * rex * dphi * (1.0 / PI) * factor;
+#else
+  mass = mass * rex * rex * dphi * dcostheta * factor;
+
+  sx = sx * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
+  sy = sy * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
+  sz = sz * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
+
+  px = px * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
+  py = py * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
+  pz = pz * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
+#endif
+
+  Rout[0] = mass;
+  Rout[1] = px;
+  Rout[2] = py;
+  Rout[3] = pz;
+  Rout[4] = sx;
+  Rout[5] = sy;
+  Rout[6] = sz;
+
+  delete[] pox[0];
+  delete[] pox[1];
+  delete[] pox[2];
+  delete[] shellf;
+  DG_List->clearList();
+}
+//|----------------------------------------------------------------
+//  for shell patch
+//|----------------------------------------------------------------
+void surface_integral::surf_MassPAng(double rex, int lev, ShellPatch *GH, var *chi, var *trK,
+                                     var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
+                                     var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
+                                     var *Gmx, var *Gmy, var *Gmz,
+                                     var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, // temparay memory for mass^i
+                                     double *Rout, monitor *Monitor)
+{
+  if (lev != 0)
+  {
+    if (myrank == 0)
+    {
+      if (Monitor && Monitor->outfile)
+        Monitor->outfile << "WARNING: shell surface integral not on level 0" << endl;
+      else
+        cout << "WARNING: shell surface integral not on level 0" << endl;
+    }
+    return;
+  }
+
+  double mass, px, py, pz, sx, sy, sz;
+
+  MyList<ss_patch> *Pp = GH->PatL;
+  while (Pp)
+  {
+    MyList<Block> *BL = Pp->data->blb;
+    int fngfs = Pp->data->fngfs;
+    while (BL)
+    {
+      Block *cg = BL->data;
+      if (myrank == cg->rank)
+      {
+        f_admmass_bssn_ss(cg->shape, cg->X[0], cg->X[1], cg->X[2],
+                          cg->fgfs[fngfs + ShellPatch::gx], cg->fgfs[fngfs + ShellPatch::gy], cg->fgfs[fngfs + ShellPatch::gz],
+                          cg->fgfs[fngfs + ShellPatch::drhodx], cg->fgfs[fngfs + ShellPatch::drhody], cg->fgfs[fngfs + ShellPatch::drhodz],
+                          cg->fgfs[fngfs + ShellPatch::dsigmadx], cg->fgfs[fngfs + ShellPatch::dsigmady], cg->fgfs[fngfs + ShellPatch::dsigmadz],
+                          cg->fgfs[fngfs + ShellPatch::dRdx], cg->fgfs[fngfs + ShellPatch::dRdy], cg->fgfs[fngfs + ShellPatch::dRdz],
+                          cg->fgfs[fngfs + ShellPatch::drhodxx], cg->fgfs[fngfs + ShellPatch::drhodxy], cg->fgfs[fngfs + ShellPatch::drhodxz],
+                          cg->fgfs[fngfs + ShellPatch::drhodyy], cg->fgfs[fngfs + ShellPatch::drhodyz], cg->fgfs[fngfs + ShellPatch::drhodzz],
+                          cg->fgfs[fngfs + ShellPatch::dsigmadxx], cg->fgfs[fngfs + ShellPatch::dsigmadxy], cg->fgfs[fngfs + ShellPatch::dsigmadxz],
+                          cg->fgfs[fngfs + ShellPatch::dsigmadyy], cg->fgfs[fngfs + ShellPatch::dsigmadyz], cg->fgfs[fngfs + ShellPatch::dsigmadzz],
+                          cg->fgfs[fngfs + ShellPatch::dRdxx], cg->fgfs[fngfs + ShellPatch::dRdxy], cg->fgfs[fngfs + ShellPatch::dRdxz],
+                          cg->fgfs[fngfs + ShellPatch::dRdyy], cg->fgfs[fngfs + ShellPatch::dRdyz], cg->fgfs[fngfs + ShellPatch::dRdzz],
+                          cg->fgfs[chi->sgfn], cg->fgfs[trK->sgfn],
+                          cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
+                          cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn],
+                          cg->fgfs[Gmx->sgfn], cg->fgfs[Gmy->sgfn], cg->fgfs[Gmz->sgfn],
+                          cg->fgfs[Sfx_rhs->sgfn], cg->fgfs[Sfy_rhs->sgfn], cg->fgfs[Sfz_rhs->sgfn],
+                          Symmetry, Pp->data->sst);
+      }
+      if (BL == Pp->data->ble)
+        break;
+      BL = BL->next;
+    }
+    Pp = Pp->next;
+  }
+
+  const int InList = 17;
+
+  MyList<var> *DG_List = new MyList<var>(Sfx_rhs);
+  DG_List->insert(Sfy_rhs);
+  DG_List->insert(Sfz_rhs);
+  DG_List->insert(chi);
+  DG_List->insert(trK);
+  DG_List->insert(gxx);
+  DG_List->insert(gxy);
+  DG_List->insert(gxz);
+  DG_List->insert(gyy);
+  DG_List->insert(gyz);
+  DG_List->insert(gzz);
+  DG_List->insert(Axx);
+  DG_List->insert(Axy);
+  DG_List->insert(Axz);
+  DG_List->insert(Ayy);
+  DG_List->insert(Ayz);
+  DG_List->insert(Azz);
+
+  int n;
+  double *pox[3];
+  for (int i = 0; i < 3; i++)
+    pox[i] = new double[n_tot];
+  for (n = 0; n < n_tot; n++)
+  {
+    pox[0][n] = rex * nx_g[n];
+    pox[1][n] = rex * ny_g[n];
+    pox[2][n] = rex * nz_g[n];
+  }
+
+  double *shellf;
+  shellf = new double[n_tot * InList];
+
+  // we have assumed there is only one box on this level,
+  // so we do not need loop boxes
+  GH->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);
+
+  double Mass_out = 0;
+  double ang_outx, ang_outy, ang_outz;
+  double p_outx, p_outy, p_outz;
+  ang_outx = ang_outy = ang_outz = 0.0;
+  p_outx = p_outy = p_outz = 0.0;
+  const double f1o8 = 0.125;
+
+  int mp, Lp, Nmin, Nmax;
+
+  mp = n_tot / cpusize;
+  Lp = n_tot - cpusize * mp;
+
+  if (Lp > myrank)
+  {
+    Nmin = myrank * mp + myrank;
+    Nmax = Nmin + mp;
+  }
+  else
+  {
+    Nmin = myrank * mp + Lp;
+    Nmax = Nmin + mp - 1;
+  }
+
+  double Chi, Psi;
+  double Gxx, Gxy, Gxz, Gyy, Gyz, Gzz;
+  double gupxx, gupxy, gupxz, gupyy, gupyz, gupzz;
+  double TRK, axx, axy, axz, ayy, ayz, azz;
+  double aupxx, aupxy, aupxz, aupyx, aupyy, aupyz, aupzx, aupzy, aupzz;
+  int i;
+  for (n = Nmin; n <= Nmax; n++)
+  {
+    //       need round off always
+    i = int(n / N_phi); // int(1.723) = 1, int(-1.732) = -1
+
+    Chi = shellf[InList * n + 3]; // chi in fact
+    TRK = shellf[InList * n + 4];
+    Gxx = shellf[InList * n + 5] + 1.0;
+    Gxy = shellf[InList * n + 6];
+    Gxz = shellf[InList * n + 7];
+    Gyy = shellf[InList * n + 8] + 1.0;
+    Gyz = shellf[InList * n + 9];
+    Gzz = shellf[InList * n + 10] + 1.0;
+    axx = shellf[InList * n + 11];
+    axy = shellf[InList * n + 12];
+    axz = shellf[InList * n + 13];
+    ayy = shellf[InList * n + 14];
+    ayz = shellf[InList * n + 15];
+    azz = shellf[InList * n + 16];
+
+    Chi = 1.0 / (1.0 + Chi); // exp(4*phi)
+    Psi = Chi * sqrt(Chi);   // Psi^6
+// Chi^2 corresponds to metric determinant
+// but this factor has been considered in f_admmass_bssn
+#ifdef GaussInt
+    // wtcostheta is even function respect costheta
+    Mass_out = Mass_out + (shellf[InList * n] * nx_g[n] + shellf[InList * n + 1] * ny_g[n] + shellf[InList * n + 2] * nz_g[n]) * wtcostheta[i];
+#else
+    Mass_out = Mass_out + (shellf[InList * n] * nx_g[n] + shellf[InList * n + 1] * ny_g[n] + shellf[InList * n + 2] * nz_g[n]);
+#endif
+
+    gupzz = Gxx * Gyy * Gzz + Gxy * Gyz * Gxz + Gxz * Gxy * Gyz -
+            Gxz * Gyy * Gxz - Gxy * Gxy * Gzz - Gxx * Gyz * Gyz;
+    gupxx = (Gyy * Gzz - Gyz * Gyz) / gupzz;
+    gupxy = -(Gxy * Gzz - Gyz * Gxz) / gupzz;
+    gupxz = (Gxy * Gyz - Gyy * Gxz) / gupzz;
+    gupyy = (Gxx * Gzz - Gxz * Gxz) / gupzz;
+    gupyz = -(Gxx * Gyz - Gxy * Gxz) / gupzz;
+    gupzz = (Gxx * Gyy - Gxy * Gxy) / gupzz;
+
+    aupxx = gupxx * axx + gupxy * axy + gupxz * axz;
+    aupxy = gupxx * axy + gupxy * ayy + gupxz * ayz;
+    aupxz = gupxx * axz + gupxy * ayz + gupxz * azz;
+    aupyx = gupxy * axx + gupyy * axy + gupyz * axz;
+    aupyy = gupxy * axy + gupyy * ayy + gupyz * ayz;
+    aupyz = gupxy * axz + gupyy * ayz + gupyz * azz;
+    aupzx = gupxz * axx + gupyz * axy + gupzz * axz;
+    aupzy = gupxz * axy + gupyz * ayy + gupzz * ayz;
+    aupzz = gupxz * axz + gupyz * ayz + gupzz * azz;
+    if (Symmetry == 0)
+    {
+#ifdef GaussInt
+      // wtcostheta is even function respect costheta
+      //  1/8\pi \int \psi^6 (y A^m_z - zA^m_y) dS_m
+      ang_outx = ang_outx + f1o8 * Psi * (nx_g[n] * (pox[1][n] * aupxz - pox[2][n] * aupxy) + ny_g[n] * (pox[1][n] * aupyz - pox[2][n] * aupyy) + nz_g[n] * (pox[1][n] * aupzz - pox[2][n] * aupzy)) * wtcostheta[i];
+      //  1/8\pi \int \psi^6 (z A^m_x - xA^m_z) dS_m
+      ang_outy = ang_outy + f1o8 * Psi * (nx_g[n] * (pox[2][n] * aupxx - pox[0][n] * aupxz) + ny_g[n] * (pox[2][n] * aupyx - pox[0][n] * aupyz) + nz_g[n] * (pox[2][n] * aupzx - pox[0][n] * aupzz)) * wtcostheta[i];
+      // 1/8\pi \int \psi^6 (x A^m_y - yA^m_x) dS_m
+      ang_outz = ang_outz + f1o8 * Psi * (nx_g[n] * (pox[0][n] * aupxy - pox[1][n] * aupxx) + ny_g[n] * (pox[0][n] * aupyy - pox[1][n] * aupyx) + nz_g[n] * (pox[0][n] * aupzy - pox[1][n] * aupzx)) * wtcostheta[i];
+#else
+      //  1/8\pi \int \psi^6 (y A^m_z - zA^m_y) dS_m
+      ang_outx = ang_outx + f1o8 * Psi * (nx_g[n] * (pox[1][n] * aupxz - pox[2][n] * aupxy) + ny_g[n] * (pox[1][n] * aupyz - pox[2][n] * aupyy) + nz_g[n] * (pox[1][n] * aupzz - pox[2][n] * aupzy));
+      //  1/8\pi \int \psi^6 (z A^m_x - xA^m_z) dS_m
+      ang_outy = ang_outy + f1o8 * Psi * (nx_g[n] * (pox[2][n] * aupxx - pox[0][n] * aupxz) + ny_g[n] * (pox[2][n] * aupyx - pox[0][n] * aupyz) + nz_g[n] * (pox[2][n] * aupzx - pox[0][n] * aupzz));
+      // 1/8\pi \int \psi^6 (x A^m_y - yA^m_x) dS_m
+      ang_outz = ang_outz + f1o8 * Psi * (nx_g[n] * (pox[0][n] * aupxy - pox[1][n] * aupxx) + ny_g[n] * (pox[0][n] * aupyy - pox[1][n] * aupyx) + nz_g[n] * (pox[0][n] * aupzy - pox[1][n] * aupzx));
+#endif
+    }
+    else if (Symmetry == 1)
+    {
+#ifdef GaussInt
+      ang_outz = ang_outz + f1o8 * Psi * (nx_g[n] * (pox[0][n] * aupxy - pox[1][n] * aupxx) + ny_g[n] * (pox[0][n] * aupyy - pox[1][n] * aupyx) + nz_g[n] * (pox[0][n] * aupzy - pox[1][n] * aupzx)) * wtcostheta[i];
+#else
+      ang_outz = ang_outz + f1o8 * Psi * (nx_g[n] * (pox[0][n] * aupxy - pox[1][n] * aupxx) + ny_g[n] * (pox[0][n] * aupyy - pox[1][n] * aupyx) + nz_g[n] * (pox[0][n] * aupzy - pox[1][n] * aupzx));
+#endif
+    }
+
+    axx = Chi * (axx + Gxx * TRK / 3.0);
+    axy = Chi * (axy + Gxy * TRK / 3.0);
+    axz = Chi * (axz + Gxz * TRK / 3.0);
+    ayy = Chi * (ayy + Gyy * TRK / 3.0);
+    ayz = Chi * (ayz + Gyz * TRK / 3.0);
+    azz = Chi * (azz + Gzz * TRK / 3.0);
+
+    axx = axx - TRK;
+    ayy = ayy - TRK;
+    azz = azz - TRK;
+
+    // 1/8\pi \int \psi^6 (K_mi - \delta_mi trK) dS^m: lower index linear momentum
+    if (Symmetry == 0)
+    {
+#ifdef GaussInt
+      p_outx = p_outx + f1o8 * Psi * (nx_g[n] * axx + ny_g[n] * axy + nz_g[n] * axz) * wtcostheta[i];
+      p_outy = p_outy + f1o8 * Psi * (nx_g[n] * axy + ny_g[n] * ayy + nz_g[n] * ayz) * wtcostheta[i];
+      p_outz = p_outz + f1o8 * Psi * (nx_g[n] * axz + ny_g[n] * ayz + nz_g[n] * azz) * wtcostheta[i];
+#else
+      p_outx = p_outx + f1o8 * Psi * (nx_g[n] * axx + ny_g[n] * axy + nz_g[n] * axz);
+      p_outy = p_outy + f1o8 * Psi * (nx_g[n] * axy + ny_g[n] * ayy + nz_g[n] * ayz);
+      p_outz = p_outz + f1o8 * Psi * (nx_g[n] * axz + ny_g[n] * ayz + nz_g[n] * azz);
+#endif
+    }
+    else if (Symmetry == 1)
+    {
+#ifdef GaussInt
+      p_outx = p_outx + f1o8 * Psi * (nx_g[n] * axx + ny_g[n] * axy + nz_g[n] * axz) * wtcostheta[i];
+      p_outy = p_outy + f1o8 * Psi * (nx_g[n] * axy + ny_g[n] * ayy + nz_g[n] * ayz) * wtcostheta[i];
+#else
+      p_outx = p_outx + f1o8 * Psi * (nx_g[n] * axx + ny_g[n] * axy + nz_g[n] * axz);
+      p_outy = p_outy + f1o8 * Psi * (nx_g[n] * axy + ny_g[n] * ayy + nz_g[n] * ayz);
+#endif
+    }
+  }
+
+  {
+    double scalar_out[7] = {Mass_out, ang_outx, ang_outy, ang_outz, p_outx, p_outy, p_outz};
+    double scalar_in[7];
+    MPI_Allreduce(scalar_out, scalar_in, 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    mass = scalar_in[0]; sx = scalar_in[1]; sy = scalar_in[2]; sz = scalar_in[3];
+    px = scalar_in[4]; py = scalar_in[5]; pz = scalar_in[6];
+  }
+
+#ifdef GaussInt
+  mass = mass * rex * rex * dphi * factor;
+
+  sx = sx * rex * rex * dphi * (1.0 / PI) * factor;
+  sy = sy * rex * rex * dphi * (1.0 / PI) * factor;
+  sz = sz * rex * rex * dphi * (1.0 / PI) * factor;
+
+  px = px * rex * rex * dphi * (1.0 / PI) * factor;
+  py = py * rex * rex * dphi * (1.0 / PI) * factor;
+  pz = pz * rex * rex * dphi * (1.0 / PI) * factor;
+#else
+  mass = mass * rex * rex * dphi * dcostheta * factor;
+
+  sx = sx * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
+  sy = sy * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
+  sz = sz * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
+
+  px = px * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
+  py = py * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
+  pz = pz * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
+#endif
+
+  Rout[0] = mass;
+  Rout[1] = px;
+  Rout[2] = py;
+  Rout[3] = pz;
+  Rout[4] = sx;
+  Rout[5] = sy;
+  Rout[6] = sz;
+
+  delete[] pox[0];
+  delete[] pox[1];
+  delete[] pox[2];
+  delete[] shellf;
+  DG_List->clearList();
+}
+//|----------------------------------------------------------------
+//  do not discriminate box and shell
+//  for Gravitational wave specially symmetric case
+//|----------------------------------------------------------------
+void surface_integral::surf_Wave(double rex, cgh *GH, ShellPatch *SH,
+                                 var *chi, var *trK,
+                                 var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
+                                 var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
+                                 var *chix, var *chiy, var *chiz,
+                                 var *trKx, var *trKy, var *trKz,
+                                 var *Axxx, var *Axxy, var *Axxz,
+                                 var *Axyx, var *Axyy, var *Axyz,
+                                 var *Axzx, var *Axzy, var *Axzz,
+                                 var *Ayyx, var *Ayyy, var *Ayyz,
+                                 var *Ayzx, var *Ayzy, var *Ayzz,
+                                 var *Azzx, var *Azzy, var *Azzz,
+                                 var *Gamxxx, var *Gamxxy, var *Gamxxz, var *Gamxyy, var *Gamxyz, var *Gamxzz,
+                                 var *Gamyxx, var *Gamyxy, var *Gamyxz, var *Gamyyy, var *Gamyyz, var *Gamyzz,
+                                 var *Gamzxx, var *Gamzxy, var *Gamzxz, var *Gamzyy, var *Gamzyz, var *Gamzzz,
+                                 var *Rxx, var *Rxy, var *Rxz, var *Ryy, var *Ryz, var *Rzz,
+                                 int spinw, int maxl, int NN, double *RP, double *IP,
+                                 monitor *Monitor) // NN is the length of RP and IP
+{
+  const int InList = 62;
+
+  MyList<var> *DG_List = new MyList<var>(chi);
+  DG_List->insert(trK);
+  DG_List->insert(gxx);
+  DG_List->insert(gxy);
+  DG_List->insert(gxz);
+  DG_List->insert(gyy);
+  DG_List->insert(gyz);
+  DG_List->insert(gzz);
+  DG_List->insert(Axx);
+  DG_List->insert(Axy);
+  DG_List->insert(Axz);
+  DG_List->insert(Ayy);
+  DG_List->insert(Ayz);
+  DG_List->insert(Azz);
+  DG_List->insert(chix);
+  DG_List->insert(chiy);
+  DG_List->insert(chiz);
+  DG_List->insert(trKx);
+  DG_List->insert(trKy);
+  DG_List->insert(trKz);
+  DG_List->insert(Axxx);
+  DG_List->insert(Axxy);
+  DG_List->insert(Axxz);
+  DG_List->insert(Axyx);
+  DG_List->insert(Axyy);
+  DG_List->insert(Axyz);
+  DG_List->insert(Axzx);
+  DG_List->insert(Axzy);
+  DG_List->insert(Axzz);
+  DG_List->insert(Ayyx);
+  DG_List->insert(Ayyy);
+  DG_List->insert(Ayyz);
+  DG_List->insert(Ayzx);
+  DG_List->insert(Ayzy);
+  DG_List->insert(Ayzz);
+  DG_List->insert(Azzx);
+  DG_List->insert(Azzy);
+  DG_List->insert(Azzz);
+  DG_List->insert(Gamxxx);
+  DG_List->insert(Gamxxy);
+  DG_List->insert(Gamxxz);
+  DG_List->insert(Gamxyy);
+  DG_List->insert(Gamxyz);
+  DG_List->insert(Gamxzz);
+  DG_List->insert(Gamyxx);
+  DG_List->insert(Gamyxy);
+  DG_List->insert(Gamyxz);
+  DG_List->insert(Gamyyy);
+  DG_List->insert(Gamyyz);
+  DG_List->insert(Gamyzz);
+  DG_List->insert(Gamzxx);
+  DG_List->insert(Gamzxy);
+  DG_List->insert(Gamzxz);
+  DG_List->insert(Gamzyy);
+  DG_List->insert(Gamzyz);
+  DG_List->insert(Gamzzz);
+  DG_List->insert(Rxx);
+  DG_List->insert(Rxy);
+  DG_List->insert(Rxz);
+  DG_List->insert(Ryy);
+  DG_List->insert(Ryz);
+  DG_List->insert(Rzz);
+
+  int n;
+  double *pox[3];
+  for (int i = 0; i < 3; i++)
+    pox[i] = new double[n_tot];
+  for (n = 0; n < n_tot; n++)
+  {
+    pox[0][n] = rex * nx_g[n];
+    pox[1][n] = rex * ny_g[n];
+    pox[2][n] = rex * nz_g[n];
+  }
+
+  double *shellf;
+  shellf = new double[n_tot * InList];
+
+  SR_Interp_Points(DG_List, GH, SH, n_tot, pox, shellf);
+
+  double *RP_out, *IP_out;
+  RP_out = new double[NN];
+  IP_out = new double[NN];
+
+  for (int ii = 0; ii < NN; ii++)
+  {
+    RP_out[ii] = 0;
+    IP_out[ii] = 0;
+  }
+
+  int mp, Lp, Nmin, Nmax;
+
+  mp = n_tot / cpusize;
+  Lp = n_tot - cpusize * mp;
+
+  if (Lp > myrank)
+  {
+    Nmin = myrank * mp + myrank;
+    Nmax = Nmin + mp;
+  }
+  else
+  {
+    Nmin = myrank * mp + Lp;
+    Nmax = Nmin + mp - 1;
+  }
+
+  // theta part
+  double costheta, thetap;
+  double cosmphi, sinmphi;
+
+  int i, j;
+  int lpsy = 0;
+  if (Symmetry == 0)
+    lpsy = 1;
+  else if (Symmetry == 1)
+    lpsy = 2;
+  else if (Symmetry == 2)
+    lpsy = 8;
+
+  double psi4RR, psi4II;
+  double px, py, pz;
+  double pchi, ptrK, pgxx, pgxy, pgxz, pgyy, pgyz, pgzz;
+  double pAxx, pAxy, pAxz, pAyy, pAyz, pAzz;
+  double pchix, pchiy, pchiz;
+  double ptrKx, ptrKy, ptrKz;
+  double pAxxx, pAxxy, pAxxz;
+  double pAxyx, pAxyy, pAxyz;
+  double pAxzx, pAxzy, pAxzz;
+  double pAyyx, pAyyy, pAyyz;
+  double pAyzx, pAyzy, pAyzz;
+  double pAzzx, pAzzy, pAzzz;
+  double pGamxxx, pGamxxy, pGamxxz, pGamxyy, pGamxyz, pGamxzz;
+  double pGamyxx, pGamyxy, pGamyxz, pGamyyy, pGamyyz, pGamyzz;
+  double pGamzxx, pGamzxy, pGamzxz, pGamzyy, pGamzyz, pGamzzz;
+  double pRxx, pRxy, pRxz, pRyy, pRyz, pRzz;
+  for (n = Nmin; n <= Nmax; n++)
+  {
+    //       need round off always
+    i = int(n / N_phi); // int(1.723) = 1, int(-1.732) = -1
+    j = n - i * N_phi;
+
+    int countlm = 0;
+    for (int pl = spinw; pl < maxl + 1; pl++)
+      for (int pm = -pl; pm < pl + 1; pm++)
+      {
+        for (int lp = 0; lp < lpsy; lp++)
+        {
+          px = pox[0][n];
+          py = pox[1][n];
+          pz = pox[2][n];
+          pchi = shellf[InList * n];
+          ptrK = shellf[InList * n + 1];
+          pgxx = shellf[InList * n + 2];
+          pgxy = shellf[InList * n + 3];
+          pgxz = shellf[InList * n + 4];
+          pgyy = shellf[InList * n + 5];
+          pgyz = shellf[InList * n + 6];
+          pgzz = shellf[InList * n + 7];
+          pAxx = shellf[InList * n + 8];
+          pAxy = shellf[InList * n + 9];
+          pAxz = shellf[InList * n + 10];
+          pAyy = shellf[InList * n + 11];
+          pAyz = shellf[InList * n + 12];
+          pAzz = shellf[InList * n + 13];
+          pchix = shellf[InList * n + 14];
+          pchiy = shellf[InList * n + 15];
+          pchiz = shellf[InList * n + 16];
+          ptrKx = shellf[InList * n + 17];
+          ptrKy = shellf[InList * n + 18];
+          ptrKz = shellf[InList * n + 19];
+          pAxxx = shellf[InList * n + 20];
+          pAxxy = shellf[InList * n + 21];
+          pAxxz = shellf[InList * n + 22];
+          pAxyx = shellf[InList * n + 23];
+          pAxyy = shellf[InList * n + 24];
+          pAxyz = shellf[InList * n + 25];
+          pAxzx = shellf[InList * n + 26];
+          pAxzy = shellf[InList * n + 27];
+          pAxzz = shellf[InList * n + 28];
+          pAyyx = shellf[InList * n + 29];
+          pAyyy = shellf[InList * n + 30];
+          pAyyz = shellf[InList * n + 31];
+          pAyzx = shellf[InList * n + 32];
+          pAyzy = shellf[InList * n + 33];
+          pAyzz = shellf[InList * n + 34];
+          pAzzx = shellf[InList * n + 35];
+          pAzzy = shellf[InList * n + 36];
+          pAzzz = shellf[InList * n + 37];
+          pGamxxx = shellf[InList * n + 38];
+          pGamxxy = shellf[InList * n + 39];
+          pGamxxz = shellf[InList * n + 40];
+          pGamxyy = shellf[InList * n + 41];
+          pGamxyz = shellf[InList * n + 42];
+          pGamxzz = shellf[InList * n + 43];
+          pGamyxx = shellf[InList * n + 44];
+          pGamyxy = shellf[InList * n + 45];
+          pGamyxz = shellf[InList * n + 46];
+          pGamyyy = shellf[InList * n + 47];
+          pGamyyz = shellf[InList * n + 48];
+          pGamyzz = shellf[InList * n + 49];
+          pGamzxx = shellf[InList * n + 50];
+          pGamzxy = shellf[InList * n + 51];
+          pGamzxz = shellf[InList * n + 52];
+          pGamzyy = shellf[InList * n + 53];
+          pGamzyz = shellf[InList * n + 54];
+          pGamzzz = shellf[InList * n + 55];
+          pRxx = shellf[InList * n + 56];
+          pRxy = shellf[InList * n + 57];
+          pRxz = shellf[InList * n + 58];
+          pRyy = shellf[InList * n + 59];
+          pRyz = shellf[InList * n + 60];
+          pRzz = shellf[InList * n + 61];
+          switch (lp)
+          {
+          case 0: //+++ (theta, phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = sin(pm * (j + 0.5) * dphi);
+            break;
+          case 1: //++- (pi-theta, phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = sin(pm * (j + 0.5) * dphi);
+            pz = -pz;
+            pgxz = -pgxz;
+            pgyz = -pgyz;
+            pAxz = -pAxz;
+            pAyz = -pAyz;
+            pchiz = -pchiz;
+            ptrKz = -ptrKz;
+            pAxxz = -pAxxz;
+            pAxyz = -pAxyz;
+            pAxzx = -pAxzx;
+            pAxzy = -pAxzy;
+            pAyyz = -pAyyz;
+            pAyzx = -pAyzx;
+            pAyzy = -pAyzy;
+            pAzzz = -pAzzz;
+            pGamxxz = -pGamxxz;
+            pGamxyz = -pGamxyz;
+            pGamyxz = -pGamyxz;
+            pGamyyz = -pGamyyz;
+            pGamzxx = -pGamzxx;
+            pGamzxy = -pGamzxy;
+            pGamzyy = -pGamzyy;
+            pGamzzz = -pGamzzz;
+            pRxz = -pRxz;
+            pRyz = -pRyz;
+            break;
+          case 2: //+-+ (theta, 2*pi-phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = -sin(pm * (j + 0.5) * dphi);
+            py = -py;
+            pgxy = -pgxy;
+            pgyz = -pgyz;
+            pAxy = -pAxy;
+            pAyz = -pAyz;
+            pchiy = -pchiy;
+            ptrKy = -ptrKy;
+            pAxxy = -pAxxy;
+            pAxyx = -pAxyx;
+            pAxyz = -pAxyz;
+            pAxzy = -pAxzy;
+            pAyyy = -pAyyy;
+            pAyzx = -pAyzx;
+            pAyzz = -pAyzz;
+            pAzzy = -pAzzy;
+            pGamxxy = -pGamxxy;
+            pGamxyz = -pGamxyz;
+            pGamyxx = -pGamyxx;
+            pGamyxz = -pGamyxz;
+            pGamyyy = -pGamyyy;
+            pGamyzz = -pGamyzz;
+            pGamzxy = -pGamzxy;
+            pGamzyz = -pGamzyz;
+            pRxy = -pRxy;
+            pRyz = -pRyz;
+            break;
+          case 3: //+-- (pi-theta, 2*pi-phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (j + 0.5) * dphi);
+            sinmphi = -sin(pm * (j + 0.5) * dphi);
+            py = -py;
+            pz = -pz;
+            pgxy = -pgxy;
+            pgxz = -pgxz;
+            pAxy = -pAxy;
+            pAxz = -pAxz;
+            pchiy = -pchiy;
+            pchiz = -pchiz;
+            ptrKy = -ptrKy;
+            ptrKz = -ptrKz;
+            pAxxy = -pAxxy;
+            pAxxz = -pAxxz;
+            pAxyx = -pAxyx;
+            pAxzx = -pAxzx;
+            pAyyy = -pAyyy;
+            pAyyz = -pAyyz;
+            pAyzy = -pAyzy;
+            pAyzz = -pAyzz;
+            pAzzy = -pAzzy;
+            pAzzz = -pAzzz;
+            pGamxxy = -pGamxxy;
+            pGamxxz = -pGamxxz;
+            pGamyxx = -pGamyxx;
+            pGamyyy = -pGamyyy;
+            pGamyyz = -pGamyyz;
+            pGamyzz = -pGamyzz;
+            pGamzxx = -pGamzxx;
+            pGamzyy = -pGamzyy;
+            pGamzyz = -pGamzyz;
+            pGamzzz = -pGamzzz;
+            pRxy = -pRxy;
+            pRxz = -pRxz;
+            break;
+          case 4: //-++ (theta, pi-phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
+            px = -px;
+            pgxy = -pgxy;
+            pgxz = -pgxz;
+            pAxy = -pAxy;
+            pAxz = -pAxz;
+            pchix = -pchix;
+            ptrKx = -ptrKx;
+            pAxxx = -pAxxx;
+            pAxyy = -pAxyy;
+            pAxyz = -pAxyz;
+            pAxzy = -pAxzy;
+            pAxzz = -pAxzz;
+            pAyyx = -pAyyx;
+            pAyzx = -pAyzx;
+            pAzzx = -pAzzx;
+            pGamxxx = -pGamxxx;
+            pGamxyy = -pGamxyy;
+            pGamxyz = -pGamxyz;
+            pGamxzz = -pGamxzz;
+            pGamyxy = -pGamyxy;
+            pGamyxz = -pGamyxz;
+            pGamzxy = -pGamzxy;
+            pGamzxz = -pGamzxz;
+            pRxy = -pRxy;
+            pRxz = -pRxz;
+            break;
+          case 5: //-+- (pi-theta, pi-phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (PI - (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI - (j + 0.5) * dphi));
+            px = -px;
+            pz = -pz;
+            pgxy = -pgxy;
+            pgyz = -pgyz;
+            pAxy = -pAxy;
+            pAyz = -pAyz;
+            pchix = -pchix;
+            pchiz = -pchiz;
+            ptrKx = -ptrKx;
+            ptrKz = -ptrKz;
+            pAxxx = -pAxxx;
+            pAxxz = -pAxxz;
+            pAxyy = -pAxyy;
+            pAxzx = -pAxzx;
+            pAxzz = -pAxzz;
+            pAyyx = -pAyyx;
+            pAyyz = -pAyyz;
+            pAyzy = -pAyzy;
+            pAzzx = -pAzzx;
+            pAzzz = -pAzzz;
+            pGamxxx = -pGamxxx;
+            pGamxxz = -pGamxxz;
+            pGamxyy = -pGamxyy;
+            pGamxzz = -pGamxzz;
+            pGamyxy = -pGamyxy;
+            pGamyyz = -pGamyyz;
+            pGamzxx = -pGamzxx;
+            pGamzxz = -pGamzxz;
+            pGamzyy = -pGamzyy;
+            pGamzzz = -pGamzzz;
+            pRxy = -pRxy;
+            pRyz = -pRyz;
+            break;
+          case 6: //--+ (theta, pi+phi)
+            costheta = arcostheta[i];
+            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
+            px = -px;
+            py = -py;
+            pgxz = -pgxz;
+            pgyz = -pgyz;
+            pAxz = -pAxz;
+            pAyz = -pAyz;
+            pchix = -pchix;
+            pchiy = -pchiy;
+            ptrKx = -ptrKx;
+            ptrKy = -ptrKy;
+            pAxxx = -pAxxx;
+            pAxxy = -pAxxy;
+            pAxyx = -pAxyx;
+            pAxyy = -pAxyy;
+            pAxzz = -pAxzz;
+            pAyyx = -pAyyx;
+            pAyyy = -pAyyy;
+            pAyzz = -pAyzz;
+            pAzzx = -pAzzx;
+            pAzzy = -pAzzy;
+            pGamxxx = -pGamxxx;
+            pGamxxy = -pGamxxy;
+            pGamxyy = -pGamxyy;
+            pGamxzz = -pGamxzz;
+            pGamyxx = -pGamyxx;
+            pGamyxy = -pGamyxy;
+            pGamyyy = -pGamyyy;
+            pGamyzz = -pGamyzz;
+            pGamzxz = -pGamzxz;
+            pGamzyz = -pGamzyz;
+            pRxz = -pRxz;
+            pRyz = -pRyz;
+            break;
+          case 7: //--- (pi-theta, pi+phi)
+            costheta = -arcostheta[i];
+            cosmphi = cos(pm * (PI + (j + 0.5) * dphi));
+            sinmphi = sin(pm * (PI + (j + 0.5) * dphi));
+            px = -px;
+            py = -py;
+            pz = -pz;
+            pchix = -pchix;
+            pchiy = -pchiy;
+            pchiz = -pchiz;
+            ptrKx = -ptrKx;
+            ptrKy = -ptrKy;
+            ptrKz = -ptrKz;
+            pAxxx = -pAxxx;
+            pAxxy = -pAxxy;
+            pAxxz = -pAxxz;
+            pAxyx = -pAxyx;
+            pAxyy = -pAxyy;
+            pAxyz = -pAxyz;
+            pAxzx = -pAxzx;
+            pAxzy = -pAxzy;
+            pAxzz = -pAxzz;
+            pAyyx = -pAyyx;
+            pAyyy = -pAyyy;
+            pAyyz = -pAyyz;
+            pAyzx = -pAyzx;
+            pAyzy = -pAyzy;
+            pAyzz = -pAyzz;
+            pAzzx = -pAzzx;
+            pAzzy = -pAzzy;
+            pAzzz = -pAzzz;
+            pGamxxx = -pGamxxx;
+            pGamxxy = -pGamxxy;
+            pGamxxz = -pGamxxz;
+            pGamxyy = -pGamxyy;
+            pGamxyz = -pGamxyz;
+            pGamxzz = -pGamxzz;
+            pGamyxx = -pGamyxx;
+            pGamyxy = -pGamyxy;
+            pGamyxz = -pGamyxz;
+            pGamyyy = -pGamyyy;
+            pGamyyz = -pGamyyz;
+            pGamyzz = -pGamyzz;
+            pGamzxx = -pGamzxx;
+            pGamzxy = -pGamzxy;
+            pGamzxz = -pGamzxz;
+            pGamzyy = -pGamzyy;
+            pGamzyz = -pGamzyz;
+            pGamzzz = -pGamzzz;
+          }
+
+          f_getnp4_point(px, py, pz, pchi, ptrK,
+                         pgxx, pgxy, pgxz, pgyy, pgyz, pgzz,
+                         pAxx, pAxy, pAxz, pAyy, pAyz, pAzz,
+                         pchix, pchiy, pchiz,
+                         ptrKx, ptrKy, ptrKz,
+                         pAxxx, pAxxy, pAxxz,
+                         pAxyx, pAxyy, pAxyz,
+                         pAxzx, pAxzy, pAxzz,
+                         pAyyx, pAyyy, pAyyz,
+                         pAyzx, pAyzy, pAyzz,
+                         pAzzx, pAzzy, pAzzz,
+                         pGamxxx, pGamxxy, pGamxxz, pGamxyy, pGamxyz, pGamxzz,
+                         pGamyxx, pGamyxy, pGamyxz, pGamyyy, pGamyyz, pGamyzz,
+                         pGamzxx, pGamzxy, pGamzxz, pGamzyy, pGamzyz, pGamzzz,
+                         pRxx, pRxy, pRxz, pRyy, pRyz, pRzz,
+                         psi4RR, psi4II);
+
+          thetap = sqrt((2 * pl + 1.0) / 4.0 / PI) * misc::Wigner_d_function(pl, pm, spinw, costheta); // note the variation from -2 to 2
+
+          //	 find back the one
+          pchi = pchi + 1;
+#ifdef GaussInt
+          // wtcostheta is even function respect costheta
+          RP_out[countlm] = RP_out[countlm] + thetap / pchi / pchi * (psi4RR * cosmphi + psi4II * sinmphi) * wtcostheta[i];
+          IP_out[countlm] = IP_out[countlm] + thetap / pchi / pchi * (psi4II * cosmphi - psi4RR * sinmphi) * wtcostheta[i];
+#else
+          RP_out[countlm] = RP_out[countlm] + thetap / pchi / pchi * (psi4RR * cosmphi + psi4II * sinmphi);
+          IP_out[countlm] = IP_out[countlm] + thetap / pchi / pchi * (psi4II * cosmphi - psi4RR * sinmphi);
+#endif
+        }
+        countlm++; // no sanity check for countlm and NN which should be noted in the input parameters
+      }
+  }
+
+  for (int ii = 0; ii < NN; ii++)
+  {
+#ifdef GaussInt
+    RP_out[ii] = RP_out[ii] * rex * dphi;
+    IP_out[ii] = IP_out[ii] * rex * dphi;
+#else
+    RP_out[ii] = RP_out[ii] * rex * dphi * dcostheta;
+    IP_out[ii] = IP_out[ii] * rex * dphi * dcostheta;
+#endif
+  }
+  //|------+  Communicate and sum the results from each processor.
+
+  {
+    double *RPIP_out = new double[2 * NN];
+    double *RPIP = new double[2 * NN];
+    memcpy(RPIP_out, RP_out, NN * sizeof(double));
+    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
+    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    memcpy(RP, RPIP, NN * sizeof(double));
+    memcpy(IP, RPIP + NN, NN * sizeof(double));
+    delete[] RPIP_out;
+    delete[] RPIP;
+  }
+
+  //|------= Free memory.
+
+  delete[] pox[0];
+  delete[] pox[1];
+  delete[] pox[2];
+  delete[] shellf;
+  delete[] RP_out;
+  delete[] IP_out;
+  DG_List->clearList();
+}
+//|----------------------------------------------------------------
+//  do not discriminate box and shell
+//|----------------------------------------------------------------
+bool surface_integral::SR_Interp_Points(MyList<var> *VarList, cgh *GH, ShellPatch *SH,
+                                        int NN, double **XX, double *Shellf)
+{
+  MyList<var> *varl;
+  int num_var = 0;
+  varl = VarList;
+  while (varl)
+  {
+    num_var++;
+    varl = varl->next;
+  }
+
+  double pox[3];
+  for (int i = 0; i < NN; i++)
+  {
+    for (int j = 0; j < 3; j++)
+      pox[j] = XX[j][i];
+    int lev = GH->levels - 1;
+    bool notfound = true;
+
+    while (notfound)
+    {
+      if (lev < 0)
+      {
+        if (SH)
+        {
+          if (SH->Interp_One_Point(VarList, pox, Shellf + i * num_var, Symmetry))
+          {
+            return true;
+          }
+          if (myrank == 0)
+            cout << "surface_integral::SR_Interp_Points point (" << pox[0] << "," << pox[1] << "," << pox[2] << ") is out of cgh and shell domain!" << endl;
+        }
+        else
+        {
+          if (myrank == 0)
+            cout << "surface_integral::SR_Interp_Points: point (" << pox[0] << "," << pox[1] << "," << pox[2] << ") is out of cgh domain!" << endl;
+        }
+        return false;
+      }
+      MyList<Patch> *Pp = GH->PatL[lev];
+      while (Pp)
+      {
+        if (Pp->data->Interp_ONE_Point(VarList, pox, Shellf + i * num_var, Symmetry))
+        {
+          notfound = false;
+          break;
+        }
+        Pp = Pp->next;
+      }
+      lev--;
+    }
+  }
+  return true;
+}