From 22c1e7168b21fdcca6bf105266acc508671ef92e Mon Sep 17 00:00:00 2001
From: CGH0S7 <776459475@qq.com>
Date: Wed, 29 Apr 2026 17:05:10 +0800
Subject: [PATCH] Optimize BSSN CUDA resident state and CUDA-aware MPI

---
 AMSS_NCKU_Input.py                |     2 +-
 AMSS_NCKU_source/Block.C          |   125 +-
 AMSS_NCKU_source/Block.h          |    13 +-
 AMSS_NCKU_source/Parallel.C       | 13737 ++++++++++++++--------------
 AMSS_NCKU_source/Parallel.h       |    59 +-
 AMSS_NCKU_source/bssn_rhs_cuda.cu |   188 +
 AMSS_NCKU_source/bssn_rhs_cuda.h  |    64 +-
 7 files changed, 7461 insertions(+), 6727 deletions(-)
diff --git a/AMSS_NCKU_Input.py b/AMSS_NCKU_Input.py
index 67e7c1c..73af547 100755
--- a/AMSS_NCKU_Input.py
+++ b/AMSS_NCKU_Input.py
@@ -16,7 +16,7 @@ import numpy
 File_directory   = "GW150914"                    ## output file directory
 Output_directory = "binary_output"               ## binary data file directory
                                                  ## The file directory name should not be too long
-MPI_processes    = 8                             ## number of mpi processes used in the simulation
+MPI_processes    = 2                             ## number of mpi processes used in the simulation
 
 GPU_Calculation  = "yes"                          ## Use GPU or not 
                                                  ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
diff --git a/AMSS_NCKU_source/Block.C b/AMSS_NCKU_source/Block.C
index fcae198..e0fa4ab 100644
--- a/AMSS_NCKU_source/Block.C
+++ b/AMSS_NCKU_source/Block.C
@@ -6,14 +6,68 @@
 #include <cstdio>
 #include <string>
 #include <cmath>
-#include <new>
-using namespace std;
-
-#include "Block.h"
-#include "misc.h"
-
-Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfsi, int levi, const int cgpui) : rank(ranki), ingfs(ingfsi), fngfs(fngfsi), lev(levi), cgpu(cgpui)
-{
+#include <new>
+using namespace std;
+
+#include "Block.h"
+#include "misc.h"
+
+#if USE_CUDA_BSSN || USE_CUDA_Z4C
+#include <cuda_runtime_api.h>
+#endif
+
+namespace {
+
+bool cuda_pin_gridfuncs_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+  {
+    const char *env = getenv("AMSS_CUDA_PIN_GRIDFUNCS");
+    enabled = (env && atoi(env) != 0) ? 1 : 0;
+  }
+  return enabled != 0;
+}
+
+double *alloc_gridfunc(size_t count, unsigned char &pinned)
+{
+  pinned = 0;
+#if USE_CUDA_BSSN || USE_CUDA_Z4C
+  if (cuda_pin_gridfuncs_enabled())
+  {
+    double *ptr = 0;
+    cudaError_t err = cudaMallocHost((void **)&ptr, count * sizeof(double));
+    if (err == cudaSuccess)
+    {
+      pinned = 1;
+      return ptr;
+    }
+    cudaGetLastError();
+  }
+#endif
+  return (double *)malloc(sizeof(double) * count);
+}
+
+void free_gridfunc(double *ptr, unsigned char pinned)
+{
+  if (!ptr)
+    return;
+#if USE_CUDA_BSSN || USE_CUDA_Z4C
+  if (pinned)
+  {
+    cudaFreeHost(ptr);
+    return;
+  }
+#else
+  (void)pinned;
+#endif
+  free(ptr);
+}
+
+}
+
+Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfsi, int levi, const int cgpui) : rank(ranki), lev(levi), cgpu(cgpui), ingfs(ingfsi), fngfs(fngfsi), igfs(0), fgfs(0), fgfs_pinned(0)
+{
   for (int i = 0; i < dim; i++)
     X[i] = 0;
 
@@ -68,14 +122,15 @@ Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fng
 #endif
     }
 
-    int nn = shape[0] * shape[1] * shape[2];
-    fgfs = new double *[fngfs];
-    for (int i = 0; i < fngfs; i++)
-    {
-      fgfs[i] = (double *)malloc(sizeof(double) * nn);
-      if (!(fgfs[i]))
-      {
-        cout << "on node#" << rank << ", out of memory when constructing Block." << endl;
+    int nn = shape[0] * shape[1] * shape[2];
+    fgfs = new double *[fngfs];
+    fgfs_pinned = new unsigned char[fngfs];
+    for (int i = 0; i < fngfs; i++)
+    {
+      fgfs[i] = alloc_gridfunc((size_t)nn, fgfs_pinned[i]);
+      if (!(fgfs[i]))
+      {
+        cout << "on node#" << rank << ", out of memory when constructing Block." << endl;
         MPI_Abort(MPI_COMM_WORLD, 1);
       }
       memset(fgfs[i], 0, sizeof(double) * nn);
@@ -103,17 +158,19 @@ Block::~Block()
   {
     for (int i = 0; i < dim; i++)
       delete[] X[i];
-    for (int i = 0; i < ingfs; i++)
-      free(igfs[i]);
-    delete[] igfs;
-    for (int i = 0; i < fngfs; i++)
-      free(fgfs[i]);
-    delete[] fgfs;
-    X[0] = X[1] = X[2] = 0;
-    igfs = 0;
-    fgfs = 0;
-  }
-}
+    for (int i = 0; i < ingfs; i++)
+      free(igfs[i]);
+    delete[] igfs;
+    for (int i = 0; i < fngfs; i++)
+      free_gridfunc(fgfs[i], fgfs_pinned ? fgfs_pinned[i] : 0);
+    delete[] fgfs;
+    delete[] fgfs_pinned;
+    X[0] = X[1] = X[2] = 0;
+    igfs = 0;
+    fgfs = 0;
+    fgfs_pinned = 0;
+  }
+}
 void Block::checkBlock()
 {
   int myrank;
@@ -184,12 +241,14 @@ void Block::swapList(MyList<var> *VarList1, MyList<var> *VarList2, int myrank)
   if (rank == myrank)
   {
     MyList<var> *varl1 = VarList1, *varl2 = VarList2;
-    while (varl1 && varl2)
-    {
-      misc::swap<double *>(fgfs[varl1->data->sgfn], fgfs[varl2->data->sgfn]);
-      varl1 = varl1->next;
-      varl2 = varl2->next;
-    }
+    while (varl1 && varl2)
+    {
+      misc::swap<double *>(fgfs[varl1->data->sgfn], fgfs[varl2->data->sgfn]);
+      if (fgfs_pinned)
+        misc::swap<unsigned char>(fgfs_pinned[varl1->data->sgfn], fgfs_pinned[varl2->data->sgfn]);
+      varl1 = varl1->next;
+      varl2 = varl2->next;
+    }
     if (varl1 || varl2)
     {
       cout << "error in Block::swaplist, var lists does not match." << endl;
diff --git a/AMSS_NCKU_source/Block.h b/AMSS_NCKU_source/Block.h
index 28193fd..6c920ba 100644
--- a/AMSS_NCKU_source/Block.h
+++ b/AMSS_NCKU_source/Block.h
@@ -13,14 +13,15 @@ public:
    int shape[dim];
    double bbox[2 * dim];
    double *X[dim];
-   int rank; // where the real data locate in
-   int lev, cgpu;
-   int ingfs, fngfs;
-   int *(*igfs);
-   double *(*fgfs);
+   int rank; // where the real data locate in
+   int lev, cgpu;
+   int ingfs, fngfs;
+   int *(*igfs);
+   double *(*fgfs);
+   unsigned char *fgfs_pinned;
 
 public:
-   Block() {};
+   Block() : rank(0), lev(0), cgpu(0), ingfs(0), fngfs(0), igfs(0), fgfs(0), fgfs_pinned(0) {};
    Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfs, int levi, const int cgpui = 0);
 
    ~Block();
diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C
index 4df55a8..6760047 100644
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -1,5 +1,5 @@
-
-#include "Parallel.h"
+
+#include "Parallel.h"
 #include "fmisc.h"
 #include "prolongrestrict.h"
 #include "misc.h"
@@ -23,6 +23,10 @@
 
 namespace {
 
+#if USE_CUDA_BSSN || USE_CUDA_Z4C
+static thread_local bool s_cuda_aware_pack_active = false;
+#endif
+
 struct SyncProfileStats
 {
   long long start_calls;
@@ -268,3735 +272,3933 @@ bool cuda_direct_unpack_segment(double *buffer,
     sync_profile_stats().direct_unpack_sec += MPI_Wtime() - t0;
   return ok;
 }
+
+bool cuda_aware_mpi_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+  {
+    const char *env = getenv("AMSS_CUDA_AWARE_MPI");
+    enabled = (!env || atoi(env) != 0) ? 1 : 0;
+  }
+  return enabled != 0;
+}
+
+bool cuda_mpi_diag_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+  {
+    const char *env = getenv("AMSS_CUDA_MPI_DIAG");
+    enabled = (env && atoi(env) != 0) ? 1 : 0;
+  }
+  return enabled != 0 || sync_profile_enabled();
+}
+
+double *alloc_device_comm_buffer(int length)
+{
+  if (length <= 0)
+    return 0;
+  double *ptr = 0;
+  cudaError_t err = cudaMalloc((void **)&ptr, (size_t)length * sizeof(double));
+  if (err != cudaSuccess)
+  {
+    fprintf(stderr, "Parallel: cudaMalloc failed for device comm buffer (%d doubles, err=%d)\n",
+            length, (int)err);
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+  return ptr;
+}
+
+void free_device_comm_buffer(double *&ptr)
+{
+  if (!ptr)
+    return;
+  cudaFree(ptr);
+  ptr = 0;
+}
+
+void ensure_device_comm_buffer(double **buffers, int *caps, int idx, int length)
+{
+  if (length <= caps[idx])
+    return;
+  free_device_comm_buffer(buffers[idx]);
+  buffers[idx] = alloc_device_comm_buffer(length);
+  if (!buffers[idx])
+  {
+    fprintf(stderr, "Parallel: failed to allocate device communication buffer (%d doubles)\n", length);
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+  caps[idx] = length;
+}
+
+bool cuda_direct_pack_segment_to_device(double *buffer,
+                                        const Parallel::gridseg *src,
+                                        const Parallel::gridseg *dst,
+                                        int state_count)
+{
+#if USE_CUDA_BSSN
+  if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
+    return false;
+  const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
+  const int i0 = cuda_seg_begin(dst, src->Bg, 0);
+  const int j0 = cuda_seg_begin(dst, src->Bg, 1);
+  const int k0 = cuda_seg_begin(dst, src->Bg, 2);
+  const bool ok = bssn_cuda_pack_state_batch_to_device_buffer(
+                      src->Bg, state_count, buffer, src->Bg->shape,
+                      i0, j0, k0,
+                      dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
+  if (sync_profile_enabled())
+    sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0;
+  return ok;
+#else
+  (void)buffer; (void)src; (void)dst; (void)state_count;
+  return false;
+#endif
+}
+
+bool cuda_direct_unpack_segment_from_device(double *buffer,
+                                            const Parallel::gridseg *dst,
+                                            int state_count)
+{
+#if USE_CUDA_BSSN
+  if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
+    return false;
+  const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
+  const int i0 = cuda_seg_begin(dst, dst->Bg, 0);
+  const int j0 = cuda_seg_begin(dst, dst->Bg, 1);
+  const int k0 = cuda_seg_begin(dst, dst->Bg, 2);
+  const bool ok = bssn_cuda_unpack_state_batch_from_device_buffer(
+                      dst->Bg, state_count, buffer, dst->Bg->shape,
+                      i0, j0, k0,
+                      dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
+  if (sync_profile_enabled())
+    sync_profile_stats().direct_unpack_sec += MPI_Wtime() - t0;
+  return ok;
+#else
+  (void)buffer; (void)dst; (void)state_count;
+  return false;
+#endif
+}
+
+bool cuda_device_state_count_supported(int state_count)
+{
+#if USE_CUDA_BSSN
+  return state_count > 0 && state_count <= BSSN_CUDA_STATE_COUNT;
+#else
+  (void)state_count;
+  return false;
+#endif
+}
+
+bool cuda_segments_same_level(MyList<Parallel::gridseg> *src,
+                              MyList<Parallel::gridseg> *dst,
+                              int rank_in,
+                              int dir,
+                              int myrank)
+{
+  bool has_work = false;
+  while (src && dst)
+  {
+    if ((dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
+        (dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank))
+    {
+      has_work = true;
+      if (!src->data || !dst->data || !src->data->Bg || !dst->data->Bg ||
+          src->data->Bg->lev != dst->data->Bg->lev)
+        return false;
+    }
+    src = src->next;
+    dst = dst->next;
+  }
+  return has_work;
+}
+
+bool cuda_pack_to_device_eligible(MyList<Parallel::gridseg> *src,
+                                  MyList<Parallel::gridseg> *dst,
+                                  int rank_in,
+                                  int state_count,
+                                  int myrank)
+{
+  if (!cuda_aware_mpi_enabled() || !cuda_device_state_count_supported(state_count))
+    return false;
+  if (!cuda_segments_same_level(src, dst, rank_in, PACK, myrank))
+    return false;
+  while (src && dst)
+  {
+    if (dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank &&
+        !cuda_can_direct_pack(src->data, dst->data, 1))
+      return false;
+    src = src->next;
+    dst = dst->next;
+  }
+  return true;
+}
+
+bool cuda_recv_to_device_eligible(MyList<Parallel::gridseg> *src,
+                                  MyList<Parallel::gridseg> *dst,
+                                  int rank_in,
+                                  int state_count,
+                                  int myrank)
+{
+  if (!cuda_aware_mpi_enabled() || !cuda_device_state_count_supported(state_count))
+    return false;
+  if (!cuda_segments_same_level(src, dst, rank_in, UNPACK, myrank))
+    return false;
+  while (src && dst)
+  {
+    if (src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank &&
+        !cuda_can_direct_unpack(dst->data, 1))
+      return false;
+    src = src->next;
+    dst = dst->next;
+  }
+  return true;
+}
+
+int data_packer_with_device_buffer(double *data,
+                                   MyList<Parallel::gridseg> *src,
+                                   MyList<Parallel::gridseg> *dst,
+                                   int rank_in,
+                                   int dir,
+                                   MyList<var> *VarLists,
+                                   MyList<var> *VarListd,
+                                   int Symmetry)
+{
+  s_cuda_aware_pack_active = true;
+  int n = Parallel::data_packer(data, src, dst, rank_in, dir, VarLists, VarListd, Symmetry);
+  s_cuda_aware_pack_active = false;
+  return n;
+}
 #endif
 
 } // namespace
 
 int Parallel::partition1(int &nx, int split_size, int min_width, int cpusize, int shape) // special for 1 diemnsion
 {
-  nx = Mymax(1, shape / min_width);
-  nx = Mymin(cpusize, nx);
-
-  return nx;
-}
-int Parallel::partition2(int *nxy, int split_size, int *min_width, int cpusize, int *shape) // special for 2 diemnsions
-{
-#define SEARCH_SIZE 5
-  int i, j, nx, ny;
-  int maxnx, maxny;
-  int mnx, mny;
-  int dn, hmin_width, cmin_width;
-  int cnx, cny;
-  double fx, fy;
-  int block_size;
-  int n;
-
-  block_size = shape[0] * shape[1];
-  n = Mymax(1, (block_size + split_size / 2) / split_size);
-
-  maxnx = Mymax(1, shape[0] / min_width[0]);
-  maxnx = Mymin(cpusize, maxnx);
-  maxny = Mymax(1, shape[1] / min_width[1]);
-  maxny = Mymin(cpusize, maxny);
-  fx = (double)shape[0] / (shape[0] + shape[1]);
-  fy = (double)shape[1] / (shape[0] + shape[1]);
-  nx = mnx = Mymax(1, Mymin(maxnx, (int)(sqrt(double(n)) * fx / fy)));
-  ny = mny = Mymax(1, Mymin(maxny, (int)(sqrt(double(n)) * fy / fx)));
-  dn = abs(n - nx * ny);
-  hmin_width = Mymin(shape[0] / nx, shape[1] / ny);
-  for (cny = Mymax(1, mny - SEARCH_SIZE); cny <= (Mymin(mny + SEARCH_SIZE, maxny)); cny++)
-    for (cnx = Mymax(1, mnx - SEARCH_SIZE); cnx <= (Mymin(mnx + SEARCH_SIZE, maxnx)); cnx++)
-    {
-      cmin_width = Mymin(shape[0] / cnx, shape[1] / cny);
-      if (dn > abs(n - cnx * cny) || (dn == abs(n - cnx * cny) && cmin_width > hmin_width))
-      {
-        dn = abs(n - cnx * cny);
-        nx = cnx;
-        ny = cny;
-        hmin_width = cmin_width;
-      }
-    }
-
-  nxy[0] = nx;
-  nxy[1] = ny;
-
-  return nx * ny;
-#undef SEARCH_SIZE
-}
-int Parallel::partition3(int *nxyz, int split_size, int *min_width, int cpusize, int *shape) // special for 3 diemnsions
-#if 1                                                                                        // algrithsm from Pretorius
-{
-//	cout<<split_size<<endl<<min_width[0]<<endl<<min_width[1]<<endl<<min_width[2]<<endl
-//            <<shape[0]<<endl<<shape[1]<<endl<<shape[2]<<endl<<cpusize<<endl;
-#define SEARCH_SIZE 5
-  int i, j, k, nx, ny, nz;
-  int maxnx, maxny, maxnz;
-  int mnx, mny, mnz;
-  int dn, hmin_width, cmin_width;
-  int cnx, cny, cnz;
-  double fx, fy, fz, max_fxfy, max_fxfz, max_fyfz;
-  int block_size;
-  int n;
-
-  block_size = shape[0] * shape[1] * shape[2];
-  n = Mymax(1, (block_size + split_size / 2) / split_size);
-
-  maxnx = Mymax(1, shape[0] / min_width[0]);
-  maxnx = Mymin(cpusize, maxnx);
-  maxny = Mymax(1, shape[1] / min_width[1]);
-  maxny = Mymin(cpusize, maxny);
-  maxnz = Mymax(1, shape[2] / min_width[2]);
-  maxnz = Mymin(cpusize, maxnz);
-  fx = (double)shape[0] / (shape[0] + shape[1] + shape[2]);
-  fy = (double)shape[1] / (shape[0] + shape[1] + shape[2]);
-  fz = (double)shape[2] / (shape[0] + shape[1] + shape[2]);
-  max_fxfy = Mymax(fx, fy);
-  max_fxfz = Mymax(fx, fz);
-  max_fyfz = Mymax(fy, fz);
-  nx = mnx = Mymax(1, Mymin(maxnx, (int)(pow(n, 1.0 / 3.0) * fx / max_fyfz)));
-  ny = mny = Mymax(1, Mymin(maxny, (int)(pow(n, 1.0 / 3.0) * fy / max_fxfz)));
-  nz = mnz = Mymax(1, Mymin(maxnz, (int)(pow(n, 1.0 / 3.0) * fz / max_fxfy)));
-  dn = abs(n - nx * ny * nz);
-  hmin_width = Mymin(shape[2] / nz, shape[1] / ny);
-  hmin_width = Mymin(hmin_width, shape[0] / nx);
-  for (cnz = Mymax(1, mnz - SEARCH_SIZE); cnz <= (Mymin(mnz + SEARCH_SIZE, maxnz)); cnz++)
-    for (cny = Mymax(1, mny - SEARCH_SIZE); cny <= (Mymin(mny + SEARCH_SIZE, maxny)); cny++)
-      for (cnx = Mymax(1, mnx - SEARCH_SIZE); cnx <= (Mymin(mnx + SEARCH_SIZE, maxnx)); cnx++)
-      {
-        cmin_width = Mymin(shape[2] / cnz, shape[1] / cny);
-        cmin_width = Mymin(cmin_width, shape[0] / cnx);
-        if (dn > abs(n - cnx * cny * cnz) || (dn == abs(n - cnx * cny * cnz) && cmin_width > hmin_width))
-        {
-          dn = abs(n - cnx * cny * cnz);
-          nx = cnx;
-          ny = cny;
-          nz = cnz;
-          hmin_width = cmin_width;
-        }
-      }
-
-  nxyz[0] = nx;
-  nxyz[1] = ny;
-  nxyz[2] = nz;
-
-  return nx * ny * nz;
-#undef SEARCH_SIZE
-}
-#elif 1 // Zhihui's idea one on 2013-09-25
-{
-  int nx, ny, nz;
-  int hmin_width;
-  hmin_width = Mymin(min_width[0], min_width[1]);
-  hmin_width = Mymin(hmin_width, min_width[2]);
-  nx = shape[0] / hmin_width;
-  if (nx * hmin_width < shape[0])
-    nx++;
-  ny = shape[1] / hmin_width;
-  if (ny * hmin_width < shape[1])
-    ny++;
-  nz = shape[2] / hmin_width;
-  if (nz * hmin_width < shape[2])
-    nz++;
-  while (nx * ny * nz > cpusize)
-  {
-    hmin_width++;
-    nx = shape[0] / hmin_width;
-    if (nx * hmin_width < shape[0])
-      nx++;
-    ny = shape[1] / hmin_width;
-    if (ny * hmin_width < shape[1])
-      ny++;
-    nz = shape[2] / hmin_width;
-    if (nz * hmin_width < shape[2])
-      nz++;
-  }
-
-  nxyz[0] = nx;
-  nxyz[1] = ny;
-  nxyz[2] = nz;
-
-  return nx * ny * nz;
-}
-#elif 1 // Zhihui's idea two on 2013-09-25
-{
-  int nx, ny, nz;
-  const int hmin_width = 8; // for example we use 8
-  nx = shape[0] / hmin_width;
-  if (nx * hmin_width < shape[0])
-    nx++;
-  ny = shape[1] / hmin_width;
-  if (ny * hmin_width < shape[1])
-    ny++;
-  nz = shape[2] / hmin_width;
-  if (nz * hmin_width < shape[2])
-    nz++;
-
-  nxyz[0] = nx;
-  nxyz[1] = ny;
-  nxyz[2] = nz;
-
-  return nx * ny * nz;
-}
-#endif
-// distribute the data to cprocessors
-#if (PSTR == 0)
-MyList<Block> *Parallel::distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
-                                    bool periodic, int nodes)
-{
-#ifdef USE_GPU_DIVIDE
-  double cpu_part, gpu_part;
-  map<string, double>::iterator iter;
-  iter = parameters::dou_par.find("cpu part");
-  if (iter != parameters::dou_par.end())
-  {
-    cpu_part = iter->second;
-  }
-  else
-  {
-    int myrank;
-    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-    // read parameter from file
-    const int LEN = 256;
-    char pline[LEN];
-    string str, sgrp, skey, sval;
-    int sind;
-    char pname[50];
-    {
-      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
-      if (iter != parameters::str_par.end())
-      {
-        strcpy(pname, (iter->second).c_str());
-      }
-      else
-      {
-        cout << "Error inputpar" << endl;
-        exit(0);
-      }
-    }
-    ifstream inf(pname, ifstream::in);
-    if (!inf.good() && myrank == 0)
-    {
-      cout << "Can not open parameter file " << pname << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-
-    for (int i = 1; inf.good(); i++)
-    {
-      inf.getline(pline, LEN);
-      str = pline;
-
-      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
-      if (status == -1)
-      {
-        cout << "error reading parameter file " << pname << " in line " << i << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-      else if (status == 0)
-        continue;
-
-      if (sgrp == "ABE")
-      {
-        if (skey == "cpu part")
-          cpu_part = atof(sval.c_str());
-      }
-    }
-    inf.close();
-
-    parameters::dou_par.insert(map<string, double>::value_type("cpu part", cpu_part));
-  }
-  iter = parameters::dou_par.find("gpu part");
-  if (iter != parameters::dou_par.end())
-  {
-    gpu_part = iter->second;
-  }
-  else
-  {
-    int myrank;
-    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-    // read parameter from file
-    const int LEN = 256;
-    char pline[LEN];
-    string str, sgrp, skey, sval;
-    int sind;
-    char pname[50];
-    {
-      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
-      if (iter != parameters::str_par.end())
-      {
-        strcpy(pname, (iter->second).c_str());
-      }
-      else
-      {
-        cout << "Error inputpar" << endl;
-        exit(0);
-      }
-    }
-    ifstream inf(pname, ifstream::in);
-    if (!inf.good() && myrank == 0)
-    {
-      cout << "Can not open parameter file " << pname << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-
-    for (int i = 1; inf.good(); i++)
-    {
-      inf.getline(pline, LEN);
-      str = pline;
-
-      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
-      if (status == -1)
-      {
-        cout << "error reading parameter file " << pname << " in line " << i << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-      else if (status == 0)
-        continue;
-
-      if (sgrp == "ABE")
-      {
-        if (skey == "gpu part")
-          gpu_part = atof(sval.c_str());
-      }
-    }
-    inf.close();
-
-    parameters::dou_par.insert(map<string, double>::value_type("gpu part", gpu_part));
-  }
-
-  if (nodes == 0)
-    nodes = cpusize / 2;
-#else
-  if (nodes == 0)
-    nodes = cpusize;
-#endif
-
-  if (dim != 3)
-  {
-    cout << "distrivute: now we only support 3-dimension" << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-  MyList<Block> *BlL = 0;
-
-  int split_size, min_size, block_size = 0;
-
-  int min_width = 2 * Mymax(ghost_width, buffer_width);
-  int nxyz[dim], mmin_width[dim], min_shape[dim];
-
-  MyList<Patch> *PLi = PatchLIST;
-  for (int i = 0; i < dim; i++)
-    min_shape[i] = PLi->data->shape[i];
-  int lev = PLi->data->lev;
-  PLi = PLi->next;
-  while (PLi)
-  {
-    Patch *PP = PLi->data;
-    for (int i = 0; i < dim; i++)
-      min_shape[i] = Mymin(min_shape[i], PP->shape[i]);
-    if (lev != PLi->data->lev)
-      cout << "Parallel::distribute CAUSTION: meet Patches for different level: " << lev << " and " << PLi->data->lev << endl;
-    PLi = PLi->next;
-  }
-
-  for (int i = 0; i < dim; i++)
-    mmin_width[i] = Mymin(min_width, min_shape[i]);
-
-  min_size = mmin_width[0];
-  for (int i = 1; i < dim; i++)
-    min_size = min_size * mmin_width[i];
-
-  PLi = PatchLIST;
-  while (PLi)
-  {
-    Patch *PP = PLi->data;
-    //    PP->checkPatch(true);
-    int bs = PP->shape[0];
-    for (int i = 1; i < dim; i++)
-      bs = bs * PP->shape[i];
-    block_size = block_size + bs;
-    PLi = PLi->next;
-  }
-  split_size = Mymax(min_size, block_size / nodes);
-  split_size = Mymax(1, split_size);
-
-  int n_rank = 0;
-  PLi = PatchLIST;
-  int reacpu = 0;
-  while (PLi)
-  {
-    Patch *PP = PLi->data;
-
-    reacpu += partition3(nxyz, split_size, mmin_width, nodes, PP->shape);
-
-    Block *ng0, *ng;
-    int shape_here[dim], ibbox_here[2 * dim];
-    double bbox_here[2 * dim], dd;
-
-    // ibbox : 0,...N-1
-    for (int i = 0; i < nxyz[0]; i++)
-      for (int j = 0; j < nxyz[1]; j++)
-        for (int k = 0; k < nxyz[2]; k++)
-        {
-          ibbox_here[0] = (PP->shape[0] * i) / nxyz[0];
-          ibbox_here[3] = (PP->shape[0] * (i + 1)) / nxyz[0] - 1;
-          ibbox_here[1] = (PP->shape[1] * j) / nxyz[1];
-          ibbox_here[4] = (PP->shape[1] * (j + 1)) / nxyz[1] - 1;
-          ibbox_here[2] = (PP->shape[2] * k) / nxyz[2];
-          ibbox_here[5] = (PP->shape[2] * (k + 1)) / nxyz[2] - 1;
-
-          if (periodic)
-          {
-            ibbox_here[0] = ibbox_here[0] - ghost_width;
-            ibbox_here[3] = ibbox_here[3] + ghost_width;
-            ibbox_here[1] = ibbox_here[1] - ghost_width;
-            ibbox_here[4] = ibbox_here[4] + ghost_width;
-            ibbox_here[2] = ibbox_here[2] - ghost_width;
-            ibbox_here[5] = ibbox_here[5] + ghost_width;
-          }
-          else
-          {
-            ibbox_here[0] = Mymax(0, ibbox_here[0] - ghost_width);
-            ibbox_here[3] = Mymin(PP->shape[0] - 1, ibbox_here[3] + ghost_width);
-            ibbox_here[1] = Mymax(0, ibbox_here[1] - ghost_width);
-            ibbox_here[4] = Mymin(PP->shape[1] - 1, ibbox_here[4] + ghost_width);
-            ibbox_here[2] = Mymax(0, ibbox_here[2] - ghost_width);
-            ibbox_here[5] = Mymin(PP->shape[2] - 1, ibbox_here[5] + ghost_width);
-          }
-
-          shape_here[0] = ibbox_here[3] - ibbox_here[0] + 1;
-          shape_here[1] = ibbox_here[4] - ibbox_here[1] + 1;
-          shape_here[2] = ibbox_here[5] - ibbox_here[2] + 1;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-          // 0--4, 5--10
-          dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1);
-          bbox_here[0] = PP->bbox[0] + ibbox_here[0] * dd;
-          bbox_here[3] = PP->bbox[0] + ibbox_here[3] * dd;
-
-          dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1);
-          bbox_here[1] = PP->bbox[1] + ibbox_here[1] * dd;
-          bbox_here[4] = PP->bbox[1] + ibbox_here[4] * dd;
-
-          dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1);
-          bbox_here[2] = PP->bbox[2] + ibbox_here[2] * dd;
-          bbox_here[5] = PP->bbox[2] + ibbox_here[5] * dd;
-#else
-#ifdef Cell
-          // 0--5, 5--10
-          dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0];
-          bbox_here[0] = PP->bbox[0] + (ibbox_here[0]) * dd;
-          bbox_here[3] = PP->bbox[0] + (ibbox_here[3] + 1) * dd;
-
-          dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1];
-          bbox_here[1] = PP->bbox[1] + (ibbox_here[1]) * dd;
-          bbox_here[4] = PP->bbox[1] + (ibbox_here[4] + 1) * dd;
-
-          dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2];
-          bbox_here[2] = PP->bbox[2] + (ibbox_here[2]) * dd;
-          bbox_here[5] = PP->bbox[2] + (ibbox_here[5] + 1) * dd;
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-
-#ifdef USE_GPU_DIVIDE
-          {
-            const int pices = 2;
-            double picef[pices];
-            picef[0] = cpu_part;
-            picef[1] = gpu_part;
-            int shape_res[dim * pices];
-            double bbox_res[2 * dim * pices];
-            misc::dividBlock(dim, shape_here, bbox_here, pices, picef, shape_res, bbox_res, min_width);
-            ng = ng0 = new Block(dim, shape_res, bbox_res, n_rank++, ingfsi, fngfsi, PP->lev, 0); // delete through KillBlocks
-
-            //	       if(n_rank==cpusize) {n_rank=0; cerr<<"place one!!"<<endl;}
-
-            //	       ng->checkBlock();
-            if (BlL)
-              BlL->insert(ng);
-            else
-              BlL = new MyList<Block>(ng); // delete through KillBlocks
-
-            for (int i = 1; i < pices; i++)
-            {
-              ng = new Block(dim, shape_res + i * dim, bbox_res + i * 2 * dim, n_rank++, ingfsi, fngfsi, PP->lev, i); // delete through KillBlocks
-              //	        if(n_rank==cpusize) {n_rank=0; cerr<<"place two!! "<<i<<endl;}
-              //	        ng->checkBlock();
-              BlL->insert(ng);
-            }
-          }
-#else
-          ng = ng0 = new Block(dim, shape_here, bbox_here, n_rank++, ingfsi, fngfsi, PP->lev);
-          //	    ng->checkBlock();
-          if (BlL)
-            BlL->insert(ng);
-          else
-            BlL = new MyList<Block>(ng); // delete through KillBlocks
-#endif
-          if (n_rank == cpusize)
-            n_rank = 0;
-
-          // set PP->blb
-          if (i == 0 && j == 0 && k == 0)
-          {
-            MyList<Block> *Bp = BlL;
-            while (Bp->data != ng0)
-              Bp = Bp->next; // ng0 is the first of the pices list
-            PP->blb = Bp;
-          }
-        }
-    // set PP->ble
-    {
-      MyList<Block> *Bp = BlL;
-      while (Bp->data != ng)
-        Bp = Bp->next; // ng is the last of the pices list
-      PP->ble = Bp;
-    }
-    PLi = PLi->next;
-  }
-  if (reacpu < nodes * 2 / 3)
-  {
-    int myrank;
-    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-    if (myrank == 0)
-      cout << "Parallel::distribute CAUSTION: level#" << lev << " uses essencially " << reacpu << " processors vs " << nodes << " nodes run, your scientific computation scale is not as large as you estimate." << endl;
-  }
-
-  return BlL;
-}
-
-#ifdef INTERP_LB_OPTIMIZE
-#include "interp_lb_profile_data.h"
-
-MyList<Block> *Parallel::distribute_optimize(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
-                                    bool periodic, int nodes)
-{
-#ifdef USE_GPU_DIVIDE
-  double cpu_part, gpu_part;
-  map<string, double>::iterator iter;
-  iter = parameters::dou_par.find("cpu part");
-  if (iter != parameters::dou_par.end())
-  {
-    cpu_part = iter->second;
-  }
-  else
-  {
-    int myrank;
-    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-    const int LEN = 256;
-    char pline[LEN];
-    string str, sgrp, skey, sval;
-    int sind;
-    char pname[50];
-    {
-      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
-      if (iter != parameters::str_par.end())
-        strcpy(pname, (iter->second).c_str());
-      else { cout << "Error inputpar" << endl; exit(0); }
-    }
-    ifstream inf(pname, ifstream::in);
-    if (!inf.good() && myrank == 0)
-    { cout << "Can not open parameter file " << pname << endl; MPI_Abort(MPI_COMM_WORLD, 1); }
-    for (int i = 1; inf.good(); i++)
-    {
-      inf.getline(pline, LEN); str = pline;
-      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
-      if (status == -1) { cout << "error reading parameter file " << pname << " in line " << i << endl; MPI_Abort(MPI_COMM_WORLD, 1); }
-      else if (status == 0) continue;
-      if (sgrp == "ABE") { if (skey == "cpu part") cpu_part = atof(sval.c_str()); }
-    }
-    inf.close();
-    parameters::dou_par.insert(map<string, double>::value_type("cpu part", cpu_part));
-  }
-  iter = parameters::dou_par.find("gpu part");
-  if (iter != parameters::dou_par.end())
-  {
-    gpu_part = iter->second;
-  }
-  else
-  {
-    int myrank;
-    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-    const int LEN = 256;
-    char pline[LEN];
-    string str, sgrp, skey, sval;
-    int sind;
-    char pname[50];
-    {
-      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
-      if (iter != parameters::str_par.end())
-        strcpy(pname, (iter->second).c_str());
-      else { cout << "Error inputpar" << endl; exit(0); }
-    }
-    ifstream inf(pname, ifstream::in);
-    if (!inf.good() && myrank == 0)
-    { cout << "Can not open parameter file " << pname << endl; MPI_Abort(MPI_COMM_WORLD, 1); }
-    for (int i = 1; inf.good(); i++)
-    {
-      inf.getline(pline, LEN); str = pline;
-      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
-      if (status == -1) { cout << "error reading parameter file " << pname << " in line " << i << endl; MPI_Abort(MPI_COMM_WORLD, 1); }
-      else if (status == 0) continue;
-      if (sgrp == "ABE") { if (skey == "gpu part") gpu_part = atof(sval.c_str()); }
-    }
-    inf.close();
-    parameters::dou_par.insert(map<string, double>::value_type("gpu part", gpu_part));
-  }
-  if (nodes == 0) nodes = cpusize / 2;
-#else
-  if (nodes == 0) nodes = cpusize;
-#endif
-
-  if (dim != 3)
-  {
-    cout << "distrivute: now we only support 3-dimension" << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-  MyList<Block> *BlL = 0;
-  int split_size, min_size, block_size = 0;
-  int min_width = 2 * Mymax(ghost_width, buffer_width);
-  int nxyz[dim], mmin_width[dim], min_shape[dim];
-
-  MyList<Patch> *PLi = PatchLIST;
-  for (int i = 0; i < dim; i++)
-    min_shape[i] = PLi->data->shape[i];
-  int lev = PLi->data->lev;
-  PLi = PLi->next;
-  while (PLi)
-  {
-    Patch *PP = PLi->data;
-    for (int i = 0; i < dim; i++)
-      min_shape[i] = Mymin(min_shape[i], PP->shape[i]);
-    if (lev != PLi->data->lev)
-      cout << "Parallel::distribute CAUSTION: meet Patches for different level: " << lev << " and " << PLi->data->lev << endl;
-    PLi = PLi->next;
-  }
-
-  for (int i = 0; i < dim; i++)
-    mmin_width[i] = Mymin(min_width, min_shape[i]);
-  min_size = mmin_width[0];
-  for (int i = 1; i < dim; i++)
-    min_size = min_size * mmin_width[i];
-
-  PLi = PatchLIST;
-  while (PLi)
-  {
-    Patch *PP = PLi->data;
-    int bs = PP->shape[0];
-    for (int i = 1; i < dim; i++)
-      bs = bs * PP->shape[i];
-    block_size = block_size + bs;
-    PLi = PLi->next;
-  }
-  split_size = Mymax(min_size, block_size / nodes);
-  split_size = Mymax(1, split_size);
-
-  int n_rank = 0;
-  PLi = PatchLIST;
-  int reacpu = 0;
-  int current_block_id = 0;
-  while (PLi) {
-    Block *ng0, *ng;
-    bool first_block_in_patch = true;
-    Patch *PP = PLi->data;
-    reacpu += partition3(nxyz, split_size, mmin_width, nodes, PP->shape);
-
-    for (int i = 0; i < nxyz[0]; i++)
-    for (int j = 0; j < nxyz[1]; j++)
-    for (int k = 0; k < nxyz[2]; k++)
-    {
-        int ibbox_here[6], shape_here[3];
-        double bbox_here[6], dd;
-        Block *current_ng_start = nullptr;
-
-        bool is_heavy = false;
-        int r_l = -1, r_r = -1;
-        if (cpusize == INTERP_LB_NPROCS) {
-          for (int si = 0; si < INTERP_LB_NUM_HEAVY; si++) {
-            if (current_block_id == interp_lb_splits[si][0]) {
-              is_heavy = true;
-              r_l = interp_lb_splits[si][1];
-              r_r = interp_lb_splits[si][2];
-              break;
-            }
-          }
-        }
-
-        if (is_heavy)
-        {
-            int ib0 = (PP->shape[0] * i) / nxyz[0];
-            int ib3 = (PP->shape[0] * (i + 1)) / nxyz[0] - 1;
-            int jb1 = (PP->shape[1] * j) / nxyz[1];
-            int jb4 = (PP->shape[1] * (j + 1)) / nxyz[1] - 1;
-            int kb2 = (PP->shape[2] * k) / nxyz[2];
-            int kb5 = (PP->shape[2] * (k + 1)) / nxyz[2] - 1;
-
-            Block *split_first_block = nullptr;
-            Block *split_last_block = nullptr;
-            splitHotspotBlock(BlL, dim, ib0, ib3, jb1, jb4, kb2, kb5,
-                              PP, r_l, r_r, ingfsi, fngfsi, periodic,
-                              split_first_block, split_last_block);
-
-            current_ng_start = split_first_block;
-            ng = split_last_block;
-        }
-        else
-        {
-            ibbox_here[0] = (PP->shape[0] * i) / nxyz[0];
-            ibbox_here[3] = (PP->shape[0] * (i + 1)) / nxyz[0] - 1;
-            ibbox_here[1] = (PP->shape[1] * j) / nxyz[1];
-            ibbox_here[4] = (PP->shape[1] * (j + 1)) / nxyz[1] - 1;
-            ibbox_here[2] = (PP->shape[2] * k) / nxyz[2];
-            ibbox_here[5] = (PP->shape[2] * (k + 1)) / nxyz[2] - 1;
-
-            if (periodic) {
-                for(int d=0; d<3; d++) {
-                    ibbox_here[d] -= ghost_width;
-                    ibbox_here[d+3] += ghost_width;
-                }
-            } else {
-                ibbox_here[0] = Mymax(0, ibbox_here[0] - ghost_width);
-                ibbox_here[3] = Mymin(PP->shape[0] - 1, ibbox_here[3] + ghost_width);
-                ibbox_here[1] = Mymax(0, ibbox_here[1] - ghost_width);
-                ibbox_here[4] = Mymin(PP->shape[1] - 1, ibbox_here[4] + ghost_width);
-                ibbox_here[2] = Mymax(0, ibbox_here[2] - ghost_width);
-                ibbox_here[5] = Mymin(PP->shape[2] - 1, ibbox_here[5] + ghost_width);
-            }
-
-            for(int d=0; d<3; d++) shape_here[d] = ibbox_here[d+3] - ibbox_here[d] + 1;
-
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-          dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1);
-          bbox_here[0] = PP->bbox[0] + ibbox_here[0] * dd;
-          bbox_here[3] = PP->bbox[0] + ibbox_here[3] * dd;
-          dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1);
-          bbox_here[1] = PP->bbox[1] + ibbox_here[1] * dd;
-          bbox_here[4] = PP->bbox[1] + ibbox_here[4] * dd;
-          dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1);
-          bbox_here[2] = PP->bbox[2] + ibbox_here[2] * dd;
-          bbox_here[5] = PP->bbox[2] + ibbox_here[5] * dd;
-#else
-#ifdef Cell
-          dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0];
-          bbox_here[0] = PP->bbox[0] + (ibbox_here[0]) * dd;
-          bbox_here[3] = PP->bbox[0] + (ibbox_here[3] + 1) * dd;
-          dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1];
-          bbox_here[1] = PP->bbox[1] + (ibbox_here[1]) * dd;
-          bbox_here[4] = PP->bbox[1] + (ibbox_here[4] + 1) * dd;
-          dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2];
-          bbox_here[2] = PP->bbox[2] + (ibbox_here[2]) * dd;
-          bbox_here[5] = PP->bbox[2] + (ibbox_here[5] + 1) * dd;
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-            ng = createMappedBlock(BlL, dim, shape_here, bbox_here,
-                                   current_block_id, ingfsi, fngfsi, PP->lev);
-            current_ng_start = ng;
-        }
-
-        if (first_block_in_patch) {
-            ng0 = current_ng_start;
-            MyList<Block> *Bp_start = BlL;
-            while (Bp_start && Bp_start->data != ng0) Bp_start = Bp_start->next;
-            PP->blb = Bp_start;
-            first_block_in_patch = false;
-        }
-
-        current_block_id++;
-    }
-
-    {
-      MyList<Block> *Bp_end = BlL;
-      while (Bp_end && Bp_end->data != ng) Bp_end = Bp_end->next;
-      PP->ble = Bp_end;
-    }
-
-    PLi = PLi->next;
-  }
-  if (reacpu < nodes * 2 / 3)
-  {
-    int myrank;
-    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-    if (myrank == 0)
-      cout << "Parallel::distribute CAUSTION: level#" << lev << " uses essencially " << reacpu << " processors vs " << nodes << " nodes run, your scientific computation scale is not as large as you estimate." << endl;
-  }
-
-  return BlL;
-}
-
-Block* Parallel::splitHotspotBlock(MyList<Block>* &BlL, int _dim,
-                                 int ib0_orig, int ib3_orig,
-                                 int jb1_orig, int jb4_orig,
-                                 int kb2_orig, int kb5_orig,
-                                 Patch* PP, int r_left, int r_right,
-                                 int ingfsi, int fngfsi, bool periodic,
-                                 Block* &split_first_block, Block* &split_last_block)
-{
-    int mid = (ib0_orig + ib3_orig) / 2;
-
-    int indices_L[6] = {ib0_orig, jb1_orig, kb2_orig, mid, jb4_orig, kb5_orig};
-    int indices_R[6] = {mid + 1, jb1_orig, kb2_orig, ib3_orig, jb4_orig, kb5_orig};
-
-    auto createSubBlock = [&](int* ib_raw, int target_rank) {
-        int ib_final[6];
-        int sh_here[3];
-        double bb_here[6], dd;
-
-        if (periodic) {
-            ib_final[0] = ib_raw[0] - ghost_width;
-            ib_final[3] = ib_raw[3] + ghost_width;
-            ib_final[1] = ib_raw[1] - ghost_width;
-            ib_final[4] = ib_raw[4] + ghost_width;
-            ib_final[2] = ib_raw[2] - ghost_width;
-            ib_final[5] = ib_raw[5] + ghost_width;
-        } else {
-            ib_final[0] = Mymax(0, ib_raw[0] - ghost_width);
-            ib_final[3] = Mymin(PP->shape[0] - 1, ib_raw[3] + ghost_width);
-            ib_final[1] = Mymax(0, ib_raw[1] - ghost_width);
-            ib_final[4] = Mymin(PP->shape[1] - 1, ib_raw[4] + ghost_width);
-            ib_final[2] = Mymax(0, ib_raw[2] - ghost_width);
-            ib_final[5] = Mymin(PP->shape[2] - 1, ib_raw[5] + ghost_width);
-        }
-
-        sh_here[0] = ib_final[3] - ib_final[0] + 1;
-        sh_here[1] = ib_final[4] - ib_final[1] + 1;
-        sh_here[2] = ib_final[5] - ib_final[2] + 1;
-
-#ifdef Vertex
-        dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1);
-        bb_here[0] = PP->bbox[0] + ib_final[0] * dd;
-        bb_here[3] = PP->bbox[0] + ib_final[3] * dd;
-        dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1);
-        bb_here[1] = PP->bbox[1] + ib_final[1] * dd;
-        bb_here[4] = PP->bbox[1] + ib_final[4] * dd;
-        dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1);
-        bb_here[2] = PP->bbox[2] + ib_final[2] * dd;
-        bb_here[5] = PP->bbox[2] + ib_final[5] * dd;
-#else
-#ifdef Cell
-        dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0];
-        bb_here[0] = PP->bbox[0] + ib_final[0] * dd;
-        bb_here[3] = PP->bbox[0] + (ib_final[3] + 1) * dd;
-        dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1];
-        bb_here[1] = PP->bbox[1] + ib_final[1] * dd;
-        bb_here[4] = PP->bbox[1] + (ib_final[4] + 1) * dd;
-        dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2];
-        bb_here[2] = PP->bbox[2] + ib_final[2] * dd;
-        bb_here[5] = PP->bbox[2] + (ib_final[5] + 1) * dd;
-#endif
-#endif
-
-        Block* Bg = new Block(dim, sh_here, bb_here, target_rank, ingfsi, fngfsi, PP->lev);
-        if (BlL) BlL->insert(Bg);
-        else     BlL = new MyList<Block>(Bg);
-
-        return Bg;
-    };
-
-    split_first_block = createSubBlock(indices_L, r_left);
-    split_last_block  = createSubBlock(indices_R, r_right);
-    return split_last_block;
-}
-
-Block* Parallel::createMappedBlock(MyList<Block>* &BlL, int _dim, int* shape, double* bbox,
-                                   int block_id, int ingfsi, int fngfsi, int lev)
-{
-    int target_rank = block_id;
-    if (INTERP_LB_NPROCS > 0) {
-      for (int ri = 0; ri < interp_lb_num_remaps; ri++) {
-        if (block_id == interp_lb_remaps[ri][0]) {
-          target_rank = interp_lb_remaps[ri][1];
-          break;
-        }
-      }
-    }
-
-    Block* ng = new Block(dim, shape, bbox, target_rank, ingfsi, fngfsi, lev);
-    if (BlL) BlL->insert(ng);
-    else     BlL = new MyList<Block>(ng);
-
-    return ng;
-}
-#else
-// When INTERP_LB_OPTIMIZE is not defined, distribute_optimize falls back to distribute
-MyList<Block> *Parallel::distribute_optimize(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
-                                    bool periodic, int nodes)
-{
-  return distribute(PatchLIST, cpusize, ingfsi, fngfsi, periodic, nodes);
-}
-Block* Parallel::splitHotspotBlock(MyList<Block>* &BlL, int _dim,
-                                 int ib0_orig, int ib3_orig,
-                                 int jb1_orig, int jb4_orig,
-                                 int kb2_orig, int kb5_orig,
-                                 Patch* PP, int r_left, int r_right,
-                                 int ingfsi, int fngfsi, bool periodic,
-                                 Block* &split_first_block, Block* &split_last_block)
-{ return nullptr; }
-Block* Parallel::createMappedBlock(MyList<Block>* &BlL, int _dim, int* shape, double* bbox,
-                                   int block_id, int ingfsi, int fngfsi, int lev)
-{ return nullptr; }
-#endif
-
-#elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
-MyList<Block> *Parallel::distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
-                                    bool periodic, int start_rank, int end_rank, int nodes)
-{
-#ifdef USE_GPU_DIVIDE
-  double cpu_part, gpu_part;
-  map<string, double>::iterator iter;
-  iter = parameters::dou_par.find("cpu part");
-  if (iter != parameters::dou_par.end())
-  {
-    cpu_part = iter->second;
-  }
-  else
-  {
-    int myrank;
-    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-    // read parameter from file
-    const int LEN = 256;
-    char pline[LEN];
-    string str, sgrp, skey, sval;
-    int sind;
-    char pname[50];
-    {
-      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
-      if (iter != parameters::str_par.end())
-      {
-        strcpy(pname, (iter->second).c_str());
-      }
-      else
-      {
-        cout << "Error inputpar" << endl;
-        exit(0);
-      }
-    }
-    ifstream inf(pname, ifstream::in);
-    if (!inf.good() && myrank == 0)
-    {
-      cout << "Can not open parameter file " << pname << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-
-    for (int i = 1; inf.good(); i++)
-    {
-      inf.getline(pline, LEN);
-      str = pline;
-
-      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
-      if (status == -1)
-      {
-        cout << "error reading parameter file " << pname << " in line " << i << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-      else if (status == 0)
-        continue;
-
-      if (sgrp == "ABE")
-      {
-        if (skey == "cpu part")
-          cpu_part = atof(sval.c_str());
-      }
-    }
-    inf.close();
-
-    parameters::dou_par.insert(map<string, double>::value_type("cpu part", cpu_part));
-  }
-  iter = parameters::dou_par.find("gpu part");
-  if (iter != parameters::dou_par.end())
-  {
-    gpu_part = iter->second;
-  }
-  else
-  {
-    int myrank;
-    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-    // read parameter from file
-    const int LEN = 256;
-    char pline[LEN];
-    string str, sgrp, skey, sval;
-    int sind;
-    char pname[50];
-    {
-      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
-      if (iter != parameters::str_par.end())
-      {
-        strcpy(pname, (iter->second).c_str());
-      }
-      else
-      {
-        cout << "Error inputpar" << endl;
-        exit(0);
-      }
-    }
-    ifstream inf(pname, ifstream::in);
-    if (!inf.good() && myrank == 0)
-    {
-      cout << "Can not open parameter file " << pname << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-
-    for (int i = 1; inf.good(); i++)
-    {
-      inf.getline(pline, LEN);
-      str = pline;
-
-      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
-      if (status == -1)
-      {
-        cout << "error reading parameter file " << pname << " in line " << i << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-      else if (status == 0)
-        continue;
-
-      if (sgrp == "ABE")
-      {
-        if (skey == "gpu part")
-          gpu_part = atof(sval.c_str());
-      }
-    }
-    inf.close();
-
-    parameters::dou_par.insert(map<string, double>::value_type("gpu part", gpu_part));
-  }
-
-  if (nodes == 0)
-    nodes = cpusize / 2;
-#else
-  if (nodes == 0)
-    nodes = cpusize;
-#endif
-
-  if (dim != 3)
-  {
-    cout << "distrivute: now we only support 3-dimension" << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-  MyList<Block> *BlL = 0;
-
-  int split_size, min_size, block_size = 0;
-
-  int min_width = 2 * Mymax(ghost_width, buffer_width);
-  int nxyz[dim], mmin_width[dim], min_shape[dim];
-
-  MyList<Patch> *PLi = PatchLIST;
-  for (int i = 0; i < dim; i++)
-    min_shape[i] = PLi->data->shape[i];
-  int lev = PLi->data->lev;
-  PLi = PLi->next;
-  while (PLi)
-  {
-    Patch *PP = PLi->data;
-    for (int i = 0; i < dim; i++)
-      min_shape[i] = Mymin(min_shape[i], PP->shape[i]);
-    if (lev != PLi->data->lev)
-      cout << "Parallel::distribute CAUSTION: meet Patches for different level: " << lev << " and " << PLi->data->lev << endl;
-    PLi = PLi->next;
-  }
-
-  for (int i = 0; i < dim; i++)
-    mmin_width[i] = Mymin(min_width, min_shape[i]);
-
-  min_size = mmin_width[0];
-  for (int i = 1; i < dim; i++)
-    min_size = min_size * mmin_width[i];
-
-  PLi = PatchLIST;
-  while (PLi)
-  {
-    Patch *PP = PLi->data;
-    //    PP->checkPatch(true);
-    int bs = PP->shape[0];
-    for (int i = 1; i < dim; i++)
-      bs = bs * PP->shape[i];
-    block_size = block_size + bs;
-    PLi = PLi->next;
-  }
-  split_size = Mymax(min_size, block_size / cpusize);
-  split_size = Mymax(1, split_size);
-
-  int n_rank = start_rank;
-  PLi = PatchLIST;
-  int reacpu = 0;
-  while (PLi)
-  {
-    Patch *PP = PLi->data;
-
-    reacpu += partition3(nxyz, split_size, mmin_width, cpusize, PP->shape);
-
-    Block *ng, *ng0;
-    int shape_here[dim], ibbox_here[2 * dim];
-    double bbox_here[2 * dim], dd;
-
-    // ibbox : 0,...N-1
-    for (int i = 0; i < nxyz[0]; i++)
-      for (int j = 0; j < nxyz[1]; j++)
-        for (int k = 0; k < nxyz[2]; k++)
-        {
-          ibbox_here[0] = (PP->shape[0] * i) / nxyz[0];
-          ibbox_here[3] = (PP->shape[0] * (i + 1)) / nxyz[0] - 1;
-          ibbox_here[1] = (PP->shape[1] * j) / nxyz[1];
-          ibbox_here[4] = (PP->shape[1] * (j + 1)) / nxyz[1] - 1;
-          ibbox_here[2] = (PP->shape[2] * k) / nxyz[2];
-          ibbox_here[5] = (PP->shape[2] * (k + 1)) / nxyz[2] - 1;
-
-          if (periodic)
-          {
-            ibbox_here[0] = ibbox_here[0] - ghost_width;
-            ibbox_here[3] = ibbox_here[3] + ghost_width;
-            ibbox_here[1] = ibbox_here[1] - ghost_width;
-            ibbox_here[4] = ibbox_here[4] + ghost_width;
-            ibbox_here[2] = ibbox_here[2] - ghost_width;
-            ibbox_here[5] = ibbox_here[5] + ghost_width;
-          }
-          else
-          {
-            ibbox_here[0] = Mymax(0, ibbox_here[0] - ghost_width);
-            ibbox_here[3] = Mymin(PP->shape[0] - 1, ibbox_here[3] + ghost_width);
-            ibbox_here[1] = Mymax(0, ibbox_here[1] - ghost_width);
-            ibbox_here[4] = Mymin(PP->shape[1] - 1, ibbox_here[4] + ghost_width);
-            ibbox_here[2] = Mymax(0, ibbox_here[2] - ghost_width);
-            ibbox_here[5] = Mymin(PP->shape[2] - 1, ibbox_here[5] + ghost_width);
-          }
-
-          shape_here[0] = ibbox_here[3] - ibbox_here[0] + 1;
-          shape_here[1] = ibbox_here[4] - ibbox_here[1] + 1;
-          shape_here[2] = ibbox_here[5] - ibbox_here[2] + 1;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-          // 0--4, 5--10
-          dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1);
-          bbox_here[0] = PP->bbox[0] + ibbox_here[0] * dd;
-          bbox_here[3] = PP->bbox[0] + ibbox_here[3] * dd;
-
-          dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1);
-          bbox_here[1] = PP->bbox[1] + ibbox_here[1] * dd;
-          bbox_here[4] = PP->bbox[1] + ibbox_here[4] * dd;
-
-          dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1);
-          bbox_here[2] = PP->bbox[2] + ibbox_here[2] * dd;
-          bbox_here[5] = PP->bbox[2] + ibbox_here[5] * dd;
-#else
-#ifdef Cell
-          // 0--5, 5--10
-          dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0];
-          bbox_here[0] = PP->bbox[0] + (ibbox_here[0]) * dd;
-          bbox_here[3] = PP->bbox[0] + (ibbox_here[3] + 1) * dd;
-
-          dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1];
-          bbox_here[1] = PP->bbox[1] + (ibbox_here[1]) * dd;
-          bbox_here[4] = PP->bbox[1] + (ibbox_here[4] + 1) * dd;
-
-          dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2];
-          bbox_here[2] = PP->bbox[2] + (ibbox_here[2]) * dd;
-          bbox_here[5] = PP->bbox[2] + (ibbox_here[5] + 1) * dd;
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-
-#ifdef USE_GPU_DIVIDE
-          {
-            const int pices = 2;
-            double picef[pices];
-            picef[0] = cpu_part;
-            picef[1] = gpu_part;
-            int shape_res[dim * pices];
-            double bbox_res[2 * dim * pices];
-            misc::dividBlock(dim, shape_here, bbox_here, pices, picef, shape_res, bbox_res, min_width);
-            ng = ng0 = new Block(dim, shape_res, bbox_res, n_rank++, ingfsi, fngfsi, PP->lev, 0); // delete through KillBlocks
-            //	       ng->checkBlock();
-            if (BlL)
-              BlL->insert(ng);
-            else
-              BlL = new MyList<Block>(ng); // delete through KillBlocks
-
-            for (int i = 1; i < pices; i++)
-            {
-              ng = new Block(dim, shape_res + i * dim, bbox_res + i * 2 * dim, n_rank++, ingfsi, fngfsi, PP->lev, i); // delete through KillBlocks
-              //	        ng->checkBlock();
-              BlL->insert(ng);
-            }
-          }
-#else
-          ng = ng0 = new Block(dim, shape_here, bbox_here, n_rank++, ingfsi, fngfsi, PP->lev); // delete through KillBlocks
-          //	    ng->checkBlock();
-          if (BlL)
-            BlL->insert(ng);
-          else
-            BlL = new MyList<Block>(ng); // delete through KillBlocks
-#endif
-
-          if (n_rank == end_rank + 1)
-            n_rank = start_rank;
-
-          // set PP->blb
-          if (i == 0 && j == 0 && k == 0)
-          {
-            MyList<Block> *Bp = BlL;
-            while (Bp->data != ng0)
-              Bp = Bp->next; // ng0 is the first of the pices list
-            PP->blb = Bp;
-          }
-        }
-    // set PP->ble
-    {
-      MyList<Block> *Bp = BlL;
-      while (Bp->data != ng)
-        Bp = Bp->next; // ng is the last of the pices list
-      PP->ble = Bp;
-    }
-    PLi = PLi->next;
-  }
-  if (reacpu < nodes * 2 / 3)
-  {
-    int myrank;
-    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-    if (myrank == start_rank)
-      cout << "Parallel::distribute CAUSTION: level#" << lev << " uses essencially " << reacpu << " processors vs " << nodes << " nodes run, your scientific computation scale is not as large as you estimate." << endl;
-  }
-
-  return BlL;
-}
-#endif
-void Parallel::setfunction(MyList<Block> *BlL, var *vn, double func(double x, double y, double z))
-{
-  while (BlL)
-  {
-    if (BlL->data->X[0])
-    {
-      int nn = BlL->data->shape[0] * BlL->data->shape[1] * BlL->data->shape[2];
-      double *p = BlL->data->fgfs[vn->sgfn];
-      for (int i = 0; i < nn; i++)
-      {
-        int ind[3];
-        getarrayindex(3, BlL->data->shape, ind, i);
-        p[i] = func(BlL->data->X[0][ind[0]], BlL->data->X[1][ind[1]], BlL->data->X[2][ind[2]]);
-      }
-    }
-    BlL = BlL->next;
-  }
-}
-// set function only for cpu rank
-void Parallel::setfunction(int rank, MyList<Block> *BlL, var *vn, double func(double x, double y, double z))
-{
-  while (BlL)
-  {
-    if (BlL->data->X[0] && BlL->data->rank == rank)
-    {
-      int nn = BlL->data->shape[0] * BlL->data->shape[1] * BlL->data->shape[2];
-      double *p = BlL->data->fgfs[vn->sgfn];
-      for (int i = 0; i < nn; i++)
-      {
-        int ind[3];
-        getarrayindex(3, BlL->data->shape, ind, i);
-        p[i] = func(BlL->data->X[0][ind[0]], BlL->data->X[1][ind[1]], BlL->data->X[2][ind[2]]);
-      }
-    }
-    BlL = BlL->next;
-  }
-}
-void Parallel::getarrayindex(int DIM, int *shape, int *index, int n)
-{
-  // we assume index has already memory space
-  int *mu;
-  mu = new int[DIM];
-  mu[0] = 1;
-  for (int i = 1; i < DIM; i++)
-    mu[i] = mu[i - 1] * shape[i - 1];
-  for (int i = DIM - 1; i >= 0; i--)
-  {
-    index[i] = n / mu[i];
-    n = n - index[i] * mu[i];
-  }
-
-  delete[] mu;
-}
-int Parallel::getarraylocation(int DIM, int *shape, int *index)
-{
-  int n, mu;
-  mu = shape[0];
-  n = index[0];
-  for (int i = 1; i < DIM; i++)
-  {
-    n = n + index[i] * mu;
-    mu = mu * shape[i];
-  }
-
-  return n;
-}
-void Parallel::copy(int DIM, double *llbout, double *uubout, int *Dshape, double *DD, double *llbin, double *uubin,
-                    int *shape, double *datain, double *llb, double *uub)
-{
-  // for 3 dimensional case, based on simple test, I found this is half slower than f90 code
-  int *illi, *iuui;
-  int *illo, *iuuo;
-  int *indi, *indo;
-  illi = new int[DIM];
-  iuui = new int[DIM];
-  illo = new int[DIM];
-  iuuo = new int[DIM];
-  indi = new int[DIM];
-  indo = new int[DIM];
-
-  int ial = 1;
-  for (int i = 0; i < DIM; i++)
-  {
-    double ho, hi;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-    ho = (uubout[i] - llbout[i]) / (Dshape[i] - 1);
-    hi = (uubin[i] - llbin[i]) / (shape[i] - 1);
-#else
-#ifdef Cell
-    ho = (uubout[i] - llbout[i]) / Dshape[i];
-    hi = (uubin[i] - llbin[i]) / shape[i];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-    illo[i] = int((llb[i] - llbout[i]) / ho);
-    iuuo[i] = Dshape[i] - 1 - int((uubout[i] - uub[i]) / ho);
-    illi[i] = int((llb[i] - llbin[i]) / hi);
-    iuui[i] = shape[i] - 1 - int((uubin[i] - uub[i]) / hi);
-
-    if (illo[i] > iuuo[i] || illi[i] > iuui[i] || illo[i] < 0 || illi[i] < 0 ||
-        iuui[i] >= shape[i] || iuuo[i] >= Dshape[i])
-    {
-      cout << "Parallel copy: in direction " << i << ":" << endl;
-      cout << "llb = " << llb[i] << ", uub = " << uub[i] << endl;
-      cout << " in data : il = " << illi[i] << ", iu = " << iuui[i] << endl;
-      cout << "bbox = (" << llbin[i] << "," << uubin[i] << ")" << endl;
-      cout << "shape = " << shape[i] << endl;
-      cout << "out data : il = " << illo[i] << ", iu = " << iuuo[i] << endl;
-      cout << "bbox = (" << llbout[i] << "," << uubout[i] << ")" << endl;
-      cout << "shape = " << Dshape[i] << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-    int ihi = iuui[i] - illi[i] + 1, iho = iuuo[i] - illo[i] + 1;
-    if (!(feq(ho, hi, ho / 2)) || ihi != iho)
-    {
-      cout << "Parallel copy: in direction " << i << ":" << endl;
-      cout << "Parallel copy: not the same grid structure." << endl;
-      cout << "hi = " << hi << ", bbox = (" << llbin[i] << "," << uubin[i] << "), shape = " << shape[i] << endl;
-      cout << "ho = " << ho << ", bbox = (" << llbout[i] << "," << uubout[i] << "), shape = " << Dshape[i] << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-    ial = ial * ihi;
-  }
-
-  for (int i = 0; i < DIM; i++)
-  {
-    indi[i] = illi[i];
-    indo[i] = illo[i];
-  }
-  /*
-  //check start index
-     for(int i=0;i<DIM;i++)
-     {
-       cout << "Parallel copy: in direction " <<i<<":"<< endl;
-       cout<<"start : indi = "<<indi[i]<<", indo = "<<indo[i]<<endl;
-     }
-  */
-  int NNi = 1, NNo = 1;
-  for (int i = 0; i < DIM; i++)
-  {
-    NNi = NNi * shape[i];
-    NNo = NNo * Dshape[i];
-  }
-  for (int i = 0; i < ial; i++)
-  {
-    int ni, no;
-    ni = getarraylocation(DIM, shape, indi);
-    no = getarraylocation(DIM, Dshape, indo);
-    if (no < 0 || no > NNo)
-    {
-      cout << "Parallel copy: no = " << no << " is out of array range (0," << NNo << ")." << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-    if (ni < 0 || ni > NNi)
-    {
-      cout << "Parallel copy: ni = " << ni << " is out of array range (0," << NNi << ")." << endl;
-      cout << "shape = (";
-      for (int j = 0; j < DIM; j++)
-      {
-        cout << shape[j];
-        if (j < DIM - 1)
-          cout << ",";
-        else
-          cout << ")" << endl;
-      }
-      cout << "ind = (";
-      for (int j = 0; j < DIM; j++)
-      {
-        cout << indi[j];
-        if (j < DIM - 1)
-          cout << ",";
-        else
-          cout << ")" << endl;
-      }
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-    DD[no] = datain[ni];
-
-    indi[0]++;
-    for (int j = 1; j < DIM; j++)
-    {
-      if (indi[j - 1] == iuui[j - 1] + 1)
-      {
-        indi[j - 1] = illi[j - 1];
-        indi[j]++;
-      } // carry 1 to next digital
-      else
-        break;
-    }
-    indo[0]++;
-    for (int j = 1; j < DIM; j++)
-    {
-      if (indo[j - 1] == iuuo[j - 1] + 1)
-      {
-        indo[j - 1] = illo[j - 1];
-        indo[j]++;
-      }
-      else
-        break;
-    }
-  }
-  /*
-  //check final index
-     for(int i=0;i<DIM;i++)
-     {
-       cout << "Parallel copy: in direction " <<i<<":"<< endl;
-       cout<<"final : indi = "<<indi[i]<<", indo = "<<indo[i]<<endl;
-     }
-  */
-  delete[] illi;
-  delete[] iuui;
-  delete[] illo;
-  delete[] iuuo;
-  delete[] indi;
-  delete[] indo;
-}
-void Parallel::writefile(double time, int nx, int ny, int nz, double xmin, double xmax, double ymin, double ymax,
-                         double zmin, double zmax, char *filename, double *data_out)
-{
-  ofstream outfile;
-  outfile.open(filename, ios::out | ios::trunc);
-  if (!outfile)
-  {
-    cout << "Can't open " << filename << " for output." << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-  outfile.write((char *)&time, sizeof(double));
-  outfile.write((char *)&nx, sizeof(int));
-  outfile.write((char *)&ny, sizeof(int));
-  outfile.write((char *)&nz, sizeof(int));
-  outfile.write((char *)&xmin, sizeof(double));
-  outfile.write((char *)&xmax, sizeof(double));
-  outfile.write((char *)&ymin, sizeof(double));
-  outfile.write((char *)&ymax, sizeof(double));
-  outfile.write((char *)&zmin, sizeof(double));
-  outfile.write((char *)&zmax, sizeof(double));
-  outfile.write((char *)data_out, nx * ny * nz * sizeof(double));
-  outfile.close();
-}
-void Parallel::writefile(double time, int nx, int ny, double xmin, double xmax, double ymin, double ymax,
-                         char *filename, double *datain)
-{
-  int i, j;
-  double *X, *Y;
-  X = new double[nx];
-  Y = new double[ny];
-  double dd;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-  dd = (xmax - xmin) / (nx - 1);
-  for (i = 0; i < nx; i++)
-    X[i] = xmin + i * dd;
-  dd = (ymax - ymin) / (ny - 1);
-  for (j = 0; j < ny; j++)
-    Y[j] = ymin + j * dd;
-#else
-#ifdef Cell
-  dd = (xmax - xmin) / nx;
-  for (i = 0; i < nx; i++)
-    X[i] = xmin + (i + 0.5) * dd;
-  dd = (ymax - ymin) / ny;
-  for (j = 0; j < ny; j++)
-    Y[j] = ymin + (j + 0.5) * dd;
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-  ofstream outfile;
-  outfile.open(filename, ios::out | ios::trunc);
-  if (!outfile)
-  {
-    cout << "Can't open " << filename << " for output." << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-  outfile << "# t = " << time << endl;
-  for (j = 0; j < ny; j++)
-  {
-    for (i = 0; i < nx; i++)
-    {
-      int ind1 = i + j * nx;
-      outfile << setw(10) << setprecision(10) << X[i] << " "
-              << setw(10) << setprecision(10) << Y[j] << " "
-              << setw(16) << setprecision(15) << datain[ind1]
-              << endl;
-    }
-    outfile << "\n"; /* blanck line for gnuplot */
-  }
-  outfile.close();
-
-  delete[] X;
-  delete[] Y;
-}
-void Parallel::Dump_CPU_Data(MyList<Block> *BlL, MyList<var> *DumpList, char *tag, double time, double dT)
-{
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  // round at 4 and 5
-  int ncount = int(time / dT + 0.5);
-
-  MyList<Block> *Bp;
-  while (DumpList)
-  {
-    Bp = BlL;
-    int Bi = 0;
-    while (Bp)
-    {
-      Block *BP = Bp->data;
-      var *VP = DumpList->data;
-      if (BP->rank == myrank)
-      {
-
-        string out_dir;
-        map<string, string>::iterator iter;
-        iter = parameters::str_par.find("output dir");
-        if (iter != parameters::str_par.end())
-        {
-          out_dir = iter->second;
-        }
-        else
-        {
-          // read parameter from file
-          const int LEN = 256;
-          char pline[LEN];
-          string str, sgrp, skey, sval;
-          int sind;
-          char pname[50];
-          {
-            map<string, string>::iterator iter = parameters::str_par.find("inputpar");
-            if (iter != parameters::str_par.end())
-            {
-              strcpy(pname, (iter->second).c_str());
-            }
-            else
-            {
-              cout << "Error inputpar" << endl;
-              exit(0);
-            }
-          }
-          ifstream inf(pname, ifstream::in);
-          if (!inf.good())
-          {
-            cout << "Can not open parameter file " << pname << endl;
-            MPI_Abort(MPI_COMM_WORLD, 1);
-          }
-
-          for (int i = 1; inf.good(); i++)
-          {
-            inf.getline(pline, LEN);
-            str = pline;
-
-            int status = misc::parse_parts(str, sgrp, skey, sval, sind);
-            if (status == -1)
-            {
-              cout << "error reading parameter file " << pname << " in line " << i << endl;
-              MPI_Abort(MPI_COMM_WORLD, 1);
-            }
-            else if (status == 0)
-              continue;
-
-            if (sgrp == "ABE")
-            {
-              if (skey == "output dir")
-                out_dir = sval;
-            }
-          }
-          inf.close();
-
-          parameters::str_par.insert(map<string, string>::value_type("output dir", out_dir));
-        }
-
-        char filename[100];
-        if (tag)
-          sprintf(filename, "%s/%s_Lev%02d-%02d_%02d_%s_%05d.bin", out_dir.c_str(), tag, BP->lev, Bi, myrank, VP->name, ncount);
-        else
-          sprintf(filename, "%s/Lev%02d-%02d_%02d_%s_%05d.bin", out_dir.c_str(), BP->lev, Bi, myrank, VP->name, ncount);
-        writefile(time, BP->shape[0], BP->shape[1], BP->shape[2], BP->bbox[0], BP->bbox[3], BP->bbox[1], BP->bbox[4],
-                  BP->bbox[2], BP->bbox[5], filename, BP->fgfs[VP->sgfn]);
-        cout << "end of dump " << VP->name << " at time " << time << ", on node " << myrank << endl;
-      }
-      Bp = Bp->next;
-      Bi++;
-    }
-    DumpList = DumpList->next;
-  }
-}
-// Now we dump the data including buffer points
-void Parallel::Dump_Data(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT, int grd)
-{
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  //   round at 4 and 5
-  int ncount = int(time / dT + 0.5);
-
-  MPI_Status sta;
-  int DIM = 3;
-  double llb[3], uub[3];
-  double DX, DY, DZ;
-
-  double *databuffer = 0;
-  if (myrank == 0)
-  {
-    databuffer = (double *)malloc(sizeof(double) * PP->shape[0] * PP->shape[1] * PP->shape[2]);
-    if (!databuffer)
-    {
-      cout << "Parallel::Dump_Data: out of memory when dumping data." << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-  }
-
-  while (DumpList)
-  {
-    var *VP = DumpList->data;
-
-    MyList<Block> *Bp = PP->blb;
-    while (Bp)
-    {
-      Block *BP = Bp->data;
-      if (BP->rank == 0 && myrank == 0)
-      {
-        DX = BP->getdX(0);
-        DY = BP->getdX(1);
-        DZ = BP->getdX(2);
-        llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
-        llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
-        llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
-        uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
-        uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
-        uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
-        f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, BP->fgfs[VP->sgfn], llb, uub);
-      }
-      else
-      {
-        int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]);
-        if (myrank == 0)
-        {
-          double *bufferhere = (double *)malloc(sizeof(double) * nnn);
-          if (!bufferhere)
-          {
-            cout << "on node#" << myrank << ", out of memory when dumping data." << endl;
-            MPI_Abort(MPI_COMM_WORLD, 1);
-          }
-          MPI_Recv(bufferhere, nnn, MPI_DOUBLE, BP->rank, 0, MPI_COMM_WORLD, &sta);
-          DX = BP->getdX(0);
-          DY = BP->getdX(1);
-          DZ = BP->getdX(2);
-          llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
-          llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
-          llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
-          uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
-          uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
-          uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
-          f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, bufferhere, llb, uub);
-          free(bufferhere);
-        }
-        else if (myrank == BP->rank)
-        {
-          MPI_Send(BP->fgfs[VP->sgfn], nnn, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
-        }
-      }
-      if (Bp == PP->ble)
-        break;
-      Bp = Bp->next;
-    }
-    if (myrank == 0)
-    {
-
-      string out_dir;
-      map<string, string>::iterator iter;
-      iter = parameters::str_par.find("output dir");
-      if (iter != parameters::str_par.end())
-      {
-        out_dir = iter->second;
-      }
-      else
-      {
-        // read parameter from file
-        const int LEN = 256;
-        char pline[LEN];
-        string str, sgrp, skey, sval;
-        int sind;
-        char pname[50];
-        {
-          map<string, string>::iterator iter = parameters::str_par.find("inputpar");
-          if (iter != parameters::str_par.end())
-          {
-            strcpy(pname, (iter->second).c_str());
-          }
-          else
-          {
-            cout << "Error inputpar" << endl;
-            exit(0);
-          }
-        }
-        ifstream inf(pname, ifstream::in);
-        if (!inf.good())
-        {
-          cout << "Can not open parameter file " << pname << endl;
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-
-        for (int i = 1; inf.good(); i++)
-        {
-          inf.getline(pline, LEN);
-          str = pline;
-
-          int status = misc::parse_parts(str, sgrp, skey, sval, sind);
-          if (status == -1)
-          {
-            cout << "error reading parameter file " << pname << " in line " << i << endl;
-            MPI_Abort(MPI_COMM_WORLD, 1);
-          }
-          else if (status == 0)
-            continue;
-
-          if (sgrp == "ABE")
-          {
-            if (skey == "output dir")
-              out_dir = sval;
-          }
-        }
-        inf.close();
-
-        parameters::str_par.insert(map<string, string>::value_type("output dir", out_dir));
-      }
-
-      char filename[100];
-      if (tag)
-        sprintf(filename, "%s/%s_Lev%02d-%02d_%s_%05d.bin", out_dir.c_str(), tag, PP->lev, grd, VP->name, ncount);
-      else
-        sprintf(filename, "%s/Lev%02d-%02d_%s_%05d.bin", out_dir.c_str(), PP->lev, grd, VP->name, ncount);
-
-      writefile(time, PP->shape[0], PP->shape[1], PP->shape[2], PP->bbox[0], PP->bbox[3], PP->bbox[1], PP->bbox[4],
-                PP->bbox[2], PP->bbox[5], filename, databuffer);
-    }
-    DumpList = DumpList->next;
-  }
-
-  if (myrank == 0)
-    free(databuffer);
-}
-void Parallel::Dump_Data(MyList<Patch> *PL, MyList<var> *DumpList, char *tag, double time, double dT)
-{
-  MyList<Patch> *Pp;
-  Pp = PL;
-  int grd = 0;
-  while (Pp)
-  {
-    Patch *PP = Pp->data;
-    Dump_Data(PP, DumpList, tag, time, dT, grd);
-    grd++;
-    Pp = Pp->next;
-  }
-}
-// collect the data including buffer points
-double *Parallel::Collect_Data(Patch *PP, var *VP)
-{
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  MPI_Status sta;
-  int DIM = 3;
-  double llb[3], uub[3];
-  double DX, DY, DZ;
-
-  double *databuffer = 0;
-  if (myrank == 0)
-  {
-    databuffer = (double *)malloc(sizeof(double) * PP->shape[0] * PP->shape[1] * PP->shape[2]);
-    if (!databuffer)
-    {
-      cout << "Parallel::Collect_Data: out of memory when dumping data." << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-  }
-
-  MyList<Block> *Bp = PP->blb;
-  while (Bp)
-  {
-    Block *BP = Bp->data;
-    if (BP->rank == 0 && myrank == 0)
-    {
-      DX = BP->getdX(0);
-      DY = BP->getdX(1);
-      DZ = BP->getdX(2);
-      llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
-      llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
-      llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
-      uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
-      uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
-      uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
-      f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, BP->fgfs[VP->sgfn], llb, uub);
-    }
-    else
-    {
-      int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]);
-      if (myrank == 0)
-      {
-        double *bufferhere = (double *)malloc(sizeof(double) * nnn);
-        if (!bufferhere)
-        {
-          cout << "on node#" << myrank << ", out of memory when dumping data." << endl;
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-        MPI_Recv(bufferhere, nnn, MPI_DOUBLE, BP->rank, 0, MPI_COMM_WORLD, &sta);
-        DX = BP->getdX(0);
-        DY = BP->getdX(1);
-        DZ = BP->getdX(2);
-        llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
-        llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
-        llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
-        uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
-        uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
-        uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
-        f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, bufferhere, llb, uub);
-        free(bufferhere);
-      }
-      else if (myrank == BP->rank)
-      {
-        MPI_Send(BP->fgfs[VP->sgfn], nnn, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
-      }
-    }
-    if (Bp == PP->ble)
-      break;
-    Bp = Bp->next;
-  }
-
-  return databuffer;
-}
-// Now we dump the data including buffer points
-// dump z = 0 plane
-void Parallel::d2Dump_Data(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT, int grd)
-{
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  //   round at 4 and 5
-  int ncount = int(time / dT + 0.5);
-
-  MPI_Status sta;
-  int DIM = 3;
-  double llb[3], uub[3];
-  double DX, DY, DZ;
-
-  double *databuffer = 0, *databuffer2 = 0;
-  if (myrank == 0)
-  {
-    databuffer = (double *)malloc(sizeof(double) * PP->shape[0] * PP->shape[1] * PP->shape[2]);
-    databuffer2 = (double *)malloc(sizeof(double) * PP->shape[0] * PP->shape[1]);
-    if (!databuffer || !databuffer2)
-    {
-      cout << "Parallel::d2Dump_Data: out of memory when dumping data." << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-  }
-
-  while (DumpList)
-  {
-    var *VP = DumpList->data;
-
-    MyList<Block> *Bp = PP->blb;
-    while (Bp)
-    {
-      Block *BP = Bp->data;
-      if (BP->rank == 0 && myrank == 0)
-      {
-        DX = BP->getdX(0);
-        DY = BP->getdX(1);
-        DZ = BP->getdX(2);
-        llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
-        llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
-        llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
-        uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
-        uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
-        uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
-        f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, BP->fgfs[VP->sgfn], llb, uub);
-      }
-      else
-      {
-        int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]);
-        if (myrank == 0)
-        {
-          double *bufferhere = (double *)malloc(sizeof(double) * nnn);
-          if (!bufferhere)
-          {
-            cout << "on node#" << myrank << ", out of memory when dumping data." << endl;
-            MPI_Abort(MPI_COMM_WORLD, 1);
-          }
-          MPI_Recv(bufferhere, nnn, MPI_DOUBLE, BP->rank, 0, MPI_COMM_WORLD, &sta);
-          DX = BP->getdX(0);
-          DY = BP->getdX(1);
-          DZ = BP->getdX(2);
-          llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
-          llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
-          llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
-          uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
-          uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
-          uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
-          f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, bufferhere, llb, uub);
-          free(bufferhere);
-        }
-        else if (myrank == BP->rank)
-        {
-          MPI_Send(BP->fgfs[VP->sgfn], nnn, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
-        }
-      }
-      if (Bp == PP->ble)
-        break;
-      Bp = Bp->next;
-    }
-    if (myrank == 0)
-    {
-
-      string out_dir;
-      map<string, string>::iterator iter;
-      iter = parameters::str_par.find("output dir");
-      if (iter != parameters::str_par.end())
-      {
-        out_dir = iter->second;
-      }
-      else
-      {
-        // read parameter from file
-        const int LEN = 256;
-        char pline[LEN];
-        string str, sgrp, skey, sval;
-        int sind;
-        char pname[50];
-        {
-          map<string, string>::iterator iter = parameters::str_par.find("inputpar");
-          if (iter != parameters::str_par.end())
-          {
-            strcpy(pname, (iter->second).c_str());
-          }
-          else
-          {
-            cout << "Error inputpar" << endl;
-            exit(0);
-          }
-        }
-        ifstream inf(pname, ifstream::in);
-        if (!inf.good())
-        {
-          cout << "Can not open parameter file " << pname << endl;
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-
-        for (int i = 1; inf.good(); i++)
-        {
-          inf.getline(pline, LEN);
-          str = pline;
-
-          int status = misc::parse_parts(str, sgrp, skey, sval, sind);
-          if (status == -1)
-          {
-            cout << "error reading parameter file " << pname << " in line " << i << endl;
-            MPI_Abort(MPI_COMM_WORLD, 1);
-          }
-          else if (status == 0)
-            continue;
-
-          if (sgrp == "ABE")
-          {
-            if (skey == "output dir")
-              out_dir = sval;
-          }
-        }
-        inf.close();
-
-        parameters::str_par.insert(map<string, string>::value_type("output dir", out_dir));
-      }
-
-      char filename[100];
-      if (tag)
-        sprintf(filename, "%s/%s_2d_Lev%02d-%02d_%s_%05d.dat", out_dir.c_str(), tag, PP->lev, grd, VP->name, ncount);
-      else
-        sprintf(filename, "%s/2d_Lev%02d-%02d_%s_%05d.dat", out_dir.c_str(), PP->lev, grd, VP->name, ncount);
-
-      int gord = ghost_width;
-      f_d2dump(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, databuffer2, gord, VP->SoA);
-      writefile(time, PP->shape[0], PP->shape[1], PP->bbox[0], PP->bbox[3], PP->bbox[1], PP->bbox[4],
-                filename, databuffer2);
-    }
-    DumpList = DumpList->next;
-  }
-
-  if (myrank == 0)
-  {
-    free(databuffer);
-    free(databuffer2);
-  }
-}
-void Parallel::d2Dump_Data(MyList<Patch> *PL, MyList<var> *DumpList, char *tag, double time, double dT)
-{
-  MyList<Patch> *Pp;
-  Pp = PL;
-  int grd = 0;
-  while (Pp)
-  {
-    Patch *PP = Pp->data;
-    d2Dump_Data(PP, DumpList, tag, time, dT, grd);
-    grd++;
-    Pp = Pp->next;
-  }
-}
-// Now we dump the data including buffer points and ghost points of the given patch
-void Parallel::Dump_Data0(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT)
-{
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  //   round at 4 and 5
-  int ncount = int(time / dT + 0.5);
-
-  MPI_Status sta;
-  int DIM = 3;
-  double llb[3], uub[3], tllb[3], tuub[3];
-  int tshape[3];
-  double DX, DY, DZ;
-
-  for (int i = 0; i < 3; i++)
-  {
-    double DX = PP->blb->data->getdX(i);
-    tshape[i] = PP->shape[i] + 2 * ghost_width;
-    tllb[i] = PP->bbox[i] - ghost_width * DX;
-    tuub[i] = PP->bbox[i + dim] + ghost_width * DX;
-  }
-
-  int NN = tshape[0] * tshape[1] * tshape[2];
-  double *databuffer = 0;
-  if (myrank == 0)
-  {
-    databuffer = (double *)malloc(sizeof(double) * NN);
-    if (!databuffer)
-    {
-      cout << "on node# " << myrank << ", out of memory when dumping data." << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-  }
-
-  while (DumpList)
-  {
-    var *VP = DumpList->data;
-    MyList<Block> *Bp = PP->blb;
-    while (Bp)
-    {
-      Block *BP = Bp->data;
-      if (BP->rank == 0 && myrank == 0)
-      {
-        DX = BP->getdX(0);
-        DY = BP->getdX(1);
-        DZ = BP->getdX(2);
-        llb[0] = (feq(BP->bbox[0], tllb[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
-        llb[1] = (feq(BP->bbox[1], tllb[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
-        llb[2] = (feq(BP->bbox[2], tllb[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
-        uub[0] = (feq(BP->bbox[3], tuub[0], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
-        uub[1] = (feq(BP->bbox[4], tuub[1], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
-        uub[2] = (feq(BP->bbox[5], tuub[2], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
-        f_copy(DIM, tllb, tuub, tshape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, BP->fgfs[VP->sgfn], llb, uub);
-      }
-      else
-      {
-        if (myrank == 0)
-        {
-          int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]);
-          double *bufferhere = (double *)malloc(sizeof(double) * nnn);
-          if (!bufferhere)
-          {
-            cout << "on node#" << myrank << ", out of memory when dumping data." << endl;
-            MPI_Abort(MPI_COMM_WORLD, 1);
-          }
-          MPI_Recv(bufferhere, nnn, MPI_DOUBLE, BP->rank, 0, MPI_COMM_WORLD, &sta);
-          DX = BP->getdX(0);
-          DY = BP->getdX(1);
-          DZ = BP->getdX(2);
-          llb[0] = (feq(BP->bbox[0], tllb[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
-          llb[1] = (feq(BP->bbox[1], tllb[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
-          llb[2] = (feq(BP->bbox[2], tllb[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
-          uub[0] = (feq(BP->bbox[3], tuub[0], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
-          uub[1] = (feq(BP->bbox[4], tuub[1], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
-          uub[2] = (feq(BP->bbox[5], tuub[2], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
-          f_copy(DIM, tllb, tuub, tshape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, bufferhere, llb, uub);
-          free(bufferhere);
-        }
-        else if (myrank == BP->rank)
-        {
-          int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]);
-          MPI_Send(BP->fgfs[VP->sgfn], nnn, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
-        }
-      }
-      if (Bp == PP->ble)
-        break;
-      Bp = Bp->next;
-    }
-    if (myrank == 0)
-    {
-
-      string out_dir;
-      map<string, string>::iterator iter;
-      iter = parameters::str_par.find("output dir");
-      if (iter != parameters::str_par.end())
-      {
-        out_dir = iter->second;
-      }
-      else
-      {
-        // read parameter from file
-        const int LEN = 256;
-        char pline[LEN];
-        string str, sgrp, skey, sval;
-        int sind;
-        char pname[50];
-        {
-          map<string, string>::iterator iter = parameters::str_par.find("inputpar");
-          if (iter != parameters::str_par.end())
-          {
-            strcpy(pname, (iter->second).c_str());
-          }
-          else
-          {
-            cout << "Error inputpar" << endl;
-            exit(0);
-          }
-        }
-        ifstream inf(pname, ifstream::in);
-        if (!inf.good())
-        {
-          cout << "Can not open parameter file " << pname << endl;
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-
-        for (int i = 1; inf.good(); i++)
-        {
-          inf.getline(pline, LEN);
-          str = pline;
-
-          int status = misc::parse_parts(str, sgrp, skey, sval, sind);
-          if (status == -1)
-          {
-            cout << "error reading parameter file " << pname << " in line " << i << endl;
-            MPI_Abort(MPI_COMM_WORLD, 1);
-          }
-          else if (status == 0)
-            continue;
-
-          if (sgrp == "ABE")
-          {
-            if (skey == "output dir")
-              out_dir = sval;
-          }
-        }
-        inf.close();
-
-        parameters::str_par.insert(map<string, string>::value_type("output dir", out_dir));
-      }
-
-      char filename[100];
-      if (tag)
-        sprintf(filename, "%s/%s_Lev%02d_%s_%05d.bin", out_dir.c_str(), tag, PP->lev, VP->name, ncount);
-      else
-        sprintf(filename, "%s/Lev%02d_%s_%05d.bin", out_dir.c_str(), PP->lev, VP->name, ncount);
-
-      writefile(time, tshape[0], tshape[1], tshape[2], tllb[0], tuub[0], tllb[1], tuub[2],
-                tllb[2], tuub[2], filename, databuffer);
-    }
-    DumpList = DumpList->next;
-  }
-
-  if (myrank == 0)
-    free(databuffer);
-}
-// Map point is much easier than maping data itself
-// But the main problem is about the points near the boundary
-// worst case is -ghost -ghost+1 .... 0 * ......
-double Parallel::global_interp(int DIM, int *ext, double **CoX, double *datain,
-                               double *poXb, int ordn, double *SoA, int Symmetry)
-{
-  if (DIM != 3)
-  {
-    cout << "Parallel::global_interp does not suport DIM = " << DIM << " for Symmetry." << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-  double resu;
-  double poX[3];
-  double asgn = 1;
-
-  for (int i = 0; i < 3; i++)
-    poX[i] = poXb[i];
-
-  switch (Symmetry)
-  {
-  case 2:
-    for (int i = 0; i < 3; i++)
-      if (poX[i] < 0)
-      {
-        poX[i] = -poX[i];
-        asgn = asgn * SoA[i];
-      }
-    break;
-  case 1:
-    if (poX[2] < 0)
-    {
-      poX[2] = -poX[2];
-      asgn = asgn * SoA[2];
-    }
-  }
-
-  int extb[3];
-
-  for (int i = 0; i < 3; i++)
-    extb[i] = ext[i];
-
-  switch (Symmetry)
-  {
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-  case 2:
-    if (poX[0] < (ghost_width - 1) * (CoX[0][1] - CoX[0][0]))
-      extb[0] = extb[0] + ghost_width - 1;
-    if (poX[1] < (ghost_width - 1) * (CoX[1][1] - CoX[1][0]))
-      extb[1] = extb[1] + ghost_width - 1;
-  case 1:
-    if (poX[2] < (ghost_width - 1) * (CoX[2][1] - CoX[2][0]))
-      extb[2] = extb[2] + ghost_width - 1;
-#else
-#ifdef Cell
-  case 2:
-    if (poX[0] < (ghost_width - 0.5) * (CoX[0][1] - CoX[0][0]))
-      extb[0] = extb[0] + ghost_width;
-    if (poX[1] < (ghost_width - 0.5) * (CoX[1][1] - CoX[1][0]))
-      extb[1] = extb[1] + ghost_width;
-  case 1:
-    if (poX[2] < (ghost_width - 0.5) * (CoX[2][1] - CoX[2][0]))
-      extb[2] = extb[2] + ghost_width;
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-  }
-
-  if (extb[0] > ext[0] || extb[1] > ext[1] || extb[2] > ext[2])
-  {
-    double *CoXb[3];
-    int Nb = extb[0] * extb[1] * extb[2];
-    double *datab;
-    datab = new double[Nb];
-    for (int i = 0; i < 3; i++)
-    {
-      CoXb[i] = new double[extb[i]];
-      double DH = CoX[i][1] - CoX[i][0];
-      if (extb[i] > ext[i])
-      {
-        if (CoX[i][0] > DH)
-        {
-          cout << "lower boundary[" << i << "] = " << CoX[i][0] << ", but SYmmetry = " << Symmetry << endl;
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        for (int j = 0; j < ghost_width - 1; j++)
-          CoXb[i][j] = -CoX[i][ghost_width - 1 - j];
-        for (int j = ghost_width - 1; j < extb[i]; j++)
-          CoXb[i][j] = CoX[i][j - ghost_width + 1];
-#else
-#ifdef Cell
-        for (int j = 0; j < ghost_width; j++)
-          CoXb[i][j] = -CoX[i][ghost_width - 1 - j];
-        for (int j = ghost_width; j < extb[i]; j++)
-          CoXb[i][j] = CoX[i][j - ghost_width];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-      else
-      {
-        for (int j = 0; j < extb[i]; j++)
-          CoXb[i][j] = CoX[i][j];
-      }
-    }
-
-    for (int i = 0; i < Nb; i++)
-    {
-      int ind[3], indb[3];
-      getarrayindex(3, extb, indb, i);
-      double sgn = 1;
-      for (int j = 0; j < 3; j++)
-      {
-        if (extb[j] > ext[j])
-        {
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-          if (indb[j] < ghost_width - 1)
-          {
-            ind[j] = ghost_width - 1 - indb[j];
-            sgn = sgn * SoA[j];
-          }
-          else
-          {
-            ind[j] = 1 + indb[j] - ghost_width;
-          }
-#else
-#ifdef Cell
-          if (indb[j] < ghost_width)
-          {
-            ind[j] = ghost_width - 1 - indb[j];
-            sgn = sgn * SoA[j];
-          }
-          else
-          {
-            ind[j] = indb[j] - ghost_width;
-          }
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-        }
-        else
-          ind[j] = indb[j];
-      }
-      int lon = getarraylocation(3, ext, ind);
-      datab[i] = datain[lon] * sgn;
-    }
-
-    resu = global_interp(DIM, extb, CoXb, datab, poX, ordn);
-
-    for (int i = 0; i < 3; i++)
-      delete[] CoXb[i];
-    delete[] datab;
-  }
-  else
-  {
-    resu = global_interp(DIM, ext, CoX, datain, poX, ordn);
-  }
-
-  return resu * asgn;
-}
-double Parallel::global_interp(int DIM, int *ext, double **CoX, double *datain,
-                               double *poX, int ordn)
-{
-  if (ordn > 2 * ghost_width)
-  {
-    cout << "Parallel::global_interp can not handle ordn = " << ordn << " > 2*ghost_width = " << 2 * ghost_width << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-  double *bbox, *datainbbox;
-  bbox = new double[2 * DIM];
-  datainbbox = new double[2 * DIM];
-
-  int *NN, *ind, *shape;
-  NN = new int[DIM];
-  ind = new int[DIM];
-  shape = new int[DIM];
-
-  for (int i = 0; i < DIM; i++)
-  {
-    ind[i] = int((poX[i] - CoX[i][0]) / (CoX[i][1] - CoX[i][0])) - ordn / 2 + 1;
-    // poX may exactly locate on the boundary (exclude ghost)
-    if (ind[i] == -1 && feq(poX[i], CoX[i][0], (CoX[i][1] - CoX[i][0]) / 2))
-      ind[i] = 0;
-    /*
-         if(ind[i] < 0)
-         {
-           cout<<"Parallel::global_interp error ind["<<i<<"] = "<<ind[i]<<endl;
-           cout<<"pox = "<<poX[i]<<", CoX[0] = "<<CoX[i][0]<<endl;
-           MPI_Abort(MPI_COMM_WORLD,1);
-         }
-    */
-    if (ind[i] == ext[i] - ordn + 1 && feq(poX[i], CoX[i][ext[i] - ordn / 2], (CoX[i][1] - CoX[i][0]) / 2))
-      ind[i] = ext[i] - ordn - 1;
-    /*
-         if(ind[i]+ordn-1 > ext[i]-1)
-         {
-           cout<<"Parallel::global_interp error ind["<<i<<"] = "<<ind[i]<<" + ordn ("<<ordn<<") > ext = "<<ext[i]<<endl;
-           cout<<"pox = "<<poX[i]<<", CoX[ind] = "<<CoX[i][ind[i]]<<", CoX = ("<<CoX[i][0]<<","<<CoX[i][ext[i]-1]<<")"<<endl;
-           MPI_Abort(MPI_COMM_WORLD,1);
-         }
-    */
-    bbox[i] = CoX[i][ind[i]];
-    bbox[DIM + i] = CoX[i][ind[i] + ordn - 1];
-    datainbbox[i] = CoX[i][0];
-    datainbbox[DIM + i] = CoX[i][ext[i] - 1];
-    shape[i] = ordn;
-  }
-
-  NN[DIM - 1] = ordn;
-  for (int i = DIM - 2; i >= 0; i--)
-    NN[i] = NN[i + 1] * ordn;
-
-  double *xpts, *funcvals;
-  xpts = new double[ordn];
-  funcvals = new double[ordn];
-  double *DDd, *DDd1, rr;
-
-  DDd = new double[NN[0]];
-
-  copy(DIM, bbox, bbox + DIM, shape, DDd, datainbbox, datainbbox + DIM, ext, datain, bbox, bbox + DIM);
-
-  for (int i = 0; i < DIM; i++)
-  {
-    for (int j = ind[i]; j < ind[i] + ordn; j++)
-    {
-      xpts[j - ind[i]] = CoX[i][j];
-    }
-
-    if (i < DIM - 1)
-    {
-      DDd1 = new double[NN[i + 1]];
-      for (int j = 0; j < NN[i + 1]; j++)
-      {
-        for (int k = 0; k < ordn; k++)
-          funcvals[k] = DDd[k + j * ordn];
-        DDd1[j] = Lagrangian_Int(poX[i], ordn, xpts, funcvals);
-      }
-      delete[] DDd;
-      DDd = DDd1;
-    }
-    else
-    {
-      for (int j = 0; j < ordn; j++)
-        funcvals[j] = DDd[j];
-      rr = Lagrangian_Int(poX[i], ordn, xpts, funcvals);
-      delete[] DDd1; // since DDd and DDd1 now point to the same stuff, we need delete after above int
-    }
-  }
-
-  delete[] NN;
-  delete[] ind;
-  delete[] xpts;
-  delete[] funcvals;
-  delete[] bbox;
-  delete[] datainbbox;
-  delete[] shape;
-
-  return rr;
-}
-double Parallel::Lagrangian_Int(double x, int npts, double *xpts, double *funcvals)
-{
-  double sum = 0;
-  for (int i = 0; i < npts; i++)
-  {
-    sum = sum + funcvals[i] * LagrangePoly(x, i, npts, xpts);
-  }
-  return sum;
-}
-double Parallel::LagrangePoly(double x, int pt, int npts, double *xpts)
-{
-  double h = 1;
-  int i;
-
-  for (i = 0; i < pt; i++)
-    h = h * (x - xpts[i]) / (xpts[pt] - xpts[i]);
-
-  for (i = pt + 1; i < npts; i++)
-    h = h * (x - xpts[i]) / (xpts[pt] - xpts[i]);
-
-  return h;
-}
-// collect all grid segments or blocks including ghost and buffer for given patch
-MyList<Parallel::gridseg> *Parallel::build_complete_gsl(Patch *Pat)
-{
-  MyList<Parallel::gridseg> *cgsl = 0, *gs;
-  MyList<Block> *BP = Pat->blb;
-  while (BP)
-  {
-    if (!cgsl)
-    {
-      cgsl = gs = new MyList<Parallel::gridseg>; // delete through destroyList();
-      gs->data = new Parallel::gridseg;
-    }
-    else
-    {
-      gs->next = new MyList<Parallel::gridseg>;
-      gs = gs->next;
-      gs->data = new Parallel::gridseg;
-    }
-
-    for (int i = 0; i < dim; i++)
-    {
-      gs->data->llb[i] = BP->data->bbox[i];
-      gs->data->uub[i] = BP->data->bbox[dim + i];
-      gs->data->shape[i] = BP->data->shape[i];
-    }
-    gs->data->Bg = BP->data;
-    gs->next = 0;
-
-    if (BP == Pat->ble)
-      break;
-    BP = BP->next;
-  }
-
-  return cgsl;
-}
-// collect all grid segments or blocks including ghost and buffer for given patch list
-MyList<Parallel::gridseg> *Parallel::build_complete_gsl(MyList<Patch> *PatL)
-{
-  MyList<Parallel::gridseg> *cgsl = 0, *gs;
-  while (PatL)
-  {
-    if (!cgsl)
-    {
-      cgsl = build_complete_gsl(PatL->data);
-      gs = cgsl;
-      while (gs->next)
-        gs = gs->next;
-    }
-    else
-    {
-      gs->next = build_complete_gsl(PatL->data);
-      gs = gs->next;
-      while (gs->next)
-        gs = gs->next;
-    }
-    PatL = PatL->next;
-  }
-
-  return cgsl;
-}
-// cellect the information of Patch list
-MyList<Parallel::gridseg> *Parallel::build_complete_gsl_virtual(MyList<Patch> *PatL)
-{
-  MyList<Parallel::gridseg> *cgsl = 0, *gs;
-  while (PatL)
-  {
-    if (cgsl)
-    {
-      gs->next = new MyList<Parallel::gridseg>;
-      gs = gs->next;
-      gs->data = new Parallel::gridseg;
-    }
-    else
-    {
-      cgsl = gs = new MyList<Parallel::gridseg>;
-      gs->data = new Parallel::gridseg;
-    }
-
-    for (int i = 0; i < dim; i++)
-    {
-      gs->data->llb[i] = PatL->data->bbox[i];
-      gs->data->uub[i] = PatL->data->bbox[dim + i];
-      gs->data->shape[i] = PatL->data->shape[i];
-    }
-    gs->data->Bg = 0;
-    gs->next = 0;
-
-    PatL = PatL->next;
-  }
-
-  return cgsl;
-}
-// cellect the information of Patch list without buffer points
-MyList<Parallel::gridseg> *Parallel::build_complete_gsl_virtual2(MyList<Patch> *PatL) // - buffer
-{
-  MyList<Parallel::gridseg> *cgsl = 0, *gs;
-  while (PatL)
-  {
-    if (cgsl)
-    {
-      gs->next = new MyList<Parallel::gridseg>;
-      gs = gs->next;
-      gs->data = new Parallel::gridseg;
-    }
-    else
-    {
-      cgsl = gs = new MyList<Parallel::gridseg>;
-      gs->data = new Parallel::gridseg;
-    }
-
-    for (int i = 0; i < dim; i++)
-    {
-      double DH = PatL->data->getdX(i);
-      gs->data->llb[i] = PatL->data->bbox[i] + PatL->data->lli[i] * DH;
-      gs->data->uub[i] = PatL->data->bbox[dim + i] - PatL->data->uui[i] * DH;
-      gs->data->shape[i] = PatL->data->shape[i] - PatL->data->lli[i] - PatL->data->uui[i];
-    }
-    gs->data->Bg = 0;
-    gs->next = 0;
-
-    PatL = PatL->next;
-  }
-
-  return cgsl;
-}
-// collect all grid segments or blocks without ghost for given patch, without extension
-MyList<Parallel::gridseg> *Parallel::build_bulk_gsl(Patch *Pat)
-{
-  MyList<Parallel::gridseg> *cgsl = 0, *gs;
-  MyList<Block> *BP = Pat->blb;
-  while (BP)
-  {
-    Block *bp = BP->data;
-    if (!cgsl)
-    {
-      cgsl = gs = new MyList<Parallel::gridseg>;
-      gs->data = new Parallel::gridseg;
-    }
-    else
-    {
-      gs->next = new MyList<Parallel::gridseg>;
-      gs = gs->next;
-      gs->data = new Parallel::gridseg;
-    }
-
-    for (int i = 0; i < dim; i++)
-    {
-      double DH = bp->getdX(i);
-      gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] : bp->bbox[dim + i] - ghost_width * DH;
-      gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + ghost_width * DH;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-      gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
-#else
-#ifdef Cell
-      gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-    }
-    gs->data->Bg = BP->data;
-    gs->next = 0;
-
-    if (BP == Pat->ble)
-      break;
-    BP = BP->next;
-  }
-
-  return cgsl;
-}
-// bulk part for given Block within given patch, without extension
-MyList<Parallel::gridseg> *Parallel::build_bulk_gsl(Block *bp, Patch *Pat)
-{
-  MyList<Parallel::gridseg> *gs = 0;
-
-  gs = new MyList<Parallel::gridseg>;
-  gs->data = new Parallel::gridseg;
-
-  for (int i = 0; i < dim; i++)
-  {
-    double DH = bp->getdX(i);
-    gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] : bp->bbox[dim + i] - ghost_width * DH;
-    gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + ghost_width * DH;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-    gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
-#else
-#ifdef Cell
-    gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-  }
-  gs->data->Bg = bp;
-  gs->next = 0;
-
-  return gs;
-}
-MyList<Parallel::gridseg> *Parallel::clone_gsl(MyList<Parallel::gridseg> *p, bool first_only)
-{
-  MyList<Parallel::gridseg> *np = 0, *q = 0, *pq = 0;
-
-  while (p)
-  {
-    q = new MyList<Parallel::gridseg>;
-    q->data = new Parallel::gridseg;
-    q->data->Bg = p->data->Bg;
-    for (int i = 0; i < dim; i++)
-    {
-      q->data->llb[i] = p->data->llb[i];
-      q->data->uub[i] = p->data->uub[i];
-      q->data->shape[i] = p->data->shape[i];
-    }
-    if (pq)
-      pq->next = q;
-    else
-      np = q;
-    if (first_only)
-    {
-      np->next = 0;
-      return np;
-    }
-    pq = q;
-    p = p->next;
-  }
-  return np;
-}
-MyList<Parallel::gridseg> *Parallel::gs_subtract(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B)
-{
-  if (!A)
-    return 0;
-  if (!B)
-    return clone_gsl(A, true);
-
-  double cut_plane[2 * dim], DH[dim];
-
-  for (int i = 0; i < dim; i++)
-  {
-    DH[i] = A->data->Bg->getdX(i);
-    if (B->data->Bg && !feq(DH[i], B->data->Bg->getdX(i), DH[i] / 2))
-    {
-      cout << "Parallel::gs_subtract meets different grid segment " << DH[i] << " vs " << B->data->Bg->getdX(i) << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-  }
-
-  MyList<Parallel::gridseg> *C = 0, *q;
-  for (int i = 0; i < dim; i++)
-  {
-    if (B->data->llb[i] > A->data->uub[i] || B->data->uub[i] < A->data->llb[i])
-      return clone_gsl(A, true);
-    cut_plane[i] = A->data->llb[i];
-    cut_plane[i + dim] = A->data->uub[i];
-  }
-
-  for (int i = 0; i < dim; i++)
-  {
-    cut_plane[i] = Mymax(A->data->llb[i], B->data->llb[i]);
-    if (cut_plane[i] - A->data->llb[i] > DH[i] / 2)
-    {
-      q = clone_gsl(A, true);
-      // prolong the list from head
-      if (C)
-        q->next = C;
-      C = q;
-      for (int j = 0; j < dim; j++)
-      {
-        if (i == j)
-        {
-          C->data->llb[i] = A->data->llb[i];
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-          C->data->uub[i] = Mymax(C->data->llb[i], cut_plane[i] - DH[i]);
-#else
-#ifdef Cell
-          C->data->uub[i] = Mymax(C->data->llb[i], cut_plane[i]);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-        }
-        else
-        {
-          C->data->llb[j] = cut_plane[j];
-          C->data->uub[j] = cut_plane[j + dim];
-        }
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4) + 1;
-#else
-#ifdef Cell
-        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-    }
-
-    cut_plane[i + dim] = Mymin(A->data->uub[i], B->data->uub[i]);
-    if (A->data->uub[i] - cut_plane[i + dim] > DH[i] / 2)
-    {
-      q = clone_gsl(A, true);
-      if (C)
-        q->next = C;
-      C = q;
-      for (int j = 0; j < dim; j++)
-      {
-        if (i == j)
-        {
-          C->data->uub[i] = A->data->uub[i];
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-          C->data->llb[i] = Mymin(C->data->uub[i], cut_plane[i + dim] + DH[i]);
-#else
-#ifdef Cell
-          C->data->llb[i] = Mymin(C->data->uub[i], cut_plane[i + dim]);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-        }
-        else
-        {
-          C->data->llb[j] = cut_plane[j];
-          C->data->uub[j] = cut_plane[j + dim];
-        }
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4) + 1;
-#else
-#ifdef Cell
-        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-    }
-  }
-  return C;
-}
-// stupid method
-/*
-MyList<Parallel::gridseg> *Parallel::gsl_subtract(MyList<Parallel::gridseg> *A,MyList<Parallel::gridseg> *B) //A subtract B but with A's information
-{
-// always make return and A, B distinct
-  if(!A) return 0;
-
-  if(!B) return clone_gsl(A,0);
-
-  MyList<Parallel::gridseg> *C=0,*C0,*C1,*Cc,*CC0,*gs;
-
-  while(A)
-  {
-     C0=gs_subtract(A,B);  // note C0 becomes a list after subtraction
-     C1=B->next;
-     while(C1)
-     {
-  CC0=C0;
-  Cc=0;
-  while(CC0)
-  {
-    gs=gs_subtract(CC0,C1);
-    if(Cc) Cc->catList(gs);
-    else   Cc=gs;
-    CC0=CC0->next;
-  }
-  if(C0) C0->destroyList();
-  C0=Cc;
-  C1=C1->next;
-     }
-     if(C) C->catList(C0);
-     else  C=C0;
-     A=A->next;
-  }
-
-  return C;
-}
-*/
-// more clever method
-MyList<Parallel::gridseg> *Parallel::gsl_subtract(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B) // A subtract B but with A's information
-{
-  // always make return and A, B distinct
-  if (!A)
-    return 0;
-
-  MyList<Parallel::gridseg> *C = 0, *C0, *C1;
-
-  C = clone_gsl(A, 0);
-
-  while (B)
-  {
-    C0 = 0;
-    C1 = C;
-    while (C1)
-    {
-      if (C0)
-        C0->catList(gs_subtract(C1, B));
-      else
-        C0 = gs_subtract(C1, B);
-      C1 = C1->next;
-    }
-    if (C)
-      C->destroyList();
-    else
-    {
-      if (C0)
-        C0->destroyList();
-      return 0;
-    }
-
-    C = C0;
-    B = B->next;
-  }
-
-  return C;
-}
-MyList<Parallel::gridseg> *Parallel::gs_and(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B)
-{
-  if (!A || !B)
-    return 0;
-
-  double llb[dim], uub[dim];
-  bool flag = false;
-  for (int i = 0; i < dim; i++)
-  {
-    llb[i] = Mymax(A->data->llb[i], B->data->llb[i]);
-    uub[i] = Mymin(A->data->uub[i], B->data->uub[i]);
-    if (llb[i] > uub[i])
-    {
-      flag = true;
-      break;
-    }
-  }
-  if (flag)
-    return 0;
-
-  MyList<Parallel::gridseg> *C;
-  C = clone_gsl(A, true);
-  for (int i = 0; i < dim; i++)
-  {
-    C->data->llb[i] = llb[i];
-    C->data->uub[i] = uub[i];
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-    C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / C->data->Bg->getdX(i) + 0.4) + 1;
-#else
-#ifdef Cell
-    C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / C->data->Bg->getdX(i) + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-  }
-
-  return C;
-}
-// overlap of A_i and (union of all j of B_j)
-MyList<Parallel::gridseg> *Parallel::gsl_and(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B) // A and B but with A's information
-{
-  MyList<Parallel::gridseg> *C = 0, *C1;
-
-  while (A)
-  {
-    C1 = B;
-    while (C1)
-    {
-      if (C)
-        C->catList(gs_and(A, C1));
-      else
-        C = gs_and(A, C1);
-      C1 = C1->next;
-    }
-    A = A->next;
-  }
-  return C;
-}
-// collect all ghost grid segments or blocks for given patch
-MyList<Parallel::gridseg> *Parallel::build_ghost_gsl(Patch *Pat)
-{
-  MyList<Parallel::gridseg> *cgsl = 0, *gs, *gsb;
-  MyList<Block> *BP = Pat->blb;
-  while (BP)
-  {
-    gs = new MyList<Parallel::gridseg>;
-    gs->data = new Parallel::gridseg;
-
-    for (int i = 0; i < dim; i++)
-    {
-      gs->data->llb[i] = BP->data->bbox[i];
-      gs->data->uub[i] = BP->data->bbox[dim + i];
-      gs->data->shape[i] = BP->data->shape[i];
-    }
-    gs->data->Bg = BP->data;
-    gs->next = 0;
-
-    gsb = build_bulk_gsl(BP->data, Pat);
-
-    if (!cgsl)
-      cgsl = gs_subtract(gs, gsb);
-    else
-      cgsl->catList(gs_subtract(gs, gsb));
-
-    gsb->destroyList();
-    gs->destroyList();
-
-    if (BP == Pat->ble)
-      break;
-    BP = BP->next;
-  }
-
-  return cgsl;
-}
-// collect all ghost grid segments or blocks for given patch list
-MyList<Parallel::gridseg> *Parallel::build_ghost_gsl(MyList<Patch> *PatL)
-{
-  MyList<Parallel::gridseg> *cgsl = 0, *gs;
-  while (PatL)
-  {
-    if (!cgsl)
-    {
-      cgsl = build_ghost_gsl(PatL->data);
-      gs = cgsl;
-      while (gs->next)
-        gs = gs->next;
-    }
-    else
-    {
-      gs->next = build_ghost_gsl(PatL->data);
-      gs = gs->next;
-      while (gs->next)
-        gs = gs->next;
-    }
-    PatL = PatL->next;
-  }
-
-  return cgsl;
-}
-// collect all grid segments or blocks without ghost for given patch
-// special for Sync usage, so we do not need consider missing points
-MyList<Parallel::gridseg> *Parallel::build_owned_gsl0(Patch *Pat, int rank_in)
-{
-  MyList<Parallel::gridseg> *cgsl = 0, *gs;
-  MyList<Block> *BP = Pat->blb;
-  while (BP)
-  {
-    Block *bp = BP->data;
-    if (bp->rank == rank_in)
-    {
-      if (!cgsl)
-      {
-        cgsl = gs = new MyList<Parallel::gridseg>;
-        gs->data = new Parallel::gridseg;
-      }
-      else
-      {
-        gs->next = new MyList<Parallel::gridseg>;
-        gs = gs->next;
-        gs->data = new Parallel::gridseg;
-      }
-
-      for (int i = 0; i < dim; i++)
-      {
-        double DH = bp->getdX(i);
-        gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] : bp->bbox[dim + i] - ghost_width * DH;
-        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + ghost_width * DH;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
-#else
-#ifdef Cell
-        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-      gs->data->Bg = BP->data;
-      gs->next = 0;
-    }
-
-    if (BP == Pat->ble)
-      break;
-    BP = BP->next;
-  }
-
-  return cgsl;
-}
-// collect all grid segments or blocks without ghost for given patch
-MyList<Parallel::gridseg> *Parallel::build_owned_gsl1(Patch *Pat, int rank_in)
-{
-  MyList<Parallel::gridseg> *cgsl = 0, *gs;
-  MyList<Block> *BP = Pat->blb;
-  while (BP)
-  {
-    Block *bp = BP->data;
-    if (bp->rank == rank_in)
-    {
-      if (!cgsl)
-      {
-        cgsl = gs = new MyList<Parallel::gridseg>;
-        gs->data = new Parallel::gridseg;
-      }
-      else
-      {
-        gs->next = new MyList<Parallel::gridseg>;
-        gs = gs->next;
-        gs->data = new Parallel::gridseg;
-      }
-
-      for (int i = 0; i < dim; i++)
-      {
-        double DH = bp->getdX(i);
-        gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] : bp->bbox[dim + i] - ghost_width * DH;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        // NOTE: our dividing structure is (exclude ghost)
-        // -1 0
-        //       1  2
-        // so (0,1) does not belong to any part for vertex structure, we always put it to right part, this is consistent to
-        // the fortran routine where we always take floor to get index
-        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + (ghost_width - 1) * DH;
-        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
-#else
-#ifdef Cell
-        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + ghost_width * DH;
-        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-      gs->data->Bg = BP->data;
-      gs->next = 0;
-    }
-
-    if (BP == Pat->ble)
-      break;
-    BP = BP->next;
-  }
-
-  return cgsl;
-}
-// collect all grid segments or blocks without ghost nor buffer for given patch
-MyList<Parallel::gridseg> *Parallel::build_owned_gsl2(Patch *Pat, int rank_in)
-{
-  MyList<Parallel::gridseg> *cgsl = 0, *gs;
-  MyList<Block> *BP = Pat->blb;
-  while (BP)
-  {
-    Block *bp = BP->data;
-    if (bp->rank == rank_in)
-    {
-      if (!cgsl)
-      {
-        cgsl = gs = new MyList<Parallel::gridseg>;
-        gs->data = new Parallel::gridseg;
-      }
-      else
-      {
-        gs->next = new MyList<Parallel::gridseg>;
-        gs = gs->next;
-        gs->data = new Parallel::gridseg;
-      }
-
-      for (int i = 0; i < dim; i++)
-      {
-        double DH = bp->getdX(i);
-        gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] - Pat->uui[i] * DH : bp->bbox[dim + i] - ghost_width * DH;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        // NOTE: our dividing structure is (exclude ghost)
-        // -1 0
-        //       1  2
-        // so (0,1) does not belong to any part for vertex structure, we always put it to right part, this is consistent to
-        // the fortran routine where we always take floor to get index
-        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i] + (ghost_width - 1) * DH;
-        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
-#else
-#ifdef Cell
-        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i] + ghost_width * DH;
-        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-      gs->data->Bg = BP->data;
-      gs->next = 0;
-    }
-
-    if (BP == Pat->ble)
-      break;
-    BP = BP->next;
-  }
-
-  return cgsl;
-}
-// collect all grid segments or blocks without ghost for given patch, and delete the ghost_width for interpolation consideration on the patch boundary
-MyList<Parallel::gridseg> *Parallel::build_owned_gsl3(Patch *Pat, int rank_in, int Symmetry)
-{
-  MyList<Parallel::gridseg> *cgsl = 0, *gs;
-  MyList<Block> *BP = Pat->blb;
-  while (BP)
-  {
-    Block *bp = BP->data;
-    if (bp->rank == rank_in)
-    {
-      if (!cgsl)
-      {
-        cgsl = gs = new MyList<Parallel::gridseg>;
-        gs->data = new Parallel::gridseg;
-      }
-      else
-      {
-        gs->next = new MyList<Parallel::gridseg>;
-        gs = gs->next;
-        gs->data = new Parallel::gridseg;
-      }
-
-      for (int i = 0; i < dim; i++)
-      {
-        double DH = bp->getdX(i);
-        gs->data->uub[i] = bp->bbox[dim + i] - ghost_width * DH;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        // NOTE: our dividing structure is (exclude ghost)
-        // -1 0
-        //       1  2
-        // so (0,1) does not belong to any part for vertex structure, we always put it to right part, this is consistent to
-        // the fortran routine where we always take floor to get index
-        gs->data->llb[i] = bp->bbox[i] + (ghost_width - 1) * DH;
-        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
-#else
-#ifdef Cell
-        gs->data->llb[i] = bp->bbox[i] + ghost_width * DH;
-        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-      // Symmetry consideration
-      if (Symmetry > 0)
-      {
-        double DH = bp->getdX(2);
-        if (feq(bp->bbox[2], 0, DH / 2))
-        {
-          gs->data->llb[2] = bp->bbox[2];
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-          gs->data->shape[2] = int((gs->data->uub[2] - gs->data->llb[2]) / DH + 0.4) + 1;
-#else
-#ifdef Cell
-          gs->data->shape[2] = int((gs->data->uub[2] - gs->data->llb[2]) / DH + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-        }
-        if (Symmetry > 1)
-        {
-          for (int i = 0; i < 2; i++)
-          {
-            DH = bp->getdX(i);
-            if (feq(bp->bbox[i], 0, DH / 2))
-            {
-              gs->data->llb[i] = bp->bbox[i];
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-              gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
-#else
-#ifdef Cell
-              gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-            }
-          }
-        }
-      }
-
-      gs->data->Bg = BP->data;
-      gs->next = 0;
-    }
-
-    if (BP == Pat->ble)
-      break;
-    BP = BP->next;
-  }
-
-  return cgsl;
-}
-// collect all grid segments or blocks without ghost nor buffer for given patch,
-// and delete the ghost_width for interpolation consideration on the patch boundary
-MyList<Parallel::gridseg> *Parallel::build_owned_gsl4(Patch *Pat, int rank_in, int Symmetry)
-{
-  MyList<Parallel::gridseg> *cgsl = 0, *gs;
-  MyList<Block> *BP = Pat->blb;
-  while (BP)
-  {
-    Block *bp = BP->data;
-    if (bp->rank == rank_in)
-    {
-      if (!cgsl)
-      {
-        cgsl = gs = new MyList<Parallel::gridseg>;
-        gs->data = new Parallel::gridseg;
-      }
-      else
-      {
-        gs->next = new MyList<Parallel::gridseg>;
-        gs = gs->next;
-        gs->data = new Parallel::gridseg;
-      }
-
-      for (int i = 0; i < dim; i++)
-      {
-        double DH = bp->getdX(i);
-        gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] - Pat->uui[i] * DH : bp->bbox[dim + i];
-        gs->data->uub[i] -= ghost_width * DH;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        // NOTE: our dividing structure is (exclude ghost)
-        // -1 0
-        //       1  2
-        // so (0,1) does not belong to any part for vertex structure, we always put it to right part, this is consistent to
-        // the fortran routine where we always take floor to get index
-        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i];
-        gs->data->llb[i] += (ghost_width - 1) * DH;
-        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
-#else
-#ifdef Cell
-        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i];
-        gs->data->llb[i] += ghost_width * DH;
-        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-      // Symmetry consideration
-      if (Symmetry > 0)
-      {
-        double DH = bp->getdX(2);
-        if (feq(bp->bbox[2], 0, DH / 2))
-        {
-          gs->data->llb[2] = bp->bbox[2];
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-          gs->data->shape[2] = int((gs->data->uub[2] - gs->data->llb[2]) / DH + 0.4) + 1;
-#else
-#ifdef Cell
-          gs->data->shape[2] = int((gs->data->uub[2] - gs->data->llb[2]) / DH + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-        }
-        if (Symmetry > 1)
-        {
-          for (int i = 0; i < 2; i++)
-          {
-            DH = bp->getdX(i);
-            if (feq(bp->bbox[i], 0, DH / 2))
-            {
-              gs->data->llb[i] = bp->bbox[i];
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-              gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
-#else
-#ifdef Cell
-              gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-            }
-          }
-        }
-      }
-
-      gs->data->Bg = BP->data;
-      gs->next = 0;
-    }
-
-    if (BP == Pat->ble)
-      break;
-    BP = BP->next;
-  }
-
-  return cgsl;
-}
-// collect all grid segments or blocks without ghost nor buffer for given patch, no extention
-MyList<Parallel::gridseg> *Parallel::build_owned_gsl5(Patch *Pat, int rank_in)
-{
-  MyList<Parallel::gridseg> *cgsl = 0, *gs;
-  MyList<Block> *BP = Pat->blb;
-  while (BP)
-  {
-    Block *bp = BP->data;
-    if (bp->rank == rank_in)
-    {
-      if (!cgsl)
-      {
-        cgsl = gs = new MyList<Parallel::gridseg>;
-        gs->data = new Parallel::gridseg;
-      }
-      else
-      {
-        gs->next = new MyList<Parallel::gridseg>;
-        gs = gs->next;
-        gs->data = new Parallel::gridseg;
-      }
-
-      for (int i = 0; i < dim; i++)
-      {
-        double DH = bp->getdX(i);
-        gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] - Pat->uui[i] * DH : bp->bbox[dim + i] - ghost_width * DH;
-        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i] + ghost_width * DH;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
-#else
-#ifdef Cell
-        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-      gs->data->Bg = BP->data;
-      gs->next = 0;
-    }
-
-    if (BP == Pat->ble)
-      break;
-    BP = BP->next;
-  }
-
-  return cgsl;
-}
-// collect all grid segments or blocks without ghost for given patch list
-// stupid method
-/*
-MyList<Parallel::gridseg> *Parallel::build_owned_gsl(MyList<Patch> *PatL,int rank_in,int type,int Symmetry)
-{
-       MyList<Parallel::gridseg> *cgsl=0,*gs;
-       while(PatL)
-       {
-    if(!cgsl)
-    {
-            switch(type)
-      {
-         case 0:
-                  cgsl = build_owned_gsl0(PatL->data,rank_in);
-      break;
-         case 1:
-                  cgsl = build_owned_gsl1(PatL->data,rank_in);
-      break;
-         case 2:
-                  cgsl = build_owned_gsl2(PatL->data,rank_in);
-      break;
-         case 3:
-                  cgsl = build_owned_gsl3(PatL->data,rank_in,Symmetry);
-      break;
-         case 4:
-                  cgsl = build_owned_gsl4(PatL->data,rank_in,Symmetry);
-      break;
-         case 5:
-                  cgsl = build_owned_gsl5(PatL->data,rank_in);
-      break;
-               default:
-      cout<<"Parallel::build_owned_gsl : unknown type = "<<type<<endl;
-                  MPI_Abort(MPI_COMM_WORLD,1);
-      }
-       gs = cgsl;
-       while(gs && gs->next) gs = gs->next;
-    }
-    else
-    {
-       switch(type)
-      {
-         case 0:
-                  gs->next = build_owned_gsl0(PatL->data,rank_in);
-      break;
-         case 1:
-                  gs->next = build_owned_gsl1(PatL->data,rank_in);
-      break;
-         case 2:
-                  gs->next = build_owned_gsl2(PatL->data,rank_in);
-      break;
-         case 3:
-                  gs->next = build_owned_gsl3(PatL->data,rank_in,Symmetry);
-      break;
-         case 4:
-                  gs->next = build_owned_gsl4(PatL->data,rank_in,Symmetry);
-      break;
-         case 5:
-                  gs->next = build_owned_gsl5(PatL->data,rank_in);
-      break;
-               default:
-      cout<<"Parallel::build_owned_gsl : unknown type = "<<type<<endl;
-                  MPI_Abort(MPI_COMM_WORLD,1);
-      }
-       while(gs && gs->next) gs = gs->next;
-    }
-    PatL = PatL->next;
-       }
-
-       return cgsl;
-}
-*/
-// more clever method
-MyList<Parallel::gridseg> *Parallel::build_owned_gsl(MyList<Patch> *PatL, int rank_in, int type, int Symmetry)
-{
-  MyList<Parallel::gridseg> *cgsl = 0, *gs;
-  while (PatL)
-  {
-    switch (type)
-    {
-    case 0:
-      gs = build_owned_gsl0(PatL->data, rank_in);
-      break;
-    case 1:
-      gs = build_owned_gsl1(PatL->data, rank_in);
-      break;
-    case 2:
-      gs = build_owned_gsl2(PatL->data, rank_in);
-      break;
-    case 3:
-      gs = build_owned_gsl3(PatL->data, rank_in, Symmetry);
-      break;
-    case 4:
-      gs = build_owned_gsl4(PatL->data, rank_in, Symmetry);
-      break;
-    case 5:
-      gs = build_owned_gsl5(PatL->data, rank_in);
-      break;
-    default:
-      cout << "Parallel::build_owned_gsl : unknown type = " << type << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-    if (cgsl)
-      cgsl->catList(gs);
-    else
-      cgsl = gs;
-    PatL = PatL->next;
-  }
-
-  return cgsl;
-}
-// according to overlape to determine real grid segments
-void Parallel::build_gstl(MyList<Parallel::gridseg> *srci, MyList<Parallel::gridseg> *dsti,
-                          MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst)
-{
-  *out_src = *out_dst = 0;
-
-  if (!srci || !dsti)
-    return;
-
-  MyList<Parallel::gridseg> *s, *d;
-  MyList<Parallel::gridseg> *s2, *d2;
-
-  double llb[dim], uub[dim];
-
-  s = srci;
-  while (s)
-  {
-    Parallel::gridseg *sd = s->data;
-    d = dsti;
-    while (d)
-    {
-      Parallel::gridseg *dd = d->data;
-      bool flag = true;
-      for (int i = 0; i < dim; i++)
-      {
-        double SH = sd->Bg->getdX(i), DH = dd->Bg->getdX(i);
-        llb[i] = Mymax(sd->llb[i], dd->llb[i]);
-        uub[i] = Mymin(sd->uub[i], dd->uub[i]);
-        // make sure the region boundary is consistent to the grids
-        // here we only judge if the domain is empty, so do not need to adjust the align
-        double lb = llb[i], ub = uub[i];
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        // ---*---
-        // x-------x
-        //		if     (int(2*(sd->uub[i]-uub[i])/SH+0.4)%2 == 1) ub = uub[i]-SH/2;
-        //		else if(int(2*(dd->uub[i]-uub[i])/DH+0.4)%2 == 1) ub = uub[i]-DH/2;
-        //		if     (int(2*(llb[i]-sd->llb[i])/SH+0.4)%2 == 1) lb = llb[i]+SH/2;
-        //		else if(int(2*(llb[i]-dd->llb[i])/DH+0.4)%2 == 1) lb = llb[i]+DH/2;
-        if (lb > ub + Mymin(SH, DH) / 2)
-        {
-          flag = false;
-          break;
-        } // special for isolated point
-#else
-#ifdef Cell
-        // |------|
-        // |-------------|
-        //		if     (int(2*(sd->uub[i]-uub[i])/SH+0.4)%2 == 1) ub = uub[i]+SH/2;
-        //		else if(int(2*(dd->uub[i]-uub[i])/DH+0.4)%2 == 1) ub = uub[i]+DH/2;
-        //        |------|
-        // |-------------|
-        //		if     (int(2*(llb[i]-sd->llb[i])/SH+0.4)%2 == 1) lb = llb[i]-SH/2;
-        //		else if(int(2*(llb[i]-dd->llb[i])/DH+0.4)%2 == 1) lb = llb[i]-DH/2;
-        if (ub - lb < Mymin(SH, DH) / 2)
-        {
-          flag = false;
-          break;
-        } // even for isolated point, it has a cell belong to it
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-
-      if (flag)
-      {
-        if (!(*out_src))
-        {
-          *out_src = s2 = new MyList<Parallel::gridseg>;
-          *out_dst = d2 = new MyList<Parallel::gridseg>;
-          s2->data = new Parallel::gridseg;
-          d2->data = new Parallel::gridseg;
-        }
-        else
-        {
-          s2->next = new MyList<Parallel::gridseg>;
-          s2 = s2->next;
-          d2->next = new MyList<Parallel::gridseg>;
-          d2 = d2->next;
-          s2->data = new Parallel::gridseg;
-          d2->data = new Parallel::gridseg;
-        }
-
-        for (int i = 0; i < dim; i++)
-        {
-          double SH = sd->Bg->getdX(i), DH = dd->Bg->getdX(i);
-          s2->data->llb[i] = d2->data->llb[i] = llb[i];
-          s2->data->uub[i] = d2->data->uub[i] = uub[i];
-// using float method to count point, we do not need following consideration (2012 nov 17)
-#if 1
-
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-          // old code distuinguish vertex and cell
-          //		   if     (int(2*(sd->uub[i]-uub[i])/SH+0.4)%2 == 1) s2->data->uub[i] = uub[i]-SH/2;
-          //		   else if(int(2*(dd->uub[i]-uub[i])/DH+0.4)%2 == 1) d2->data->uub[i] = uub[i]-DH/2;
-          //	           if     (int(2*(llb[i]-sd->llb[i])/SH+0.4)%2 == 1) s2->data->llb[i] = llb[i]+SH/2;
-          //		   else if(int(2*(llb[i]-dd->llb[i])/DH+0.4)%2 == 1) d2->data->llb[i] = llb[i]+DH/2;
-          // new code: here we concern much more about missing point, because overlaping domain has been gaureented above
-          if (int(2 * (sd->uub[i] - uub[i]) / SH + 0.4) % 2 == 1)
-            s2->data->uub[i] = uub[i] + SH / 2;
-          else if (int(2 * (dd->uub[i] - uub[i]) / DH + 0.4) % 2 == 1)
-            d2->data->uub[i] = uub[i] + DH / 2;
-          if (int(2 * (llb[i] - sd->llb[i]) / SH + 0.4) % 2 == 1)
-            s2->data->llb[i] = llb[i] - SH / 2;
-          else if (int(2 * (llb[i] - dd->llb[i]) / DH + 0.4) % 2 == 1)
-            d2->data->llb[i] = llb[i] - DH / 2;
-          s2->data->shape[i] = int((s2->data->uub[i] - s2->data->llb[i]) / SH + 0.4) + 1;
-          d2->data->shape[i] = int((d2->data->uub[i] - d2->data->llb[i]) / DH + 0.4) + 1;
-#else
-#ifdef Cell
-          if (int(2 * (sd->uub[i] - uub[i]) / SH + 0.4) % 2 == 1)
-            s2->data->uub[i] = uub[i] + SH / 2;
-          else if (int(2 * (dd->uub[i] - uub[i]) / DH + 0.4) % 2 == 1)
-            d2->data->uub[i] = uub[i] + DH / 2;
-          if (int(2 * (llb[i] - sd->llb[i]) / SH + 0.4) % 2 == 1)
-            s2->data->llb[i] = llb[i] - SH / 2;
-          else if (int(2 * (llb[i] - dd->llb[i]) / DH + 0.4) % 2 == 1)
-            d2->data->llb[i] = llb[i] - DH / 2;
-          s2->data->shape[i] = int((s2->data->uub[i] - s2->data->llb[i]) / SH + 0.4);
-          d2->data->shape[i] = int((d2->data->uub[i] - d2->data->llb[i]) / DH + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-
-#endif
-          s2->data->illb[i] = sd->illb[i];
-          d2->data->illb[i] = dd->illb[i];
-          s2->data->iuub[i] = sd->iuub[i];
-          d2->data->iuub[i] = dd->iuub[i];
-        }
-        s2->data->Bg = sd->Bg;
-        s2->next = 0;
-        d2->data->Bg = dd->Bg;
-        d2->next = 0;
-      }
-      d = d->next;
-    }
-    s = s->next;
-  }
-}
-//   PACK: prepare target data in 'data'
-// UNPACK: copy target data from 'data' to corresponding numerical grids
-int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<Parallel::gridseg> *dst, int rank_in, int dir,
-                          MyList<var> *VarLists /* source */, MyList<var> *VarListd /* target */, int Symmetry)
-{
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  int DIM = dim;
-
-  if (dir != PACK && dir != UNPACK)
-  {
-    cout << "error dir " << dir << " for data_packer " << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-  int size_out = 0;
-
-  if (!src || !dst)
-    return size_out;
-
+  nx = Mymax(1, shape / min_width);
+  nx = Mymin(cpusize, nx);
+
+  return nx;
+}
+int Parallel::partition2(int *nxy, int split_size, int *min_width, int cpusize, int *shape) // special for 2 diemnsions
+{
+#define SEARCH_SIZE 5
+  int i, j, nx, ny;
+  int maxnx, maxny;
+  int mnx, mny;
+  int dn, hmin_width, cmin_width;
+  int cnx, cny;
+  double fx, fy;
+  int block_size;
+  int n;
+
+  block_size = shape[0] * shape[1];
+  n = Mymax(1, (block_size + split_size / 2) / split_size);
+
+  maxnx = Mymax(1, shape[0] / min_width[0]);
+  maxnx = Mymin(cpusize, maxnx);
+  maxny = Mymax(1, shape[1] / min_width[1]);
+  maxny = Mymin(cpusize, maxny);
+  fx = (double)shape[0] / (shape[0] + shape[1]);
+  fy = (double)shape[1] / (shape[0] + shape[1]);
+  nx = mnx = Mymax(1, Mymin(maxnx, (int)(sqrt(double(n)) * fx / fy)));
+  ny = mny = Mymax(1, Mymin(maxny, (int)(sqrt(double(n)) * fy / fx)));
+  dn = abs(n - nx * ny);
+  hmin_width = Mymin(shape[0] / nx, shape[1] / ny);
+  for (cny = Mymax(1, mny - SEARCH_SIZE); cny <= (Mymin(mny + SEARCH_SIZE, maxny)); cny++)
+    for (cnx = Mymax(1, mnx - SEARCH_SIZE); cnx <= (Mymin(mnx + SEARCH_SIZE, maxnx)); cnx++)
+    {
+      cmin_width = Mymin(shape[0] / cnx, shape[1] / cny);
+      if (dn > abs(n - cnx * cny) || (dn == abs(n - cnx * cny) && cmin_width > hmin_width))
+      {
+        dn = abs(n - cnx * cny);
+        nx = cnx;
+        ny = cny;
+        hmin_width = cmin_width;
+      }
+    }
+
+  nxy[0] = nx;
+  nxy[1] = ny;
+
+  return nx * ny;
+#undef SEARCH_SIZE
+}
+int Parallel::partition3(int *nxyz, int split_size, int *min_width, int cpusize, int *shape) // special for 3 diemnsions
+#if 1                                                                                        // algrithsm from Pretorius
+{
+//	cout<<split_size<<endl<<min_width[0]<<endl<<min_width[1]<<endl<<min_width[2]<<endl
+//            <<shape[0]<<endl<<shape[1]<<endl<<shape[2]<<endl<<cpusize<<endl;
+#define SEARCH_SIZE 5
+  int i, j, k, nx, ny, nz;
+  int maxnx, maxny, maxnz;
+  int mnx, mny, mnz;
+  int dn, hmin_width, cmin_width;
+  int cnx, cny, cnz;
+  double fx, fy, fz, max_fxfy, max_fxfz, max_fyfz;
+  int block_size;
+  int n;
+
+  block_size = shape[0] * shape[1] * shape[2];
+  n = Mymax(1, (block_size + split_size / 2) / split_size);
+
+  maxnx = Mymax(1, shape[0] / min_width[0]);
+  maxnx = Mymin(cpusize, maxnx);
+  maxny = Mymax(1, shape[1] / min_width[1]);
+  maxny = Mymin(cpusize, maxny);
+  maxnz = Mymax(1, shape[2] / min_width[2]);
+  maxnz = Mymin(cpusize, maxnz);
+  fx = (double)shape[0] / (shape[0] + shape[1] + shape[2]);
+  fy = (double)shape[1] / (shape[0] + shape[1] + shape[2]);
+  fz = (double)shape[2] / (shape[0] + shape[1] + shape[2]);
+  max_fxfy = Mymax(fx, fy);
+  max_fxfz = Mymax(fx, fz);
+  max_fyfz = Mymax(fy, fz);
+  nx = mnx = Mymax(1, Mymin(maxnx, (int)(pow(n, 1.0 / 3.0) * fx / max_fyfz)));
+  ny = mny = Mymax(1, Mymin(maxny, (int)(pow(n, 1.0 / 3.0) * fy / max_fxfz)));
+  nz = mnz = Mymax(1, Mymin(maxnz, (int)(pow(n, 1.0 / 3.0) * fz / max_fxfy)));
+  dn = abs(n - nx * ny * nz);
+  hmin_width = Mymin(shape[2] / nz, shape[1] / ny);
+  hmin_width = Mymin(hmin_width, shape[0] / nx);
+  for (cnz = Mymax(1, mnz - SEARCH_SIZE); cnz <= (Mymin(mnz + SEARCH_SIZE, maxnz)); cnz++)
+    for (cny = Mymax(1, mny - SEARCH_SIZE); cny <= (Mymin(mny + SEARCH_SIZE, maxny)); cny++)
+      for (cnx = Mymax(1, mnx - SEARCH_SIZE); cnx <= (Mymin(mnx + SEARCH_SIZE, maxnx)); cnx++)
+      {
+        cmin_width = Mymin(shape[2] / cnz, shape[1] / cny);
+        cmin_width = Mymin(cmin_width, shape[0] / cnx);
+        if (dn > abs(n - cnx * cny * cnz) || (dn == abs(n - cnx * cny * cnz) && cmin_width > hmin_width))
+        {
+          dn = abs(n - cnx * cny * cnz);
+          nx = cnx;
+          ny = cny;
+          nz = cnz;
+          hmin_width = cmin_width;
+        }
+      }
+
+  nxyz[0] = nx;
+  nxyz[1] = ny;
+  nxyz[2] = nz;
+
+  return nx * ny * nz;
+#undef SEARCH_SIZE
+}
+#elif 1 // Zhihui's idea one on 2013-09-25
+{
+  int nx, ny, nz;
+  int hmin_width;
+  hmin_width = Mymin(min_width[0], min_width[1]);
+  hmin_width = Mymin(hmin_width, min_width[2]);
+  nx = shape[0] / hmin_width;
+  if (nx * hmin_width < shape[0])
+    nx++;
+  ny = shape[1] / hmin_width;
+  if (ny * hmin_width < shape[1])
+    ny++;
+  nz = shape[2] / hmin_width;
+  if (nz * hmin_width < shape[2])
+    nz++;
+  while (nx * ny * nz > cpusize)
+  {
+    hmin_width++;
+    nx = shape[0] / hmin_width;
+    if (nx * hmin_width < shape[0])
+      nx++;
+    ny = shape[1] / hmin_width;
+    if (ny * hmin_width < shape[1])
+      ny++;
+    nz = shape[2] / hmin_width;
+    if (nz * hmin_width < shape[2])
+      nz++;
+  }
+
+  nxyz[0] = nx;
+  nxyz[1] = ny;
+  nxyz[2] = nz;
+
+  return nx * ny * nz;
+}
+#elif 1 // Zhihui's idea two on 2013-09-25
+{
+  int nx, ny, nz;
+  const int hmin_width = 8; // for example we use 8
+  nx = shape[0] / hmin_width;
+  if (nx * hmin_width < shape[0])
+    nx++;
+  ny = shape[1] / hmin_width;
+  if (ny * hmin_width < shape[1])
+    ny++;
+  nz = shape[2] / hmin_width;
+  if (nz * hmin_width < shape[2])
+    nz++;
+
+  nxyz[0] = nx;
+  nxyz[1] = ny;
+  nxyz[2] = nz;
+
+  return nx * ny * nz;
+}
+#endif
+// distribute the data to cprocessors
+#if (PSTR == 0)
+MyList<Block> *Parallel::distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
+                                    bool periodic, int nodes)
+{
+#ifdef USE_GPU_DIVIDE
+  double cpu_part, gpu_part;
+  map<string, double>::iterator iter;
+  iter = parameters::dou_par.find("cpu part");
+  if (iter != parameters::dou_par.end())
+  {
+    cpu_part = iter->second;
+  }
+  else
+  {
+    int myrank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+    // read parameter from file
+    const int LEN = 256;
+    char pline[LEN];
+    string str, sgrp, skey, sval;
+    int sind;
+    char pname[50];
+    {
+      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
+      if (iter != parameters::str_par.end())
+      {
+        strcpy(pname, (iter->second).c_str());
+      }
+      else
+      {
+        cout << "Error inputpar" << endl;
+        exit(0);
+      }
+    }
+    ifstream inf(pname, ifstream::in);
+    if (!inf.good() && myrank == 0)
+    {
+      cout << "Can not open parameter file " << pname << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    for (int i = 1; inf.good(); i++)
+    {
+      inf.getline(pline, LEN);
+      str = pline;
+
+      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
+      if (status == -1)
+      {
+        cout << "error reading parameter file " << pname << " in line " << i << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+      else if (status == 0)
+        continue;
+
+      if (sgrp == "ABE")
+      {
+        if (skey == "cpu part")
+          cpu_part = atof(sval.c_str());
+      }
+    }
+    inf.close();
+
+    parameters::dou_par.insert(map<string, double>::value_type("cpu part", cpu_part));
+  }
+  iter = parameters::dou_par.find("gpu part");
+  if (iter != parameters::dou_par.end())
+  {
+    gpu_part = iter->second;
+  }
+  else
+  {
+    int myrank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+    // read parameter from file
+    const int LEN = 256;
+    char pline[LEN];
+    string str, sgrp, skey, sval;
+    int sind;
+    char pname[50];
+    {
+      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
+      if (iter != parameters::str_par.end())
+      {
+        strcpy(pname, (iter->second).c_str());
+      }
+      else
+      {
+        cout << "Error inputpar" << endl;
+        exit(0);
+      }
+    }
+    ifstream inf(pname, ifstream::in);
+    if (!inf.good() && myrank == 0)
+    {
+      cout << "Can not open parameter file " << pname << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    for (int i = 1; inf.good(); i++)
+    {
+      inf.getline(pline, LEN);
+      str = pline;
+
+      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
+      if (status == -1)
+      {
+        cout << "error reading parameter file " << pname << " in line " << i << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+      else if (status == 0)
+        continue;
+
+      if (sgrp == "ABE")
+      {
+        if (skey == "gpu part")
+          gpu_part = atof(sval.c_str());
+      }
+    }
+    inf.close();
+
+    parameters::dou_par.insert(map<string, double>::value_type("gpu part", gpu_part));
+  }
+
+  if (nodes == 0)
+    nodes = cpusize / 2;
+#else
+  if (nodes == 0)
+    nodes = cpusize;
+#endif
+
+  if (dim != 3)
+  {
+    cout << "distrivute: now we only support 3-dimension" << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+
+  MyList<Block> *BlL = 0;
+
+  int split_size, min_size, block_size = 0;
+
+  int min_width = 2 * Mymax(ghost_width, buffer_width);
+  int nxyz[dim], mmin_width[dim], min_shape[dim];
+
+  MyList<Patch> *PLi = PatchLIST;
+  for (int i = 0; i < dim; i++)
+    min_shape[i] = PLi->data->shape[i];
+  int lev = PLi->data->lev;
+  PLi = PLi->next;
+  while (PLi)
+  {
+    Patch *PP = PLi->data;
+    for (int i = 0; i < dim; i++)
+      min_shape[i] = Mymin(min_shape[i], PP->shape[i]);
+    if (lev != PLi->data->lev)
+      cout << "Parallel::distribute CAUSTION: meet Patches for different level: " << lev << " and " << PLi->data->lev << endl;
+    PLi = PLi->next;
+  }
+
+  for (int i = 0; i < dim; i++)
+    mmin_width[i] = Mymin(min_width, min_shape[i]);
+
+  min_size = mmin_width[0];
+  for (int i = 1; i < dim; i++)
+    min_size = min_size * mmin_width[i];
+
+  PLi = PatchLIST;
+  while (PLi)
+  {
+    Patch *PP = PLi->data;
+    //    PP->checkPatch(true);
+    int bs = PP->shape[0];
+    for (int i = 1; i < dim; i++)
+      bs = bs * PP->shape[i];
+    block_size = block_size + bs;
+    PLi = PLi->next;
+  }
+  split_size = Mymax(min_size, block_size / nodes);
+  split_size = Mymax(1, split_size);
+
+  int n_rank = 0;
+  PLi = PatchLIST;
+  int reacpu = 0;
+  while (PLi)
+  {
+    Patch *PP = PLi->data;
+
+    reacpu += partition3(nxyz, split_size, mmin_width, nodes, PP->shape);
+
+    Block *ng0, *ng;
+    int shape_here[dim], ibbox_here[2 * dim];
+    double bbox_here[2 * dim], dd;
+
+    // ibbox : 0,...N-1
+    for (int i = 0; i < nxyz[0]; i++)
+      for (int j = 0; j < nxyz[1]; j++)
+        for (int k = 0; k < nxyz[2]; k++)
+        {
+          ibbox_here[0] = (PP->shape[0] * i) / nxyz[0];
+          ibbox_here[3] = (PP->shape[0] * (i + 1)) / nxyz[0] - 1;
+          ibbox_here[1] = (PP->shape[1] * j) / nxyz[1];
+          ibbox_here[4] = (PP->shape[1] * (j + 1)) / nxyz[1] - 1;
+          ibbox_here[2] = (PP->shape[2] * k) / nxyz[2];
+          ibbox_here[5] = (PP->shape[2] * (k + 1)) / nxyz[2] - 1;
+
+          if (periodic)
+          {
+            ibbox_here[0] = ibbox_here[0] - ghost_width;
+            ibbox_here[3] = ibbox_here[3] + ghost_width;
+            ibbox_here[1] = ibbox_here[1] - ghost_width;
+            ibbox_here[4] = ibbox_here[4] + ghost_width;
+            ibbox_here[2] = ibbox_here[2] - ghost_width;
+            ibbox_here[5] = ibbox_here[5] + ghost_width;
+          }
+          else
+          {
+            ibbox_here[0] = Mymax(0, ibbox_here[0] - ghost_width);
+            ibbox_here[3] = Mymin(PP->shape[0] - 1, ibbox_here[3] + ghost_width);
+            ibbox_here[1] = Mymax(0, ibbox_here[1] - ghost_width);
+            ibbox_here[4] = Mymin(PP->shape[1] - 1, ibbox_here[4] + ghost_width);
+            ibbox_here[2] = Mymax(0, ibbox_here[2] - ghost_width);
+            ibbox_here[5] = Mymin(PP->shape[2] - 1, ibbox_here[5] + ghost_width);
+          }
+
+          shape_here[0] = ibbox_here[3] - ibbox_here[0] + 1;
+          shape_here[1] = ibbox_here[4] - ibbox_here[1] + 1;
+          shape_here[2] = ibbox_here[5] - ibbox_here[2] + 1;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+          // 0--4, 5--10
+          dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1);
+          bbox_here[0] = PP->bbox[0] + ibbox_here[0] * dd;
+          bbox_here[3] = PP->bbox[0] + ibbox_here[3] * dd;
+
+          dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1);
+          bbox_here[1] = PP->bbox[1] + ibbox_here[1] * dd;
+          bbox_here[4] = PP->bbox[1] + ibbox_here[4] * dd;
+
+          dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1);
+          bbox_here[2] = PP->bbox[2] + ibbox_here[2] * dd;
+          bbox_here[5] = PP->bbox[2] + ibbox_here[5] * dd;
+#else
+#ifdef Cell
+          // 0--5, 5--10
+          dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0];
+          bbox_here[0] = PP->bbox[0] + (ibbox_here[0]) * dd;
+          bbox_here[3] = PP->bbox[0] + (ibbox_here[3] + 1) * dd;
+
+          dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1];
+          bbox_here[1] = PP->bbox[1] + (ibbox_here[1]) * dd;
+          bbox_here[4] = PP->bbox[1] + (ibbox_here[4] + 1) * dd;
+
+          dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2];
+          bbox_here[2] = PP->bbox[2] + (ibbox_here[2]) * dd;
+          bbox_here[5] = PP->bbox[2] + (ibbox_here[5] + 1) * dd;
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+
+#ifdef USE_GPU_DIVIDE
+          {
+            const int pices = 2;
+            double picef[pices];
+            picef[0] = cpu_part;
+            picef[1] = gpu_part;
+            int shape_res[dim * pices];
+            double bbox_res[2 * dim * pices];
+            misc::dividBlock(dim, shape_here, bbox_here, pices, picef, shape_res, bbox_res, min_width);
+            ng = ng0 = new Block(dim, shape_res, bbox_res, n_rank++, ingfsi, fngfsi, PP->lev, 0); // delete through KillBlocks
+
+            //	       if(n_rank==cpusize) {n_rank=0; cerr<<"place one!!"<<endl;}
+
+            //	       ng->checkBlock();
+            if (BlL)
+              BlL->insert(ng);
+            else
+              BlL = new MyList<Block>(ng); // delete through KillBlocks
+
+            for (int i = 1; i < pices; i++)
+            {
+              ng = new Block(dim, shape_res + i * dim, bbox_res + i * 2 * dim, n_rank++, ingfsi, fngfsi, PP->lev, i); // delete through KillBlocks
+              //	        if(n_rank==cpusize) {n_rank=0; cerr<<"place two!! "<<i<<endl;}
+              //	        ng->checkBlock();
+              BlL->insert(ng);
+            }
+          }
+#else
+          ng = ng0 = new Block(dim, shape_here, bbox_here, n_rank++, ingfsi, fngfsi, PP->lev);
+          //	    ng->checkBlock();
+          if (BlL)
+            BlL->insert(ng);
+          else
+            BlL = new MyList<Block>(ng); // delete through KillBlocks
+#endif
+          if (n_rank == cpusize)
+            n_rank = 0;
+
+          // set PP->blb
+          if (i == 0 && j == 0 && k == 0)
+          {
+            MyList<Block> *Bp = BlL;
+            while (Bp->data != ng0)
+              Bp = Bp->next; // ng0 is the first of the pices list
+            PP->blb = Bp;
+          }
+        }
+    // set PP->ble
+    {
+      MyList<Block> *Bp = BlL;
+      while (Bp->data != ng)
+        Bp = Bp->next; // ng is the last of the pices list
+      PP->ble = Bp;
+    }
+    PLi = PLi->next;
+  }
+  if (reacpu < nodes * 2 / 3)
+  {
+    int myrank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+    if (myrank == 0)
+      cout << "Parallel::distribute CAUSTION: level#" << lev << " uses essencially " << reacpu << " processors vs " << nodes << " nodes run, your scientific computation scale is not as large as you estimate." << endl;
+  }
+
+  return BlL;
+}
+
+#ifdef INTERP_LB_OPTIMIZE
+#include "interp_lb_profile_data.h"
+
+MyList<Block> *Parallel::distribute_optimize(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
+                                    bool periodic, int nodes)
+{
+#ifdef USE_GPU_DIVIDE
+  double cpu_part, gpu_part;
+  map<string, double>::iterator iter;
+  iter = parameters::dou_par.find("cpu part");
+  if (iter != parameters::dou_par.end())
+  {
+    cpu_part = iter->second;
+  }
+  else
+  {
+    int myrank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+    const int LEN = 256;
+    char pline[LEN];
+    string str, sgrp, skey, sval;
+    int sind;
+    char pname[50];
+    {
+      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
+      if (iter != parameters::str_par.end())
+        strcpy(pname, (iter->second).c_str());
+      else { cout << "Error inputpar" << endl; exit(0); }
+    }
+    ifstream inf(pname, ifstream::in);
+    if (!inf.good() && myrank == 0)
+    { cout << "Can not open parameter file " << pname << endl; MPI_Abort(MPI_COMM_WORLD, 1); }
+    for (int i = 1; inf.good(); i++)
+    {
+      inf.getline(pline, LEN); str = pline;
+      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
+      if (status == -1) { cout << "error reading parameter file " << pname << " in line " << i << endl; MPI_Abort(MPI_COMM_WORLD, 1); }
+      else if (status == 0) continue;
+      if (sgrp == "ABE") { if (skey == "cpu part") cpu_part = atof(sval.c_str()); }
+    }
+    inf.close();
+    parameters::dou_par.insert(map<string, double>::value_type("cpu part", cpu_part));
+  }
+  iter = parameters::dou_par.find("gpu part");
+  if (iter != parameters::dou_par.end())
+  {
+    gpu_part = iter->second;
+  }
+  else
+  {
+    int myrank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+    const int LEN = 256;
+    char pline[LEN];
+    string str, sgrp, skey, sval;
+    int sind;
+    char pname[50];
+    {
+      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
+      if (iter != parameters::str_par.end())
+        strcpy(pname, (iter->second).c_str());
+      else { cout << "Error inputpar" << endl; exit(0); }
+    }
+    ifstream inf(pname, ifstream::in);
+    if (!inf.good() && myrank == 0)
+    { cout << "Can not open parameter file " << pname << endl; MPI_Abort(MPI_COMM_WORLD, 1); }
+    for (int i = 1; inf.good(); i++)
+    {
+      inf.getline(pline, LEN); str = pline;
+      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
+      if (status == -1) { cout << "error reading parameter file " << pname << " in line " << i << endl; MPI_Abort(MPI_COMM_WORLD, 1); }
+      else if (status == 0) continue;
+      if (sgrp == "ABE") { if (skey == "gpu part") gpu_part = atof(sval.c_str()); }
+    }
+    inf.close();
+    parameters::dou_par.insert(map<string, double>::value_type("gpu part", gpu_part));
+  }
+  if (nodes == 0) nodes = cpusize / 2;
+#else
+  if (nodes == 0) nodes = cpusize;
+#endif
+
+  if (dim != 3)
+  {
+    cout << "distrivute: now we only support 3-dimension" << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+
+  MyList<Block> *BlL = 0;
+  int split_size, min_size, block_size = 0;
+  int min_width = 2 * Mymax(ghost_width, buffer_width);
+  int nxyz[dim], mmin_width[dim], min_shape[dim];
+
+  MyList<Patch> *PLi = PatchLIST;
+  for (int i = 0; i < dim; i++)
+    min_shape[i] = PLi->data->shape[i];
+  int lev = PLi->data->lev;
+  PLi = PLi->next;
+  while (PLi)
+  {
+    Patch *PP = PLi->data;
+    for (int i = 0; i < dim; i++)
+      min_shape[i] = Mymin(min_shape[i], PP->shape[i]);
+    if (lev != PLi->data->lev)
+      cout << "Parallel::distribute CAUSTION: meet Patches for different level: " << lev << " and " << PLi->data->lev << endl;
+    PLi = PLi->next;
+  }
+
+  for (int i = 0; i < dim; i++)
+    mmin_width[i] = Mymin(min_width, min_shape[i]);
+  min_size = mmin_width[0];
+  for (int i = 1; i < dim; i++)
+    min_size = min_size * mmin_width[i];
+
+  PLi = PatchLIST;
+  while (PLi)
+  {
+    Patch *PP = PLi->data;
+    int bs = PP->shape[0];
+    for (int i = 1; i < dim; i++)
+      bs = bs * PP->shape[i];
+    block_size = block_size + bs;
+    PLi = PLi->next;
+  }
+  split_size = Mymax(min_size, block_size / nodes);
+  split_size = Mymax(1, split_size);
+
+  int n_rank = 0;
+  PLi = PatchLIST;
+  int reacpu = 0;
+  int current_block_id = 0;
+  while (PLi) {
+    Block *ng0, *ng;
+    bool first_block_in_patch = true;
+    Patch *PP = PLi->data;
+    reacpu += partition3(nxyz, split_size, mmin_width, nodes, PP->shape);
+
+    for (int i = 0; i < nxyz[0]; i++)
+    for (int j = 0; j < nxyz[1]; j++)
+    for (int k = 0; k < nxyz[2]; k++)
+    {
+        int ibbox_here[6], shape_here[3];
+        double bbox_here[6], dd;
+        Block *current_ng_start = nullptr;
+
+        bool is_heavy = false;
+        int r_l = -1, r_r = -1;
+        if (cpusize == INTERP_LB_NPROCS) {
+          for (int si = 0; si < INTERP_LB_NUM_HEAVY; si++) {
+            if (current_block_id == interp_lb_splits[si][0]) {
+              is_heavy = true;
+              r_l = interp_lb_splits[si][1];
+              r_r = interp_lb_splits[si][2];
+              break;
+            }
+          }
+        }
+
+        if (is_heavy)
+        {
+            int ib0 = (PP->shape[0] * i) / nxyz[0];
+            int ib3 = (PP->shape[0] * (i + 1)) / nxyz[0] - 1;
+            int jb1 = (PP->shape[1] * j) / nxyz[1];
+            int jb4 = (PP->shape[1] * (j + 1)) / nxyz[1] - 1;
+            int kb2 = (PP->shape[2] * k) / nxyz[2];
+            int kb5 = (PP->shape[2] * (k + 1)) / nxyz[2] - 1;
+
+            Block *split_first_block = nullptr;
+            Block *split_last_block = nullptr;
+            splitHotspotBlock(BlL, dim, ib0, ib3, jb1, jb4, kb2, kb5,
+                              PP, r_l, r_r, ingfsi, fngfsi, periodic,
+                              split_first_block, split_last_block);
+
+            current_ng_start = split_first_block;
+            ng = split_last_block;
+        }
+        else
+        {
+            ibbox_here[0] = (PP->shape[0] * i) / nxyz[0];
+            ibbox_here[3] = (PP->shape[0] * (i + 1)) / nxyz[0] - 1;
+            ibbox_here[1] = (PP->shape[1] * j) / nxyz[1];
+            ibbox_here[4] = (PP->shape[1] * (j + 1)) / nxyz[1] - 1;
+            ibbox_here[2] = (PP->shape[2] * k) / nxyz[2];
+            ibbox_here[5] = (PP->shape[2] * (k + 1)) / nxyz[2] - 1;
+
+            if (periodic) {
+                for(int d=0; d<3; d++) {
+                    ibbox_here[d] -= ghost_width;
+                    ibbox_here[d+3] += ghost_width;
+                }
+            } else {
+                ibbox_here[0] = Mymax(0, ibbox_here[0] - ghost_width);
+                ibbox_here[3] = Mymin(PP->shape[0] - 1, ibbox_here[3] + ghost_width);
+                ibbox_here[1] = Mymax(0, ibbox_here[1] - ghost_width);
+                ibbox_here[4] = Mymin(PP->shape[1] - 1, ibbox_here[4] + ghost_width);
+                ibbox_here[2] = Mymax(0, ibbox_here[2] - ghost_width);
+                ibbox_here[5] = Mymin(PP->shape[2] - 1, ibbox_here[5] + ghost_width);
+            }
+
+            for(int d=0; d<3; d++) shape_here[d] = ibbox_here[d+3] - ibbox_here[d] + 1;
+
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+          dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1);
+          bbox_here[0] = PP->bbox[0] + ibbox_here[0] * dd;
+          bbox_here[3] = PP->bbox[0] + ibbox_here[3] * dd;
+          dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1);
+          bbox_here[1] = PP->bbox[1] + ibbox_here[1] * dd;
+          bbox_here[4] = PP->bbox[1] + ibbox_here[4] * dd;
+          dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1);
+          bbox_here[2] = PP->bbox[2] + ibbox_here[2] * dd;
+          bbox_here[5] = PP->bbox[2] + ibbox_here[5] * dd;
+#else
+#ifdef Cell
+          dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0];
+          bbox_here[0] = PP->bbox[0] + (ibbox_here[0]) * dd;
+          bbox_here[3] = PP->bbox[0] + (ibbox_here[3] + 1) * dd;
+          dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1];
+          bbox_here[1] = PP->bbox[1] + (ibbox_here[1]) * dd;
+          bbox_here[4] = PP->bbox[1] + (ibbox_here[4] + 1) * dd;
+          dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2];
+          bbox_here[2] = PP->bbox[2] + (ibbox_here[2]) * dd;
+          bbox_here[5] = PP->bbox[2] + (ibbox_here[5] + 1) * dd;
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+            ng = createMappedBlock(BlL, dim, shape_here, bbox_here,
+                                   current_block_id, ingfsi, fngfsi, PP->lev);
+            current_ng_start = ng;
+        }
+
+        if (first_block_in_patch) {
+            ng0 = current_ng_start;
+            MyList<Block> *Bp_start = BlL;
+            while (Bp_start && Bp_start->data != ng0) Bp_start = Bp_start->next;
+            PP->blb = Bp_start;
+            first_block_in_patch = false;
+        }
+
+        current_block_id++;
+    }
+
+    {
+      MyList<Block> *Bp_end = BlL;
+      while (Bp_end && Bp_end->data != ng) Bp_end = Bp_end->next;
+      PP->ble = Bp_end;
+    }
+
+    PLi = PLi->next;
+  }
+  if (reacpu < nodes * 2 / 3)
+  {
+    int myrank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+    if (myrank == 0)
+      cout << "Parallel::distribute CAUSTION: level#" << lev << " uses essencially " << reacpu << " processors vs " << nodes << " nodes run, your scientific computation scale is not as large as you estimate." << endl;
+  }
+
+  return BlL;
+}
+
+Block* Parallel::splitHotspotBlock(MyList<Block>* &BlL, int _dim,
+                                 int ib0_orig, int ib3_orig,
+                                 int jb1_orig, int jb4_orig,
+                                 int kb2_orig, int kb5_orig,
+                                 Patch* PP, int r_left, int r_right,
+                                 int ingfsi, int fngfsi, bool periodic,
+                                 Block* &split_first_block, Block* &split_last_block)
+{
+    int mid = (ib0_orig + ib3_orig) / 2;
+
+    int indices_L[6] = {ib0_orig, jb1_orig, kb2_orig, mid, jb4_orig, kb5_orig};
+    int indices_R[6] = {mid + 1, jb1_orig, kb2_orig, ib3_orig, jb4_orig, kb5_orig};
+
+    auto createSubBlock = [&](int* ib_raw, int target_rank) {
+        int ib_final[6];
+        int sh_here[3];
+        double bb_here[6], dd;
+
+        if (periodic) {
+            ib_final[0] = ib_raw[0] - ghost_width;
+            ib_final[3] = ib_raw[3] + ghost_width;
+            ib_final[1] = ib_raw[1] - ghost_width;
+            ib_final[4] = ib_raw[4] + ghost_width;
+            ib_final[2] = ib_raw[2] - ghost_width;
+            ib_final[5] = ib_raw[5] + ghost_width;
+        } else {
+            ib_final[0] = Mymax(0, ib_raw[0] - ghost_width);
+            ib_final[3] = Mymin(PP->shape[0] - 1, ib_raw[3] + ghost_width);
+            ib_final[1] = Mymax(0, ib_raw[1] - ghost_width);
+            ib_final[4] = Mymin(PP->shape[1] - 1, ib_raw[4] + ghost_width);
+            ib_final[2] = Mymax(0, ib_raw[2] - ghost_width);
+            ib_final[5] = Mymin(PP->shape[2] - 1, ib_raw[5] + ghost_width);
+        }
+
+        sh_here[0] = ib_final[3] - ib_final[0] + 1;
+        sh_here[1] = ib_final[4] - ib_final[1] + 1;
+        sh_here[2] = ib_final[5] - ib_final[2] + 1;
+
+#ifdef Vertex
+        dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1);
+        bb_here[0] = PP->bbox[0] + ib_final[0] * dd;
+        bb_here[3] = PP->bbox[0] + ib_final[3] * dd;
+        dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1);
+        bb_here[1] = PP->bbox[1] + ib_final[1] * dd;
+        bb_here[4] = PP->bbox[1] + ib_final[4] * dd;
+        dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1);
+        bb_here[2] = PP->bbox[2] + ib_final[2] * dd;
+        bb_here[5] = PP->bbox[2] + ib_final[5] * dd;
+#else
+#ifdef Cell
+        dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0];
+        bb_here[0] = PP->bbox[0] + ib_final[0] * dd;
+        bb_here[3] = PP->bbox[0] + (ib_final[3] + 1) * dd;
+        dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1];
+        bb_here[1] = PP->bbox[1] + ib_final[1] * dd;
+        bb_here[4] = PP->bbox[1] + (ib_final[4] + 1) * dd;
+        dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2];
+        bb_here[2] = PP->bbox[2] + ib_final[2] * dd;
+        bb_here[5] = PP->bbox[2] + (ib_final[5] + 1) * dd;
+#endif
+#endif
+
+        Block* Bg = new Block(dim, sh_here, bb_here, target_rank, ingfsi, fngfsi, PP->lev);
+        if (BlL) BlL->insert(Bg);
+        else     BlL = new MyList<Block>(Bg);
+
+        return Bg;
+    };
+
+    split_first_block = createSubBlock(indices_L, r_left);
+    split_last_block  = createSubBlock(indices_R, r_right);
+    return split_last_block;
+}
+
+Block* Parallel::createMappedBlock(MyList<Block>* &BlL, int _dim, int* shape, double* bbox,
+                                   int block_id, int ingfsi, int fngfsi, int lev)
+{
+    int target_rank = block_id;
+    if (INTERP_LB_NPROCS > 0) {
+      for (int ri = 0; ri < interp_lb_num_remaps; ri++) {
+        if (block_id == interp_lb_remaps[ri][0]) {
+          target_rank = interp_lb_remaps[ri][1];
+          break;
+        }
+      }
+    }
+
+    Block* ng = new Block(dim, shape, bbox, target_rank, ingfsi, fngfsi, lev);
+    if (BlL) BlL->insert(ng);
+    else     BlL = new MyList<Block>(ng);
+
+    return ng;
+}
+#else
+// When INTERP_LB_OPTIMIZE is not defined, distribute_optimize falls back to distribute
+MyList<Block> *Parallel::distribute_optimize(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
+                                    bool periodic, int nodes)
+{
+  return distribute(PatchLIST, cpusize, ingfsi, fngfsi, periodic, nodes);
+}
+Block* Parallel::splitHotspotBlock(MyList<Block>* &BlL, int _dim,
+                                 int ib0_orig, int ib3_orig,
+                                 int jb1_orig, int jb4_orig,
+                                 int kb2_orig, int kb5_orig,
+                                 Patch* PP, int r_left, int r_right,
+                                 int ingfsi, int fngfsi, bool periodic,
+                                 Block* &split_first_block, Block* &split_last_block)
+{ return nullptr; }
+Block* Parallel::createMappedBlock(MyList<Block>* &BlL, int _dim, int* shape, double* bbox,
+                                   int block_id, int ingfsi, int fngfsi, int lev)
+{ return nullptr; }
+#endif
+
+#elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
+MyList<Block> *Parallel::distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
+                                    bool periodic, int start_rank, int end_rank, int nodes)
+{
+#ifdef USE_GPU_DIVIDE
+  double cpu_part, gpu_part;
+  map<string, double>::iterator iter;
+  iter = parameters::dou_par.find("cpu part");
+  if (iter != parameters::dou_par.end())
+  {
+    cpu_part = iter->second;
+  }
+  else
+  {
+    int myrank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+    // read parameter from file
+    const int LEN = 256;
+    char pline[LEN];
+    string str, sgrp, skey, sval;
+    int sind;
+    char pname[50];
+    {
+      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
+      if (iter != parameters::str_par.end())
+      {
+        strcpy(pname, (iter->second).c_str());
+      }
+      else
+      {
+        cout << "Error inputpar" << endl;
+        exit(0);
+      }
+    }
+    ifstream inf(pname, ifstream::in);
+    if (!inf.good() && myrank == 0)
+    {
+      cout << "Can not open parameter file " << pname << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    for (int i = 1; inf.good(); i++)
+    {
+      inf.getline(pline, LEN);
+      str = pline;
+
+      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
+      if (status == -1)
+      {
+        cout << "error reading parameter file " << pname << " in line " << i << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+      else if (status == 0)
+        continue;
+
+      if (sgrp == "ABE")
+      {
+        if (skey == "cpu part")
+          cpu_part = atof(sval.c_str());
+      }
+    }
+    inf.close();
+
+    parameters::dou_par.insert(map<string, double>::value_type("cpu part", cpu_part));
+  }
+  iter = parameters::dou_par.find("gpu part");
+  if (iter != parameters::dou_par.end())
+  {
+    gpu_part = iter->second;
+  }
+  else
+  {
+    int myrank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+    // read parameter from file
+    const int LEN = 256;
+    char pline[LEN];
+    string str, sgrp, skey, sval;
+    int sind;
+    char pname[50];
+    {
+      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
+      if (iter != parameters::str_par.end())
+      {
+        strcpy(pname, (iter->second).c_str());
+      }
+      else
+      {
+        cout << "Error inputpar" << endl;
+        exit(0);
+      }
+    }
+    ifstream inf(pname, ifstream::in);
+    if (!inf.good() && myrank == 0)
+    {
+      cout << "Can not open parameter file " << pname << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    for (int i = 1; inf.good(); i++)
+    {
+      inf.getline(pline, LEN);
+      str = pline;
+
+      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
+      if (status == -1)
+      {
+        cout << "error reading parameter file " << pname << " in line " << i << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+      else if (status == 0)
+        continue;
+
+      if (sgrp == "ABE")
+      {
+        if (skey == "gpu part")
+          gpu_part = atof(sval.c_str());
+      }
+    }
+    inf.close();
+
+    parameters::dou_par.insert(map<string, double>::value_type("gpu part", gpu_part));
+  }
+
+  if (nodes == 0)
+    nodes = cpusize / 2;
+#else
+  if (nodes == 0)
+    nodes = cpusize;
+#endif
+
+  if (dim != 3)
+  {
+    cout << "distrivute: now we only support 3-dimension" << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+
+  MyList<Block> *BlL = 0;
+
+  int split_size, min_size, block_size = 0;
+
+  int min_width = 2 * Mymax(ghost_width, buffer_width);
+  int nxyz[dim], mmin_width[dim], min_shape[dim];
+
+  MyList<Patch> *PLi = PatchLIST;
+  for (int i = 0; i < dim; i++)
+    min_shape[i] = PLi->data->shape[i];
+  int lev = PLi->data->lev;
+  PLi = PLi->next;
+  while (PLi)
+  {
+    Patch *PP = PLi->data;
+    for (int i = 0; i < dim; i++)
+      min_shape[i] = Mymin(min_shape[i], PP->shape[i]);
+    if (lev != PLi->data->lev)
+      cout << "Parallel::distribute CAUSTION: meet Patches for different level: " << lev << " and " << PLi->data->lev << endl;
+    PLi = PLi->next;
+  }
+
+  for (int i = 0; i < dim; i++)
+    mmin_width[i] = Mymin(min_width, min_shape[i]);
+
+  min_size = mmin_width[0];
+  for (int i = 1; i < dim; i++)
+    min_size = min_size * mmin_width[i];
+
+  PLi = PatchLIST;
+  while (PLi)
+  {
+    Patch *PP = PLi->data;
+    //    PP->checkPatch(true);
+    int bs = PP->shape[0];
+    for (int i = 1; i < dim; i++)
+      bs = bs * PP->shape[i];
+    block_size = block_size + bs;
+    PLi = PLi->next;
+  }
+  split_size = Mymax(min_size, block_size / cpusize);
+  split_size = Mymax(1, split_size);
+
+  int n_rank = start_rank;
+  PLi = PatchLIST;
+  int reacpu = 0;
+  while (PLi)
+  {
+    Patch *PP = PLi->data;
+
+    reacpu += partition3(nxyz, split_size, mmin_width, cpusize, PP->shape);
+
+    Block *ng, *ng0;
+    int shape_here[dim], ibbox_here[2 * dim];
+    double bbox_here[2 * dim], dd;
+
+    // ibbox : 0,...N-1
+    for (int i = 0; i < nxyz[0]; i++)
+      for (int j = 0; j < nxyz[1]; j++)
+        for (int k = 0; k < nxyz[2]; k++)
+        {
+          ibbox_here[0] = (PP->shape[0] * i) / nxyz[0];
+          ibbox_here[3] = (PP->shape[0] * (i + 1)) / nxyz[0] - 1;
+          ibbox_here[1] = (PP->shape[1] * j) / nxyz[1];
+          ibbox_here[4] = (PP->shape[1] * (j + 1)) / nxyz[1] - 1;
+          ibbox_here[2] = (PP->shape[2] * k) / nxyz[2];
+          ibbox_here[5] = (PP->shape[2] * (k + 1)) / nxyz[2] - 1;
+
+          if (periodic)
+          {
+            ibbox_here[0] = ibbox_here[0] - ghost_width;
+            ibbox_here[3] = ibbox_here[3] + ghost_width;
+            ibbox_here[1] = ibbox_here[1] - ghost_width;
+            ibbox_here[4] = ibbox_here[4] + ghost_width;
+            ibbox_here[2] = ibbox_here[2] - ghost_width;
+            ibbox_here[5] = ibbox_here[5] + ghost_width;
+          }
+          else
+          {
+            ibbox_here[0] = Mymax(0, ibbox_here[0] - ghost_width);
+            ibbox_here[3] = Mymin(PP->shape[0] - 1, ibbox_here[3] + ghost_width);
+            ibbox_here[1] = Mymax(0, ibbox_here[1] - ghost_width);
+            ibbox_here[4] = Mymin(PP->shape[1] - 1, ibbox_here[4] + ghost_width);
+            ibbox_here[2] = Mymax(0, ibbox_here[2] - ghost_width);
+            ibbox_here[5] = Mymin(PP->shape[2] - 1, ibbox_here[5] + ghost_width);
+          }
+
+          shape_here[0] = ibbox_here[3] - ibbox_here[0] + 1;
+          shape_here[1] = ibbox_here[4] - ibbox_here[1] + 1;
+          shape_here[2] = ibbox_here[5] - ibbox_here[2] + 1;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+          // 0--4, 5--10
+          dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1);
+          bbox_here[0] = PP->bbox[0] + ibbox_here[0] * dd;
+          bbox_here[3] = PP->bbox[0] + ibbox_here[3] * dd;
+
+          dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1);
+          bbox_here[1] = PP->bbox[1] + ibbox_here[1] * dd;
+          bbox_here[4] = PP->bbox[1] + ibbox_here[4] * dd;
+
+          dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1);
+          bbox_here[2] = PP->bbox[2] + ibbox_here[2] * dd;
+          bbox_here[5] = PP->bbox[2] + ibbox_here[5] * dd;
+#else
+#ifdef Cell
+          // 0--5, 5--10
+          dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0];
+          bbox_here[0] = PP->bbox[0] + (ibbox_here[0]) * dd;
+          bbox_here[3] = PP->bbox[0] + (ibbox_here[3] + 1) * dd;
+
+          dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1];
+          bbox_here[1] = PP->bbox[1] + (ibbox_here[1]) * dd;
+          bbox_here[4] = PP->bbox[1] + (ibbox_here[4] + 1) * dd;
+
+          dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2];
+          bbox_here[2] = PP->bbox[2] + (ibbox_here[2]) * dd;
+          bbox_here[5] = PP->bbox[2] + (ibbox_here[5] + 1) * dd;
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+
+#ifdef USE_GPU_DIVIDE
+          {
+            const int pices = 2;
+            double picef[pices];
+            picef[0] = cpu_part;
+            picef[1] = gpu_part;
+            int shape_res[dim * pices];
+            double bbox_res[2 * dim * pices];
+            misc::dividBlock(dim, shape_here, bbox_here, pices, picef, shape_res, bbox_res, min_width);
+            ng = ng0 = new Block(dim, shape_res, bbox_res, n_rank++, ingfsi, fngfsi, PP->lev, 0); // delete through KillBlocks
+            //	       ng->checkBlock();
+            if (BlL)
+              BlL->insert(ng);
+            else
+              BlL = new MyList<Block>(ng); // delete through KillBlocks
+
+            for (int i = 1; i < pices; i++)
+            {
+              ng = new Block(dim, shape_res + i * dim, bbox_res + i * 2 * dim, n_rank++, ingfsi, fngfsi, PP->lev, i); // delete through KillBlocks
+              //	        ng->checkBlock();
+              BlL->insert(ng);
+            }
+          }
+#else
+          ng = ng0 = new Block(dim, shape_here, bbox_here, n_rank++, ingfsi, fngfsi, PP->lev); // delete through KillBlocks
+          //	    ng->checkBlock();
+          if (BlL)
+            BlL->insert(ng);
+          else
+            BlL = new MyList<Block>(ng); // delete through KillBlocks
+#endif
+
+          if (n_rank == end_rank + 1)
+            n_rank = start_rank;
+
+          // set PP->blb
+          if (i == 0 && j == 0 && k == 0)
+          {
+            MyList<Block> *Bp = BlL;
+            while (Bp->data != ng0)
+              Bp = Bp->next; // ng0 is the first of the pices list
+            PP->blb = Bp;
+          }
+        }
+    // set PP->ble
+    {
+      MyList<Block> *Bp = BlL;
+      while (Bp->data != ng)
+        Bp = Bp->next; // ng is the last of the pices list
+      PP->ble = Bp;
+    }
+    PLi = PLi->next;
+  }
+  if (reacpu < nodes * 2 / 3)
+  {
+    int myrank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+    if (myrank == start_rank)
+      cout << "Parallel::distribute CAUSTION: level#" << lev << " uses essencially " << reacpu << " processors vs " << nodes << " nodes run, your scientific computation scale is not as large as you estimate." << endl;
+  }
+
+  return BlL;
+}
+#endif
+void Parallel::setfunction(MyList<Block> *BlL, var *vn, double func(double x, double y, double z))
+{
+  while (BlL)
+  {
+    if (BlL->data->X[0])
+    {
+      int nn = BlL->data->shape[0] * BlL->data->shape[1] * BlL->data->shape[2];
+      double *p = BlL->data->fgfs[vn->sgfn];
+      for (int i = 0; i < nn; i++)
+      {
+        int ind[3];
+        getarrayindex(3, BlL->data->shape, ind, i);
+        p[i] = func(BlL->data->X[0][ind[0]], BlL->data->X[1][ind[1]], BlL->data->X[2][ind[2]]);
+      }
+    }
+    BlL = BlL->next;
+  }
+}
+// set function only for cpu rank
+void Parallel::setfunction(int rank, MyList<Block> *BlL, var *vn, double func(double x, double y, double z))
+{
+  while (BlL)
+  {
+    if (BlL->data->X[0] && BlL->data->rank == rank)
+    {
+      int nn = BlL->data->shape[0] * BlL->data->shape[1] * BlL->data->shape[2];
+      double *p = BlL->data->fgfs[vn->sgfn];
+      for (int i = 0; i < nn; i++)
+      {
+        int ind[3];
+        getarrayindex(3, BlL->data->shape, ind, i);
+        p[i] = func(BlL->data->X[0][ind[0]], BlL->data->X[1][ind[1]], BlL->data->X[2][ind[2]]);
+      }
+    }
+    BlL = BlL->next;
+  }
+}
+void Parallel::getarrayindex(int DIM, int *shape, int *index, int n)
+{
+  // we assume index has already memory space
+  int *mu;
+  mu = new int[DIM];
+  mu[0] = 1;
+  for (int i = 1; i < DIM; i++)
+    mu[i] = mu[i - 1] * shape[i - 1];
+  for (int i = DIM - 1; i >= 0; i--)
+  {
+    index[i] = n / mu[i];
+    n = n - index[i] * mu[i];
+  }
+
+  delete[] mu;
+}
+int Parallel::getarraylocation(int DIM, int *shape, int *index)
+{
+  int n, mu;
+  mu = shape[0];
+  n = index[0];
+  for (int i = 1; i < DIM; i++)
+  {
+    n = n + index[i] * mu;
+    mu = mu * shape[i];
+  }
+
+  return n;
+}
+void Parallel::copy(int DIM, double *llbout, double *uubout, int *Dshape, double *DD, double *llbin, double *uubin,
+                    int *shape, double *datain, double *llb, double *uub)
+{
+  // for 3 dimensional case, based on simple test, I found this is half slower than f90 code
+  int *illi, *iuui;
+  int *illo, *iuuo;
+  int *indi, *indo;
+  illi = new int[DIM];
+  iuui = new int[DIM];
+  illo = new int[DIM];
+  iuuo = new int[DIM];
+  indi = new int[DIM];
+  indo = new int[DIM];
+
+  int ial = 1;
+  for (int i = 0; i < DIM; i++)
+  {
+    double ho, hi;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+    ho = (uubout[i] - llbout[i]) / (Dshape[i] - 1);
+    hi = (uubin[i] - llbin[i]) / (shape[i] - 1);
+#else
+#ifdef Cell
+    ho = (uubout[i] - llbout[i]) / Dshape[i];
+    hi = (uubin[i] - llbin[i]) / shape[i];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+    illo[i] = int((llb[i] - llbout[i]) / ho);
+    iuuo[i] = Dshape[i] - 1 - int((uubout[i] - uub[i]) / ho);
+    illi[i] = int((llb[i] - llbin[i]) / hi);
+    iuui[i] = shape[i] - 1 - int((uubin[i] - uub[i]) / hi);
+
+    if (illo[i] > iuuo[i] || illi[i] > iuui[i] || illo[i] < 0 || illi[i] < 0 ||
+        iuui[i] >= shape[i] || iuuo[i] >= Dshape[i])
+    {
+      cout << "Parallel copy: in direction " << i << ":" << endl;
+      cout << "llb = " << llb[i] << ", uub = " << uub[i] << endl;
+      cout << " in data : il = " << illi[i] << ", iu = " << iuui[i] << endl;
+      cout << "bbox = (" << llbin[i] << "," << uubin[i] << ")" << endl;
+      cout << "shape = " << shape[i] << endl;
+      cout << "out data : il = " << illo[i] << ", iu = " << iuuo[i] << endl;
+      cout << "bbox = (" << llbout[i] << "," << uubout[i] << ")" << endl;
+      cout << "shape = " << Dshape[i] << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+    int ihi = iuui[i] - illi[i] + 1, iho = iuuo[i] - illo[i] + 1;
+    if (!(feq(ho, hi, ho / 2)) || ihi != iho)
+    {
+      cout << "Parallel copy: in direction " << i << ":" << endl;
+      cout << "Parallel copy: not the same grid structure." << endl;
+      cout << "hi = " << hi << ", bbox = (" << llbin[i] << "," << uubin[i] << "), shape = " << shape[i] << endl;
+      cout << "ho = " << ho << ", bbox = (" << llbout[i] << "," << uubout[i] << "), shape = " << Dshape[i] << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+    ial = ial * ihi;
+  }
+
+  for (int i = 0; i < DIM; i++)
+  {
+    indi[i] = illi[i];
+    indo[i] = illo[i];
+  }
+  /*
+  //check start index
+     for(int i=0;i<DIM;i++)
+     {
+       cout << "Parallel copy: in direction " <<i<<":"<< endl;
+       cout<<"start : indi = "<<indi[i]<<", indo = "<<indo[i]<<endl;
+     }
+  */
+  int NNi = 1, NNo = 1;
+  for (int i = 0; i < DIM; i++)
+  {
+    NNi = NNi * shape[i];
+    NNo = NNo * Dshape[i];
+  }
+  for (int i = 0; i < ial; i++)
+  {
+    int ni, no;
+    ni = getarraylocation(DIM, shape, indi);
+    no = getarraylocation(DIM, Dshape, indo);
+    if (no < 0 || no > NNo)
+    {
+      cout << "Parallel copy: no = " << no << " is out of array range (0," << NNo << ")." << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+    if (ni < 0 || ni > NNi)
+    {
+      cout << "Parallel copy: ni = " << ni << " is out of array range (0," << NNi << ")." << endl;
+      cout << "shape = (";
+      for (int j = 0; j < DIM; j++)
+      {
+        cout << shape[j];
+        if (j < DIM - 1)
+          cout << ",";
+        else
+          cout << ")" << endl;
+      }
+      cout << "ind = (";
+      for (int j = 0; j < DIM; j++)
+      {
+        cout << indi[j];
+        if (j < DIM - 1)
+          cout << ",";
+        else
+          cout << ")" << endl;
+      }
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+    DD[no] = datain[ni];
+
+    indi[0]++;
+    for (int j = 1; j < DIM; j++)
+    {
+      if (indi[j - 1] == iuui[j - 1] + 1)
+      {
+        indi[j - 1] = illi[j - 1];
+        indi[j]++;
+      } // carry 1 to next digital
+      else
+        break;
+    }
+    indo[0]++;
+    for (int j = 1; j < DIM; j++)
+    {
+      if (indo[j - 1] == iuuo[j - 1] + 1)
+      {
+        indo[j - 1] = illo[j - 1];
+        indo[j]++;
+      }
+      else
+        break;
+    }
+  }
+  /*
+  //check final index
+     for(int i=0;i<DIM;i++)
+     {
+       cout << "Parallel copy: in direction " <<i<<":"<< endl;
+       cout<<"final : indi = "<<indi[i]<<", indo = "<<indo[i]<<endl;
+     }
+  */
+  delete[] illi;
+  delete[] iuui;
+  delete[] illo;
+  delete[] iuuo;
+  delete[] indi;
+  delete[] indo;
+}
+void Parallel::writefile(double time, int nx, int ny, int nz, double xmin, double xmax, double ymin, double ymax,
+                         double zmin, double zmax, char *filename, double *data_out)
+{
+  ofstream outfile;
+  outfile.open(filename, ios::out | ios::trunc);
+  if (!outfile)
+  {
+    cout << "Can't open " << filename << " for output." << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+  outfile.write((char *)&time, sizeof(double));
+  outfile.write((char *)&nx, sizeof(int));
+  outfile.write((char *)&ny, sizeof(int));
+  outfile.write((char *)&nz, sizeof(int));
+  outfile.write((char *)&xmin, sizeof(double));
+  outfile.write((char *)&xmax, sizeof(double));
+  outfile.write((char *)&ymin, sizeof(double));
+  outfile.write((char *)&ymax, sizeof(double));
+  outfile.write((char *)&zmin, sizeof(double));
+  outfile.write((char *)&zmax, sizeof(double));
+  outfile.write((char *)data_out, nx * ny * nz * sizeof(double));
+  outfile.close();
+}
+void Parallel::writefile(double time, int nx, int ny, double xmin, double xmax, double ymin, double ymax,
+                         char *filename, double *datain)
+{
+  int i, j;
+  double *X, *Y;
+  X = new double[nx];
+  Y = new double[ny];
+  double dd;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+  dd = (xmax - xmin) / (nx - 1);
+  for (i = 0; i < nx; i++)
+    X[i] = xmin + i * dd;
+  dd = (ymax - ymin) / (ny - 1);
+  for (j = 0; j < ny; j++)
+    Y[j] = ymin + j * dd;
+#else
+#ifdef Cell
+  dd = (xmax - xmin) / nx;
+  for (i = 0; i < nx; i++)
+    X[i] = xmin + (i + 0.5) * dd;
+  dd = (ymax - ymin) / ny;
+  for (j = 0; j < ny; j++)
+    Y[j] = ymin + (j + 0.5) * dd;
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+  ofstream outfile;
+  outfile.open(filename, ios::out | ios::trunc);
+  if (!outfile)
+  {
+    cout << "Can't open " << filename << " for output." << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+  outfile << "# t = " << time << endl;
+  for (j = 0; j < ny; j++)
+  {
+    for (i = 0; i < nx; i++)
+    {
+      int ind1 = i + j * nx;
+      outfile << setw(10) << setprecision(10) << X[i] << " "
+              << setw(10) << setprecision(10) << Y[j] << " "
+              << setw(16) << setprecision(15) << datain[ind1]
+              << endl;
+    }
+    outfile << "\n"; /* blanck line for gnuplot */
+  }
+  outfile.close();
+
+  delete[] X;
+  delete[] Y;
+}
+void Parallel::Dump_CPU_Data(MyList<Block> *BlL, MyList<var> *DumpList, char *tag, double time, double dT)
+{
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  // round at 4 and 5
+  int ncount = int(time / dT + 0.5);
+
+  MyList<Block> *Bp;
+  while (DumpList)
+  {
+    Bp = BlL;
+    int Bi = 0;
+    while (Bp)
+    {
+      Block *BP = Bp->data;
+      var *VP = DumpList->data;
+      if (BP->rank == myrank)
+      {
+
+        string out_dir;
+        map<string, string>::iterator iter;
+        iter = parameters::str_par.find("output dir");
+        if (iter != parameters::str_par.end())
+        {
+          out_dir = iter->second;
+        }
+        else
+        {
+          // read parameter from file
+          const int LEN = 256;
+          char pline[LEN];
+          string str, sgrp, skey, sval;
+          int sind;
+          char pname[50];
+          {
+            map<string, string>::iterator iter = parameters::str_par.find("inputpar");
+            if (iter != parameters::str_par.end())
+            {
+              strcpy(pname, (iter->second).c_str());
+            }
+            else
+            {
+              cout << "Error inputpar" << endl;
+              exit(0);
+            }
+          }
+          ifstream inf(pname, ifstream::in);
+          if (!inf.good())
+          {
+            cout << "Can not open parameter file " << pname << endl;
+            MPI_Abort(MPI_COMM_WORLD, 1);
+          }
+
+          for (int i = 1; inf.good(); i++)
+          {
+            inf.getline(pline, LEN);
+            str = pline;
+
+            int status = misc::parse_parts(str, sgrp, skey, sval, sind);
+            if (status == -1)
+            {
+              cout << "error reading parameter file " << pname << " in line " << i << endl;
+              MPI_Abort(MPI_COMM_WORLD, 1);
+            }
+            else if (status == 0)
+              continue;
+
+            if (sgrp == "ABE")
+            {
+              if (skey == "output dir")
+                out_dir = sval;
+            }
+          }
+          inf.close();
+
+          parameters::str_par.insert(map<string, string>::value_type("output dir", out_dir));
+        }
+
+        char filename[100];
+        if (tag)
+          sprintf(filename, "%s/%s_Lev%02d-%02d_%02d_%s_%05d.bin", out_dir.c_str(), tag, BP->lev, Bi, myrank, VP->name, ncount);
+        else
+          sprintf(filename, "%s/Lev%02d-%02d_%02d_%s_%05d.bin", out_dir.c_str(), BP->lev, Bi, myrank, VP->name, ncount);
+        writefile(time, BP->shape[0], BP->shape[1], BP->shape[2], BP->bbox[0], BP->bbox[3], BP->bbox[1], BP->bbox[4],
+                  BP->bbox[2], BP->bbox[5], filename, BP->fgfs[VP->sgfn]);
+        cout << "end of dump " << VP->name << " at time " << time << ", on node " << myrank << endl;
+      }
+      Bp = Bp->next;
+      Bi++;
+    }
+    DumpList = DumpList->next;
+  }
+}
+// Now we dump the data including buffer points
+void Parallel::Dump_Data(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT, int grd)
+{
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  //   round at 4 and 5
+  int ncount = int(time / dT + 0.5);
+
+  MPI_Status sta;
+  int DIM = 3;
+  double llb[3], uub[3];
+  double DX, DY, DZ;
+
+  double *databuffer = 0;
+  if (myrank == 0)
+  {
+    databuffer = (double *)malloc(sizeof(double) * PP->shape[0] * PP->shape[1] * PP->shape[2]);
+    if (!databuffer)
+    {
+      cout << "Parallel::Dump_Data: out of memory when dumping data." << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+  }
+
+  while (DumpList)
+  {
+    var *VP = DumpList->data;
+
+    MyList<Block> *Bp = PP->blb;
+    while (Bp)
+    {
+      Block *BP = Bp->data;
+      if (BP->rank == 0 && myrank == 0)
+      {
+        DX = BP->getdX(0);
+        DY = BP->getdX(1);
+        DZ = BP->getdX(2);
+        llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
+        llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
+        llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
+        uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
+        uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
+        uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
+        f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, BP->fgfs[VP->sgfn], llb, uub);
+      }
+      else
+      {
+        int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]);
+        if (myrank == 0)
+        {
+          double *bufferhere = (double *)malloc(sizeof(double) * nnn);
+          if (!bufferhere)
+          {
+            cout << "on node#" << myrank << ", out of memory when dumping data." << endl;
+            MPI_Abort(MPI_COMM_WORLD, 1);
+          }
+          MPI_Recv(bufferhere, nnn, MPI_DOUBLE, BP->rank, 0, MPI_COMM_WORLD, &sta);
+          DX = BP->getdX(0);
+          DY = BP->getdX(1);
+          DZ = BP->getdX(2);
+          llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
+          llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
+          llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
+          uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
+          uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
+          uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
+          f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, bufferhere, llb, uub);
+          free(bufferhere);
+        }
+        else if (myrank == BP->rank)
+        {
+          MPI_Send(BP->fgfs[VP->sgfn], nnn, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
+        }
+      }
+      if (Bp == PP->ble)
+        break;
+      Bp = Bp->next;
+    }
+    if (myrank == 0)
+    {
+
+      string out_dir;
+      map<string, string>::iterator iter;
+      iter = parameters::str_par.find("output dir");
+      if (iter != parameters::str_par.end())
+      {
+        out_dir = iter->second;
+      }
+      else
+      {
+        // read parameter from file
+        const int LEN = 256;
+        char pline[LEN];
+        string str, sgrp, skey, sval;
+        int sind;
+        char pname[50];
+        {
+          map<string, string>::iterator iter = parameters::str_par.find("inputpar");
+          if (iter != parameters::str_par.end())
+          {
+            strcpy(pname, (iter->second).c_str());
+          }
+          else
+          {
+            cout << "Error inputpar" << endl;
+            exit(0);
+          }
+        }
+        ifstream inf(pname, ifstream::in);
+        if (!inf.good())
+        {
+          cout << "Can not open parameter file " << pname << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+
+        for (int i = 1; inf.good(); i++)
+        {
+          inf.getline(pline, LEN);
+          str = pline;
+
+          int status = misc::parse_parts(str, sgrp, skey, sval, sind);
+          if (status == -1)
+          {
+            cout << "error reading parameter file " << pname << " in line " << i << endl;
+            MPI_Abort(MPI_COMM_WORLD, 1);
+          }
+          else if (status == 0)
+            continue;
+
+          if (sgrp == "ABE")
+          {
+            if (skey == "output dir")
+              out_dir = sval;
+          }
+        }
+        inf.close();
+
+        parameters::str_par.insert(map<string, string>::value_type("output dir", out_dir));
+      }
+
+      char filename[100];
+      if (tag)
+        sprintf(filename, "%s/%s_Lev%02d-%02d_%s_%05d.bin", out_dir.c_str(), tag, PP->lev, grd, VP->name, ncount);
+      else
+        sprintf(filename, "%s/Lev%02d-%02d_%s_%05d.bin", out_dir.c_str(), PP->lev, grd, VP->name, ncount);
+
+      writefile(time, PP->shape[0], PP->shape[1], PP->shape[2], PP->bbox[0], PP->bbox[3], PP->bbox[1], PP->bbox[4],
+                PP->bbox[2], PP->bbox[5], filename, databuffer);
+    }
+    DumpList = DumpList->next;
+  }
+
+  if (myrank == 0)
+    free(databuffer);
+}
+void Parallel::Dump_Data(MyList<Patch> *PL, MyList<var> *DumpList, char *tag, double time, double dT)
+{
+  MyList<Patch> *Pp;
+  Pp = PL;
+  int grd = 0;
+  while (Pp)
+  {
+    Patch *PP = Pp->data;
+    Dump_Data(PP, DumpList, tag, time, dT, grd);
+    grd++;
+    Pp = Pp->next;
+  }
+}
+// collect the data including buffer points
+double *Parallel::Collect_Data(Patch *PP, var *VP)
+{
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+  MPI_Status sta;
+  int DIM = 3;
+  double llb[3], uub[3];
+  double DX, DY, DZ;
+
+  double *databuffer = 0;
+  if (myrank == 0)
+  {
+    databuffer = (double *)malloc(sizeof(double) * PP->shape[0] * PP->shape[1] * PP->shape[2]);
+    if (!databuffer)
+    {
+      cout << "Parallel::Collect_Data: out of memory when dumping data." << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+  }
+
+  MyList<Block> *Bp = PP->blb;
+  while (Bp)
+  {
+    Block *BP = Bp->data;
+    if (BP->rank == 0 && myrank == 0)
+    {
+      DX = BP->getdX(0);
+      DY = BP->getdX(1);
+      DZ = BP->getdX(2);
+      llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
+      llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
+      llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
+      uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
+      uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
+      uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
+      f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, BP->fgfs[VP->sgfn], llb, uub);
+    }
+    else
+    {
+      int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]);
+      if (myrank == 0)
+      {
+        double *bufferhere = (double *)malloc(sizeof(double) * nnn);
+        if (!bufferhere)
+        {
+          cout << "on node#" << myrank << ", out of memory when dumping data." << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        MPI_Recv(bufferhere, nnn, MPI_DOUBLE, BP->rank, 0, MPI_COMM_WORLD, &sta);
+        DX = BP->getdX(0);
+        DY = BP->getdX(1);
+        DZ = BP->getdX(2);
+        llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
+        llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
+        llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
+        uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
+        uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
+        uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
+        f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, bufferhere, llb, uub);
+        free(bufferhere);
+      }
+      else if (myrank == BP->rank)
+      {
+        MPI_Send(BP->fgfs[VP->sgfn], nnn, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
+      }
+    }
+    if (Bp == PP->ble)
+      break;
+    Bp = Bp->next;
+  }
+
+  return databuffer;
+}
+// Now we dump the data including buffer points
+// dump z = 0 plane
+void Parallel::d2Dump_Data(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT, int grd)
+{
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  //   round at 4 and 5
+  int ncount = int(time / dT + 0.5);
+
+  MPI_Status sta;
+  int DIM = 3;
+  double llb[3], uub[3];
+  double DX, DY, DZ;
+
+  double *databuffer = 0, *databuffer2 = 0;
+  if (myrank == 0)
+  {
+    databuffer = (double *)malloc(sizeof(double) * PP->shape[0] * PP->shape[1] * PP->shape[2]);
+    databuffer2 = (double *)malloc(sizeof(double) * PP->shape[0] * PP->shape[1]);
+    if (!databuffer || !databuffer2)
+    {
+      cout << "Parallel::d2Dump_Data: out of memory when dumping data." << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+  }
+
+  while (DumpList)
+  {
+    var *VP = DumpList->data;
+
+    MyList<Block> *Bp = PP->blb;
+    while (Bp)
+    {
+      Block *BP = Bp->data;
+      if (BP->rank == 0 && myrank == 0)
+      {
+        DX = BP->getdX(0);
+        DY = BP->getdX(1);
+        DZ = BP->getdX(2);
+        llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
+        llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
+        llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
+        uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
+        uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
+        uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
+        f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, BP->fgfs[VP->sgfn], llb, uub);
+      }
+      else
+      {
+        int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]);
+        if (myrank == 0)
+        {
+          double *bufferhere = (double *)malloc(sizeof(double) * nnn);
+          if (!bufferhere)
+          {
+            cout << "on node#" << myrank << ", out of memory when dumping data." << endl;
+            MPI_Abort(MPI_COMM_WORLD, 1);
+          }
+          MPI_Recv(bufferhere, nnn, MPI_DOUBLE, BP->rank, 0, MPI_COMM_WORLD, &sta);
+          DX = BP->getdX(0);
+          DY = BP->getdX(1);
+          DZ = BP->getdX(2);
+          llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
+          llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
+          llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
+          uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
+          uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
+          uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
+          f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, bufferhere, llb, uub);
+          free(bufferhere);
+        }
+        else if (myrank == BP->rank)
+        {
+          MPI_Send(BP->fgfs[VP->sgfn], nnn, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
+        }
+      }
+      if (Bp == PP->ble)
+        break;
+      Bp = Bp->next;
+    }
+    if (myrank == 0)
+    {
+
+      string out_dir;
+      map<string, string>::iterator iter;
+      iter = parameters::str_par.find("output dir");
+      if (iter != parameters::str_par.end())
+      {
+        out_dir = iter->second;
+      }
+      else
+      {
+        // read parameter from file
+        const int LEN = 256;
+        char pline[LEN];
+        string str, sgrp, skey, sval;
+        int sind;
+        char pname[50];
+        {
+          map<string, string>::iterator iter = parameters::str_par.find("inputpar");
+          if (iter != parameters::str_par.end())
+          {
+            strcpy(pname, (iter->second).c_str());
+          }
+          else
+          {
+            cout << "Error inputpar" << endl;
+            exit(0);
+          }
+        }
+        ifstream inf(pname, ifstream::in);
+        if (!inf.good())
+        {
+          cout << "Can not open parameter file " << pname << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+
+        for (int i = 1; inf.good(); i++)
+        {
+          inf.getline(pline, LEN);
+          str = pline;
+
+          int status = misc::parse_parts(str, sgrp, skey, sval, sind);
+          if (status == -1)
+          {
+            cout << "error reading parameter file " << pname << " in line " << i << endl;
+            MPI_Abort(MPI_COMM_WORLD, 1);
+          }
+          else if (status == 0)
+            continue;
+
+          if (sgrp == "ABE")
+          {
+            if (skey == "output dir")
+              out_dir = sval;
+          }
+        }
+        inf.close();
+
+        parameters::str_par.insert(map<string, string>::value_type("output dir", out_dir));
+      }
+
+      char filename[100];
+      if (tag)
+        sprintf(filename, "%s/%s_2d_Lev%02d-%02d_%s_%05d.dat", out_dir.c_str(), tag, PP->lev, grd, VP->name, ncount);
+      else
+        sprintf(filename, "%s/2d_Lev%02d-%02d_%s_%05d.dat", out_dir.c_str(), PP->lev, grd, VP->name, ncount);
+
+      int gord = ghost_width;
+      f_d2dump(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, databuffer2, gord, VP->SoA);
+      writefile(time, PP->shape[0], PP->shape[1], PP->bbox[0], PP->bbox[3], PP->bbox[1], PP->bbox[4],
+                filename, databuffer2);
+    }
+    DumpList = DumpList->next;
+  }
+
+  if (myrank == 0)
+  {
+    free(databuffer);
+    free(databuffer2);
+  }
+}
+void Parallel::d2Dump_Data(MyList<Patch> *PL, MyList<var> *DumpList, char *tag, double time, double dT)
+{
+  MyList<Patch> *Pp;
+  Pp = PL;
+  int grd = 0;
+  while (Pp)
+  {
+    Patch *PP = Pp->data;
+    d2Dump_Data(PP, DumpList, tag, time, dT, grd);
+    grd++;
+    Pp = Pp->next;
+  }
+}
+// Now we dump the data including buffer points and ghost points of the given patch
+void Parallel::Dump_Data0(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT)
+{
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  //   round at 4 and 5
+  int ncount = int(time / dT + 0.5);
+
+  MPI_Status sta;
+  int DIM = 3;
+  double llb[3], uub[3], tllb[3], tuub[3];
+  int tshape[3];
+  double DX, DY, DZ;
+
+  for (int i = 0; i < 3; i++)
+  {
+    double DX = PP->blb->data->getdX(i);
+    tshape[i] = PP->shape[i] + 2 * ghost_width;
+    tllb[i] = PP->bbox[i] - ghost_width * DX;
+    tuub[i] = PP->bbox[i + dim] + ghost_width * DX;
+  }
+
+  int NN = tshape[0] * tshape[1] * tshape[2];
+  double *databuffer = 0;
+  if (myrank == 0)
+  {
+    databuffer = (double *)malloc(sizeof(double) * NN);
+    if (!databuffer)
+    {
+      cout << "on node# " << myrank << ", out of memory when dumping data." << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+  }
+
+  while (DumpList)
+  {
+    var *VP = DumpList->data;
+    MyList<Block> *Bp = PP->blb;
+    while (Bp)
+    {
+      Block *BP = Bp->data;
+      if (BP->rank == 0 && myrank == 0)
+      {
+        DX = BP->getdX(0);
+        DY = BP->getdX(1);
+        DZ = BP->getdX(2);
+        llb[0] = (feq(BP->bbox[0], tllb[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
+        llb[1] = (feq(BP->bbox[1], tllb[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
+        llb[2] = (feq(BP->bbox[2], tllb[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
+        uub[0] = (feq(BP->bbox[3], tuub[0], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
+        uub[1] = (feq(BP->bbox[4], tuub[1], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
+        uub[2] = (feq(BP->bbox[5], tuub[2], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
+        f_copy(DIM, tllb, tuub, tshape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, BP->fgfs[VP->sgfn], llb, uub);
+      }
+      else
+      {
+        if (myrank == 0)
+        {
+          int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]);
+          double *bufferhere = (double *)malloc(sizeof(double) * nnn);
+          if (!bufferhere)
+          {
+            cout << "on node#" << myrank << ", out of memory when dumping data." << endl;
+            MPI_Abort(MPI_COMM_WORLD, 1);
+          }
+          MPI_Recv(bufferhere, nnn, MPI_DOUBLE, BP->rank, 0, MPI_COMM_WORLD, &sta);
+          DX = BP->getdX(0);
+          DY = BP->getdX(1);
+          DZ = BP->getdX(2);
+          llb[0] = (feq(BP->bbox[0], tllb[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
+          llb[1] = (feq(BP->bbox[1], tllb[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
+          llb[2] = (feq(BP->bbox[2], tllb[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
+          uub[0] = (feq(BP->bbox[3], tuub[0], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
+          uub[1] = (feq(BP->bbox[4], tuub[1], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
+          uub[2] = (feq(BP->bbox[5], tuub[2], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
+          f_copy(DIM, tllb, tuub, tshape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, bufferhere, llb, uub);
+          free(bufferhere);
+        }
+        else if (myrank == BP->rank)
+        {
+          int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]);
+          MPI_Send(BP->fgfs[VP->sgfn], nnn, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
+        }
+      }
+      if (Bp == PP->ble)
+        break;
+      Bp = Bp->next;
+    }
+    if (myrank == 0)
+    {
+
+      string out_dir;
+      map<string, string>::iterator iter;
+      iter = parameters::str_par.find("output dir");
+      if (iter != parameters::str_par.end())
+      {
+        out_dir = iter->second;
+      }
+      else
+      {
+        // read parameter from file
+        const int LEN = 256;
+        char pline[LEN];
+        string str, sgrp, skey, sval;
+        int sind;
+        char pname[50];
+        {
+          map<string, string>::iterator iter = parameters::str_par.find("inputpar");
+          if (iter != parameters::str_par.end())
+          {
+            strcpy(pname, (iter->second).c_str());
+          }
+          else
+          {
+            cout << "Error inputpar" << endl;
+            exit(0);
+          }
+        }
+        ifstream inf(pname, ifstream::in);
+        if (!inf.good())
+        {
+          cout << "Can not open parameter file " << pname << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+
+        for (int i = 1; inf.good(); i++)
+        {
+          inf.getline(pline, LEN);
+          str = pline;
+
+          int status = misc::parse_parts(str, sgrp, skey, sval, sind);
+          if (status == -1)
+          {
+            cout << "error reading parameter file " << pname << " in line " << i << endl;
+            MPI_Abort(MPI_COMM_WORLD, 1);
+          }
+          else if (status == 0)
+            continue;
+
+          if (sgrp == "ABE")
+          {
+            if (skey == "output dir")
+              out_dir = sval;
+          }
+        }
+        inf.close();
+
+        parameters::str_par.insert(map<string, string>::value_type("output dir", out_dir));
+      }
+
+      char filename[100];
+      if (tag)
+        sprintf(filename, "%s/%s_Lev%02d_%s_%05d.bin", out_dir.c_str(), tag, PP->lev, VP->name, ncount);
+      else
+        sprintf(filename, "%s/Lev%02d_%s_%05d.bin", out_dir.c_str(), PP->lev, VP->name, ncount);
+
+      writefile(time, tshape[0], tshape[1], tshape[2], tllb[0], tuub[0], tllb[1], tuub[2],
+                tllb[2], tuub[2], filename, databuffer);
+    }
+    DumpList = DumpList->next;
+  }
+
+  if (myrank == 0)
+    free(databuffer);
+}
+// Map point is much easier than maping data itself
+// But the main problem is about the points near the boundary
+// worst case is -ghost -ghost+1 .... 0 * ......
+double Parallel::global_interp(int DIM, int *ext, double **CoX, double *datain,
+                               double *poXb, int ordn, double *SoA, int Symmetry)
+{
+  if (DIM != 3)
+  {
+    cout << "Parallel::global_interp does not suport DIM = " << DIM << " for Symmetry." << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+  double resu;
+  double poX[3];
+  double asgn = 1;
+
+  for (int i = 0; i < 3; i++)
+    poX[i] = poXb[i];
+
+  switch (Symmetry)
+  {
+  case 2:
+    for (int i = 0; i < 3; i++)
+      if (poX[i] < 0)
+      {
+        poX[i] = -poX[i];
+        asgn = asgn * SoA[i];
+      }
+    break;
+  case 1:
+    if (poX[2] < 0)
+    {
+      poX[2] = -poX[2];
+      asgn = asgn * SoA[2];
+    }
+  }
+
+  int extb[3];
+
+  for (int i = 0; i < 3; i++)
+    extb[i] = ext[i];
+
+  switch (Symmetry)
+  {
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+  case 2:
+    if (poX[0] < (ghost_width - 1) * (CoX[0][1] - CoX[0][0]))
+      extb[0] = extb[0] + ghost_width - 1;
+    if (poX[1] < (ghost_width - 1) * (CoX[1][1] - CoX[1][0]))
+      extb[1] = extb[1] + ghost_width - 1;
+  case 1:
+    if (poX[2] < (ghost_width - 1) * (CoX[2][1] - CoX[2][0]))
+      extb[2] = extb[2] + ghost_width - 1;
+#else
+#ifdef Cell
+  case 2:
+    if (poX[0] < (ghost_width - 0.5) * (CoX[0][1] - CoX[0][0]))
+      extb[0] = extb[0] + ghost_width;
+    if (poX[1] < (ghost_width - 0.5) * (CoX[1][1] - CoX[1][0]))
+      extb[1] = extb[1] + ghost_width;
+  case 1:
+    if (poX[2] < (ghost_width - 0.5) * (CoX[2][1] - CoX[2][0]))
+      extb[2] = extb[2] + ghost_width;
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+  }
+
+  if (extb[0] > ext[0] || extb[1] > ext[1] || extb[2] > ext[2])
+  {
+    double *CoXb[3];
+    int Nb = extb[0] * extb[1] * extb[2];
+    double *datab;
+    datab = new double[Nb];
+    for (int i = 0; i < 3; i++)
+    {
+      CoXb[i] = new double[extb[i]];
+      double DH = CoX[i][1] - CoX[i][0];
+      if (extb[i] > ext[i])
+      {
+        if (CoX[i][0] > DH)
+        {
+          cout << "lower boundary[" << i << "] = " << CoX[i][0] << ", but SYmmetry = " << Symmetry << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        for (int j = 0; j < ghost_width - 1; j++)
+          CoXb[i][j] = -CoX[i][ghost_width - 1 - j];
+        for (int j = ghost_width - 1; j < extb[i]; j++)
+          CoXb[i][j] = CoX[i][j - ghost_width + 1];
+#else
+#ifdef Cell
+        for (int j = 0; j < ghost_width; j++)
+          CoXb[i][j] = -CoX[i][ghost_width - 1 - j];
+        for (int j = ghost_width; j < extb[i]; j++)
+          CoXb[i][j] = CoX[i][j - ghost_width];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+      else
+      {
+        for (int j = 0; j < extb[i]; j++)
+          CoXb[i][j] = CoX[i][j];
+      }
+    }
+
+    for (int i = 0; i < Nb; i++)
+    {
+      int ind[3], indb[3];
+      getarrayindex(3, extb, indb, i);
+      double sgn = 1;
+      for (int j = 0; j < 3; j++)
+      {
+        if (extb[j] > ext[j])
+        {
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+          if (indb[j] < ghost_width - 1)
+          {
+            ind[j] = ghost_width - 1 - indb[j];
+            sgn = sgn * SoA[j];
+          }
+          else
+          {
+            ind[j] = 1 + indb[j] - ghost_width;
+          }
+#else
+#ifdef Cell
+          if (indb[j] < ghost_width)
+          {
+            ind[j] = ghost_width - 1 - indb[j];
+            sgn = sgn * SoA[j];
+          }
+          else
+          {
+            ind[j] = indb[j] - ghost_width;
+          }
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+        }
+        else
+          ind[j] = indb[j];
+      }
+      int lon = getarraylocation(3, ext, ind);
+      datab[i] = datain[lon] * sgn;
+    }
+
+    resu = global_interp(DIM, extb, CoXb, datab, poX, ordn);
+
+    for (int i = 0; i < 3; i++)
+      delete[] CoXb[i];
+    delete[] datab;
+  }
+  else
+  {
+    resu = global_interp(DIM, ext, CoX, datain, poX, ordn);
+  }
+
+  return resu * asgn;
+}
+double Parallel::global_interp(int DIM, int *ext, double **CoX, double *datain,
+                               double *poX, int ordn)
+{
+  if (ordn > 2 * ghost_width)
+  {
+    cout << "Parallel::global_interp can not handle ordn = " << ordn << " > 2*ghost_width = " << 2 * ghost_width << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+
+  double *bbox, *datainbbox;
+  bbox = new double[2 * DIM];
+  datainbbox = new double[2 * DIM];
+
+  int *NN, *ind, *shape;
+  NN = new int[DIM];
+  ind = new int[DIM];
+  shape = new int[DIM];
+
+  for (int i = 0; i < DIM; i++)
+  {
+    ind[i] = int((poX[i] - CoX[i][0]) / (CoX[i][1] - CoX[i][0])) - ordn / 2 + 1;
+    // poX may exactly locate on the boundary (exclude ghost)
+    if (ind[i] == -1 && feq(poX[i], CoX[i][0], (CoX[i][1] - CoX[i][0]) / 2))
+      ind[i] = 0;
+    /*
+         if(ind[i] < 0)
+         {
+           cout<<"Parallel::global_interp error ind["<<i<<"] = "<<ind[i]<<endl;
+           cout<<"pox = "<<poX[i]<<", CoX[0] = "<<CoX[i][0]<<endl;
+           MPI_Abort(MPI_COMM_WORLD,1);
+         }
+    */
+    if (ind[i] == ext[i] - ordn + 1 && feq(poX[i], CoX[i][ext[i] - ordn / 2], (CoX[i][1] - CoX[i][0]) / 2))
+      ind[i] = ext[i] - ordn - 1;
+    /*
+         if(ind[i]+ordn-1 > ext[i]-1)
+         {
+           cout<<"Parallel::global_interp error ind["<<i<<"] = "<<ind[i]<<" + ordn ("<<ordn<<") > ext = "<<ext[i]<<endl;
+           cout<<"pox = "<<poX[i]<<", CoX[ind] = "<<CoX[i][ind[i]]<<", CoX = ("<<CoX[i][0]<<","<<CoX[i][ext[i]-1]<<")"<<endl;
+           MPI_Abort(MPI_COMM_WORLD,1);
+         }
+    */
+    bbox[i] = CoX[i][ind[i]];
+    bbox[DIM + i] = CoX[i][ind[i] + ordn - 1];
+    datainbbox[i] = CoX[i][0];
+    datainbbox[DIM + i] = CoX[i][ext[i] - 1];
+    shape[i] = ordn;
+  }
+
+  NN[DIM - 1] = ordn;
+  for (int i = DIM - 2; i >= 0; i--)
+    NN[i] = NN[i + 1] * ordn;
+
+  double *xpts, *funcvals;
+  xpts = new double[ordn];
+  funcvals = new double[ordn];
+  double *DDd, *DDd1, rr;
+
+  DDd = new double[NN[0]];
+
+  copy(DIM, bbox, bbox + DIM, shape, DDd, datainbbox, datainbbox + DIM, ext, datain, bbox, bbox + DIM);
+
+  for (int i = 0; i < DIM; i++)
+  {
+    for (int j = ind[i]; j < ind[i] + ordn; j++)
+    {
+      xpts[j - ind[i]] = CoX[i][j];
+    }
+
+    if (i < DIM - 1)
+    {
+      DDd1 = new double[NN[i + 1]];
+      for (int j = 0; j < NN[i + 1]; j++)
+      {
+        for (int k = 0; k < ordn; k++)
+          funcvals[k] = DDd[k + j * ordn];
+        DDd1[j] = Lagrangian_Int(poX[i], ordn, xpts, funcvals);
+      }
+      delete[] DDd;
+      DDd = DDd1;
+    }
+    else
+    {
+      for (int j = 0; j < ordn; j++)
+        funcvals[j] = DDd[j];
+      rr = Lagrangian_Int(poX[i], ordn, xpts, funcvals);
+      delete[] DDd1; // since DDd and DDd1 now point to the same stuff, we need delete after above int
+    }
+  }
+
+  delete[] NN;
+  delete[] ind;
+  delete[] xpts;
+  delete[] funcvals;
+  delete[] bbox;
+  delete[] datainbbox;
+  delete[] shape;
+
+  return rr;
+}
+double Parallel::Lagrangian_Int(double x, int npts, double *xpts, double *funcvals)
+{
+  double sum = 0;
+  for (int i = 0; i < npts; i++)
+  {
+    sum = sum + funcvals[i] * LagrangePoly(x, i, npts, xpts);
+  }
+  return sum;
+}
+double Parallel::LagrangePoly(double x, int pt, int npts, double *xpts)
+{
+  double h = 1;
+  int i;
+
+  for (i = 0; i < pt; i++)
+    h = h * (x - xpts[i]) / (xpts[pt] - xpts[i]);
+
+  for (i = pt + 1; i < npts; i++)
+    h = h * (x - xpts[i]) / (xpts[pt] - xpts[i]);
+
+  return h;
+}
+// collect all grid segments or blocks including ghost and buffer for given patch
+MyList<Parallel::gridseg> *Parallel::build_complete_gsl(Patch *Pat)
+{
+  MyList<Parallel::gridseg> *cgsl = 0, *gs;
+  MyList<Block> *BP = Pat->blb;
+  while (BP)
+  {
+    if (!cgsl)
+    {
+      cgsl = gs = new MyList<Parallel::gridseg>; // delete through destroyList();
+      gs->data = new Parallel::gridseg;
+    }
+    else
+    {
+      gs->next = new MyList<Parallel::gridseg>;
+      gs = gs->next;
+      gs->data = new Parallel::gridseg;
+    }
+
+    for (int i = 0; i < dim; i++)
+    {
+      gs->data->llb[i] = BP->data->bbox[i];
+      gs->data->uub[i] = BP->data->bbox[dim + i];
+      gs->data->shape[i] = BP->data->shape[i];
+    }
+    gs->data->Bg = BP->data;
+    gs->next = 0;
+
+    if (BP == Pat->ble)
+      break;
+    BP = BP->next;
+  }
+
+  return cgsl;
+}
+// collect all grid segments or blocks including ghost and buffer for given patch list
+MyList<Parallel::gridseg> *Parallel::build_complete_gsl(MyList<Patch> *PatL)
+{
+  MyList<Parallel::gridseg> *cgsl = 0, *gs;
+  while (PatL)
+  {
+    if (!cgsl)
+    {
+      cgsl = build_complete_gsl(PatL->data);
+      gs = cgsl;
+      while (gs->next)
+        gs = gs->next;
+    }
+    else
+    {
+      gs->next = build_complete_gsl(PatL->data);
+      gs = gs->next;
+      while (gs->next)
+        gs = gs->next;
+    }
+    PatL = PatL->next;
+  }
+
+  return cgsl;
+}
+// cellect the information of Patch list
+MyList<Parallel::gridseg> *Parallel::build_complete_gsl_virtual(MyList<Patch> *PatL)
+{
+  MyList<Parallel::gridseg> *cgsl = 0, *gs;
+  while (PatL)
+  {
+    if (cgsl)
+    {
+      gs->next = new MyList<Parallel::gridseg>;
+      gs = gs->next;
+      gs->data = new Parallel::gridseg;
+    }
+    else
+    {
+      cgsl = gs = new MyList<Parallel::gridseg>;
+      gs->data = new Parallel::gridseg;
+    }
+
+    for (int i = 0; i < dim; i++)
+    {
+      gs->data->llb[i] = PatL->data->bbox[i];
+      gs->data->uub[i] = PatL->data->bbox[dim + i];
+      gs->data->shape[i] = PatL->data->shape[i];
+    }
+    gs->data->Bg = 0;
+    gs->next = 0;
+
+    PatL = PatL->next;
+  }
+
+  return cgsl;
+}
+// cellect the information of Patch list without buffer points
+MyList<Parallel::gridseg> *Parallel::build_complete_gsl_virtual2(MyList<Patch> *PatL) // - buffer
+{
+  MyList<Parallel::gridseg> *cgsl = 0, *gs;
+  while (PatL)
+  {
+    if (cgsl)
+    {
+      gs->next = new MyList<Parallel::gridseg>;
+      gs = gs->next;
+      gs->data = new Parallel::gridseg;
+    }
+    else
+    {
+      cgsl = gs = new MyList<Parallel::gridseg>;
+      gs->data = new Parallel::gridseg;
+    }
+
+    for (int i = 0; i < dim; i++)
+    {
+      double DH = PatL->data->getdX(i);
+      gs->data->llb[i] = PatL->data->bbox[i] + PatL->data->lli[i] * DH;
+      gs->data->uub[i] = PatL->data->bbox[dim + i] - PatL->data->uui[i] * DH;
+      gs->data->shape[i] = PatL->data->shape[i] - PatL->data->lli[i] - PatL->data->uui[i];
+    }
+    gs->data->Bg = 0;
+    gs->next = 0;
+
+    PatL = PatL->next;
+  }
+
+  return cgsl;
+}
+// collect all grid segments or blocks without ghost for given patch, without extension
+MyList<Parallel::gridseg> *Parallel::build_bulk_gsl(Patch *Pat)
+{
+  MyList<Parallel::gridseg> *cgsl = 0, *gs;
+  MyList<Block> *BP = Pat->blb;
+  while (BP)
+  {
+    Block *bp = BP->data;
+    if (!cgsl)
+    {
+      cgsl = gs = new MyList<Parallel::gridseg>;
+      gs->data = new Parallel::gridseg;
+    }
+    else
+    {
+      gs->next = new MyList<Parallel::gridseg>;
+      gs = gs->next;
+      gs->data = new Parallel::gridseg;
+    }
+
+    for (int i = 0; i < dim; i++)
+    {
+      double DH = bp->getdX(i);
+      gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] : bp->bbox[dim + i] - ghost_width * DH;
+      gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + ghost_width * DH;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+      gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
+#else
+#ifdef Cell
+      gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+    }
+    gs->data->Bg = BP->data;
+    gs->next = 0;
+
+    if (BP == Pat->ble)
+      break;
+    BP = BP->next;
+  }
+
+  return cgsl;
+}
+// bulk part for given Block within given patch, without extension
+MyList<Parallel::gridseg> *Parallel::build_bulk_gsl(Block *bp, Patch *Pat)
+{
+  MyList<Parallel::gridseg> *gs = 0;
+
+  gs = new MyList<Parallel::gridseg>;
+  gs->data = new Parallel::gridseg;
+
+  for (int i = 0; i < dim; i++)
+  {
+    double DH = bp->getdX(i);
+    gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] : bp->bbox[dim + i] - ghost_width * DH;
+    gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + ghost_width * DH;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+    gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
+#else
+#ifdef Cell
+    gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+  }
+  gs->data->Bg = bp;
+  gs->next = 0;
+
+  return gs;
+}
+MyList<Parallel::gridseg> *Parallel::clone_gsl(MyList<Parallel::gridseg> *p, bool first_only)
+{
+  MyList<Parallel::gridseg> *np = 0, *q = 0, *pq = 0;
+
+  while (p)
+  {
+    q = new MyList<Parallel::gridseg>;
+    q->data = new Parallel::gridseg;
+    q->data->Bg = p->data->Bg;
+    for (int i = 0; i < dim; i++)
+    {
+      q->data->llb[i] = p->data->llb[i];
+      q->data->uub[i] = p->data->uub[i];
+      q->data->shape[i] = p->data->shape[i];
+    }
+    if (pq)
+      pq->next = q;
+    else
+      np = q;
+    if (first_only)
+    {
+      np->next = 0;
+      return np;
+    }
+    pq = q;
+    p = p->next;
+  }
+  return np;
+}
+MyList<Parallel::gridseg> *Parallel::gs_subtract(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B)
+{
+  if (!A)
+    return 0;
+  if (!B)
+    return clone_gsl(A, true);
+
+  double cut_plane[2 * dim], DH[dim];
+
+  for (int i = 0; i < dim; i++)
+  {
+    DH[i] = A->data->Bg->getdX(i);
+    if (B->data->Bg && !feq(DH[i], B->data->Bg->getdX(i), DH[i] / 2))
+    {
+      cout << "Parallel::gs_subtract meets different grid segment " << DH[i] << " vs " << B->data->Bg->getdX(i) << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+  }
+
+  MyList<Parallel::gridseg> *C = 0, *q;
+  for (int i = 0; i < dim; i++)
+  {
+    if (B->data->llb[i] > A->data->uub[i] || B->data->uub[i] < A->data->llb[i])
+      return clone_gsl(A, true);
+    cut_plane[i] = A->data->llb[i];
+    cut_plane[i + dim] = A->data->uub[i];
+  }
+
+  for (int i = 0; i < dim; i++)
+  {
+    cut_plane[i] = Mymax(A->data->llb[i], B->data->llb[i]);
+    if (cut_plane[i] - A->data->llb[i] > DH[i] / 2)
+    {
+      q = clone_gsl(A, true);
+      // prolong the list from head
+      if (C)
+        q->next = C;
+      C = q;
+      for (int j = 0; j < dim; j++)
+      {
+        if (i == j)
+        {
+          C->data->llb[i] = A->data->llb[i];
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+          C->data->uub[i] = Mymax(C->data->llb[i], cut_plane[i] - DH[i]);
+#else
+#ifdef Cell
+          C->data->uub[i] = Mymax(C->data->llb[i], cut_plane[i]);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+        }
+        else
+        {
+          C->data->llb[j] = cut_plane[j];
+          C->data->uub[j] = cut_plane[j + dim];
+        }
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4) + 1;
+#else
+#ifdef Cell
+        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+    }
+
+    cut_plane[i + dim] = Mymin(A->data->uub[i], B->data->uub[i]);
+    if (A->data->uub[i] - cut_plane[i + dim] > DH[i] / 2)
+    {
+      q = clone_gsl(A, true);
+      if (C)
+        q->next = C;
+      C = q;
+      for (int j = 0; j < dim; j++)
+      {
+        if (i == j)
+        {
+          C->data->uub[i] = A->data->uub[i];
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+          C->data->llb[i] = Mymin(C->data->uub[i], cut_plane[i + dim] + DH[i]);
+#else
+#ifdef Cell
+          C->data->llb[i] = Mymin(C->data->uub[i], cut_plane[i + dim]);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+        }
+        else
+        {
+          C->data->llb[j] = cut_plane[j];
+          C->data->uub[j] = cut_plane[j + dim];
+        }
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4) + 1;
+#else
+#ifdef Cell
+        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+    }
+  }
+  return C;
+}
+// stupid method
+/*
+MyList<Parallel::gridseg> *Parallel::gsl_subtract(MyList<Parallel::gridseg> *A,MyList<Parallel::gridseg> *B) //A subtract B but with A's information
+{
+// always make return and A, B distinct
+  if(!A) return 0;
+
+  if(!B) return clone_gsl(A,0);
+
+  MyList<Parallel::gridseg> *C=0,*C0,*C1,*Cc,*CC0,*gs;
+
+  while(A)
+  {
+     C0=gs_subtract(A,B);  // note C0 becomes a list after subtraction
+     C1=B->next;
+     while(C1)
+     {
+  CC0=C0;
+  Cc=0;
+  while(CC0)
+  {
+    gs=gs_subtract(CC0,C1);
+    if(Cc) Cc->catList(gs);
+    else   Cc=gs;
+    CC0=CC0->next;
+  }
+  if(C0) C0->destroyList();
+  C0=Cc;
+  C1=C1->next;
+     }
+     if(C) C->catList(C0);
+     else  C=C0;
+     A=A->next;
+  }
+
+  return C;
+}
+*/
+// more clever method
+MyList<Parallel::gridseg> *Parallel::gsl_subtract(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B) // A subtract B but with A's information
+{
+  // always make return and A, B distinct
+  if (!A)
+    return 0;
+
+  MyList<Parallel::gridseg> *C = 0, *C0, *C1;
+
+  C = clone_gsl(A, 0);
+
+  while (B)
+  {
+    C0 = 0;
+    C1 = C;
+    while (C1)
+    {
+      if (C0)
+        C0->catList(gs_subtract(C1, B));
+      else
+        C0 = gs_subtract(C1, B);
+      C1 = C1->next;
+    }
+    if (C)
+      C->destroyList();
+    else
+    {
+      if (C0)
+        C0->destroyList();
+      return 0;
+    }
+
+    C = C0;
+    B = B->next;
+  }
+
+  return C;
+}
+MyList<Parallel::gridseg> *Parallel::gs_and(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B)
+{
+  if (!A || !B)
+    return 0;
+
+  double llb[dim], uub[dim];
+  bool flag = false;
+  for (int i = 0; i < dim; i++)
+  {
+    llb[i] = Mymax(A->data->llb[i], B->data->llb[i]);
+    uub[i] = Mymin(A->data->uub[i], B->data->uub[i]);
+    if (llb[i] > uub[i])
+    {
+      flag = true;
+      break;
+    }
+  }
+  if (flag)
+    return 0;
+
+  MyList<Parallel::gridseg> *C;
+  C = clone_gsl(A, true);
+  for (int i = 0; i < dim; i++)
+  {
+    C->data->llb[i] = llb[i];
+    C->data->uub[i] = uub[i];
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+    C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / C->data->Bg->getdX(i) + 0.4) + 1;
+#else
+#ifdef Cell
+    C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / C->data->Bg->getdX(i) + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+  }
+
+  return C;
+}
+// overlap of A_i and (union of all j of B_j)
+MyList<Parallel::gridseg> *Parallel::gsl_and(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B) // A and B but with A's information
+{
+  MyList<Parallel::gridseg> *C = 0, *C1;
+
+  while (A)
+  {
+    C1 = B;
+    while (C1)
+    {
+      if (C)
+        C->catList(gs_and(A, C1));
+      else
+        C = gs_and(A, C1);
+      C1 = C1->next;
+    }
+    A = A->next;
+  }
+  return C;
+}
+// collect all ghost grid segments or blocks for given patch
+MyList<Parallel::gridseg> *Parallel::build_ghost_gsl(Patch *Pat)
+{
+  MyList<Parallel::gridseg> *cgsl = 0, *gs, *gsb;
+  MyList<Block> *BP = Pat->blb;
+  while (BP)
+  {
+    gs = new MyList<Parallel::gridseg>;
+    gs->data = new Parallel::gridseg;
+
+    for (int i = 0; i < dim; i++)
+    {
+      gs->data->llb[i] = BP->data->bbox[i];
+      gs->data->uub[i] = BP->data->bbox[dim + i];
+      gs->data->shape[i] = BP->data->shape[i];
+    }
+    gs->data->Bg = BP->data;
+    gs->next = 0;
+
+    gsb = build_bulk_gsl(BP->data, Pat);
+
+    if (!cgsl)
+      cgsl = gs_subtract(gs, gsb);
+    else
+      cgsl->catList(gs_subtract(gs, gsb));
+
+    gsb->destroyList();
+    gs->destroyList();
+
+    if (BP == Pat->ble)
+      break;
+    BP = BP->next;
+  }
+
+  return cgsl;
+}
+// collect all ghost grid segments or blocks for given patch list
+MyList<Parallel::gridseg> *Parallel::build_ghost_gsl(MyList<Patch> *PatL)
+{
+  MyList<Parallel::gridseg> *cgsl = 0, *gs;
+  while (PatL)
+  {
+    if (!cgsl)
+    {
+      cgsl = build_ghost_gsl(PatL->data);
+      gs = cgsl;
+      while (gs->next)
+        gs = gs->next;
+    }
+    else
+    {
+      gs->next = build_ghost_gsl(PatL->data);
+      gs = gs->next;
+      while (gs->next)
+        gs = gs->next;
+    }
+    PatL = PatL->next;
+  }
+
+  return cgsl;
+}
+// collect all grid segments or blocks without ghost for given patch
+// special for Sync usage, so we do not need consider missing points
+MyList<Parallel::gridseg> *Parallel::build_owned_gsl0(Patch *Pat, int rank_in)
+{
+  MyList<Parallel::gridseg> *cgsl = 0, *gs;
+  MyList<Block> *BP = Pat->blb;
+  while (BP)
+  {
+    Block *bp = BP->data;
+    if (bp->rank == rank_in)
+    {
+      if (!cgsl)
+      {
+        cgsl = gs = new MyList<Parallel::gridseg>;
+        gs->data = new Parallel::gridseg;
+      }
+      else
+      {
+        gs->next = new MyList<Parallel::gridseg>;
+        gs = gs->next;
+        gs->data = new Parallel::gridseg;
+      }
+
+      for (int i = 0; i < dim; i++)
+      {
+        double DH = bp->getdX(i);
+        gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] : bp->bbox[dim + i] - ghost_width * DH;
+        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + ghost_width * DH;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
+#else
+#ifdef Cell
+        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+      gs->data->Bg = BP->data;
+      gs->next = 0;
+    }
+
+    if (BP == Pat->ble)
+      break;
+    BP = BP->next;
+  }
+
+  return cgsl;
+}
+// collect all grid segments or blocks without ghost for given patch
+MyList<Parallel::gridseg> *Parallel::build_owned_gsl1(Patch *Pat, int rank_in)
+{
+  MyList<Parallel::gridseg> *cgsl = 0, *gs;
+  MyList<Block> *BP = Pat->blb;
+  while (BP)
+  {
+    Block *bp = BP->data;
+    if (bp->rank == rank_in)
+    {
+      if (!cgsl)
+      {
+        cgsl = gs = new MyList<Parallel::gridseg>;
+        gs->data = new Parallel::gridseg;
+      }
+      else
+      {
+        gs->next = new MyList<Parallel::gridseg>;
+        gs = gs->next;
+        gs->data = new Parallel::gridseg;
+      }
+
+      for (int i = 0; i < dim; i++)
+      {
+        double DH = bp->getdX(i);
+        gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] : bp->bbox[dim + i] - ghost_width * DH;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        // NOTE: our dividing structure is (exclude ghost)
+        // -1 0
+        //       1  2
+        // so (0,1) does not belong to any part for vertex structure, we always put it to right part, this is consistent to
+        // the fortran routine where we always take floor to get index
+        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + (ghost_width - 1) * DH;
+        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
+#else
+#ifdef Cell
+        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + ghost_width * DH;
+        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+      gs->data->Bg = BP->data;
+      gs->next = 0;
+    }
+
+    if (BP == Pat->ble)
+      break;
+    BP = BP->next;
+  }
+
+  return cgsl;
+}
+// collect all grid segments or blocks without ghost nor buffer for given patch
+MyList<Parallel::gridseg> *Parallel::build_owned_gsl2(Patch *Pat, int rank_in)
+{
+  MyList<Parallel::gridseg> *cgsl = 0, *gs;
+  MyList<Block> *BP = Pat->blb;
+  while (BP)
+  {
+    Block *bp = BP->data;
+    if (bp->rank == rank_in)
+    {
+      if (!cgsl)
+      {
+        cgsl = gs = new MyList<Parallel::gridseg>;
+        gs->data = new Parallel::gridseg;
+      }
+      else
+      {
+        gs->next = new MyList<Parallel::gridseg>;
+        gs = gs->next;
+        gs->data = new Parallel::gridseg;
+      }
+
+      for (int i = 0; i < dim; i++)
+      {
+        double DH = bp->getdX(i);
+        gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] - Pat->uui[i] * DH : bp->bbox[dim + i] - ghost_width * DH;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        // NOTE: our dividing structure is (exclude ghost)
+        // -1 0
+        //       1  2
+        // so (0,1) does not belong to any part for vertex structure, we always put it to right part, this is consistent to
+        // the fortran routine where we always take floor to get index
+        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i] + (ghost_width - 1) * DH;
+        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
+#else
+#ifdef Cell
+        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i] + ghost_width * DH;
+        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+      gs->data->Bg = BP->data;
+      gs->next = 0;
+    }
+
+    if (BP == Pat->ble)
+      break;
+    BP = BP->next;
+  }
+
+  return cgsl;
+}
+// collect all grid segments or blocks without ghost for given patch, and delete the ghost_width for interpolation consideration on the patch boundary
+MyList<Parallel::gridseg> *Parallel::build_owned_gsl3(Patch *Pat, int rank_in, int Symmetry)
+{
+  MyList<Parallel::gridseg> *cgsl = 0, *gs;
+  MyList<Block> *BP = Pat->blb;
+  while (BP)
+  {
+    Block *bp = BP->data;
+    if (bp->rank == rank_in)
+    {
+      if (!cgsl)
+      {
+        cgsl = gs = new MyList<Parallel::gridseg>;
+        gs->data = new Parallel::gridseg;
+      }
+      else
+      {
+        gs->next = new MyList<Parallel::gridseg>;
+        gs = gs->next;
+        gs->data = new Parallel::gridseg;
+      }
+
+      for (int i = 0; i < dim; i++)
+      {
+        double DH = bp->getdX(i);
+        gs->data->uub[i] = bp->bbox[dim + i] - ghost_width * DH;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        // NOTE: our dividing structure is (exclude ghost)
+        // -1 0
+        //       1  2
+        // so (0,1) does not belong to any part for vertex structure, we always put it to right part, this is consistent to
+        // the fortran routine where we always take floor to get index
+        gs->data->llb[i] = bp->bbox[i] + (ghost_width - 1) * DH;
+        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
+#else
+#ifdef Cell
+        gs->data->llb[i] = bp->bbox[i] + ghost_width * DH;
+        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+      // Symmetry consideration
+      if (Symmetry > 0)
+      {
+        double DH = bp->getdX(2);
+        if (feq(bp->bbox[2], 0, DH / 2))
+        {
+          gs->data->llb[2] = bp->bbox[2];
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+          gs->data->shape[2] = int((gs->data->uub[2] - gs->data->llb[2]) / DH + 0.4) + 1;
+#else
+#ifdef Cell
+          gs->data->shape[2] = int((gs->data->uub[2] - gs->data->llb[2]) / DH + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+        }
+        if (Symmetry > 1)
+        {
+          for (int i = 0; i < 2; i++)
+          {
+            DH = bp->getdX(i);
+            if (feq(bp->bbox[i], 0, DH / 2))
+            {
+              gs->data->llb[i] = bp->bbox[i];
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+              gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
+#else
+#ifdef Cell
+              gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+            }
+          }
+        }
+      }
+
+      gs->data->Bg = BP->data;
+      gs->next = 0;
+    }
+
+    if (BP == Pat->ble)
+      break;
+    BP = BP->next;
+  }
+
+  return cgsl;
+}
+// collect all grid segments or blocks without ghost nor buffer for given patch,
+// and delete the ghost_width for interpolation consideration on the patch boundary
+MyList<Parallel::gridseg> *Parallel::build_owned_gsl4(Patch *Pat, int rank_in, int Symmetry)
+{
+  MyList<Parallel::gridseg> *cgsl = 0, *gs;
+  MyList<Block> *BP = Pat->blb;
+  while (BP)
+  {
+    Block *bp = BP->data;
+    if (bp->rank == rank_in)
+    {
+      if (!cgsl)
+      {
+        cgsl = gs = new MyList<Parallel::gridseg>;
+        gs->data = new Parallel::gridseg;
+      }
+      else
+      {
+        gs->next = new MyList<Parallel::gridseg>;
+        gs = gs->next;
+        gs->data = new Parallel::gridseg;
+      }
+
+      for (int i = 0; i < dim; i++)
+      {
+        double DH = bp->getdX(i);
+        gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] - Pat->uui[i] * DH : bp->bbox[dim + i];
+        gs->data->uub[i] -= ghost_width * DH;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        // NOTE: our dividing structure is (exclude ghost)
+        // -1 0
+        //       1  2
+        // so (0,1) does not belong to any part for vertex structure, we always put it to right part, this is consistent to
+        // the fortran routine where we always take floor to get index
+        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i];
+        gs->data->llb[i] += (ghost_width - 1) * DH;
+        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
+#else
+#ifdef Cell
+        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i];
+        gs->data->llb[i] += ghost_width * DH;
+        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+      // Symmetry consideration
+      if (Symmetry > 0)
+      {
+        double DH = bp->getdX(2);
+        if (feq(bp->bbox[2], 0, DH / 2))
+        {
+          gs->data->llb[2] = bp->bbox[2];
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+          gs->data->shape[2] = int((gs->data->uub[2] - gs->data->llb[2]) / DH + 0.4) + 1;
+#else
+#ifdef Cell
+          gs->data->shape[2] = int((gs->data->uub[2] - gs->data->llb[2]) / DH + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+        }
+        if (Symmetry > 1)
+        {
+          for (int i = 0; i < 2; i++)
+          {
+            DH = bp->getdX(i);
+            if (feq(bp->bbox[i], 0, DH / 2))
+            {
+              gs->data->llb[i] = bp->bbox[i];
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+              gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
+#else
+#ifdef Cell
+              gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+            }
+          }
+        }
+      }
+
+      gs->data->Bg = BP->data;
+      gs->next = 0;
+    }
+
+    if (BP == Pat->ble)
+      break;
+    BP = BP->next;
+  }
+
+  return cgsl;
+}
+// collect all grid segments or blocks without ghost nor buffer for given patch, no extention
+MyList<Parallel::gridseg> *Parallel::build_owned_gsl5(Patch *Pat, int rank_in)
+{
+  MyList<Parallel::gridseg> *cgsl = 0, *gs;
+  MyList<Block> *BP = Pat->blb;
+  while (BP)
+  {
+    Block *bp = BP->data;
+    if (bp->rank == rank_in)
+    {
+      if (!cgsl)
+      {
+        cgsl = gs = new MyList<Parallel::gridseg>;
+        gs->data = new Parallel::gridseg;
+      }
+      else
+      {
+        gs->next = new MyList<Parallel::gridseg>;
+        gs = gs->next;
+        gs->data = new Parallel::gridseg;
+      }
+
+      for (int i = 0; i < dim; i++)
+      {
+        double DH = bp->getdX(i);
+        gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] - Pat->uui[i] * DH : bp->bbox[dim + i] - ghost_width * DH;
+        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i] + ghost_width * DH;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
+#else
+#ifdef Cell
+        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+      gs->data->Bg = BP->data;
+      gs->next = 0;
+    }
+
+    if (BP == Pat->ble)
+      break;
+    BP = BP->next;
+  }
+
+  return cgsl;
+}
+// collect all grid segments or blocks without ghost for given patch list
+// stupid method
+/*
+MyList<Parallel::gridseg> *Parallel::build_owned_gsl(MyList<Patch> *PatL,int rank_in,int type,int Symmetry)
+{
+       MyList<Parallel::gridseg> *cgsl=0,*gs;
+       while(PatL)
+       {
+    if(!cgsl)
+    {
+            switch(type)
+      {
+         case 0:
+                  cgsl = build_owned_gsl0(PatL->data,rank_in);
+      break;
+         case 1:
+                  cgsl = build_owned_gsl1(PatL->data,rank_in);
+      break;
+         case 2:
+                  cgsl = build_owned_gsl2(PatL->data,rank_in);
+      break;
+         case 3:
+                  cgsl = build_owned_gsl3(PatL->data,rank_in,Symmetry);
+      break;
+         case 4:
+                  cgsl = build_owned_gsl4(PatL->data,rank_in,Symmetry);
+      break;
+         case 5:
+                  cgsl = build_owned_gsl5(PatL->data,rank_in);
+      break;
+               default:
+      cout<<"Parallel::build_owned_gsl : unknown type = "<<type<<endl;
+                  MPI_Abort(MPI_COMM_WORLD,1);
+      }
+       gs = cgsl;
+       while(gs && gs->next) gs = gs->next;
+    }
+    else
+    {
+       switch(type)
+      {
+         case 0:
+                  gs->next = build_owned_gsl0(PatL->data,rank_in);
+      break;
+         case 1:
+                  gs->next = build_owned_gsl1(PatL->data,rank_in);
+      break;
+         case 2:
+                  gs->next = build_owned_gsl2(PatL->data,rank_in);
+      break;
+         case 3:
+                  gs->next = build_owned_gsl3(PatL->data,rank_in,Symmetry);
+      break;
+         case 4:
+                  gs->next = build_owned_gsl4(PatL->data,rank_in,Symmetry);
+      break;
+         case 5:
+                  gs->next = build_owned_gsl5(PatL->data,rank_in);
+      break;
+               default:
+      cout<<"Parallel::build_owned_gsl : unknown type = "<<type<<endl;
+                  MPI_Abort(MPI_COMM_WORLD,1);
+      }
+       while(gs && gs->next) gs = gs->next;
+    }
+    PatL = PatL->next;
+       }
+
+       return cgsl;
+}
+*/
+// more clever method
+MyList<Parallel::gridseg> *Parallel::build_owned_gsl(MyList<Patch> *PatL, int rank_in, int type, int Symmetry)
+{
+  MyList<Parallel::gridseg> *cgsl = 0, *gs;
+  while (PatL)
+  {
+    switch (type)
+    {
+    case 0:
+      gs = build_owned_gsl0(PatL->data, rank_in);
+      break;
+    case 1:
+      gs = build_owned_gsl1(PatL->data, rank_in);
+      break;
+    case 2:
+      gs = build_owned_gsl2(PatL->data, rank_in);
+      break;
+    case 3:
+      gs = build_owned_gsl3(PatL->data, rank_in, Symmetry);
+      break;
+    case 4:
+      gs = build_owned_gsl4(PatL->data, rank_in, Symmetry);
+      break;
+    case 5:
+      gs = build_owned_gsl5(PatL->data, rank_in);
+      break;
+    default:
+      cout << "Parallel::build_owned_gsl : unknown type = " << type << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+    if (cgsl)
+      cgsl->catList(gs);
+    else
+      cgsl = gs;
+    PatL = PatL->next;
+  }
+
+  return cgsl;
+}
+// according to overlape to determine real grid segments
+void Parallel::build_gstl(MyList<Parallel::gridseg> *srci, MyList<Parallel::gridseg> *dsti,
+                          MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst)
+{
+  *out_src = *out_dst = 0;
+
+  if (!srci || !dsti)
+    return;
+
+  MyList<Parallel::gridseg> *s, *d;
+  MyList<Parallel::gridseg> *s2, *d2;
+
+  double llb[dim], uub[dim];
+
+  s = srci;
+  while (s)
+  {
+    Parallel::gridseg *sd = s->data;
+    d = dsti;
+    while (d)
+    {
+      Parallel::gridseg *dd = d->data;
+      bool flag = true;
+      for (int i = 0; i < dim; i++)
+      {
+        double SH = sd->Bg->getdX(i), DH = dd->Bg->getdX(i);
+        llb[i] = Mymax(sd->llb[i], dd->llb[i]);
+        uub[i] = Mymin(sd->uub[i], dd->uub[i]);
+        // make sure the region boundary is consistent to the grids
+        // here we only judge if the domain is empty, so do not need to adjust the align
+        double lb = llb[i], ub = uub[i];
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        // ---*---
+        // x-------x
+        //		if     (int(2*(sd->uub[i]-uub[i])/SH+0.4)%2 == 1) ub = uub[i]-SH/2;
+        //		else if(int(2*(dd->uub[i]-uub[i])/DH+0.4)%2 == 1) ub = uub[i]-DH/2;
+        //		if     (int(2*(llb[i]-sd->llb[i])/SH+0.4)%2 == 1) lb = llb[i]+SH/2;
+        //		else if(int(2*(llb[i]-dd->llb[i])/DH+0.4)%2 == 1) lb = llb[i]+DH/2;
+        if (lb > ub + Mymin(SH, DH) / 2)
+        {
+          flag = false;
+          break;
+        } // special for isolated point
+#else
+#ifdef Cell
+        // |------|
+        // |-------------|
+        //		if     (int(2*(sd->uub[i]-uub[i])/SH+0.4)%2 == 1) ub = uub[i]+SH/2;
+        //		else if(int(2*(dd->uub[i]-uub[i])/DH+0.4)%2 == 1) ub = uub[i]+DH/2;
+        //        |------|
+        // |-------------|
+        //		if     (int(2*(llb[i]-sd->llb[i])/SH+0.4)%2 == 1) lb = llb[i]-SH/2;
+        //		else if(int(2*(llb[i]-dd->llb[i])/DH+0.4)%2 == 1) lb = llb[i]-DH/2;
+        if (ub - lb < Mymin(SH, DH) / 2)
+        {
+          flag = false;
+          break;
+        } // even for isolated point, it has a cell belong to it
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+
+      if (flag)
+      {
+        if (!(*out_src))
+        {
+          *out_src = s2 = new MyList<Parallel::gridseg>;
+          *out_dst = d2 = new MyList<Parallel::gridseg>;
+          s2->data = new Parallel::gridseg;
+          d2->data = new Parallel::gridseg;
+        }
+        else
+        {
+          s2->next = new MyList<Parallel::gridseg>;
+          s2 = s2->next;
+          d2->next = new MyList<Parallel::gridseg>;
+          d2 = d2->next;
+          s2->data = new Parallel::gridseg;
+          d2->data = new Parallel::gridseg;
+        }
+
+        for (int i = 0; i < dim; i++)
+        {
+          double SH = sd->Bg->getdX(i), DH = dd->Bg->getdX(i);
+          s2->data->llb[i] = d2->data->llb[i] = llb[i];
+          s2->data->uub[i] = d2->data->uub[i] = uub[i];
+// using float method to count point, we do not need following consideration (2012 nov 17)
+#if 1
+
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+          // old code distuinguish vertex and cell
+          //		   if     (int(2*(sd->uub[i]-uub[i])/SH+0.4)%2 == 1) s2->data->uub[i] = uub[i]-SH/2;
+          //		   else if(int(2*(dd->uub[i]-uub[i])/DH+0.4)%2 == 1) d2->data->uub[i] = uub[i]-DH/2;
+          //	           if     (int(2*(llb[i]-sd->llb[i])/SH+0.4)%2 == 1) s2->data->llb[i] = llb[i]+SH/2;
+          //		   else if(int(2*(llb[i]-dd->llb[i])/DH+0.4)%2 == 1) d2->data->llb[i] = llb[i]+DH/2;
+          // new code: here we concern much more about missing point, because overlaping domain has been gaureented above
+          if (int(2 * (sd->uub[i] - uub[i]) / SH + 0.4) % 2 == 1)
+            s2->data->uub[i] = uub[i] + SH / 2;
+          else if (int(2 * (dd->uub[i] - uub[i]) / DH + 0.4) % 2 == 1)
+            d2->data->uub[i] = uub[i] + DH / 2;
+          if (int(2 * (llb[i] - sd->llb[i]) / SH + 0.4) % 2 == 1)
+            s2->data->llb[i] = llb[i] - SH / 2;
+          else if (int(2 * (llb[i] - dd->llb[i]) / DH + 0.4) % 2 == 1)
+            d2->data->llb[i] = llb[i] - DH / 2;
+          s2->data->shape[i] = int((s2->data->uub[i] - s2->data->llb[i]) / SH + 0.4) + 1;
+          d2->data->shape[i] = int((d2->data->uub[i] - d2->data->llb[i]) / DH + 0.4) + 1;
+#else
+#ifdef Cell
+          if (int(2 * (sd->uub[i] - uub[i]) / SH + 0.4) % 2 == 1)
+            s2->data->uub[i] = uub[i] + SH / 2;
+          else if (int(2 * (dd->uub[i] - uub[i]) / DH + 0.4) % 2 == 1)
+            d2->data->uub[i] = uub[i] + DH / 2;
+          if (int(2 * (llb[i] - sd->llb[i]) / SH + 0.4) % 2 == 1)
+            s2->data->llb[i] = llb[i] - SH / 2;
+          else if (int(2 * (llb[i] - dd->llb[i]) / DH + 0.4) % 2 == 1)
+            d2->data->llb[i] = llb[i] - DH / 2;
+          s2->data->shape[i] = int((s2->data->uub[i] - s2->data->llb[i]) / SH + 0.4);
+          d2->data->shape[i] = int((d2->data->uub[i] - d2->data->llb[i]) / DH + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+
+#endif
+          s2->data->illb[i] = sd->illb[i];
+          d2->data->illb[i] = dd->illb[i];
+          s2->data->iuub[i] = sd->iuub[i];
+          d2->data->iuub[i] = dd->iuub[i];
+        }
+        s2->data->Bg = sd->Bg;
+        s2->next = 0;
+        d2->data->Bg = dd->Bg;
+        d2->next = 0;
+      }
+      d = d->next;
+    }
+    s = s->next;
+  }
+}
+//   PACK: prepare target data in 'data'
+// UNPACK: copy target data from 'data' to corresponding numerical grids
+int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<Parallel::gridseg> *dst, int rank_in, int dir,
+                          MyList<var> *VarLists /* source */, MyList<var> *VarListd /* target */, int Symmetry)
+{
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+  int DIM = dim;
+
+  if (dir != PACK && dir != UNPACK)
+  {
+    cout << "error dir " << dir << " for data_packer " << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+
+  int size_out = 0;
+
+  if (!src || !dst)
+    return size_out;
+
   MyList<var> *varls, *varld;
 
   const int state_count = cuda_state_var_count(VarLists, VarListd);
@@ -4005,15 +4207,15 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
     cout << "error in short data packer, var lists does not match." << endl;
     MPI_Abort(MPI_COMM_WORLD, 1);
   }
-
-  int type; /* 1 copy, 2 restrict, 3 prolong */
-  if (src->data->Bg->lev == dst->data->Bg->lev)
-    type = 1;
-  else if (src->data->Bg->lev > dst->data->Bg->lev)
-    type = 2;
-  else
-    type = 3;
-
+
+  int type; /* 1 copy, 2 restrict, 3 prolong */
+  if (src->data->Bg->lev == dst->data->Bg->lev)
+    type = 1;
+  else if (src->data->Bg->lev > dst->data->Bg->lev)
+    type = 2;
+  else
+    type = 3;
+
   while (src && dst)
   {
     if ((dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
@@ -4031,7 +4233,11 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
           if (dir == PACK && cuda_state_count_direct_supported(state_count) &&
               cuda_can_direct_pack(src->data, dst->data, type))
           {
-            handled_by_cuda = cuda_direct_pack_segment(data + size_out, src->data, dst->data, state_count);
+            if (s_cuda_aware_pack_active) {
+              handled_by_cuda = cuda_direct_pack_segment_to_device(data + size_out, src->data, dst->data, state_count);
+            } else {
+              handled_by_cuda = cuda_direct_pack_segment(data + size_out, src->data, dst->data, state_count);
+            }
             if (!handled_by_cuda)
             {
               cout << "Parallel::data_packer: CUDA direct pack failed." << endl;
@@ -4041,7 +4247,11 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
           else if (dir == UNPACK && cuda_state_count_direct_supported(state_count) &&
                    cuda_can_direct_unpack(dst->data, type))
           {
-            handled_by_cuda = cuda_direct_unpack_segment(data + size_out, dst->data, state_count);
+            if (s_cuda_aware_pack_active) {
+              handled_by_cuda = cuda_direct_unpack_segment_from_device(data + size_out, dst->data, state_count);
+            } else {
+              handled_by_cuda = cuda_direct_unpack_segment(data + size_out, dst->data, state_count);
+            }
             if (!handled_by_cuda)
             {
               cout << "Parallel::data_packer: CUDA direct unpack failed." << endl;
@@ -4050,26 +4260,34 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
           }
           if (!handled_by_cuda)
           {
+#if USE_CUDA_BSSN || USE_CUDA_Z4C
+            if (s_cuda_aware_pack_active)
+            {
+              fprintf(stderr, "Parallel::data_packer: type %d not supported in CUDA-aware MPI mode. "
+                              "This path requires host buffers but a device buffer was provided.\n", type);
+              MPI_Abort(MPI_COMM_WORLD, 1);
+            }
+#endif
 #endif
           if (dir == PACK)
             switch (type)
             {
               // attention must be paied to the difference between src's llb,uub and dst's llb,uub
             case 1:
-              f_copy(DIM, dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
-                     src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
-                     dst->data->llb, dst->data->uub);
-              break;
-            case 2:
-              f_restrict3(DIM, dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
-                          src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
-                          dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
-              break;
-            case 3:
-              f_prolong3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
-                         dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
-                         dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
-            }
+              f_copy(DIM, dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
+                     src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
+                     dst->data->llb, dst->data->uub);
+              break;
+            case 2:
+              f_restrict3(DIM, dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
+                          src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
+                          dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
+              break;
+            case 3:
+              f_prolong3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
+                         dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
+                         dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
+            }
           if (dir == UNPACK) // from target data to corresponding grid
             f_copy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn],
                    dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
@@ -4096,563 +4314,583 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
     }
     dst = dst->next;
     src = src->next;
-  }
-
-  return size_out;
-}
-int Parallel::data_packermix(double *data, MyList<Parallel::gridseg> *src, MyList<Parallel::gridseg> *dst, int rank_in, int dir,
-                             MyList<var> *VarLists /* source */, MyList<var> *VarListd /* target */, int Symmetry)
-{
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  int DIM = dim;
-
-  if (dir != PACK && dir != UNPACK)
-  {
-    cout << "Parallel::data_packermix: error dir " << dir << " for data_packermix." << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-  int size_out = 0;
-
-  if (!src || !dst)
-    return size_out;
-
-  MyList<var> *varls, *varld;
-
-  varls = VarLists;
-  varld = VarListd;
-  while (varls && varld)
-  {
-    varls = varls->next;
-    varld = varld->next;
-  }
-
-  if (varls || varld)
-  {
-    cout << "error in short data packer, var lists does not match." << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-  int type; /* 1 copy, 2 restrict, 3 prolong */
-  if (src->data->Bg->lev == dst->data->Bg->lev)
-    type = 1;
-  else if (src->data->Bg->lev > dst->data->Bg->lev)
-    type = 2;
-  else
-    type = 3;
-
-  if (type != 3)
-  {
-    cout << "Parallel::data_packermix: error type " << type << " for data_packermix." << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-  while (src && dst)
-  {
-    if ((dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
-        (dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank))
-    {
-      varls = VarLists;
-      varld = VarListd;
-      while (varls && varld)
-      {
-        if (data)
-        {
-          if (dir == PACK)
-            f_prolongcopy3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
-                           dst->data->llb, dst->data->uub, src->data->shape, data + size_out,
-                           src->data->llb, src->data->uub, varls->data->SoA, Symmetry);
-          if (dir == UNPACK) // from target data to corresponding grid
-            f_prolongmix3(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn],
-                          src->data->llb, src->data->uub, src->data->shape, data + size_out,
-                          dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry, dst->data->illb, dst->data->iuub);
-        }
-        // the symmetry problem should be dealt in prolongcopy3,
-        // so we always have ghost_width for both sides
-        size_out += (src->data->shape[0] + 2 * ghost_width) * (src->data->shape[1] + 2 * ghost_width) * (src->data->shape[2] + 2 * ghost_width);
-        varls = varls->next;
-        varld = varld->next;
-      }
-    }
-    dst = dst->next;
-    src = src->next;
-  }
-
-  return size_out;
-}
-//
-void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
-                        MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
-                        int Symmetry)
-{
-  int myrank, cpusize;
-  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  int node;
-
-  MPI_Request *reqs = new MPI_Request[2 * cpusize];
-  MPI_Status *stats = new MPI_Status[2 * cpusize];
-  int *req_node = new int[2 * cpusize];
-  int *req_is_recv = new int[2 * cpusize];
-  int *completed = new int[2 * cpusize];
-  int req_no = 0;
-  int pending_recv = 0;
-
-  double **send_data = new double *[cpusize];
-  double **rec_data = new double *[cpusize];
-  int *send_lengths = new int[cpusize];
-  int *recv_lengths = new int[cpusize];
-
-  for (node = 0; node < cpusize; node++)
-  {
-    send_data[node] = rec_data[node] = 0;
-    send_lengths[node] = recv_lengths[node] = 0;
-  }
-
-  // Post receives first so peers can progress rendezvous early.
-  for (node = 0; node < cpusize; node++)
-  {
-    if (node == myrank) continue;
-
-    recv_lengths[node] = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
-    if (recv_lengths[node] > 0)
-    {
-      rec_data[node] = new double[recv_lengths[node]];
-      if (!rec_data[node])
-      {
-        cout << "out of memory when new in short transfer, place 1" << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-      MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
-      req_node[req_no] = node;
-      req_is_recv[req_no] = 1;
-      req_no++;
-      pending_recv++;
-    }
-  }
-
-  // Local transfer on this rank.
-  recv_lengths[myrank] = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
-  if (recv_lengths[myrank] > 0)
-  {
-    rec_data[myrank] = new double[recv_lengths[myrank]];
-    if (!rec_data[myrank])
-    {
-      cout << "out of memory when new in short transfer, place 2" << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-    data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
-  }
-
-  // Pack and post sends.
-  for (node = 0; node < cpusize; node++)
-  {
-    if (node == myrank) continue;
-
-    send_lengths[node] = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
-    if (send_lengths[node] > 0)
-    {
-      send_data[node] = new double[send_lengths[node]];
-      if (!send_data[node])
-      {
-        cout << "out of memory when new in short transfer, place 3" << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-      data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
-      MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
-      req_node[req_no] = node;
-      req_is_recv[req_no] = 0;
-      req_no++;
-    }
-  }
-
-  // Unpack as soon as receive completes to reduce pure wait time.
-  while (pending_recv > 0)
-  {
-    int outcount = 0;
-    MPI_Waitsome(req_no, reqs, &outcount, completed, stats);
-    if (outcount == MPI_UNDEFINED) break;
-
-    for (int i = 0; i < outcount; i++)
-    {
-      int idx = completed[i];
-      if (idx >= 0 && req_is_recv[idx])
-      {
-        int recv_node = req_node[idx];
-        data_packer(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry);
-        pending_recv--;
-      }
-    }
-  }
-
-  if (req_no > 0) MPI_Waitall(req_no, reqs, stats);
-
-  if (rec_data[myrank])
-    data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
-
-  for (node = 0; node < cpusize; node++)
-  {
-    if (send_data[node])
-      delete[] send_data[node];
-    if (rec_data[node])
-      delete[] rec_data[node];
-  }
-
-  delete[] reqs;
-  delete[] stats;
-  delete[] req_node;
-  delete[] req_is_recv;
-  delete[] completed;
-  delete[] send_data;
-  delete[] rec_data;
-  delete[] send_lengths;
-  delete[] recv_lengths;
-}
-//
-void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
-                           MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
-                           int Symmetry)
-{
-  int myrank, cpusize;
-  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  int node;
-
-  MPI_Request *reqs = new MPI_Request[2 * cpusize];
-  MPI_Status *stats = new MPI_Status[2 * cpusize];
-  int *req_node = new int[2 * cpusize];
-  int *req_is_recv = new int[2 * cpusize];
-  int *completed = new int[2 * cpusize];
-  int req_no = 0;
-  int pending_recv = 0;
-
-  double **send_data = new double *[cpusize];
-  double **rec_data = new double *[cpusize];
-  int *send_lengths = new int[cpusize];
-  int *recv_lengths = new int[cpusize];
-
-  for (node = 0; node < cpusize; node++)
-  {
-    send_data[node] = rec_data[node] = 0;
-    send_lengths[node] = recv_lengths[node] = 0;
-  }
-
-  // Post receives first so peers can progress rendezvous early.
-  for (node = 0; node < cpusize; node++)
-  {
-    if (node == myrank) continue;
-
-    recv_lengths[node] = data_packermix(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
-    if (recv_lengths[node] > 0)
-    {
-      rec_data[node] = new double[recv_lengths[node]];
-      if (!rec_data[node])
-      {
-        cout << "out of memory when new in short transfer, place 1" << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-      MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
-      req_node[req_no] = node;
-      req_is_recv[req_no] = 1;
-      req_no++;
-      pending_recv++;
-    }
-  }
-
-  // Local transfer on this rank.
-  recv_lengths[myrank] = data_packermix(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
-  if (recv_lengths[myrank] > 0)
-  {
-    rec_data[myrank] = new double[recv_lengths[myrank]];
-    if (!rec_data[myrank])
-    {
-      cout << "out of memory when new in short transfer, place 2" << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-    data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
-  }
-
-  // Pack and post sends.
-  for (node = 0; node < cpusize; node++)
-  {
-    if (node == myrank) continue;
-
-    send_lengths[node] = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
-    if (send_lengths[node] > 0)
-    {
-      send_data[node] = new double[send_lengths[node]];
-      if (!send_data[node])
-      {
-        cout << "out of memory when new in short transfer, place 3" << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-      data_packermix(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
-      MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
-      req_node[req_no] = node;
-      req_is_recv[req_no] = 0;
-      req_no++;
-    }
-  }
-
-  // Unpack as soon as receive completes to reduce pure wait time.
-  while (pending_recv > 0)
-  {
-    int outcount = 0;
-    MPI_Waitsome(req_no, reqs, &outcount, completed, stats);
-    if (outcount == MPI_UNDEFINED) break;
-
-    for (int i = 0; i < outcount; i++)
-    {
-      int idx = completed[i];
-      if (idx >= 0 && req_is_recv[idx])
-      {
-        int recv_node = req_node[idx];
-        data_packermix(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry);
-        pending_recv--;
-      }
-    }
-  }
-
-  if (req_no > 0) MPI_Waitall(req_no, reqs, stats);
-
-  if (rec_data[myrank])
-    data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
-
-  for (node = 0; node < cpusize; node++)
-  {
-    if (send_data[node])
-      delete[] send_data[node];
-    if (rec_data[node])
-      delete[] rec_data[node];
-  }
-
-  delete[] reqs;
-  delete[] stats;
-  delete[] req_node;
-  delete[] req_is_recv;
-  delete[] completed;
-  delete[] send_data;
-  delete[] rec_data;
-  delete[] send_lengths;
-  delete[] recv_lengths;
-}
-void Parallel::Sync(Patch *Pat, MyList<var> *VarList, int Symmetry)
-{
-  int cpusize;
-  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-
-  MyList<Parallel::gridseg> *dst;
-  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
-  src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
-
-  dst = build_ghost_gsl(Pat); // ghost region only
-  for (int node = 0; node < cpusize; node++)
-  {
-    src[node] = build_owned_gsl0(Pat, node);                              // for the part without ghost points and do not extend
-    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer_src[node], data locate on cpu#node;
-                                                                          // but for transfer_dst[node] the data may locate on any node
-  }
-
-  transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
-
-  if (dst)
-    dst->destroyList();
-  for (int node = 0; node < cpusize; node++)
-  {
-    if (src[node])
-      src[node]->destroyList();
-    if (transfer_src[node])
-      transfer_src[node]->destroyList();
-    if (transfer_dst[node])
-      transfer_dst[node]->destroyList();
-  }
-
-  delete[] src;
-  delete[] transfer_src;
-  delete[] transfer_dst;
-}
-void Parallel::Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
-{
-  // Patch inner Synch
-  MyList<Patch> *Pp = PatL;
-  while (Pp)
-  {
-    Sync(Pp->data, VarList, Symmetry);
-    Pp = Pp->next;
-  }
-
-  // Patch inter Synch
-  int cpusize;
-  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-
-  MyList<Parallel::gridseg> *dst;
-  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
-  src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
-
-  dst = build_buffer_gsl(PatL); // buffer region only
-  for (int node = 0; node < cpusize; node++)
-  {
-    src[node] = build_owned_gsl(PatL, node, 5, Symmetry);                 // for the part without ghost nor buffer points and do not extend
-    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
-  }
-
-  transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
-
-  if (dst)
-    dst->destroyList();
-  for (int node = 0; node < cpusize; node++)
-  {
-    if (src[node])
-      src[node]->destroyList();
-    if (transfer_src[node])
-      transfer_src[node]->destroyList();
-    if (transfer_dst[node])
-      transfer_dst[node]->destroyList();
-  }
-
-  delete[] src;
-  delete[] transfer_src;
-  delete[] transfer_dst;
-}
-// Merged Sync: collect all intra-patch and inter-patch grid segment lists,
-// then issue a single transfer() call instead of N+1 separate ones.
-void Parallel::Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
-{
-  int cpusize;
-  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-
-  MyList<Parallel::gridseg> **combined_src = new MyList<Parallel::gridseg> *[cpusize];
-  MyList<Parallel::gridseg> **combined_dst = new MyList<Parallel::gridseg> *[cpusize];
-  for (int node = 0; node < cpusize; node++)
-    combined_src[node] = combined_dst[node] = 0;
-
-  // Phase A: Intra-patch ghost exchange segments
-  MyList<Patch> *Pp = PatL;
-  while (Pp)
-  {
-    Patch *Pat = Pp->data;
-    MyList<Parallel::gridseg> *dst_ghost = build_ghost_gsl(Pat);
-
-    for (int node = 0; node < cpusize; node++)
-    {
-      MyList<Parallel::gridseg> *src_owned = build_owned_gsl0(Pat, node);
-      MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
-      build_gstl(src_owned, dst_ghost, &tsrc, &tdst);
-
-      if (tsrc)
-      {
-        if (combined_src[node])
-          combined_src[node]->catList(tsrc);
-        else
-          combined_src[node] = tsrc;
-      }
-      if (tdst)
-      {
-        if (combined_dst[node])
-          combined_dst[node]->catList(tdst);
-        else
-          combined_dst[node] = tdst;
-      }
-
-      if (src_owned)
-        src_owned->destroyList();
-    }
-
-    if (dst_ghost)
-      dst_ghost->destroyList();
-
-    Pp = Pp->next;
-  }
-
-  // Phase B: Inter-patch buffer exchange segments
-  MyList<Parallel::gridseg> *dst_buffer = build_buffer_gsl(PatL);
-  for (int node = 0; node < cpusize; node++)
-  {
-    MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatL, node, 5, Symmetry);
-    MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
-    build_gstl(src_owned, dst_buffer, &tsrc, &tdst);
-
-    if (tsrc)
-    {
-      if (combined_src[node])
-        combined_src[node]->catList(tsrc);
-      else
-        combined_src[node] = tsrc;
-    }
-    if (tdst)
-    {
-      if (combined_dst[node])
-        combined_dst[node]->catList(tdst);
-      else
-        combined_dst[node] = tdst;
-    }
-
-    if (src_owned)
-      src_owned->destroyList();
-  }
-  if (dst_buffer)
-    dst_buffer->destroyList();
-
-  // Phase C: Single transfer
-  transfer(combined_src, combined_dst, VarList, VarList, Symmetry);
-
-  // Phase D: Cleanup
-  for (int node = 0; node < cpusize; node++)
-  {
-    if (combined_src[node])
-      combined_src[node]->destroyList();
-    if (combined_dst[node])
-      combined_dst[node]->destroyList();
-  }
-  delete[] combined_src;
-  delete[] combined_dst;
-}
-// SyncCache constructor
+  }
+
+  return size_out;
+}
+int Parallel::data_packermix(double *data, MyList<Parallel::gridseg> *src, MyList<Parallel::gridseg> *dst, int rank_in, int dir,
+                             MyList<var> *VarLists /* source */, MyList<var> *VarListd /* target */, int Symmetry)
+{
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+  int DIM = dim;
+
+  if (dir != PACK && dir != UNPACK)
+  {
+    cout << "Parallel::data_packermix: error dir " << dir << " for data_packermix." << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+
+  int size_out = 0;
+
+  if (!src || !dst)
+    return size_out;
+
+  MyList<var> *varls, *varld;
+
+  varls = VarLists;
+  varld = VarListd;
+  while (varls && varld)
+  {
+    varls = varls->next;
+    varld = varld->next;
+  }
+
+  if (varls || varld)
+  {
+    cout << "error in short data packer, var lists does not match." << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+
+  int type; /* 1 copy, 2 restrict, 3 prolong */
+  if (src->data->Bg->lev == dst->data->Bg->lev)
+    type = 1;
+  else if (src->data->Bg->lev > dst->data->Bg->lev)
+    type = 2;
+  else
+    type = 3;
+
+  if (type != 3)
+  {
+    cout << "Parallel::data_packermix: error type " << type << " for data_packermix." << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+
+  while (src && dst)
+  {
+    if ((dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
+        (dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank))
+    {
+      varls = VarLists;
+      varld = VarListd;
+      while (varls && varld)
+      {
+        if (data)
+        {
+          if (dir == PACK)
+            f_prolongcopy3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
+                           dst->data->llb, dst->data->uub, src->data->shape, data + size_out,
+                           src->data->llb, src->data->uub, varls->data->SoA, Symmetry);
+          if (dir == UNPACK) // from target data to corresponding grid
+            f_prolongmix3(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn],
+                          src->data->llb, src->data->uub, src->data->shape, data + size_out,
+                          dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry, dst->data->illb, dst->data->iuub);
+        }
+        // the symmetry problem should be dealt in prolongcopy3,
+        // so we always have ghost_width for both sides
+        size_out += (src->data->shape[0] + 2 * ghost_width) * (src->data->shape[1] + 2 * ghost_width) * (src->data->shape[2] + 2 * ghost_width);
+        varls = varls->next;
+        varld = varld->next;
+      }
+    }
+    dst = dst->next;
+    src = src->next;
+  }
+
+  return size_out;
+}
+//
+void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
+                        MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
+                        int Symmetry)
+{
+  int myrank, cpusize;
+  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+  int node;
+
+  MPI_Request *reqs = new MPI_Request[2 * cpusize];
+  MPI_Status *stats = new MPI_Status[2 * cpusize];
+  int *req_node = new int[2 * cpusize];
+  int *req_is_recv = new int[2 * cpusize];
+  int *completed = new int[2 * cpusize];
+  int req_no = 0;
+  int pending_recv = 0;
+
+  double **send_data = new double *[cpusize];
+  double **rec_data = new double *[cpusize];
+  int *send_lengths = new int[cpusize];
+  int *recv_lengths = new int[cpusize];
+
+  for (node = 0; node < cpusize; node++)
+  {
+    send_data[node] = rec_data[node] = 0;
+    send_lengths[node] = recv_lengths[node] = 0;
+  }
+
+  // Post receives first so peers can progress rendezvous early.
+  for (node = 0; node < cpusize; node++)
+  {
+    if (node == myrank) continue;
+
+    recv_lengths[node] = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
+    if (recv_lengths[node] > 0)
+    {
+      rec_data[node] = new double[recv_lengths[node]];
+      if (!rec_data[node])
+      {
+        cout << "out of memory when new in short transfer, place 1" << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+      MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
+      req_node[req_no] = node;
+      req_is_recv[req_no] = 1;
+      req_no++;
+      pending_recv++;
+    }
+  }
+
+  // Local transfer on this rank.
+  recv_lengths[myrank] = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
+  if (recv_lengths[myrank] > 0)
+  {
+    rec_data[myrank] = new double[recv_lengths[myrank]];
+    if (!rec_data[myrank])
+    {
+      cout << "out of memory when new in short transfer, place 2" << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+    data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
+  }
+
+  // Pack and post sends.
+  for (node = 0; node < cpusize; node++)
+  {
+    if (node == myrank) continue;
+
+    send_lengths[node] = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+    if (send_lengths[node] > 0)
+    {
+      send_data[node] = new double[send_lengths[node]];
+      if (!send_data[node])
+      {
+        cout << "out of memory when new in short transfer, place 3" << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+      data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+      MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
+      req_node[req_no] = node;
+      req_is_recv[req_no] = 0;
+      req_no++;
+    }
+  }
+
+  // Unpack as soon as receive completes to reduce pure wait time.
+  while (pending_recv > 0)
+  {
+    int outcount = 0;
+    MPI_Waitsome(req_no, reqs, &outcount, completed, stats);
+    if (outcount == MPI_UNDEFINED) break;
+
+    for (int i = 0; i < outcount; i++)
+    {
+      int idx = completed[i];
+      if (idx >= 0 && req_is_recv[idx])
+      {
+        int recv_node = req_node[idx];
+        data_packer(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry);
+        pending_recv--;
+      }
+    }
+  }
+
+  if (req_no > 0) MPI_Waitall(req_no, reqs, stats);
+
+  if (rec_data[myrank])
+    data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
+
+  for (node = 0; node < cpusize; node++)
+  {
+    if (send_data[node])
+      delete[] send_data[node];
+    if (rec_data[node])
+      delete[] rec_data[node];
+  }
+
+  delete[] reqs;
+  delete[] stats;
+  delete[] req_node;
+  delete[] req_is_recv;
+  delete[] completed;
+  delete[] send_data;
+  delete[] rec_data;
+  delete[] send_lengths;
+  delete[] recv_lengths;
+}
+//
+void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
+                           MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
+                           int Symmetry)
+{
+  int myrank, cpusize;
+  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+  int node;
+
+  MPI_Request *reqs = new MPI_Request[2 * cpusize];
+  MPI_Status *stats = new MPI_Status[2 * cpusize];
+  int *req_node = new int[2 * cpusize];
+  int *req_is_recv = new int[2 * cpusize];
+  int *completed = new int[2 * cpusize];
+  int req_no = 0;
+  int pending_recv = 0;
+
+  double **send_data = new double *[cpusize];
+  double **rec_data = new double *[cpusize];
+  int *send_lengths = new int[cpusize];
+  int *recv_lengths = new int[cpusize];
+
+  for (node = 0; node < cpusize; node++)
+  {
+    send_data[node] = rec_data[node] = 0;
+    send_lengths[node] = recv_lengths[node] = 0;
+  }
+
+  // Post receives first so peers can progress rendezvous early.
+  for (node = 0; node < cpusize; node++)
+  {
+    if (node == myrank) continue;
+
+    recv_lengths[node] = data_packermix(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
+    if (recv_lengths[node] > 0)
+    {
+      rec_data[node] = new double[recv_lengths[node]];
+      if (!rec_data[node])
+      {
+        cout << "out of memory when new in short transfer, place 1" << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+      MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
+      req_node[req_no] = node;
+      req_is_recv[req_no] = 1;
+      req_no++;
+      pending_recv++;
+    }
+  }
+
+  // Local transfer on this rank.
+  recv_lengths[myrank] = data_packermix(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
+  if (recv_lengths[myrank] > 0)
+  {
+    rec_data[myrank] = new double[recv_lengths[myrank]];
+    if (!rec_data[myrank])
+    {
+      cout << "out of memory when new in short transfer, place 2" << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+    data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
+  }
+
+  // Pack and post sends.
+  for (node = 0; node < cpusize; node++)
+  {
+    if (node == myrank) continue;
+
+    send_lengths[node] = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+    if (send_lengths[node] > 0)
+    {
+      send_data[node] = new double[send_lengths[node]];
+      if (!send_data[node])
+      {
+        cout << "out of memory when new in short transfer, place 3" << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+      data_packermix(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+      MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
+      req_node[req_no] = node;
+      req_is_recv[req_no] = 0;
+      req_no++;
+    }
+  }
+
+  // Unpack as soon as receive completes to reduce pure wait time.
+  while (pending_recv > 0)
+  {
+    int outcount = 0;
+    MPI_Waitsome(req_no, reqs, &outcount, completed, stats);
+    if (outcount == MPI_UNDEFINED) break;
+
+    for (int i = 0; i < outcount; i++)
+    {
+      int idx = completed[i];
+      if (idx >= 0 && req_is_recv[idx])
+      {
+        int recv_node = req_node[idx];
+        data_packermix(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry);
+        pending_recv--;
+      }
+    }
+  }
+
+  if (req_no > 0) MPI_Waitall(req_no, reqs, stats);
+
+  if (rec_data[myrank])
+    data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
+
+  for (node = 0; node < cpusize; node++)
+  {
+    if (send_data[node])
+      delete[] send_data[node];
+    if (rec_data[node])
+      delete[] rec_data[node];
+  }
+
+  delete[] reqs;
+  delete[] stats;
+  delete[] req_node;
+  delete[] req_is_recv;
+  delete[] completed;
+  delete[] send_data;
+  delete[] rec_data;
+  delete[] send_lengths;
+  delete[] recv_lengths;
+}
+void Parallel::Sync(Patch *Pat, MyList<var> *VarList, int Symmetry)
+{
+  int cpusize;
+  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+
+  MyList<Parallel::gridseg> *dst;
+  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
+  src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
+
+  dst = build_ghost_gsl(Pat); // ghost region only
+  for (int node = 0; node < cpusize; node++)
+  {
+    src[node] = build_owned_gsl0(Pat, node);                              // for the part without ghost points and do not extend
+    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer_src[node], data locate on cpu#node;
+                                                                          // but for transfer_dst[node] the data may locate on any node
+  }
+
+  transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
+
+  if (dst)
+    dst->destroyList();
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (src[node])
+      src[node]->destroyList();
+    if (transfer_src[node])
+      transfer_src[node]->destroyList();
+    if (transfer_dst[node])
+      transfer_dst[node]->destroyList();
+  }
+
+  delete[] src;
+  delete[] transfer_src;
+  delete[] transfer_dst;
+}
+void Parallel::Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
+{
+  // Patch inner Synch
+  MyList<Patch> *Pp = PatL;
+  while (Pp)
+  {
+    Sync(Pp->data, VarList, Symmetry);
+    Pp = Pp->next;
+  }
+
+  // Patch inter Synch
+  int cpusize;
+  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+
+  MyList<Parallel::gridseg> *dst;
+  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
+  src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
+
+  dst = build_buffer_gsl(PatL); // buffer region only
+  for (int node = 0; node < cpusize; node++)
+  {
+    src[node] = build_owned_gsl(PatL, node, 5, Symmetry);                 // for the part without ghost nor buffer points and do not extend
+    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
+  }
+
+  transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
+
+  if (dst)
+    dst->destroyList();
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (src[node])
+      src[node]->destroyList();
+    if (transfer_src[node])
+      transfer_src[node]->destroyList();
+    if (transfer_dst[node])
+      transfer_dst[node]->destroyList();
+  }
+
+  delete[] src;
+  delete[] transfer_src;
+  delete[] transfer_dst;
+}
+// Merged Sync: collect all intra-patch and inter-patch grid segment lists,
+// then issue a single transfer() call instead of N+1 separate ones.
+void Parallel::Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
+{
+  int cpusize;
+  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+
+  MyList<Parallel::gridseg> **combined_src = new MyList<Parallel::gridseg> *[cpusize];
+  MyList<Parallel::gridseg> **combined_dst = new MyList<Parallel::gridseg> *[cpusize];
+  for (int node = 0; node < cpusize; node++)
+    combined_src[node] = combined_dst[node] = 0;
+
+  // Phase A: Intra-patch ghost exchange segments
+  MyList<Patch> *Pp = PatL;
+  while (Pp)
+  {
+    Patch *Pat = Pp->data;
+    MyList<Parallel::gridseg> *dst_ghost = build_ghost_gsl(Pat);
+
+    for (int node = 0; node < cpusize; node++)
+    {
+      MyList<Parallel::gridseg> *src_owned = build_owned_gsl0(Pat, node);
+      MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
+      build_gstl(src_owned, dst_ghost, &tsrc, &tdst);
+
+      if (tsrc)
+      {
+        if (combined_src[node])
+          combined_src[node]->catList(tsrc);
+        else
+          combined_src[node] = tsrc;
+      }
+      if (tdst)
+      {
+        if (combined_dst[node])
+          combined_dst[node]->catList(tdst);
+        else
+          combined_dst[node] = tdst;
+      }
+
+      if (src_owned)
+        src_owned->destroyList();
+    }
+
+    if (dst_ghost)
+      dst_ghost->destroyList();
+
+    Pp = Pp->next;
+  }
+
+  // Phase B: Inter-patch buffer exchange segments
+  MyList<Parallel::gridseg> *dst_buffer = build_buffer_gsl(PatL);
+  for (int node = 0; node < cpusize; node++)
+  {
+    MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatL, node, 5, Symmetry);
+    MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
+    build_gstl(src_owned, dst_buffer, &tsrc, &tdst);
+
+    if (tsrc)
+    {
+      if (combined_src[node])
+        combined_src[node]->catList(tsrc);
+      else
+        combined_src[node] = tsrc;
+    }
+    if (tdst)
+    {
+      if (combined_dst[node])
+        combined_dst[node]->catList(tdst);
+      else
+        combined_dst[node] = tdst;
+    }
+
+    if (src_owned)
+      src_owned->destroyList();
+  }
+  if (dst_buffer)
+    dst_buffer->destroyList();
+
+  // Phase C: Single transfer
+  transfer(combined_src, combined_dst, VarList, VarList, Symmetry);
+
+  // Phase D: Cleanup
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (combined_src[node])
+      combined_src[node]->destroyList();
+    if (combined_dst[node])
+      combined_dst[node]->destroyList();
+  }
+  delete[] combined_src;
+  delete[] combined_dst;
+}
+// SyncCache constructor
 Parallel::SyncCache::SyncCache()
     : valid(false), cpusize(0), combined_src(0), combined_dst(0),
       send_lengths(0), recv_lengths(0), send_bufs(0), recv_bufs(0),
       send_buf_caps(0), recv_buf_caps(0), send_buf_pinned(0), recv_buf_pinned(0),
+      send_buf_is_dev(0), recv_buf_is_dev(0),
+      send_buf_caps_dev(0), recv_buf_caps_dev(0),
+      send_bufs_dev(0), recv_bufs_dev(0),
       reqs(0), stats(0), max_reqs(0),
-      lengths_valid(false), tc_req_node(0), tc_req_is_recv(0), tc_completed(0)
+      lengths_valid(false), tc_req_node(0), tc_req_is_recv(0), tc_completed(0),
+      cuda_aware_mode(false)
 {
 }
-// SyncCache invalidate: free grid segment lists but keep buffers
-void Parallel::SyncCache::invalidate()
-{
-  if (!valid)
-    return;
-  for (int i = 0; i < cpusize; i++)
-  {
-    if (combined_src[i])
-      combined_src[i]->destroyList();
-    if (combined_dst[i])
-      combined_dst[i]->destroyList();
-    combined_src[i] = combined_dst[i] = 0;
-    send_lengths[i] = recv_lengths[i] = 0;
-  }
-  valid = false;
-  lengths_valid = false;
-}
-// SyncCache destroy: free everything
-void Parallel::SyncCache::destroy()
-{
-  invalidate();
-  if (combined_src) delete[] combined_src;
-  if (combined_dst) delete[] combined_dst;
+// SyncCache invalidate: free grid segment lists but keep buffers
+void Parallel::SyncCache::invalidate()
+{
+  if (!valid)
+    return;
+  for (int i = 0; i < cpusize; i++)
+  {
+    if (combined_src[i])
+      combined_src[i]->destroyList();
+    if (combined_dst[i])
+      combined_dst[i]->destroyList();
+    combined_src[i] = combined_dst[i] = 0;
+    send_lengths[i] = recv_lengths[i] = 0;
+  }
+  valid = false;
+  lengths_valid = false;
+}
+// SyncCache destroy: free everything
+void Parallel::SyncCache::destroy()
+{
+  invalidate();
+  if (combined_src) delete[] combined_src;
+  if (combined_dst) delete[] combined_dst;
   if (send_lengths) delete[] send_lengths;
   if (recv_lengths) delete[] recv_lengths;
   if (send_buf_caps) delete[] send_buf_caps;
   if (recv_buf_caps) delete[] recv_buf_caps;
   for (int i = 0; i < cpusize; i++)
   {
+    if (send_bufs_dev && send_bufs_dev[i])
+    {
+#if USE_CUDA_BSSN || USE_CUDA_Z4C
+      free_device_comm_buffer(send_bufs_dev[i]);
+#else
+      delete[] send_bufs_dev[i];
+#endif
+    }
+    if (recv_bufs_dev && recv_bufs_dev[i])
+    {
+#if USE_CUDA_BSSN || USE_CUDA_Z4C
+      free_device_comm_buffer(recv_bufs_dev[i]);
+#else
+      delete[] recv_bufs_dev[i];
+#endif
+    }
     if (send_bufs && send_bufs[i])
     {
 #if USE_CUDA_BSSN || USE_CUDA_Z4C
@@ -4674,6 +4912,12 @@ void Parallel::SyncCache::destroy()
   if (recv_bufs) delete[] recv_bufs;
   if (send_buf_pinned) delete[] send_buf_pinned;
   if (recv_buf_pinned) delete[] recv_buf_pinned;
+  if (send_buf_is_dev) delete[] send_buf_is_dev;
+  if (recv_buf_is_dev) delete[] recv_buf_is_dev;
+  if (send_buf_caps_dev) delete[] send_buf_caps_dev;
+  if (recv_buf_caps_dev) delete[] recv_buf_caps_dev;
+  if (send_bufs_dev) delete[] send_bufs_dev;
+  if (recv_bufs_dev) delete[] recv_bufs_dev;
   if (reqs) delete[] reqs;
   if (stats) delete[] stats;
   if (tc_req_node) delete[] tc_req_node;
@@ -4682,97 +4926,195 @@ void Parallel::SyncCache::destroy()
   combined_src = combined_dst = 0;
   send_lengths = recv_lengths = 0;
   send_buf_caps = recv_buf_caps = 0;
+  send_buf_caps_dev = recv_buf_caps_dev = 0;
   send_bufs = recv_bufs = 0;
+  send_bufs_dev = recv_bufs_dev = 0;
   send_buf_pinned = recv_buf_pinned = 0;
+  send_buf_is_dev = recv_buf_is_dev = 0;
   reqs = 0; stats = 0;
-  tc_req_node = 0; tc_req_is_recv = 0; tc_completed = 0;
-  cpusize = 0; max_reqs = 0;
-}
-// transfer_cached: reuse pre-allocated buffers from SyncCache
+  tc_req_node = 0; tc_req_is_recv = 0; tc_completed = 0;
+  cpusize = 0; max_reqs = 0;
+  cuda_aware_mode = false;
+}
+// transfer_cached: reuse pre-allocated buffers from SyncCache
 void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
                                MyList<var> *VarList1, MyList<var> *VarList2,
                                int Symmetry, SyncCache &cache)
-{
-  int myrank;
-  MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  int cpusize = cache.cpusize;
-
-  int req_no = 0;
-  int pending_recv = 0;
-  int node;
-  int *req_node = cache.tc_req_node;
-  int *req_is_recv = cache.tc_req_is_recv;
-  int *completed = cache.tc_completed;
-
-  // Post receives first so peers can progress rendezvous early.
-  for (node = 0; node < cpusize; node++)
-  {
-    if (node == myrank) continue;
-
-    int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
+{
+  int myrank;
+  MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  int cpusize = cache.cpusize;
+
+  int cuda_device_sends = 0;
+  int cuda_device_recvs = 0;
+  for (int n = 0; n < cpusize; n++)
+    cache.send_buf_is_dev[n] = cache.recv_buf_is_dev[n] = 0;
+#if USE_CUDA_BSSN || USE_CUDA_Z4C
+  const int state_count = cuda_state_var_count(VarList1, VarList2);
+  if (state_count < 0)
+  {
+    cout << "Parallel::transfer_cached: variable lists do not match." << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+  if (cuda_aware_mpi_enabled())
+  {
+    for (int n = 0; n < cpusize; n++)
+    {
+      cache.send_buf_is_dev[n] = cuda_pack_to_device_eligible(src[myrank], dst[myrank], n, state_count, myrank) ? 1 : 0;
+      cache.recv_buf_is_dev[n] = cuda_recv_to_device_eligible(src[n], dst[n], n, state_count, myrank) ? 1 : 0;
+    }
+    cache.recv_buf_is_dev[myrank] = (cache.send_buf_is_dev[myrank] && cache.recv_buf_is_dev[myrank]) ? 1 : 0;
+    for (int n = 0; n < cpusize; n++)
+    {
+      cuda_device_sends += cache.send_buf_is_dev[n] ? 1 : 0;
+      cuda_device_recvs += cache.recv_buf_is_dev[n] ? 1 : 0;
+    }
+    if (cuda_mpi_diag_enabled())
+    {
+      static int diag_reported = 0;
+      int rep = diag_reported;
+      if (myrank == 0 && rep < 10)
+      {
+        if (__sync_bool_compare_and_swap(&diag_reported, rep, rep + 1))
+          fprintf(stderr, "[AMSS-CUDA-MPI][rank %d] transfer_cached: device_sends=%d "
+                          "device_recvs=%d cuda_aware_mpi=%d\n",
+                  myrank, cuda_device_sends, cuda_device_recvs,
+                  cuda_aware_mpi_enabled() ? 1 : 0);
+      }
+    }
+  }
+  else
+  {
+    for (int n = 0; n < cpusize; n++)
+      cache.send_buf_is_dev[n] = cache.recv_buf_is_dev[n] = 0;
+  }
+  cache.cuda_aware_mode = (cuda_device_sends + cuda_device_recvs) > 0;
+#endif
+
+  int req_no = 0;
+  int pending_recv = 0;
+  int node;
+  int *req_node = cache.tc_req_node;
+  int *req_is_recv = cache.tc_req_is_recv;
+  int *completed = cache.tc_completed;
+
+  // Post receives first so peers can progress rendezvous early.
+  for (node = 0; node < cpusize; node++)
+  {
+    if (node == myrank) continue;
+
+    int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
     cache.recv_lengths[node] = rlength;
     if (rlength > 0)
     {
-      ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, node, rlength);
-      MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
-      req_node[req_no] = node;
-      req_is_recv[req_no] = 1;
-      req_no++;
-      pending_recv++;
-    }
-  }
-
-  // Local transfer on this rank.
-  int self_len = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
+      if (cache.recv_buf_is_dev[node])
+      {
+        ensure_device_comm_buffer(cache.recv_bufs_dev, cache.recv_buf_caps_dev, node, rlength);
+        MPI_Irecv((void *)cache.recv_bufs_dev[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
+      }
+      else
+      {
+        ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, node, rlength);
+        MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
+      }
+      req_node[req_no] = node;
+      req_is_recv[req_no] = 1;
+      req_no++;
+      pending_recv++;
+    }
+  }
+
+  // Local transfer on this rank.
+  int self_len = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
   cache.recv_lengths[myrank] = self_len;
   if (self_len > 0)
   {
-    ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, myrank, self_len);
-    data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
+    if (cache.recv_buf_is_dev[myrank])
+    {
+      ensure_device_comm_buffer(cache.recv_bufs_dev, cache.recv_buf_caps_dev, myrank, self_len);
+      data_packer_with_device_buffer(cache.recv_bufs_dev[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
+    }
+    else
+    {
+      ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, myrank, self_len);
+      data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
+    }
   }
-
-  // Pack and post sends.
-  for (node = 0; node < cpusize; node++)
-  {
-    if (node == myrank) continue;
-
-    int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+
+  // Pack sends first. Device sends are posted after a single CUDA sync.
+  for (node = 0; node < cpusize; node++)
+  {
+    if (node == myrank) continue;
+
+    int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
     cache.send_lengths[node] = slength;
     if (slength > 0)
     {
-      ensure_comm_buffer(cache.send_bufs, cache.send_buf_pinned, cache.send_buf_caps, node, slength);
-      data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
-      MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
-      req_node[req_no] = node;
-      req_is_recv[req_no] = 0;
-      req_no++;
-    }
-  }
-
-  // Unpack as soon as receive completes to reduce pure wait time.
-  while (pending_recv > 0)
-  {
-    int outcount = 0;
-    MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats);
-    if (outcount == MPI_UNDEFINED) break;
-
-    for (int i = 0; i < outcount; i++)
-    {
-      int idx = completed[i];
-      if (idx >= 0 && req_is_recv[idx])
-      {
-        int recv_node_i = req_node[idx];
-        data_packer(cache.recv_bufs[recv_node_i], src[recv_node_i], dst[recv_node_i], recv_node_i, UNPACK, VarList1, VarList2, Symmetry);
-        pending_recv--;
-      }
-    }
-  }
-
-  if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats);
+      if (cache.send_buf_is_dev[node])
+      {
+        ensure_device_comm_buffer(cache.send_bufs_dev, cache.send_buf_caps_dev, node, slength);
+        data_packer_with_device_buffer(cache.send_bufs_dev[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+      }
+      else
+      {
+        ensure_comm_buffer(cache.send_bufs, cache.send_buf_pinned, cache.send_buf_caps, node, slength);
+        data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+      }
+    }
+  }
+#if USE_CUDA_BSSN || USE_CUDA_Z4C
+  if (cuda_device_sends > 0)
+    cudaDeviceSynchronize();
+#endif
+
+  for (node = 0; node < cpusize; node++)
+  {
+    if (node == myrank) continue;
+    const int slength = cache.send_lengths[node];
+    if (slength > 0)
+    {
+      if (cache.send_buf_is_dev[node])
+        MPI_Isend((void *)cache.send_bufs_dev[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
+      else
+        MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
+      req_node[req_no] = node;
+      req_is_recv[req_no] = 0;
+      req_no++;
+    }
+  }
+
+  // Unpack as soon as receive completes to reduce pure wait time.
+  while (pending_recv > 0)
+  {
+    int outcount = 0;
+    MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats);
+    if (outcount == MPI_UNDEFINED) break;
+
+    for (int i = 0; i < outcount; i++)
+    {
+      int idx = completed[i];
+      if (idx >= 0 && req_is_recv[idx])
+      {
+        int recv_node_i = req_node[idx];
+        if (cache.recv_buf_is_dev[recv_node_i])
+          data_packer_with_device_buffer(cache.recv_bufs_dev[recv_node_i], src[recv_node_i], dst[recv_node_i], recv_node_i, UNPACK, VarList1, VarList2, Symmetry);
+        else
+          data_packer(cache.recv_bufs[recv_node_i], src[recv_node_i], dst[recv_node_i], recv_node_i, UNPACK, VarList1, VarList2, Symmetry);
+        pending_recv--;
+      }
+    }
+  }
+
+  if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats);
 
   if (self_len > 0)
-    data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
+  {
+    if (cache.recv_buf_is_dev[myrank])
+      data_packer_with_device_buffer(cache.recv_bufs_dev[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
+    else
+      data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
+  }
 }
 void Parallel::Sync_ensure_cache(MyList<Patch> *PatL, int Symmetry, SyncCache &cache)
 {
@@ -4795,11 +5137,20 @@ void Parallel::Sync_ensure_cache(MyList<Patch> *PatL, int Symmetry, SyncCache &c
     cache.recv_buf_caps = new int[cpusize];
     cache.send_buf_pinned = new unsigned char[cpusize];
     cache.recv_buf_pinned = new unsigned char[cpusize];
+    cache.send_buf_is_dev = new unsigned char[cpusize];
+    cache.recv_buf_is_dev = new unsigned char[cpusize];
+    cache.send_buf_caps_dev = new int[cpusize];
+    cache.recv_buf_caps_dev = new int[cpusize];
+    cache.send_bufs_dev = new double *[cpusize];
+    cache.recv_bufs_dev = new double *[cpusize];
     for (int i = 0; i < cpusize; i++)
     {
       cache.send_bufs[i] = cache.recv_bufs[i] = 0;
       cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
       cache.send_buf_pinned[i] = cache.recv_buf_pinned[i] = 0;
+      cache.send_buf_is_dev[i] = cache.recv_buf_is_dev[i] = 0;
+      cache.send_buf_caps_dev[i] = cache.recv_buf_caps_dev[i] = 0;
+      cache.send_bufs_dev[i] = cache.recv_bufs_dev[i] = 0;
     }
     cache.max_reqs = 2 * cpusize;
     cache.reqs = new MPI_Request[cache.max_reqs];
@@ -4878,7 +5229,7 @@ void Parallel::Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmet
   // Use cached lists with buffer-reusing transfer
   transfer_cached(cache.combined_src, cache.combined_dst, VarList, VarList, Symmetry, cache);
 }
-// Sync_start: pack and post MPI_Isend/Irecv, return immediately
+// Sync_start: pack and post MPI_Isend/Irecv, return immediately
 void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
                           SyncCache &cache, AsyncSyncState &state)
 {
@@ -4887,19 +5238,60 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
 
   // Now pack and post async MPI operations
   int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  int cpusize = cache.cpusize;
-  state.req_no = 0;
-  state.active = true;
-  state.pending_recv = 0;
-  // Allocate tracking arrays
-  delete[] state.req_node; delete[] state.req_is_recv;
-  state.req_node = new int[cache.max_reqs];
-  state.req_is_recv = new int[cache.max_reqs];
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  int cpusize = cache.cpusize;
+  state.req_no = 0;
+  state.active = true;
+  state.pending_recv = 0;
+  // Allocate tracking arrays
+  delete[] state.req_node; delete[] state.req_is_recv;
+  state.req_node = new int[cache.max_reqs];
+  state.req_is_recv = new int[cache.max_reqs];
 
   MyList<Parallel::gridseg> **src = cache.combined_src;
   MyList<Parallel::gridseg> **dst = cache.combined_dst;
 
+  int cuda_device_sends = 0;
+  int cuda_device_recvs = 0;
+  for (int n = 0; n < cpusize; n++)
+    cache.send_buf_is_dev[n] = cache.recv_buf_is_dev[n] = 0;
+#if USE_CUDA_BSSN || USE_CUDA_Z4C
+  const int state_count = cuda_state_var_count(VarList, VarList);
+  if (state_count < 0)
+  {
+    cout << "Parallel::Sync_start: variable lists do not match." << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+  if (cuda_aware_mpi_enabled())
+  {
+    for (int n = 0; n < cpusize; n++)
+    {
+      cache.send_buf_is_dev[n] = cuda_pack_to_device_eligible(src[myrank], dst[myrank], n, state_count, myrank) ? 1 : 0;
+      cache.recv_buf_is_dev[n] = cuda_recv_to_device_eligible(src[n], dst[n], n, state_count, myrank) ? 1 : 0;
+    }
+    cache.recv_buf_is_dev[myrank] = (cache.send_buf_is_dev[myrank] && cache.recv_buf_is_dev[myrank]) ? 1 : 0;
+    for (int n = 0; n < cpusize; n++)
+    {
+      cuda_device_sends += cache.send_buf_is_dev[n] ? 1 : 0;
+      cuda_device_recvs += cache.recv_buf_is_dev[n] ? 1 : 0;
+    }
+    if (cuda_mpi_diag_enabled())
+    {
+      static int diag_reported = 0;
+      int rep = diag_reported;
+      if (myrank == 0 && rep < 20)
+      {
+        if (__sync_bool_compare_and_swap(&diag_reported, rep, rep + 1))
+          fprintf(stderr, "[AMSS-CUDA-MPI][rank %d] Sync_start: device_sends=%d "
+                          "device_recvs=%d cuda_aware_mpi=%d\n",
+                  myrank, cuda_device_sends, cuda_device_recvs,
+                  cuda_aware_mpi_enabled() ? 1 : 0);
+      }
+    }
+  }
+  cache.cuda_aware_mode = (cuda_device_sends + cuda_device_recvs) > 0;
+#endif
+
   for (int node = 0; node < cpusize; node++)
   {
     if (node == myrank)
@@ -4913,11 +5305,22 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
     }
     if (rlength > 0)
     {
-      ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, node, rlength);
-      state.req_node[state.req_no] = node;
-      state.req_is_recv[state.req_no] = 1;
-      state.pending_recv++;
-      MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
+      if (cache.recv_buf_is_dev[node])
+      {
+        ensure_device_comm_buffer(cache.recv_bufs_dev, cache.recv_buf_caps_dev, node, rlength);
+        state.req_node[state.req_no] = node;
+        state.req_is_recv[state.req_no] = 1;
+        state.pending_recv++;
+        MPI_Irecv((void *)cache.recv_bufs_dev[node], rlength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
+      }
+      else
+      {
+        ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, node, rlength);
+        state.req_node[state.req_no] = node;
+        state.req_is_recv[state.req_no] = 1;
+        state.pending_recv++;
+        MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
+      }
     }
   }
 
@@ -4926,37 +5329,70 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
     if (node == myrank)
     {
       int length;
-      if (!cache.lengths_valid) {
-        length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
-        cache.recv_lengths[node] = length;
-      } else {
-        length = cache.recv_lengths[node];
+      if (!cache.lengths_valid) {
+        length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
+        cache.recv_lengths[node] = length;
+      } else {
+        length = cache.recv_lengths[node];
       }
       if (length > 0)
       {
-        ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, node, length);
-        data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
+        if (cache.recv_buf_is_dev[node])
+        {
+          ensure_device_comm_buffer(cache.recv_bufs_dev, cache.recv_buf_caps_dev, node, length);
+          data_packer_with_device_buffer(cache.recv_bufs_dev[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
+        }
+        else
+        {
+          ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, node, length);
+          data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
+        }
       }
     }
     else
     {
-      int slength;
-      if (!cache.lengths_valid) {
-        slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
-        cache.send_lengths[node] = slength;
-      } else {
-        slength = cache.send_lengths[node];
+      int slength;
+      if (!cache.lengths_valid) {
+        slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
+        cache.send_lengths[node] = slength;
+      } else {
+        slength = cache.send_lengths[node];
       }
       if (slength > 0)
       {
-        ensure_comm_buffer(cache.send_bufs, cache.send_buf_pinned, cache.send_buf_caps, node, slength);
-        data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
-        state.req_node[state.req_no] = node;
-        state.req_is_recv[state.req_no] = 0;
-        MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
+        if (cache.send_buf_is_dev[node])
+        {
+          ensure_device_comm_buffer(cache.send_bufs_dev, cache.send_buf_caps_dev, node, slength);
+          data_packer_with_device_buffer(cache.send_bufs_dev[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
+        }
+        else
+        {
+          ensure_comm_buffer(cache.send_bufs, cache.send_buf_pinned, cache.send_buf_caps, node, slength);
+          data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
+        }
       }
     }
   }
+#if USE_CUDA_BSSN || USE_CUDA_Z4C
+  if (cuda_device_sends > 0)
+    cudaDeviceSynchronize();
+#endif
+
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (node == myrank)
+      continue;
+    const int slength = cache.send_lengths[node];
+    if (slength > 0)
+    {
+      state.req_node[state.req_no] = node;
+      state.req_is_recv[state.req_no] = 0;
+      if (cache.send_buf_is_dev[node])
+        MPI_Isend((void *)cache.send_bufs_dev[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
+      else
+        MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
+    }
+  }
   cache.lengths_valid = true;
   if (sync_profile_enabled())
   {
@@ -4969,9 +5405,9 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
 void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state,
                            MyList<var> *VarList, int Symmetry)
 {
-  if (!state.active)
-    return;
-
+  if (!state.active)
+    return;
+
   int myrank;
   MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
   MyList<Parallel::gridseg> **src = cache.combined_src;
@@ -4980,10 +5416,18 @@ void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state,
   double wait_sec = 0.0;
 
   // Unpack local data first (no MPI needed)
-  if (cache.recv_bufs[myrank] && cache.recv_lengths[myrank] > 0)
-    data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList, VarList, Symmetry);
-
-  // Progressive unpack of remote receives
+  if (cache.recv_buf_is_dev[myrank])
+  {
+    if (cache.recv_bufs_dev[myrank] && cache.recv_lengths[myrank] > 0)
+      data_packer_with_device_buffer(cache.recv_bufs_dev[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList, VarList, Symmetry);
+  }
+  else
+  {
+    if (cache.recv_bufs[myrank] && cache.recv_lengths[myrank] > 0)
+      data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList, VarList, Symmetry);
+  }
+
+  // Progressive unpack of remote receives
   if (state.pending_recv > 0 && state.req_no > 0)
   {
     int pending = state.pending_recv;
@@ -5001,7 +5445,10 @@ void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state,
         if (idx >= 0 && state.req_is_recv[idx])
         {
           int recv_node = state.req_node[idx];
-          data_packer(cache.recv_bufs[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList, VarList, Symmetry);
+          if (cache.recv_buf_is_dev[recv_node])
+            data_packer_with_device_buffer(cache.recv_bufs_dev[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList, VarList, Symmetry);
+          else
+            data_packer(cache.recv_bufs[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList, VarList, Symmetry);
           pending--;
         }
       }
@@ -5029,489 +5476,489 @@ void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state,
     sync_profile_maybe_log();
   }
 }
-// collect buffer grid segments or blocks for the periodic boundary condition of given patch
-// ---------------------------------------------------
-// |con |                                       |con |
-// |ner |                PhysBD                 |ner |
-// |-------------------------------------------------|
-// |    |                                       |    |
-// |Phy |                                       |Phy |
-// |sBD |                                       |BD  |
-// |    |                                       |    |
-// |    |                                       |    |
-// |    |                                       |    |
-// |-------------------------------------------------|
-// |con |               PhysBD                  |con |
-// |ner |                                       |ner |
-// ---------------------------------------------------
-// first order derivetive does not need conner information,
-// but second order derivative needs!
-/* the following code does not include conner part
-MyList<Parallel::gridseg> *Parallel::build_PhysBD_gsl(Patch *Pat)
-{
-       MyList<Parallel::gridseg> *cgsl,*gsc,*gsb=0,*p;
-       gsc = build_ghost_gsl(Pat);
-       for(int i=0;i<dim;i++)
-       {
-         double DH = gsc->data->Bg->getdX(i);
-// lower boundary
-         if(gsb)
-   {
-          p = new MyList<Parallel::gridseg>;
-          p->data = new Parallel::gridseg;
-          p->next=gsb;
-    gsb=p;
-   }
-   else
-   {
-          gsb = new MyList<Parallel::gridseg>;
-          gsb->data = new Parallel::gridseg;
-          gsb->next=0;
-   }
-         for(int j=0;j<dim;j++)
-   {
-           if(i == j)
-     {
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-             gsb->data->llb[i] = Pat->bbox[i]-ghost_width*DH;
-             gsb->data->uub[i] = Pat->bbox[i]-DH;
-#else
-#ifdef Cell
-             gsb->data->llb[i] = Pat->bbox[i]-ghost_width*DH;
-             gsb->data->uub[i] = Pat->bbox[i];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-             gsb->data->shape[i] = ghost_width;
-     }
-     else
-     {
-             gsb->data->llb[j] = Pat->bbox[j];
-             gsb->data->uub[j] = Pat->bbox[j+dim];
-             gsb->data->shape[j] = Pat->shape[j];
-     }
-   }
-   gsb->data->Bg = 0;  //vertual grid segment
-// upper boundary
-         p = new MyList<Parallel::gridseg>;
-         p->data = new Parallel::gridseg;
-         p->next=gsb;
-   gsb=p;
-         for(int j=0;j<dim;j++)
-   {
-           if(i == j)
-     {
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-             gsb->data->llb[i] = Pat->bbox[i+dim]+DH;
-             gsb->data->uub[i] = Pat->bbox[i+dim]+ghost_width*DH;
-#else
-#ifdef Cell
-             gsb->data->llb[i] = Pat->bbox[i+dim];
-             gsb->data->uub[i] = Pat->bbox[i+dim]+ghost_width*DH;
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-             gsb->data->shape[i] = ghost_width;
-     }
-     else
-     {
-             gsb->data->llb[j] = Pat->bbox[j];
-             gsb->data->uub[j] = Pat->bbox[j+dim];
-             gsb->data->shape[j] = Pat->shape[j];
-     }
-   }
-   gsb->data->Bg = 0;  //vertual grid segment
-       }
-
-       cgsl = gsl_and(gsc,gsb);
-
-       gsc->destroyList();
-       gsb->destroyList();
-
-       return cgsl;
-}
-*/
-// the following code includes conner part
-MyList<Parallel::gridseg> *Parallel::build_PhysBD_gsl(Patch *Pat)
-{
-  MyList<Parallel::gridseg> *cgsl, *gsc, *gsb = 0, *p;
-
-  gsc = build_complete_gsl(Pat);
-
-  gsb = new MyList<Parallel::gridseg>;
-  gsb->data = new Parallel::gridseg;
-  gsb->next = 0;
-  gsb->data->Bg = 0;
-
-  for (int j = 0; j < dim; j++)
-  {
-    gsb->data->llb[j] = Pat->bbox[j];
-    gsb->data->uub[j] = Pat->bbox[j + dim];
-    gsb->data->shape[j] = Pat->shape[j];
-  }
-
-  p = gsl_subtract(gsc, gsb);
-
-  gsc->destroyList();
-  gsb->destroyList();
-
-  cgsl = divide_gsl(p, Pat);
-
-  p->destroyList();
-
-  return cgsl;
-}
-MyList<Parallel::gridseg> *Parallel::divide_gsl(MyList<Parallel::gridseg> *p, Patch *Pat)
-{
-  MyList<Parallel::gridseg> *cgsl = 0;
-  while (p)
-  {
-    if (cgsl)
-      cgsl->catList(divide_gs(p, Pat));
-    else
-      cgsl = divide_gs(p, Pat);
-    p = p->next;
-  }
-
-  return cgsl;
-}
-// divide the gs into pices which locate either totally outside of the given Patch coordinate range
-// or totally inside it. It's usefull for periodic boundary condition
-MyList<Parallel::gridseg> *Parallel::divide_gs(MyList<Parallel::gridseg> *p, Patch *Pat)
-{
-  double DH[dim];
-  for (int i = 0; i < dim; i++)
-  {
-    DH[i] = p->data->Bg->getdX(i);
-  }
-
-  int num[dim];
-  double llb[3][dim], uub[3][dim];
-  for (int i = 0; i < dim; i++)
-  {
-    if (p->data->llb[i] < Pat->bbox[i] - DH[i] / 2)
-    {
-      if (p->data->uub[i] > Pat->bbox[i + dim] + DH[i] / 2)
-      {
-        num[i] = 3;
-        llb[0][i] = p->data->llb[i];
-        llb[1][i] = Pat->bbox[i];
-        uub[1][i] = Pat->bbox[i + dim];
-        uub[2][i] = p->data->uub[i];
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        uub[0][i] = Pat->bbox[i] - DH[i];
-        llb[2][i] = Pat->bbox[i + dim] + DH[i];
-#else
-#ifdef Cell
-        uub[0][i] = Pat->bbox[i];
-        llb[2][i] = Pat->bbox[i + dim];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-      else if (p->data->uub[i] > Pat->bbox[i] + DH[i] / 2)
-      {
-        num[i] = 2;
-        llb[0][i] = p->data->llb[i];
-        llb[1][i] = Pat->bbox[i];
-        uub[1][i] = p->data->uub[i];
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        uub[0][i] = Pat->bbox[i] - DH[i];
-#else
-#ifdef Cell
-        uub[0][i] = Pat->bbox[i];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-      else
-      {
-        num[i] = 1;
-        llb[0][i] = p->data->llb[i];
-        uub[0][i] = p->data->uub[i];
-      }
-    }
-    else if (p->data->llb[i] < Pat->bbox[i + dim] - DH[i] / 2)
-    {
-      if (p->data->uub[i] > Pat->bbox[i + dim] + DH[i] / 2)
-      {
-        num[i] = 2;
-        llb[0][i] = p->data->llb[i];
-        uub[0][i] = Pat->bbox[i + dim];
-        uub[1][i] = p->data->uub[i];
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        llb[1][i] = Pat->bbox[i + dim] + DH[i];
-#else
-#ifdef Cell
-        llb[1][i] = Pat->bbox[i + dim];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-      else
-      {
-        num[i] = 1;
-        llb[0][i] = p->data->llb[i];
-        uub[0][i] = p->data->uub[i];
-      }
-    }
-    else
-    {
-      num[i] = 1;
-      llb[0][i] = p->data->llb[i];
-      uub[0][i] = p->data->uub[i];
-    }
-  }
-  MyList<Parallel::gridseg> *cgsl = 0, *gg;
-  int NN = 1;
-  for (int i = 0; i < dim; i++)
-    NN = NN * num[i];
-
-  for (int i = 0; i < NN; i++)
-  {
-    int ind[dim];
-    getarrayindex(dim, num, ind, i);
-    gg = clone_gsl(p, true);
-    for (int k = 0; k < dim; k++)
-    {
-      gg->data->llb[k] = llb[ind[k]][k];
-      gg->data->uub[k] = uub[ind[k]][k];
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-      gg->data->shape[k] = int((uub[ind[k]][k] - llb[ind[k]][k]) / DH[k] + 0.4) + 1;
-#else
-#ifdef Cell
-      gg->data->shape[k] = int((uub[ind[k]][k] - llb[ind[k]][k]) / DH[k] + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-    }
-
-    if (cgsl)
-      cgsl->catList(gg);
-    else
-      cgsl = gg;
-  }
-
-  return cgsl;
-}
-// after mod operation, according to overlape to determine real grid segments
-void Parallel::build_PhysBD_gstl(Patch *Pat, MyList<Parallel::gridseg> *srci, MyList<Parallel::gridseg> *dsti,
-                                 MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst)
-{
-  *out_src = *out_dst = 0;
-
-  if (!srci || !dsti)
-    return;
-
-  MyList<Parallel::gridseg> *s, *d;
-  MyList<Parallel::gridseg> *s2, *d2;
-
-  double llb[dim], uub[dim];
-
-  s = srci;
-  while (s)
-  {
-    Parallel::gridseg *sd = s->data;
-    d = dsti;
-    while (d)
-    {
-      Parallel::gridseg *dd = d->data;
-      bool flag = true;
-      for (int i = 0; i < dim; i++)
-      {
-        double SH = sd->Bg->getdX(i), DH = dd->Bg->getdX(i);
-        if (!feq(SH, DH, SH / 2))
-        {
-          cout << "Parallel::build_PhysBD_gstl meets different grid space SH = " << SH << ", DH = " << DH << endl;
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-        // we assume dst and src locate on the same Patch
-        if (dd->llb[i] < Pat->bbox[i])
-          llb[i] = Mymax(sd->llb[i], dd->llb[i] + Pat->bbox[dim + i] - Pat->bbox[i]);
-        else if (dd->llb[i] > Pat->bbox[i + dim])
-          llb[i] = Mymax(sd->llb[i], dd->llb[i] - Pat->bbox[dim + i] + Pat->bbox[i]);
-        else
-          llb[i] = Mymax(sd->llb[i], dd->llb[i]);
-
-        if (dd->uub[i] < Pat->bbox[i])
-          uub[i] = Mymin(sd->uub[i], dd->uub[i] + Pat->bbox[dim + i] - Pat->bbox[i]);
-        else if (dd->uub[i] > Pat->bbox[dim + i])
-          uub[i] = Mymin(sd->uub[i], dd->uub[i] - Pat->bbox[dim + i] + Pat->bbox[i]);
-        else
-          uub[i] = Mymin(sd->uub[i], dd->uub[i]);
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        if (llb[i] > uub[i] + SH / 2)
-        {
-          flag = false;
-          break;
-        } // special for isolated point
-#else
-#ifdef Cell
-        if (llb[i] > uub[i])
-        {
-          flag = false;
-          break;
-        }
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-
-      if (flag)
-      {
-        if (!(*out_src))
-        {
-          *out_src = s2 = new MyList<Parallel::gridseg>;
-          *out_dst = d2 = new MyList<Parallel::gridseg>;
-          s2->data = new Parallel::gridseg;
-          d2->data = new Parallel::gridseg;
-        }
-        else
-        {
-          s2->next = new MyList<Parallel::gridseg>;
-          s2 = s2->next;
-          d2->next = new MyList<Parallel::gridseg>;
-          d2 = d2->next;
-          s2->data = new Parallel::gridseg;
-          d2->data = new Parallel::gridseg;
-        }
-
-        for (int i = 0; i < dim; i++)
-        {
-          double SH = sd->Bg->getdX(i), DH = dd->Bg->getdX(i);
-          s2->data->llb[i] = llb[i];
-          s2->data->uub[i] = uub[i];
-
-          if (dd->llb[i] < Pat->bbox[i])
-            d2->data->llb[i] = llb[i] - Pat->bbox[dim + i] + Pat->bbox[i];
-          else if (dd->llb[i] > Pat->bbox[i + dim])
-            d2->data->llb[i] = llb[i] + Pat->bbox[dim + i] - Pat->bbox[i];
-          else
-            d2->data->llb[i] = llb[i];
-
-          if (dd->uub[i] < Pat->bbox[i])
-            d2->data->uub[i] = uub[i] - Pat->bbox[dim + i] + Pat->bbox[i];
-          else if (dd->uub[i] > Pat->bbox[dim + i])
-            d2->data->uub[i] = uub[i] + Pat->bbox[dim + i] - Pat->bbox[i];
-          else
-            d2->data->uub[i] = uub[i];
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-          s2->data->shape[i] = int((s2->data->uub[i] - s2->data->llb[i]) / SH + 0.4) + 1;
-          d2->data->shape[i] = int((d2->data->uub[i] - d2->data->llb[i]) / DH + 0.4) + 1;
-#else
-#ifdef Cell
-          s2->data->shape[i] = int((s2->data->uub[i] - s2->data->llb[i]) / SH + 0.4);
-          d2->data->shape[i] = int((d2->data->uub[i] - d2->data->llb[i]) / DH + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-        }
-        s2->data->Bg = sd->Bg;
-        s2->next = 0;
-        d2->data->Bg = dd->Bg;
-        d2->next = 0;
-      }
-      d = d->next;
-    }
-    s = s->next;
-  }
-}
-void Parallel::PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry)
-{
-  int cpusize;
-  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-
-  MyList<Parallel::gridseg> *dst;
-  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
-  src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
-
-  dst = build_PhysBD_gsl(Pat);
-  for (int node = 0; node < cpusize; node++)
-  {
-    src[node] = build_owned_gsl0(Pat, node);                                          // for the part without ghost points and do not extend
-    build_PhysBD_gstl(Pat, src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
-  }
-
-  transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
-
-  if (dst)
-    dst->destroyList();
-  for (int node = 0; node < cpusize; node++)
-  {
-    if (src[node])
-      src[node]->destroyList();
-    if (transfer_src[node])
-      transfer_src[node]->destroyList();
-    if (transfer_dst[node])
-      transfer_dst[node]->destroyList();
-  }
-
-  delete[] src;
-  delete[] transfer_src;
-  delete[] transfer_dst;
-}
+// collect buffer grid segments or blocks for the periodic boundary condition of given patch
+// ---------------------------------------------------
+// |con |                                       |con |
+// |ner |                PhysBD                 |ner |
+// |-------------------------------------------------|
+// |    |                                       |    |
+// |Phy |                                       |Phy |
+// |sBD |                                       |BD  |
+// |    |                                       |    |
+// |    |                                       |    |
+// |    |                                       |    |
+// |-------------------------------------------------|
+// |con |               PhysBD                  |con |
+// |ner |                                       |ner |
+// ---------------------------------------------------
+// first order derivetive does not need conner information,
+// but second order derivative needs!
+/* the following code does not include conner part
+MyList<Parallel::gridseg> *Parallel::build_PhysBD_gsl(Patch *Pat)
+{
+       MyList<Parallel::gridseg> *cgsl,*gsc,*gsb=0,*p;
+       gsc = build_ghost_gsl(Pat);
+       for(int i=0;i<dim;i++)
+       {
+         double DH = gsc->data->Bg->getdX(i);
+// lower boundary
+         if(gsb)
+   {
+          p = new MyList<Parallel::gridseg>;
+          p->data = new Parallel::gridseg;
+          p->next=gsb;
+    gsb=p;
+   }
+   else
+   {
+          gsb = new MyList<Parallel::gridseg>;
+          gsb->data = new Parallel::gridseg;
+          gsb->next=0;
+   }
+         for(int j=0;j<dim;j++)
+   {
+           if(i == j)
+     {
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+             gsb->data->llb[i] = Pat->bbox[i]-ghost_width*DH;
+             gsb->data->uub[i] = Pat->bbox[i]-DH;
+#else
+#ifdef Cell
+             gsb->data->llb[i] = Pat->bbox[i]-ghost_width*DH;
+             gsb->data->uub[i] = Pat->bbox[i];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+             gsb->data->shape[i] = ghost_width;
+     }
+     else
+     {
+             gsb->data->llb[j] = Pat->bbox[j];
+             gsb->data->uub[j] = Pat->bbox[j+dim];
+             gsb->data->shape[j] = Pat->shape[j];
+     }
+   }
+   gsb->data->Bg = 0;  //vertual grid segment
+// upper boundary
+         p = new MyList<Parallel::gridseg>;
+         p->data = new Parallel::gridseg;
+         p->next=gsb;
+   gsb=p;
+         for(int j=0;j<dim;j++)
+   {
+           if(i == j)
+     {
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+             gsb->data->llb[i] = Pat->bbox[i+dim]+DH;
+             gsb->data->uub[i] = Pat->bbox[i+dim]+ghost_width*DH;
+#else
+#ifdef Cell
+             gsb->data->llb[i] = Pat->bbox[i+dim];
+             gsb->data->uub[i] = Pat->bbox[i+dim]+ghost_width*DH;
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+             gsb->data->shape[i] = ghost_width;
+     }
+     else
+     {
+             gsb->data->llb[j] = Pat->bbox[j];
+             gsb->data->uub[j] = Pat->bbox[j+dim];
+             gsb->data->shape[j] = Pat->shape[j];
+     }
+   }
+   gsb->data->Bg = 0;  //vertual grid segment
+       }
+
+       cgsl = gsl_and(gsc,gsb);
+
+       gsc->destroyList();
+       gsb->destroyList();
+
+       return cgsl;
+}
+*/
+// the following code includes conner part
+MyList<Parallel::gridseg> *Parallel::build_PhysBD_gsl(Patch *Pat)
+{
+  MyList<Parallel::gridseg> *cgsl, *gsc, *gsb = 0, *p;
+
+  gsc = build_complete_gsl(Pat);
+
+  gsb = new MyList<Parallel::gridseg>;
+  gsb->data = new Parallel::gridseg;
+  gsb->next = 0;
+  gsb->data->Bg = 0;
+
+  for (int j = 0; j < dim; j++)
+  {
+    gsb->data->llb[j] = Pat->bbox[j];
+    gsb->data->uub[j] = Pat->bbox[j + dim];
+    gsb->data->shape[j] = Pat->shape[j];
+  }
+
+  p = gsl_subtract(gsc, gsb);
+
+  gsc->destroyList();
+  gsb->destroyList();
+
+  cgsl = divide_gsl(p, Pat);
+
+  p->destroyList();
+
+  return cgsl;
+}
+MyList<Parallel::gridseg> *Parallel::divide_gsl(MyList<Parallel::gridseg> *p, Patch *Pat)
+{
+  MyList<Parallel::gridseg> *cgsl = 0;
+  while (p)
+  {
+    if (cgsl)
+      cgsl->catList(divide_gs(p, Pat));
+    else
+      cgsl = divide_gs(p, Pat);
+    p = p->next;
+  }
+
+  return cgsl;
+}
+// divide the gs into pices which locate either totally outside of the given Patch coordinate range
+// or totally inside it. It's usefull for periodic boundary condition
+MyList<Parallel::gridseg> *Parallel::divide_gs(MyList<Parallel::gridseg> *p, Patch *Pat)
+{
+  double DH[dim];
+  for (int i = 0; i < dim; i++)
+  {
+    DH[i] = p->data->Bg->getdX(i);
+  }
+
+  int num[dim];
+  double llb[3][dim], uub[3][dim];
+  for (int i = 0; i < dim; i++)
+  {
+    if (p->data->llb[i] < Pat->bbox[i] - DH[i] / 2)
+    {
+      if (p->data->uub[i] > Pat->bbox[i + dim] + DH[i] / 2)
+      {
+        num[i] = 3;
+        llb[0][i] = p->data->llb[i];
+        llb[1][i] = Pat->bbox[i];
+        uub[1][i] = Pat->bbox[i + dim];
+        uub[2][i] = p->data->uub[i];
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        uub[0][i] = Pat->bbox[i] - DH[i];
+        llb[2][i] = Pat->bbox[i + dim] + DH[i];
+#else
+#ifdef Cell
+        uub[0][i] = Pat->bbox[i];
+        llb[2][i] = Pat->bbox[i + dim];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+      else if (p->data->uub[i] > Pat->bbox[i] + DH[i] / 2)
+      {
+        num[i] = 2;
+        llb[0][i] = p->data->llb[i];
+        llb[1][i] = Pat->bbox[i];
+        uub[1][i] = p->data->uub[i];
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        uub[0][i] = Pat->bbox[i] - DH[i];
+#else
+#ifdef Cell
+        uub[0][i] = Pat->bbox[i];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+      else
+      {
+        num[i] = 1;
+        llb[0][i] = p->data->llb[i];
+        uub[0][i] = p->data->uub[i];
+      }
+    }
+    else if (p->data->llb[i] < Pat->bbox[i + dim] - DH[i] / 2)
+    {
+      if (p->data->uub[i] > Pat->bbox[i + dim] + DH[i] / 2)
+      {
+        num[i] = 2;
+        llb[0][i] = p->data->llb[i];
+        uub[0][i] = Pat->bbox[i + dim];
+        uub[1][i] = p->data->uub[i];
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        llb[1][i] = Pat->bbox[i + dim] + DH[i];
+#else
+#ifdef Cell
+        llb[1][i] = Pat->bbox[i + dim];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+      else
+      {
+        num[i] = 1;
+        llb[0][i] = p->data->llb[i];
+        uub[0][i] = p->data->uub[i];
+      }
+    }
+    else
+    {
+      num[i] = 1;
+      llb[0][i] = p->data->llb[i];
+      uub[0][i] = p->data->uub[i];
+    }
+  }
+  MyList<Parallel::gridseg> *cgsl = 0, *gg;
+  int NN = 1;
+  for (int i = 0; i < dim; i++)
+    NN = NN * num[i];
+
+  for (int i = 0; i < NN; i++)
+  {
+    int ind[dim];
+    getarrayindex(dim, num, ind, i);
+    gg = clone_gsl(p, true);
+    for (int k = 0; k < dim; k++)
+    {
+      gg->data->llb[k] = llb[ind[k]][k];
+      gg->data->uub[k] = uub[ind[k]][k];
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+      gg->data->shape[k] = int((uub[ind[k]][k] - llb[ind[k]][k]) / DH[k] + 0.4) + 1;
+#else
+#ifdef Cell
+      gg->data->shape[k] = int((uub[ind[k]][k] - llb[ind[k]][k]) / DH[k] + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+    }
+
+    if (cgsl)
+      cgsl->catList(gg);
+    else
+      cgsl = gg;
+  }
+
+  return cgsl;
+}
+// after mod operation, according to overlape to determine real grid segments
+void Parallel::build_PhysBD_gstl(Patch *Pat, MyList<Parallel::gridseg> *srci, MyList<Parallel::gridseg> *dsti,
+                                 MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst)
+{
+  *out_src = *out_dst = 0;
+
+  if (!srci || !dsti)
+    return;
+
+  MyList<Parallel::gridseg> *s, *d;
+  MyList<Parallel::gridseg> *s2, *d2;
+
+  double llb[dim], uub[dim];
+
+  s = srci;
+  while (s)
+  {
+    Parallel::gridseg *sd = s->data;
+    d = dsti;
+    while (d)
+    {
+      Parallel::gridseg *dd = d->data;
+      bool flag = true;
+      for (int i = 0; i < dim; i++)
+      {
+        double SH = sd->Bg->getdX(i), DH = dd->Bg->getdX(i);
+        if (!feq(SH, DH, SH / 2))
+        {
+          cout << "Parallel::build_PhysBD_gstl meets different grid space SH = " << SH << ", DH = " << DH << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        // we assume dst and src locate on the same Patch
+        if (dd->llb[i] < Pat->bbox[i])
+          llb[i] = Mymax(sd->llb[i], dd->llb[i] + Pat->bbox[dim + i] - Pat->bbox[i]);
+        else if (dd->llb[i] > Pat->bbox[i + dim])
+          llb[i] = Mymax(sd->llb[i], dd->llb[i] - Pat->bbox[dim + i] + Pat->bbox[i]);
+        else
+          llb[i] = Mymax(sd->llb[i], dd->llb[i]);
+
+        if (dd->uub[i] < Pat->bbox[i])
+          uub[i] = Mymin(sd->uub[i], dd->uub[i] + Pat->bbox[dim + i] - Pat->bbox[i]);
+        else if (dd->uub[i] > Pat->bbox[dim + i])
+          uub[i] = Mymin(sd->uub[i], dd->uub[i] - Pat->bbox[dim + i] + Pat->bbox[i]);
+        else
+          uub[i] = Mymin(sd->uub[i], dd->uub[i]);
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        if (llb[i] > uub[i] + SH / 2)
+        {
+          flag = false;
+          break;
+        } // special for isolated point
+#else
+#ifdef Cell
+        if (llb[i] > uub[i])
+        {
+          flag = false;
+          break;
+        }
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+
+      if (flag)
+      {
+        if (!(*out_src))
+        {
+          *out_src = s2 = new MyList<Parallel::gridseg>;
+          *out_dst = d2 = new MyList<Parallel::gridseg>;
+          s2->data = new Parallel::gridseg;
+          d2->data = new Parallel::gridseg;
+        }
+        else
+        {
+          s2->next = new MyList<Parallel::gridseg>;
+          s2 = s2->next;
+          d2->next = new MyList<Parallel::gridseg>;
+          d2 = d2->next;
+          s2->data = new Parallel::gridseg;
+          d2->data = new Parallel::gridseg;
+        }
+
+        for (int i = 0; i < dim; i++)
+        {
+          double SH = sd->Bg->getdX(i), DH = dd->Bg->getdX(i);
+          s2->data->llb[i] = llb[i];
+          s2->data->uub[i] = uub[i];
+
+          if (dd->llb[i] < Pat->bbox[i])
+            d2->data->llb[i] = llb[i] - Pat->bbox[dim + i] + Pat->bbox[i];
+          else if (dd->llb[i] > Pat->bbox[i + dim])
+            d2->data->llb[i] = llb[i] + Pat->bbox[dim + i] - Pat->bbox[i];
+          else
+            d2->data->llb[i] = llb[i];
+
+          if (dd->uub[i] < Pat->bbox[i])
+            d2->data->uub[i] = uub[i] - Pat->bbox[dim + i] + Pat->bbox[i];
+          else if (dd->uub[i] > Pat->bbox[dim + i])
+            d2->data->uub[i] = uub[i] + Pat->bbox[dim + i] - Pat->bbox[i];
+          else
+            d2->data->uub[i] = uub[i];
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+          s2->data->shape[i] = int((s2->data->uub[i] - s2->data->llb[i]) / SH + 0.4) + 1;
+          d2->data->shape[i] = int((d2->data->uub[i] - d2->data->llb[i]) / DH + 0.4) + 1;
+#else
+#ifdef Cell
+          s2->data->shape[i] = int((s2->data->uub[i] - s2->data->llb[i]) / SH + 0.4);
+          d2->data->shape[i] = int((d2->data->uub[i] - d2->data->llb[i]) / DH + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+        }
+        s2->data->Bg = sd->Bg;
+        s2->next = 0;
+        d2->data->Bg = dd->Bg;
+        d2->next = 0;
+      }
+      d = d->next;
+    }
+    s = s->next;
+  }
+}
+void Parallel::PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry)
+{
+  int cpusize;
+  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+
+  MyList<Parallel::gridseg> *dst;
+  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
+  src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
+
+  dst = build_PhysBD_gsl(Pat);
+  for (int node = 0; node < cpusize; node++)
+  {
+    src[node] = build_owned_gsl0(Pat, node);                                          // for the part without ghost points and do not extend
+    build_PhysBD_gstl(Pat, src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
+  }
+
+  transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
+
+  if (dst)
+    dst->destroyList();
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (src[node])
+      src[node]->destroyList();
+    if (transfer_src[node])
+      transfer_src[node]->destroyList();
+    if (transfer_dst[node])
+      transfer_dst[node]->destroyList();
+  }
+
+  delete[] src;
+  delete[] transfer_src;
+  delete[] transfer_dst;
+}
 double Parallel::L2Norm(Patch *Pat, var *vf)
 {
   int myrank;
   MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  double tvf, dtvf = 0;
-  int BDW = ghost_width;
-
-  MyList<Block> *BP = Pat->blb;
-  while (BP)
-  {
-    Block *cg = BP->data;
-    if (myrank == cg->rank)
-    {
-      f_l2normhelper(cg->shape, cg->X[0], cg->X[1], cg->X[2],
-                     Pat->bbox[0], Pat->bbox[1], Pat->bbox[2],
-                     Pat->bbox[3], Pat->bbox[4], Pat->bbox[5],
-                     cg->fgfs[vf->sgfn], tvf, BDW);
-      dtvf += tvf;
-    }
-    if (BP == Pat->ble)
-      break;
-    BP = BP->next;
-  }
-
-  MPI_Allreduce(&dtvf, &tvf, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-
-  tvf = sqrt(tvf);
+
+  double tvf, dtvf = 0;
+  int BDW = ghost_width;
+
+  MyList<Block> *BP = Pat->blb;
+  while (BP)
+  {
+    Block *cg = BP->data;
+    if (myrank == cg->rank)
+    {
+      f_l2normhelper(cg->shape, cg->X[0], cg->X[1], cg->X[2],
+                     Pat->bbox[0], Pat->bbox[1], Pat->bbox[2],
+                     Pat->bbox[3], Pat->bbox[4], Pat->bbox[5],
+                     cg->fgfs[vf->sgfn], tvf, BDW);
+      dtvf += tvf;
+    }
+    if (BP == Pat->ble)
+      break;
+    BP = BP->next;
+  }
+
+  MPI_Allreduce(&dtvf, &tvf, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+
+  tvf = sqrt(tvf);
 
   return tvf;
 }
@@ -5554,30 +6001,30 @@ double Parallel::L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here)
 {
   int myrank;
   MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  double tvf, dtvf = 0;
-  int BDW = ghost_width;
-
-  MyList<Block> *BP = Pat->blb;
-  while (BP)
-  {
-    Block *cg = BP->data;
-    if (myrank == cg->rank)
-    {
-      f_l2normhelper(cg->shape, cg->X[0], cg->X[1], cg->X[2],
-                     Pat->bbox[0], Pat->bbox[1], Pat->bbox[2],
-                     Pat->bbox[3], Pat->bbox[4], Pat->bbox[5],
-                     cg->fgfs[vf->sgfn], tvf, BDW);
-      dtvf += tvf;
-    }
-    if (BP == Pat->ble)
-      break;
-    BP = BP->next;
-  }
-
-  MPI_Allreduce(&dtvf, &tvf, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
-
-  tvf = sqrt(tvf);
+
+  double tvf, dtvf = 0;
+  int BDW = ghost_width;
+
+  MyList<Block> *BP = Pat->blb;
+  while (BP)
+  {
+    Block *cg = BP->data;
+    if (myrank == cg->rank)
+    {
+      f_l2normhelper(cg->shape, cg->X[0], cg->X[1], cg->X[2],
+                     Pat->bbox[0], Pat->bbox[1], Pat->bbox[2],
+                     Pat->bbox[3], Pat->bbox[4], Pat->bbox[5],
+                     cg->fgfs[vf->sgfn], tvf, BDW);
+      dtvf += tvf;
+    }
+    if (BP == Pat->ble)
+      break;
+    BP = BP->next;
+  }
+
+  MPI_Allreduce(&dtvf, &tvf, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
+
+  tvf = sqrt(tvf);
 
   return tvf;
 }
@@ -5619,538 +6066,660 @@ void Parallel::L2Norm7(Patch *Pat, var **vf, double *norms, MPI_Comm Comm_here)
 void Parallel::checkgsl(MyList<Parallel::gridseg> *pp, bool first_only)
 {
   int myrank = 0;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  if (myrank == 0)
-  {
-    if (!pp)
-      cout << " Parallel::checkgsl meets empty gsl" << endl;
-    while (pp)
-    {
-      if (pp->data->Bg)
-        cout << " on node#" << pp->data->Bg->rank << endl;
-      else
-        cout << " virtual grid segment" << endl;
-      cout << " shape: (";
-      for (int i = 0; i < dim; i++)
-      {
-        if (i < dim - 1)
-          cout << pp->data->shape[i] << ",";
-        else
-          cout << pp->data->shape[i] << ")" << endl;
-      }
-      cout << " range: (";
-      for (int i = 0; i < dim; i++)
-      {
-        if (i < dim - 1)
-          cout << pp->data->llb[i] << ":" << pp->data->uub[i] << ",";
-        else
-          cout << pp->data->llb[i] << ":" << pp->data->uub[i] << ")" << endl;
-      }
-      if (first_only)
-        return;
-      pp = pp->next;
-    }
-  }
-}
-void Parallel::checkvarl(MyList<var> *pp, bool first_only)
-{
-  int myrank = 0;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  if (myrank == 0)
-  {
-    while (pp)
-    {
-      cout << "name: " << pp->data->name << endl;
-      cout << "SoA = (" << pp->data->SoA[0] << "," << pp->data->SoA[1] << "," << pp->data->SoA[2] << ")" << endl;
-      cout << "sgfn = " << pp->data->sgfn << endl;
-      if (first_only)
-        return;
-      pp = pp->next;
-    }
-  }
-}
-void Parallel::prepare_inter_time_level(MyList<Patch> *PatL,
-                                        MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
-                                        MyList<var> *VarList3 /* target (t+a*dt) */, int tindex)
-{
-  while (PatL)
-  {
-    prepare_inter_time_level(PatL->data, VarList1, VarList2, VarList3, tindex);
-    PatL = PatL->next;
-  }
-}
-void Parallel::prepare_inter_time_level(Patch *Pat,
-                                        MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
-                                        MyList<var> *VarList3 /* target (t+a*dt) */, int tindex)
-{
-  int myrank = 0;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  MyList<var> *varl1;
-  MyList<var> *varl2;
-  MyList<var> *varl3;
-
-  MyList<Block> *BP = Pat->blb;
-  while (BP)
-  {
-    Block *cg = BP->data;
-    if (myrank == cg->rank)
-    {
-      varl1 = VarList1;
-      varl2 = VarList2;
-      varl3 = VarList3;
-      while (varl1)
-      {
-        if (tindex == 0)
-          f_average(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn], cg->fgfs[varl3->data->sgfn]);
-        else if (tindex == 1)
-          f_average3(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn], cg->fgfs[varl3->data->sgfn]);
-        else if (tindex == -1)
-          // just change data order to use average3
-          f_average3(cg->shape, cg->fgfs[varl2->data->sgfn], cg->fgfs[varl1->data->sgfn], cg->fgfs[varl3->data->sgfn]);
-        else
-        {
-          cout << "error tindex in Parallel::prepare_inter_time_level" << endl;
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-        varl1 = varl1->next;
-        varl2 = varl2->next;
-        varl3 = varl3->next;
-      }
-    }
-    if (BP == Pat->ble)
-      break;
-    BP = BP->next;
-  }
-}
-void Parallel::prepare_inter_time_level(MyList<Patch> *PatL,
-                                        MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
-                                        MyList<var> *VarList3 /* source (t-dt) */, MyList<var> *VarList4 /* target (t+a*dt) */, int tindex)
-{
-  while (PatL)
-  {
-    prepare_inter_time_level(PatL->data, VarList1, VarList2, VarList3, VarList4, tindex);
-    PatL = PatL->next;
-  }
-}
-void Parallel::prepare_inter_time_level(Patch *Pat,
-                                        MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
-                                        MyList<var> *VarList3 /* source (t-dt) */, MyList<var> *VarList4 /* target (t+a*dt) */, int tindex)
-{
-  int myrank = 0;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  MyList<var> *varl1;
-  MyList<var> *varl2;
-  MyList<var> *varl3;
-  MyList<var> *varl4;
-
-  MyList<Block> *BP = Pat->blb;
-  while (BP)
-  {
-    Block *cg = BP->data;
-    if (myrank == cg->rank)
-    {
-      varl1 = VarList1;
-      varl2 = VarList2;
-      varl3 = VarList3;
-      varl4 = VarList4;
-      while (varl1)
-      {
-        if (tindex == 0)
-          f_average2(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn],
-                     cg->fgfs[varl3->data->sgfn], cg->fgfs[varl4->data->sgfn]);
-        else if (tindex == 1)
-          f_average2p(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn],
-                      cg->fgfs[varl3->data->sgfn], cg->fgfs[varl4->data->sgfn]);
-        else if (tindex == -1)
-          f_average2m(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn],
-                      cg->fgfs[varl3->data->sgfn], cg->fgfs[varl4->data->sgfn]);
-        else
-        {
-          cout << "error tindex in long cgh::prepare_inter_time_level" << endl;
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-        varl1 = varl1->next;
-        varl2 = varl2->next;
-        varl3 = varl3->next;
-        varl4 = varl4->next;
-      }
-    }
-    if (BP == Pat->ble)
-      break;
-    BP = BP->next;
-  }
-}
-void Parallel::Prolong(Patch *Patc, Patch *Patf,
-                       MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
-                       int Symmetry)
-{
-  if (Patc->lev >= Patf->lev)
-  {
-    cout << "Parallel::Prolong: meet requst of Prolong from lev#" << Patc->lev << " to lev#" << Patf->lev << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-  int cpusize;
-  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-
-  MyList<Parallel::gridseg> *dst;
-  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
-  src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
-
-  dst = build_complete_gsl(Patf); // including ghost
-  for (int node = 0; node < cpusize; node++)
-  {
-    src[node] = build_owned_gsl4(Patc, node, Symmetry);                   // - buffer - ghost - BD ghost
-    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
-  }
-
-  transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
-
-  if (dst)
-    dst->destroyList();
-  for (int node = 0; node < cpusize; node++)
-  {
-    if (src[node])
-      src[node]->destroyList();
-    if (transfer_src[node])
-      transfer_src[node]->destroyList();
-    if (transfer_dst[node])
-      transfer_dst[node]->destroyList();
-  }
-
-  delete[] src;
-  delete[] transfer_src;
-  delete[] transfer_dst;
-}
-void Parallel::Restrict(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
-                        MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
-                        int Symmetry)
-{
-  if (PatcL->data->lev >= PatfL->data->lev)
-  {
-    cout << "Parallel::Restrict: meet requst of Restrict from lev#" << PatfL->data->lev << " to lev#" << PatcL->data->lev << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-  int cpusize;
-  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-
-  MyList<Parallel::gridseg> *dst;
-  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
-  src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
-
-  dst = build_complete_gsl(PatcL); // including ghost
-  for (int node = 0; node < cpusize; node++)
-  {
-#if 0
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif    
-      src[node]=build_owned_gsl(PatfL,node,2,Symmetry);   // - buffer - ghost
-#else
-#ifdef Cell
-      src[node]=build_owned_gsl(PatfL,node,4,Symmetry); // - buffer - ghost - BD ghost
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-#else
-    // it seems bam always use this
-    src[node] = build_owned_gsl(PatfL, node, 2, Symmetry); // - buffer - ghost
-#endif
-    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
-  }
-
-  transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
-
-  if (dst)
-    dst->destroyList();
-  for (int node = 0; node < cpusize; node++)
-  {
-    if (src[node])
-      src[node]->destroyList();
-    if (transfer_src[node])
-      transfer_src[node]->destroyList();
-    if (transfer_dst[node])
-      transfer_dst[node]->destroyList();
-  }
-
-  delete[] src;
-  delete[] transfer_src;
-  delete[] transfer_dst;
-}
-void Parallel::Restrict_after(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
-                              MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
-                              int Symmetry)
-{
-  if (PatcL->data->lev >= PatfL->data->lev)
-  {
-    cout << "Parallel::Restrict: meet requst of Restrict from lev#" << PatfL->data->lev << " to lev#" << PatcL->data->lev << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-  int cpusize;
-  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-
-  MyList<Parallel::gridseg> *dst;
-  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
-  src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
-
-  dst = build_complete_gsl(PatcL); // including ghost
-  for (int node = 0; node < cpusize; node++)
-  {
-    src[node] = build_owned_gsl(PatfL, node, 3, Symmetry); // - ghost - BD ghost
-
-    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
-  }
-
-  transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
-
-  if (dst)
-    dst->destroyList();
-  for (int node = 0; node < cpusize; node++)
-  {
-    if (src[node])
-      src[node]->destroyList();
-    if (transfer_src[node])
-      transfer_src[node]->destroyList();
-    if (transfer_dst[node])
-      transfer_dst[node]->destroyList();
-  }
-
-  delete[] src;
-  delete[] transfer_src;
-  delete[] transfer_dst;
-}
-// for the same time level
-void Parallel::OutBdLow2Hi(Patch *Patc, Patch *Patf,
-                           MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
-                           int Symmetry)
-{
-  if (Patc->lev >= Patf->lev)
-  {
-    cout << "Parallel::OutBdLow2Hi: meet requst of Prolong from lev#" << Patc->lev << " to lev#" << Patf->lev << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-  int cpusize;
-  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-
-  MyList<Parallel::gridseg> *dst;
-  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
-  src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
-
-  dst = build_buffer_gsl(Patf); // buffer region only
-
-  for (int node = 0; node < cpusize; node++)
-  {
-    src[node] = build_owned_gsl4(Patc, node, Symmetry);                   // - buffer - ghost - BD ghost
-    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
-  }
-
-  transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
-
-  if (dst)
-    dst->destroyList();
-  for (int node = 0; node < cpusize; node++)
-  {
-    if (src[node])
-      src[node]->destroyList();
-    if (transfer_src[node])
-      transfer_src[node]->destroyList();
-    if (transfer_dst[node])
-      transfer_dst[node]->destroyList();
-  }
-
-  delete[] src;
-  delete[] transfer_src;
-  delete[] transfer_dst;
-}
-void Parallel::OutBdLow2Hi(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
-                           MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
-                           int Symmetry)
-{
-  MyList<Patch> *Pp, *Ppc;
-  Ppc = PatcL;
-  while (Ppc)
-  {
-    Pp = PatfL;
-    while (Pp)
-    {
-      if (Ppc->data->lev >= Pp->data->lev)
-      {
-        cout << "Parallel::OutBdLow2Hi(list): meet requst of Prolong from lev#" << Ppc->data->lev << " to lev#" << Pp->data->lev << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-      Pp = Pp->next;
-    }
-    Ppc = Ppc->next;
-  }
-
-  int cpusize;
-  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-
-  MyList<Parallel::gridseg> *dst;
-  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
-  src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
-
-  dst = build_buffer_gsl(PatfL); // buffer region only
-
-  for (int node = 0; node < cpusize; node++)
-  {
-    src[node] = build_owned_gsl(PatcL, node, 4, Symmetry);                // - buffer - ghost - BD ghost
-    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
-  }
-
-  transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
-
-  if (dst)
-    dst->destroyList();
-  for (int node = 0; node < cpusize; node++)
-  {
-    if (src[node])
-      src[node]->destroyList();
-    if (transfer_src[node])
-      transfer_src[node]->destroyList();
-    if (transfer_dst[node])
-      transfer_dst[node]->destroyList();
-  }
-
-  delete[] src;
-  delete[] transfer_src;
-  delete[] transfer_dst;
-}
-// for the same time level
-void Parallel::OutBdLow2Himix(Patch *Patc, Patch *Patf,
-                              MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
-                              int Symmetry)
-{
-  if (Patc->lev >= Patf->lev)
-  {
-    cout << "Parallel::OutBdLow2Himix: meet requst of Prolong from lev#" << Patc->lev << " to lev#" << Patf->lev << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-  int cpusize;
-  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-
-  MyList<Parallel::gridseg> *dst;
-  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
-  src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
-
-  dst = build_buffer_gsl(Patf); // buffer region only
-
-  for (int node = 0; node < cpusize; node++)
-  {
-    src[node] = build_owned_gsl4(Patc, node, Symmetry);                   // - buffer - ghost - BD ghost
-    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
-  }
-
-  transfermix(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
-
-  if (dst)
-    dst->destroyList();
-  for (int node = 0; node < cpusize; node++)
-  {
-    if (src[node])
-      src[node]->destroyList();
-    if (transfer_src[node])
-      transfer_src[node]->destroyList();
-    if (transfer_dst[node])
-      transfer_dst[node]->destroyList();
-  }
-
-  delete[] src;
-  delete[] transfer_src;
-  delete[] transfer_dst;
-
-  // do not need this, we have done after calling of this routine in ProlongRestrict or RestrictProlong
-  //    Sync(Patf,VarList2,Symmetry);  // fine level points may be not enough for interpolation
-}
-void Parallel::OutBdLow2Himix(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
-                              MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
-                              int Symmetry)
-{
-  MyList<Patch> *Pp, *Ppc;
-  Ppc = PatcL;
-  while (Ppc)
-  {
-    Pp = PatfL;
-    while (Pp)
-    {
-      if (Ppc->data->lev >= Pp->data->lev)
-      {
-        cout << "Parallel::OutBdLow2Himix(list): meet requst of Prolong from lev#" << Ppc->data->lev << " to lev#" << Pp->data->lev << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-      Pp = Pp->next;
-    }
-    Ppc = Ppc->next;
-  }
-
-  int cpusize;
-  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-
-  MyList<Parallel::gridseg> *dst;
-  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
-  src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
-
-  dst = build_buffer_gsl(PatfL); // buffer region only
-
-  for (int node = 0; node < cpusize; node++)
-  {
-    src[node] = build_owned_gsl(PatcL, node, 4, Symmetry);                // - buffer - ghost - BD ghost
-    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
-  }
-
-  transfermix(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
-
-  if (dst)
-    dst->destroyList();
-  for (int node = 0; node < cpusize; node++)
-  {
-    if (src[node])
-      src[node]->destroyList();
-    if (transfer_src[node])
-      transfer_src[node]->destroyList();
-    if (transfer_dst[node])
-      transfer_dst[node]->destroyList();
-  }
-
-  delete[] src;
-  delete[] transfer_src;
-  delete[] transfer_dst;
-}
-
-// Restrict_cached: cache grid segment lists, reuse buffers via transfer_cached
-void Parallel::Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
-                               MyList<var> *VarList1, MyList<var> *VarList2,
-                               int Symmetry, SyncCache &cache)
-{
-  if (!cache.valid)
-  {
-    int cpusize;
-    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-    cache.cpusize = cpusize;
-
-    if (!cache.combined_src)
-    {
-      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
-      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
-      cache.send_lengths = new int[cpusize];
-      cache.recv_lengths = new int[cpusize];
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  if (myrank == 0)
+  {
+    if (!pp)
+      cout << " Parallel::checkgsl meets empty gsl" << endl;
+    while (pp)
+    {
+      if (pp->data->Bg)
+        cout << " on node#" << pp->data->Bg->rank << endl;
+      else
+        cout << " virtual grid segment" << endl;
+      cout << " shape: (";
+      for (int i = 0; i < dim; i++)
+      {
+        if (i < dim - 1)
+          cout << pp->data->shape[i] << ",";
+        else
+          cout << pp->data->shape[i] << ")" << endl;
+      }
+      cout << " range: (";
+      for (int i = 0; i < dim; i++)
+      {
+        if (i < dim - 1)
+          cout << pp->data->llb[i] << ":" << pp->data->uub[i] << ",";
+        else
+          cout << pp->data->llb[i] << ":" << pp->data->uub[i] << ")" << endl;
+      }
+      if (first_only)
+        return;
+      pp = pp->next;
+    }
+  }
+}
+void Parallel::checkvarl(MyList<var> *pp, bool first_only)
+{
+  int myrank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  if (myrank == 0)
+  {
+    while (pp)
+    {
+      cout << "name: " << pp->data->name << endl;
+      cout << "SoA = (" << pp->data->SoA[0] << "," << pp->data->SoA[1] << "," << pp->data->SoA[2] << ")" << endl;
+      cout << "sgfn = " << pp->data->sgfn << endl;
+      if (first_only)
+        return;
+      pp = pp->next;
+    }
+  }
+}
+void Parallel::prepare_inter_time_level(MyList<Patch> *PatL,
+                                        MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
+                                        MyList<var> *VarList3 /* target (t+a*dt) */, int tindex)
+{
+  while (PatL)
+  {
+    prepare_inter_time_level(PatL->data, VarList1, VarList2, VarList3, tindex);
+    PatL = PatL->next;
+  }
+}
+void Parallel::prepare_inter_time_level(Patch *Pat,
+                                        MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
+                                        MyList<var> *VarList3 /* target (t+a*dt) */, int tindex)
+{
+  int myrank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+  MyList<var> *varl1;
+  MyList<var> *varl2;
+  MyList<var> *varl3;
+
+  MyList<Block> *BP = Pat->blb;
+  while (BP)
+  {
+    Block *cg = BP->data;
+    if (myrank == cg->rank)
+    {
+      varl1 = VarList1;
+      varl2 = VarList2;
+      varl3 = VarList3;
+      while (varl1)
+      {
+        if (tindex == 0)
+          f_average(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn], cg->fgfs[varl3->data->sgfn]);
+        else if (tindex == 1)
+          f_average3(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn], cg->fgfs[varl3->data->sgfn]);
+        else if (tindex == -1)
+          // just change data order to use average3
+          f_average3(cg->shape, cg->fgfs[varl2->data->sgfn], cg->fgfs[varl1->data->sgfn], cg->fgfs[varl3->data->sgfn]);
+        else
+        {
+          cout << "error tindex in Parallel::prepare_inter_time_level" << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        varl1 = varl1->next;
+        varl2 = varl2->next;
+        varl3 = varl3->next;
+      }
+    }
+    if (BP == Pat->ble)
+      break;
+    BP = BP->next;
+  }
+}
+void Parallel::prepare_inter_time_level(MyList<Patch> *PatL,
+                                        MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
+                                        MyList<var> *VarList3 /* source (t-dt) */, MyList<var> *VarList4 /* target (t+a*dt) */, int tindex)
+{
+  while (PatL)
+  {
+    prepare_inter_time_level(PatL->data, VarList1, VarList2, VarList3, VarList4, tindex);
+    PatL = PatL->next;
+  }
+}
+void Parallel::prepare_inter_time_level(Patch *Pat,
+                                        MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
+                                        MyList<var> *VarList3 /* source (t-dt) */, MyList<var> *VarList4 /* target (t+a*dt) */, int tindex)
+{
+  int myrank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+  MyList<var> *varl1;
+  MyList<var> *varl2;
+  MyList<var> *varl3;
+  MyList<var> *varl4;
+
+  MyList<Block> *BP = Pat->blb;
+  while (BP)
+  {
+    Block *cg = BP->data;
+    if (myrank == cg->rank)
+    {
+      varl1 = VarList1;
+      varl2 = VarList2;
+      varl3 = VarList3;
+      varl4 = VarList4;
+      while (varl1)
+      {
+        if (tindex == 0)
+          f_average2(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn],
+                     cg->fgfs[varl3->data->sgfn], cg->fgfs[varl4->data->sgfn]);
+        else if (tindex == 1)
+          f_average2p(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn],
+                      cg->fgfs[varl3->data->sgfn], cg->fgfs[varl4->data->sgfn]);
+        else if (tindex == -1)
+          f_average2m(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn],
+                      cg->fgfs[varl3->data->sgfn], cg->fgfs[varl4->data->sgfn]);
+        else
+        {
+          cout << "error tindex in long cgh::prepare_inter_time_level" << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        varl1 = varl1->next;
+        varl2 = varl2->next;
+        varl3 = varl3->next;
+        varl4 = varl4->next;
+      }
+    }
+    if (BP == Pat->ble)
+      break;
+    BP = BP->next;
+  }
+}
+void Parallel::Prolong(Patch *Patc, Patch *Patf,
+                       MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
+                       int Symmetry)
+{
+  if (Patc->lev >= Patf->lev)
+  {
+    cout << "Parallel::Prolong: meet requst of Prolong from lev#" << Patc->lev << " to lev#" << Patf->lev << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+  int cpusize;
+  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+
+  MyList<Parallel::gridseg> *dst;
+  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
+  src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
+
+  dst = build_complete_gsl(Patf); // including ghost
+  for (int node = 0; node < cpusize; node++)
+  {
+    src[node] = build_owned_gsl4(Patc, node, Symmetry);                   // - buffer - ghost - BD ghost
+    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
+  }
+
+  transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
+
+  if (dst)
+    dst->destroyList();
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (src[node])
+      src[node]->destroyList();
+    if (transfer_src[node])
+      transfer_src[node]->destroyList();
+    if (transfer_dst[node])
+      transfer_dst[node]->destroyList();
+  }
+
+  delete[] src;
+  delete[] transfer_src;
+  delete[] transfer_dst;
+}
+void Parallel::Restrict(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
+                        MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
+                        int Symmetry)
+{
+  if (PatcL->data->lev >= PatfL->data->lev)
+  {
+    cout << "Parallel::Restrict: meet requst of Restrict from lev#" << PatfL->data->lev << " to lev#" << PatcL->data->lev << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+  int cpusize;
+  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+
+  MyList<Parallel::gridseg> *dst;
+  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
+  src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
+
+  dst = build_complete_gsl(PatcL); // including ghost
+  for (int node = 0; node < cpusize; node++)
+  {
+#if 0
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif    
+      src[node]=build_owned_gsl(PatfL,node,2,Symmetry);   // - buffer - ghost
+#else
+#ifdef Cell
+      src[node]=build_owned_gsl(PatfL,node,4,Symmetry); // - buffer - ghost - BD ghost
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+#else
+    // it seems bam always use this
+    src[node] = build_owned_gsl(PatfL, node, 2, Symmetry); // - buffer - ghost
+#endif
+    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
+  }
+
+  transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
+
+  if (dst)
+    dst->destroyList();
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (src[node])
+      src[node]->destroyList();
+    if (transfer_src[node])
+      transfer_src[node]->destroyList();
+    if (transfer_dst[node])
+      transfer_dst[node]->destroyList();
+  }
+
+  delete[] src;
+  delete[] transfer_src;
+  delete[] transfer_dst;
+}
+void Parallel::Restrict_after(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
+                              MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
+                              int Symmetry)
+{
+  if (PatcL->data->lev >= PatfL->data->lev)
+  {
+    cout << "Parallel::Restrict: meet requst of Restrict from lev#" << PatfL->data->lev << " to lev#" << PatcL->data->lev << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+  int cpusize;
+  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+
+  MyList<Parallel::gridseg> *dst;
+  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
+  src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
+
+  dst = build_complete_gsl(PatcL); // including ghost
+  for (int node = 0; node < cpusize; node++)
+  {
+    src[node] = build_owned_gsl(PatfL, node, 3, Symmetry); // - ghost - BD ghost
+
+    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
+  }
+
+  transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
+
+  if (dst)
+    dst->destroyList();
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (src[node])
+      src[node]->destroyList();
+    if (transfer_src[node])
+      transfer_src[node]->destroyList();
+    if (transfer_dst[node])
+      transfer_dst[node]->destroyList();
+  }
+
+  delete[] src;
+  delete[] transfer_src;
+  delete[] transfer_dst;
+}
+// for the same time level
+void Parallel::OutBdLow2Hi(Patch *Patc, Patch *Patf,
+                           MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
+                           int Symmetry)
+{
+  if (Patc->lev >= Patf->lev)
+  {
+    cout << "Parallel::OutBdLow2Hi: meet requst of Prolong from lev#" << Patc->lev << " to lev#" << Patf->lev << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+  int cpusize;
+  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+
+  MyList<Parallel::gridseg> *dst;
+  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
+  src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
+
+  dst = build_buffer_gsl(Patf); // buffer region only
+
+  for (int node = 0; node < cpusize; node++)
+  {
+    src[node] = build_owned_gsl4(Patc, node, Symmetry);                   // - buffer - ghost - BD ghost
+    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
+  }
+
+  transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
+
+  if (dst)
+    dst->destroyList();
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (src[node])
+      src[node]->destroyList();
+    if (transfer_src[node])
+      transfer_src[node]->destroyList();
+    if (transfer_dst[node])
+      transfer_dst[node]->destroyList();
+  }
+
+  delete[] src;
+  delete[] transfer_src;
+  delete[] transfer_dst;
+}
+void Parallel::OutBdLow2Hi(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
+                           MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
+                           int Symmetry)
+{
+  MyList<Patch> *Pp, *Ppc;
+  Ppc = PatcL;
+  while (Ppc)
+  {
+    Pp = PatfL;
+    while (Pp)
+    {
+      if (Ppc->data->lev >= Pp->data->lev)
+      {
+        cout << "Parallel::OutBdLow2Hi(list): meet requst of Prolong from lev#" << Ppc->data->lev << " to lev#" << Pp->data->lev << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+      Pp = Pp->next;
+    }
+    Ppc = Ppc->next;
+  }
+
+  int cpusize;
+  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+
+  MyList<Parallel::gridseg> *dst;
+  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
+  src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
+
+  dst = build_buffer_gsl(PatfL); // buffer region only
+
+  for (int node = 0; node < cpusize; node++)
+  {
+    src[node] = build_owned_gsl(PatcL, node, 4, Symmetry);                // - buffer - ghost - BD ghost
+    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
+  }
+
+  transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
+
+  if (dst)
+    dst->destroyList();
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (src[node])
+      src[node]->destroyList();
+    if (transfer_src[node])
+      transfer_src[node]->destroyList();
+    if (transfer_dst[node])
+      transfer_dst[node]->destroyList();
+  }
+
+  delete[] src;
+  delete[] transfer_src;
+  delete[] transfer_dst;
+}
+// for the same time level
+void Parallel::OutBdLow2Himix(Patch *Patc, Patch *Patf,
+                              MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
+                              int Symmetry)
+{
+  if (Patc->lev >= Patf->lev)
+  {
+    cout << "Parallel::OutBdLow2Himix: meet requst of Prolong from lev#" << Patc->lev << " to lev#" << Patf->lev << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+  int cpusize;
+  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+
+  MyList<Parallel::gridseg> *dst;
+  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
+  src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
+
+  dst = build_buffer_gsl(Patf); // buffer region only
+
+  for (int node = 0; node < cpusize; node++)
+  {
+    src[node] = build_owned_gsl4(Patc, node, Symmetry);                   // - buffer - ghost - BD ghost
+    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
+  }
+
+  transfermix(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
+
+  if (dst)
+    dst->destroyList();
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (src[node])
+      src[node]->destroyList();
+    if (transfer_src[node])
+      transfer_src[node]->destroyList();
+    if (transfer_dst[node])
+      transfer_dst[node]->destroyList();
+  }
+
+  delete[] src;
+  delete[] transfer_src;
+  delete[] transfer_dst;
+
+  // do not need this, we have done after calling of this routine in ProlongRestrict or RestrictProlong
+  //    Sync(Patf,VarList2,Symmetry);  // fine level points may be not enough for interpolation
+}
+void Parallel::OutBdLow2Himix(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
+                              MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
+                              int Symmetry)
+{
+  MyList<Patch> *Pp, *Ppc;
+  Ppc = PatcL;
+  while (Ppc)
+  {
+    Pp = PatfL;
+    while (Pp)
+    {
+      if (Ppc->data->lev >= Pp->data->lev)
+      {
+        cout << "Parallel::OutBdLow2Himix(list): meet requst of Prolong from lev#" << Ppc->data->lev << " to lev#" << Pp->data->lev << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+      Pp = Pp->next;
+    }
+    Ppc = Ppc->next;
+  }
+
+  int cpusize;
+  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+
+  MyList<Parallel::gridseg> *dst;
+  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
+  src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
+
+  dst = build_buffer_gsl(PatfL); // buffer region only
+
+  for (int node = 0; node < cpusize; node++)
+  {
+    src[node] = build_owned_gsl(PatcL, node, 4, Symmetry);                // - buffer - ghost - BD ghost
+    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
+  }
+
+  transfermix(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
+
+  if (dst)
+    dst->destroyList();
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (src[node])
+      src[node]->destroyList();
+    if (transfer_src[node])
+      transfer_src[node]->destroyList();
+    if (transfer_dst[node])
+      transfer_dst[node]->destroyList();
+  }
+
+  delete[] src;
+  delete[] transfer_src;
+  delete[] transfer_dst;
+}
+
+// Restrict_cached: cache grid segment lists, reuse buffers via transfer_cached
+void Parallel::Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
+                               MyList<var> *VarList1, MyList<var> *VarList2,
+                               int Symmetry, SyncCache &cache)
+{
+  if (!cache.valid)
+  {
+    int cpusize;
+    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+    cache.cpusize = cpusize;
+
+    if (!cache.combined_src)
+    {
+      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
+      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
+      cache.send_lengths = new int[cpusize];
+      cache.recv_lengths = new int[cpusize];
+      cache.send_bufs = new double *[cpusize];
+      cache.recv_bufs = new double *[cpusize];
+      cache.send_buf_caps = new int[cpusize];
+      cache.recv_buf_caps = new int[cpusize];
+      cache.send_buf_pinned = new unsigned char[cpusize];
+      cache.recv_buf_pinned = new unsigned char[cpusize];
+      cache.send_buf_is_dev = new unsigned char[cpusize];
+      cache.recv_buf_is_dev = new unsigned char[cpusize];
+      cache.send_buf_caps_dev = new int[cpusize];
+      cache.recv_buf_caps_dev = new int[cpusize];
+      cache.send_bufs_dev = new double *[cpusize];
+      cache.recv_bufs_dev = new double *[cpusize];
+      for (int i = 0; i < cpusize; i++)
+      {
+        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
+        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
+        cache.send_buf_pinned[i] = cache.recv_buf_pinned[i] = 0;
+        cache.send_buf_is_dev[i] = cache.recv_buf_is_dev[i] = 0;
+        cache.send_buf_caps_dev[i] = cache.recv_buf_caps_dev[i] = 0;
+        cache.send_bufs_dev[i] = cache.recv_bufs_dev[i] = 0;
+      }
+      cache.max_reqs = 2 * cpusize;
+      cache.reqs = new MPI_Request[cache.max_reqs];
+      cache.stats = new MPI_Status[cache.max_reqs];
+      cache.tc_req_node = new int[cache.max_reqs];
+      cache.tc_req_is_recv = new int[cache.max_reqs];
+      cache.tc_completed = new int[cache.max_reqs];
+    }
+
+    MyList<Parallel::gridseg> *dst = build_complete_gsl(PatcL);
+    for (int node = 0; node < cpusize; node++)
+    {
+      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatfL, node, 2, Symmetry);
+      build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]);
+      if (src_owned) src_owned->destroyList();
+    }
+    if (dst) dst->destroyList();
+
+    cache.valid = true;
+  }
+
+  transfer_cached(cache.combined_src, cache.combined_dst, VarList1, VarList2, Symmetry, cache);
+}
+
+// OutBdLow2Hi_cached: cache grid segment lists, reuse buffers via transfer_cached
+void Parallel::OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
+                                  MyList<var> *VarList1, MyList<var> *VarList2,
+                                  int Symmetry, SyncCache &cache)
+{
+  if (!cache.valid)
+  {
+    int cpusize;
+    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+    cache.cpusize = cpusize;
+
+    if (!cache.combined_src)
+    {
+      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
+      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
+      cache.send_lengths = new int[cpusize];
+      cache.recv_lengths = new int[cpusize];
+      cache.send_bufs = new double *[cpusize];
+      cache.recv_bufs = new double *[cpusize];
+      cache.send_buf_caps = new int[cpusize];
+      cache.recv_buf_caps = new int[cpusize];
+      cache.send_buf_pinned = new unsigned char[cpusize];
+      cache.recv_buf_pinned = new unsigned char[cpusize];
+      cache.send_buf_is_dev = new unsigned char[cpusize];
+      cache.recv_buf_is_dev = new unsigned char[cpusize];
+      cache.send_buf_caps_dev = new int[cpusize];
+      cache.recv_buf_caps_dev = new int[cpusize];
+      cache.send_bufs_dev = new double *[cpusize];
+      cache.recv_bufs_dev = new double *[cpusize];
+      for (int i = 0; i < cpusize; i++)
+      {
+        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
+        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
+        cache.send_buf_pinned[i] = cache.recv_buf_pinned[i] = 0;
+        cache.send_buf_is_dev[i] = cache.recv_buf_is_dev[i] = 0;
+        cache.send_buf_caps_dev[i] = cache.recv_buf_caps_dev[i] = 0;
+        cache.send_bufs_dev[i] = cache.recv_bufs_dev[i] = 0;
+      }
+      cache.max_reqs = 2 * cpusize;
+      cache.reqs = new MPI_Request[cache.max_reqs];
+      cache.stats = new MPI_Status[cache.max_reqs];
+      cache.tc_req_node = new int[cache.max_reqs];
+      cache.tc_req_is_recv = new int[cache.max_reqs];
+      cache.tc_completed = new int[cache.max_reqs];
+    }
+
+    MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
+    for (int node = 0; node < cpusize; node++)
+    {
+      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatcL, node, 4, Symmetry);
+      build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]);
+      if (src_owned) src_owned->destroyList();
+    }
+    if (dst) dst->destroyList();
+
+    cache.valid = true;
+  }
+
+  transfer_cached(cache.combined_src, cache.combined_dst, VarList1, VarList2, Symmetry, cache);
+}
+
+// OutBdLow2Himix_cached: same as OutBdLow2Hi_cached but uses transfermix for unpacking
+void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
+                                     MyList<var> *VarList1, MyList<var> *VarList2,
+                                     int Symmetry, SyncCache &cache)
+{
+  if (!cache.valid)
+  {
+    int cpusize;
+    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+    cache.cpusize = cpusize;
+
+    if (!cache.combined_src)
+    {
+      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
+      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
+      cache.send_lengths = new int[cpusize];
+      cache.recv_lengths = new int[cpusize];
       cache.send_bufs = new double *[cpusize];
       cache.recv_bufs = new double *[cpusize];
       cache.send_buf_caps = new int[cpusize];
@@ -6163,1212 +6732,1108 @@ void Parallel::Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
         cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
         cache.send_buf_pinned[i] = cache.recv_buf_pinned[i] = 0;
       }
-      cache.max_reqs = 2 * cpusize;
-      cache.reqs = new MPI_Request[cache.max_reqs];
-      cache.stats = new MPI_Status[cache.max_reqs];
-      cache.tc_req_node = new int[cache.max_reqs];
-      cache.tc_req_is_recv = new int[cache.max_reqs];
-      cache.tc_completed = new int[cache.max_reqs];
-    }
-
-    MyList<Parallel::gridseg> *dst = build_complete_gsl(PatcL);
-    for (int node = 0; node < cpusize; node++)
-    {
-      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatfL, node, 2, Symmetry);
-      build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]);
-      if (src_owned) src_owned->destroyList();
-    }
-    if (dst) dst->destroyList();
-
-    cache.valid = true;
-  }
-
-  transfer_cached(cache.combined_src, cache.combined_dst, VarList1, VarList2, Symmetry, cache);
-}
-
-// OutBdLow2Hi_cached: cache grid segment lists, reuse buffers via transfer_cached
-void Parallel::OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
-                                  MyList<var> *VarList1, MyList<var> *VarList2,
-                                  int Symmetry, SyncCache &cache)
-{
-  if (!cache.valid)
-  {
-    int cpusize;
-    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-    cache.cpusize = cpusize;
-
-    if (!cache.combined_src)
-    {
-      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
-      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
-      cache.send_lengths = new int[cpusize];
-      cache.recv_lengths = new int[cpusize];
-      cache.send_bufs = new double *[cpusize];
-      cache.recv_bufs = new double *[cpusize];
-      cache.send_buf_caps = new int[cpusize];
-      cache.recv_buf_caps = new int[cpusize];
-      cache.send_buf_pinned = new unsigned char[cpusize];
-      cache.recv_buf_pinned = new unsigned char[cpusize];
-      for (int i = 0; i < cpusize; i++)
-      {
-        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
-        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
-        cache.send_buf_pinned[i] = cache.recv_buf_pinned[i] = 0;
-      }
-      cache.max_reqs = 2 * cpusize;
-      cache.reqs = new MPI_Request[cache.max_reqs];
-      cache.stats = new MPI_Status[cache.max_reqs];
-      cache.tc_req_node = new int[cache.max_reqs];
-      cache.tc_req_is_recv = new int[cache.max_reqs];
-      cache.tc_completed = new int[cache.max_reqs];
-    }
-
-    MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
-    for (int node = 0; node < cpusize; node++)
-    {
-      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatcL, node, 4, Symmetry);
-      build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]);
-      if (src_owned) src_owned->destroyList();
-    }
-    if (dst) dst->destroyList();
-
-    cache.valid = true;
-  }
-
-  transfer_cached(cache.combined_src, cache.combined_dst, VarList1, VarList2, Symmetry, cache);
-}
-
-// OutBdLow2Himix_cached: same as OutBdLow2Hi_cached but uses transfermix for unpacking
-void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
-                                     MyList<var> *VarList1, MyList<var> *VarList2,
-                                     int Symmetry, SyncCache &cache)
-{
-  if (!cache.valid)
-  {
-    int cpusize;
-    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-    cache.cpusize = cpusize;
-
-    if (!cache.combined_src)
-    {
-      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
-      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
-      cache.send_lengths = new int[cpusize];
-      cache.recv_lengths = new int[cpusize];
-      cache.send_bufs = new double *[cpusize];
-      cache.recv_bufs = new double *[cpusize];
-      cache.send_buf_caps = new int[cpusize];
-      cache.recv_buf_caps = new int[cpusize];
-      cache.send_buf_pinned = new unsigned char[cpusize];
-      cache.recv_buf_pinned = new unsigned char[cpusize];
-      for (int i = 0; i < cpusize; i++)
-      {
-        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
-        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
-        cache.send_buf_pinned[i] = cache.recv_buf_pinned[i] = 0;
-      }
-      cache.max_reqs = 2 * cpusize;
-      cache.reqs = new MPI_Request[cache.max_reqs];
-      cache.stats = new MPI_Status[cache.max_reqs];
-      cache.tc_req_node = new int[cache.max_reqs];
-      cache.tc_req_is_recv = new int[cache.max_reqs];
-      cache.tc_completed = new int[cache.max_reqs];
-    }
-
-    MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
-    for (int node = 0; node < cpusize; node++)
-    {
-      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatcL, node, 4, Symmetry);
-      build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]);
-      if (src_owned) src_owned->destroyList();
-    }
-    if (dst) dst->destroyList();
-
-    cache.valid = true;
-  }
-
-  // Use transfermix instead of transfer for mix-mode interpolation
-  int myrank;
-  MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  int cpusize = cache.cpusize;
-
-  int req_no = 0;
-  int pending_recv = 0;
-  int *req_node = new int[cache.max_reqs];
-  int *req_is_recv = new int[cache.max_reqs];
-  int *completed = new int[cache.max_reqs];
-
-  // Post receives first so peers can progress rendezvous early.
-  for (int node = 0; node < cpusize; node++)
-  {
-    if (node == myrank) continue;
-
-    int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
+      cache.max_reqs = 2 * cpusize;
+      cache.reqs = new MPI_Request[cache.max_reqs];
+      cache.stats = new MPI_Status[cache.max_reqs];
+      cache.tc_req_node = new int[cache.max_reqs];
+      cache.tc_req_is_recv = new int[cache.max_reqs];
+      cache.tc_completed = new int[cache.max_reqs];
+    }
+
+    MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
+    for (int node = 0; node < cpusize; node++)
+    {
+      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatcL, node, 4, Symmetry);
+      build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]);
+      if (src_owned) src_owned->destroyList();
+    }
+    if (dst) dst->destroyList();
+
+    cache.valid = true;
+  }
+
+  // Use transfermix instead of transfer for mix-mode interpolation
+  int myrank;
+  MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  int cpusize = cache.cpusize;
+
+  int req_no = 0;
+  int pending_recv = 0;
+  int *req_node = new int[cache.max_reqs];
+  int *req_is_recv = new int[cache.max_reqs];
+  int *completed = new int[cache.max_reqs];
+
+  // Post receives first so peers can progress rendezvous early.
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (node == myrank) continue;
+
+    int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
     cache.recv_lengths[node] = rlength;
     if (rlength > 0)
     {
       ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, node, rlength);
       MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
-      req_node[req_no] = node;
-      req_is_recv[req_no] = 1;
-      req_no++;
-      pending_recv++;
-    }
-  }
-
-  // Local transfer on this rank.
-  int self_len = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
+      req_node[req_no] = node;
+      req_is_recv[req_no] = 1;
+      req_no++;
+      pending_recv++;
+    }
+  }
+
+  // Local transfer on this rank.
+  int self_len = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
   cache.recv_lengths[myrank] = self_len;
   if (self_len > 0)
   {
     ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, myrank, self_len);
     data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
   }
-
-  // Pack and post sends.
-  for (int node = 0; node < cpusize; node++)
-  {
-    if (node == myrank) continue;
-
-    int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+
+  // Pack and post sends.
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (node == myrank) continue;
+
+    int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
     cache.send_lengths[node] = slength;
     if (slength > 0)
     {
       ensure_comm_buffer(cache.send_bufs, cache.send_buf_pinned, cache.send_buf_caps, node, slength);
       data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
-      MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
-      req_node[req_no] = node;
-      req_is_recv[req_no] = 0;
-      req_no++;
-    }
-  }
-
-  // Unpack as soon as receive completes to reduce pure wait time.
-  while (pending_recv > 0)
-  {
-    int outcount = 0;
-    MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats);
-    if (outcount == MPI_UNDEFINED) break;
-
-    for (int i = 0; i < outcount; i++)
-    {
-      int idx = completed[i];
-      if (idx >= 0 && req_is_recv[idx])
-      {
-        int recv_node_i = req_node[idx];
-        data_packermix(cache.recv_bufs[recv_node_i], cache.combined_src[recv_node_i], cache.combined_dst[recv_node_i], recv_node_i, UNPACK, VarList1, VarList2, Symmetry);
-        pending_recv--;
-      }
-    }
-  }
-
-  if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats);
-
-  if (self_len > 0)
-    data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
-
-  delete[] req_node;
-  delete[] req_is_recv;
-  delete[] completed;
-}
-
-// collect all buffer grid segments or blocks for given patch
-MyList<Parallel::gridseg> *Parallel::build_buffer_gsl(Patch *Pat)
-{
-  MyList<Parallel::gridseg> *cgsl, *gsc, *gsb;
-
-  gsc = build_complete_gsl(Pat); // including ghost
-
-  gsb = new MyList<Parallel::gridseg>;
-  gsb->data = new Parallel::gridseg;
-
-  for (int i = 0; i < dim; i++)
-  {
-    double DH = Pat->blb->data->getdX(i);
-    gsb->data->uub[i] = Pat->bbox[dim + i] - Pat->uui[i] * DH;
-    gsb->data->llb[i] = Pat->bbox[i] + Pat->lli[i] * DH;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-    gsb->data->shape[i] = int((gsb->data->uub[i] - gsb->data->llb[i]) / DH + 0.4) + 1;
-#else
-#ifdef Cell
-    gsb->data->shape[i] = int((gsb->data->uub[i] - gsb->data->llb[i]) / DH + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-  }
-  gsb->data->Bg = 0;
-  gsb->next = 0;
-
-  cgsl = gsl_subtract(gsc, gsb);
-
-  gsc->destroyList();
-  gsb->destroyList();
-
-  //  set illb and iuub
-  gsb = cgsl;
-  while (gsb)
-  {
-    for (int i = 0; i < dim; i++)
-    {
-      double DH = Pat->blb->data->getdX(i);
-      gsb->data->iuub[i] = Pat->bbox[dim + i] - Pat->uui[i] * DH;
-      gsb->data->illb[i] = Pat->bbox[i] + Pat->lli[i] * DH;
-    }
-    gsb = gsb->next;
-  }
-
-  return cgsl;
-}
-MyList<Parallel::gridseg> *Parallel::build_buffer_gsl(MyList<Patch> *PatL)
-{
-  MyList<Parallel::gridseg> *cgsl = 0, *gs;
-  while (PatL)
-  {
-    if (cgsl)
-    {
-      gs->next = build_buffer_gsl(PatL->data);
-      gs = gs->next;
-      if (gs)
-        while (gs->next)
-          gs = gs->next;
-    }
-    else
-    {
-      cgsl = build_buffer_gsl(PatL->data);
-      gs = cgsl;
-      if (gs)
-        while (gs->next)
-          gs = gs->next;
-    }
-    PatL = PatL->next;
-  }
-
-  return cgsl;
-}
-void Parallel::Prolongint(Patch *Patc, Patch *Patf,
-                          MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
-                          int Symmetry)
-{
-  if (Patc->lev >= Patf->lev)
-  {
-    cout << "Parallel::Prolong: meet requst of Prolong from lev#" << Patc->lev << " to lev#" << Patf->lev << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-  int myrank = 0;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  int num_var = 0;
-  MyList<var> *varl;
-  varl = VarList1;
-  while (varl)
-  {
-    num_var++;
-    varl = varl->next;
-  }
-
-  MyList<Block> *BP = Patf->blb;
-  while (BP)
-  {
-    int Npts;
-    if (myrank == BP->data->rank)
-      Npts = BP->data->shape[0] * BP->data->shape[1] * BP->data->shape[2];
-    MPI_Bcast(&Npts, 1, MPI_INT, BP->data->rank, MPI_COMM_WORLD);
-    double *pox[3];
-    for (int i = 0; i < 3; i++)
-      pox[i] = new double[Npts];
-    if (myrank == BP->data->rank)
-    {
-      for (int i = 0; i < Npts; i++)
-      {
-        int ind[3];
-        Parallel::getarrayindex(3, BP->data->shape, ind, i);
-        pox[0][i] = BP->data->X[0][ind[0]];
-        pox[1][i] = BP->data->X[1][ind[1]];
-        pox[2][i] = BP->data->X[2][ind[2]];
-      }
-    }
-    for (int i = 0; i < 3; i++)
-      MPI_Bcast(pox[i], Npts, MPI_DOUBLE, BP->data->rank, MPI_COMM_WORLD);
-    double *res;
-    res = new double[num_var * Npts];
-    Patc->Interp_Points(VarList1, Npts, pox, res, Symmetry); // because this operation is a global operation (for all processors)
-                                                             // we have to isolate it out of myrank==BP->data->rank
-    if (myrank == BP->data->rank)
-    {
-      for (int i = 0; i < Npts; i++)
-      {
-        varl = VarList2;
-        int j = 0;
-        while (varl)
-        {
-          (BP->data->fgfs[varl->data->sgfn])[i] = res[j + i * num_var];
-          j++;
-          varl = varl->next;
-        }
-      }
-    }
-    delete[] pox[0];
-    delete[] pox[1];
-    delete[] pox[2];
-    delete[] res;
-    BP = BP->next;
-  }
-}
-//
-void Parallel::merge_gsl(MyList<gridseg> *&A, const double ratio)
-{
-  if (!A)
-    return;
-
-  MyList<gridseg> *B, *C, *D = A;
-  bool flag = false;
-  while (D->next)
-  {
-    B = D->next;
-    while (B)
-    {
-      flag = merge_gs(D, B, C, ratio);
-      if (flag)
-        break;
-      B = B->next;
-    }
-    if (flag)
-      break;
-    D = D->next;
-  }
-
-  if (flag)
-  {
-    // delete D and B from A
-    MyList<gridseg> *E = A;
-    while (E->next)
-    {
-      MyList<gridseg> *tp = E->next;
-      if (D == tp || B == tp)
-      {
-        E->next = (tp->next) ? tp->next : 0;
-        delete tp->data;
-        delete tp;
-      }
-      if (E->next)
-        E = E->next;
-    }
-
-    if (D == A)
-    {
-      MyList<gridseg> *tp = A;
-      A = (A->next) ? A->next : 0;
-      delete tp->data;
-      delete tp;
-    }
-    // cat C to A
-    if (A)
-      A->catList(C);
-    else
-      A = C;
-
-    merge_gsl(A, ratio);
-  }
-}
-//
-bool Parallel::merge_gs(MyList<gridseg> *D, MyList<gridseg> *B, MyList<gridseg> *&C, const double ratio)
-{
-  if (!B || !D)
-    return false;
-
-  C = 0;
-  double llb[dim], uub[dim], DH[dim];
-  for (int i = 0; i < dim; i++)
-  {
-    double tdh;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-    DH[i] = (D->data->uub[i] - D->data->llb[i]) / (D->data->shape[i] - 1);
-    tdh = (B->data->uub[i] - B->data->llb[i]) / (B->data->shape[i] - 1);
-#else
-#ifdef Cell
-    DH[i] = (D->data->uub[i] - D->data->llb[i]) / D->data->shape[i];
-    tdh = (B->data->uub[i] - B->data->llb[i]) / B->data->shape[i];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-    if (!feq(DH[i], tdh, DH[i] / 2))
-    {
-      cout << "Parallel::merge_gs meets different grid segment " << DH[i] << " vs " << tdh << endl;
-      checkgsl(B, true);
-      checkgsl(D, true);
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-    llb[i] = Mymax(D->data->llb[i], B->data->llb[i]);
-    uub[i] = Mymin(D->data->uub[i], B->data->uub[i]);
-    //    if(uub[i]-llb[i] < DH[i]/2) return false;  //here this is valid for both vertex and cell
-
-    // use 0 instead of DH[i]/2, we consider contact case, 2012 Aug 8
-    if (uub[i] - llb[i] < 0)
-      return false; // here this is valid for both vertex and cell
-  }
-
-  // vb: volume of B
-  // vd: volume of D
-  // vo: volume of overlap
-  // vt: volume of smallest common box (virtual merged box)
-  double vd = 1, vb = 1, vt = 1, vo = 1;
-  for (int i = 0; i < dim; i++)
-  {
-    vt = vt * (Mymax(D->data->uub[i], B->data->uub[i]) - Mymin(D->data->llb[i], B->data->llb[i]));
-    vo = vo * (uub[i] - llb[i]);
-    vd = vd * (D->data->uub[i] - D->data->llb[i]);
-    vb = vb * (B->data->uub[i] - B->data->llb[i]);
-  }
-
-  // smller ratio, more possible to merge
-  if ((vd + vb - vo) / vt > ratio)
-  {
-    C = new MyList<gridseg>;
-    C->data = new gridseg;
-    for (int i = 0; i < dim; i++)
-    {
-      C->data->uub[i] = Mymax(D->data->uub[i], B->data->uub[i]);
-      C->data->llb[i] = Mymin(D->data->llb[i], B->data->llb[i]);
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-      C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / DH[i] + 0.4) + 1;
-#else
-#ifdef Cell
-      C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / DH[i] + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-    }
-    if (D->data->Bg == B->data->Bg)
-      C->data->Bg = D->data->Bg;
-    else
-      C->data->Bg = 0;
-
-    C->next = 0;
-
-    return true;
-  }
-  else
-  {
-    return false;
-  }
-}
-// Add ghost region to tangent plane
-// we assume the grids have the same resolution
-void Parallel::add_ghost_touch(MyList<gridseg> *&A)
-{
-  if (!A || !(A->next))
-    return;
-
-  double DH[dim];
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-  for (int i = 0; i < dim; i++)
-    DH[i] = (A->data->uub[i] - A->data->llb[i]) / (A->data->shape[i] - 1) / 2;
-#else
-#ifdef Cell
-  for (int i = 0; i < dim; i++)
-    DH[i] = (A->data->uub[i] - A->data->llb[i]) / A->data->shape[i] / 2;
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-
-  MyList<gridseg> *C1, *C2, *A1 = A, *A2, *dc;
-  dc = C1 = clone_gsl(A, false);
-  while (C1)
-  {
-    C2 = C1->next;
-    A2 = A1->next;
-    while (C2)
-    {
-      for (int i = 0; i < dim; i++)
-      {
-        if (feq(C1->data->llb[i], C2->data->uub[i], DH[i]))
-        {
-          // direction i touch, other directions overlap
-          bool flag = true;
-          for (int j = 0; j < i; j++)
-            if ((C1->data->llb[j] - C2->data->llb[j]) * (C1->data->uub[j] - C2->data->llb[j]) > 0 &&
-                (C2->data->llb[j] - C1->data->llb[j]) * (C2->data->uub[j] - C1->data->llb[j]) > 0)
-              flag = false;
-          for (int j = i + 1; j < dim; j++)
-            if ((C1->data->llb[j] - C2->data->llb[j]) * (C1->data->uub[j] - C2->data->llb[j]) > 0 &&
-                (C2->data->llb[j] - C1->data->llb[j]) * (C2->data->uub[j] - C1->data->llb[j]) > 0)
-              flag = false;
-
-          if (flag)
-          {
-            // only add one ghost region
-            if (feq(A1->data->llb[i], C1->data->llb[i], DH[i]))
-            {
-              A1->data->llb[i] -= ghost_width * 2 * DH[i];
-              A1->data->shape[i] += ghost_width;
-            }
-            if (feq(A2->data->uub[i], C2->data->uub[i], DH[i]))
-            {
-              A2->data->uub[i] += ghost_width * 2 * DH[i];
-              A2->data->shape[i] += ghost_width;
-            }
-          }
-        }
-        if (feq(C1->data->uub[i], C2->data->llb[i], DH[i]))
-        {
-          // direction i touch, other directions overlap
-          bool flag = true;
-          for (int j = 0; j < i; j++)
-            if ((C1->data->llb[j] - C2->data->llb[j]) * (C1->data->uub[j] - C2->data->llb[j]) > 0 &&
-                (C2->data->llb[j] - C1->data->llb[j]) * (C2->data->uub[j] - C1->data->llb[j]) > 0)
-              flag = false;
-          for (int j = i + 1; j < dim; j++)
-            if ((C1->data->llb[j] - C2->data->llb[j]) * (C1->data->uub[j] - C2->data->llb[j]) > 0 &&
-                (C2->data->llb[j] - C1->data->llb[j]) * (C2->data->uub[j] - C1->data->llb[j]) > 0)
-              flag = false;
-
-          if (flag)
-          {
-            // only add one ghost region
-            if (feq(A1->data->uub[i], C1->data->uub[i], DH[i]))
-            {
-              A1->data->uub[i] += ghost_width * 2 * DH[i];
-              A1->data->shape[i] += ghost_width;
-            }
-            if (feq(A2->data->llb[i], C2->data->llb[i], DH[i]))
-            {
-              A2->data->llb[i] -= ghost_width * 2 * DH[i];
-              A2->data->shape[i] += ghost_width;
-            }
-          }
-        }
-      }
-      C2 = C2->next;
-      A2 = A2->next;
-    }
-    C1 = C1->next;
-    A1 = A1->next;
-  }
-
-  if (dc)
-    dc->destroyList();
-}
-// According to overlap to cut the gsl into recular pices
-void Parallel::cut_gsl(MyList<gridseg> *&A)
-{
-  if (!A)
-    return;
-
-  MyList<gridseg> *B, *C, *D = A;
-  bool flag = false;
-  while (D->next)
-  {
-    B = D->next;
-    while (B)
-    {
-      flag = cut_gs(D, B, C);
-      if (flag)
-        break;
-      B = B->next;
-    }
-    if (flag)
-      break;
-    D = D->next;
-  }
-
-  if (flag)
-  {
-    // delete D and B from A
-    MyList<gridseg> *E = A;
-    while (E->next)
-    {
-      MyList<gridseg> *tp = E->next;
-      if (D == tp || B == tp)
-      {
-        E->next = (tp->next) ? tp->next : 0;
-        delete tp->data;
-        delete tp;
-      }
-      if (E->next)
-        E = E->next;
-    }
-
-    if (D == A)
-    {
-      MyList<gridseg> *tp = A;
-      A = (A->next) ? A->next : 0;
-      delete tp->data;
-      delete tp;
-    }
-    // cat C to A
-    if (A)
-      A->catList(C);
-    else
-      A = C;
-
-    cut_gsl(A);
-  }
-}
-// when D and B have overlap, cut them into C and return true
-// otherwise return false and C=0
-bool Parallel::cut_gs(MyList<gridseg> *D, MyList<gridseg> *B, MyList<gridseg> *&C)
-{
-  C = 0;
-  double llb[dim], uub[dim], DH[dim];
-  for (int i = 0; i < dim; i++)
-  {
-    double tdh;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-    DH[i] = (D->data->uub[i] - D->data->llb[i]) / (D->data->shape[i] - 1);
-    tdh = (B->data->uub[i] - B->data->llb[i]) / (B->data->shape[i] - 1);
-#else
-#ifdef Cell
-    DH[i] = (D->data->uub[i] - D->data->llb[i]) / D->data->shape[i];
-    tdh = (B->data->uub[i] - B->data->llb[i]) / B->data->shape[i];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-    if (!feq(DH[i], tdh, DH[i] / 2))
-    {
-      cout << "Parallel::cut_gs meets different grid segment " << DH[i] << " vs " << tdh << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-    llb[i] = Mymax(D->data->llb[i], B->data->llb[i]);
-    uub[i] = Mymin(D->data->uub[i], B->data->uub[i]);
-    // for efficiency we ask the width of the patch at least 2(buffer+ghost+BD ghost)
-    if (uub[i] - llb[i] < DH[i] * 2 * (buffer_width + 2 * ghost_width))
-      return false; // here this is valid for both vertex and cell
-  }
-
-  // this part code results in 5 patches generally
-
-  C = new MyList<gridseg>;
-  C->data = new gridseg;
-  for (int i = 0; i < dim; i++)
-  {
-    C->data->llb[i] = llb[i];
-    C->data->uub[i] = uub[i];
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-    C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / DH[i] + 0.4) + 1;
-#else
-#ifdef Cell
-    C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / DH[i] + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-  }
-  if (D->data->Bg == B->data->Bg)
-    C->data->Bg = D->data->Bg;
-  else
-    C->data->Bg = 0;
-
-  C->next = gs_subtract_virtual(D, C);
-
-  MyList<gridseg> *E = C;
-
-  while (E->next)
-    E = E->next;
-
-  E->next = gs_subtract_virtual(B, C);
-
-  // this part code results in 3 patches generally
-  /*
-       C = clone_gsl(D,true);
-       C->next = gs_subtract_virtual(B,C);
-  */
-
-  return true;
-}
-// note here it is different to real cut, we need leave the cutting edge for both vertex center and cell center
-MyList<Parallel::gridseg> *Parallel::gs_subtract_virtual(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B)
-{
-  if (!A)
-    return 0;
-  if (!B)
-    return clone_gsl(A, true);
-
-  double cut_plane[2 * dim], DH[dim];
-
-  for (int i = 0; i < dim; i++)
-  {
-    double tdh;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-    DH[i] = (A->data->uub[i] - A->data->llb[i]) / (A->data->shape[i] - 1);
-    tdh = (B->data->uub[i] - B->data->llb[i]) / (B->data->shape[i] - 1);
-#else
-#ifdef Cell
-    DH[i] = (A->data->uub[i] - A->data->llb[i]) / A->data->shape[i];
-    tdh = (B->data->uub[i] - B->data->llb[i]) / B->data->shape[i];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-    if (!feq(DH[i], tdh, DH[i] / 2))
-    {
-      cout << "Parallel::gs_subtract_virtual meets different grid segment " << DH[i] << " vs " << tdh << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-  }
-
-  MyList<Parallel::gridseg> *C = 0, *q;
-  for (int i = 0; i < dim; i++)
-  {
-    if (B->data->llb[i] > A->data->uub[i] || B->data->uub[i] < A->data->llb[i])
-      return clone_gsl(A, true);
-    cut_plane[i] = A->data->llb[i];
-    cut_plane[i + dim] = A->data->uub[i];
-  }
-
-  for (int i = 0; i < dim; i++)
-  {
-    cut_plane[i] = Mymax(A->data->llb[i], B->data->llb[i]);
-    if (cut_plane[i] > A->data->llb[i])
-    {
-      q = clone_gsl(A, true);
-      // prolong the list from head
-      if (C)
-        q->next = C;
-      C = q;
-      for (int j = 0; j < dim; j++)
-      {
-        if (i == j)
-        {
-          C->data->llb[i] = A->data->llb[i];
-          // **note here it is different to real cut, we need leave the cutting edge for both vertex center and cell center**
-          C->data->uub[i] = Mymax(C->data->llb[i], cut_plane[i]);
-        }
-        else
-        {
-          C->data->llb[j] = cut_plane[j];
-          C->data->uub[j] = cut_plane[j + dim];
-        }
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4) + 1;
-#else
-#ifdef Cell
-        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-    }
-
-    cut_plane[i + dim] = Mymin(A->data->uub[i], B->data->uub[i]);
-    if (cut_plane[i + dim] < A->data->uub[i])
-    {
-      q = clone_gsl(A, true);
-      if (C)
-        q->next = C;
-      C = q;
-      for (int j = 0; j < dim; j++)
-      {
-        if (i == j)
-        {
-          C->data->uub[i] = A->data->uub[i];
-          // note here it is different to real cut, we need leave the cutting edge for both vertex center and cell center
-          C->data->llb[i] = Mymin(C->data->uub[i], cut_plane[i + dim]);
-        }
-        else
-        {
-          C->data->llb[j] = cut_plane[j];
-          C->data->uub[j] = cut_plane[j + dim];
-        }
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4) + 1;
-#else
-#ifdef Cell
-        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-      }
-    }
-  }
-  return C;
-}
-// note the data structure
-// if CC is true
-// 1   -----------  1   ------  ^
-//                  0   ------  |  t
-// 0   -----------  old ------  |
-//
-// old -----------
-// if CC is false
-// 1   -----------  1   ------  ^
-// 0   -----------  0   ------  |  t
-// old -----------  old ------  |
-void Parallel::fill_level_data(MyList<Patch> *PatLd, MyList<Patch> *PatLs, MyList<Patch> *PatcL,
-                               MyList<var> *OldList, MyList<var> *StateList, MyList<var> *FutureList,
-                               MyList<var> *tmList, int Symmetry, bool BB, bool CC)
-{
-  if (PatLd->data->lev != PatLs->data->lev)
-  {
-    cout << "Parallel::fill_level_data: meet requst from lev#" << PatLs->data->lev << " to lev#" << PatLd->data->lev << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-  if (PatLd->data->lev <= PatcL->data->lev)
-  {
-    cout << "Parallel::fill_level_data: meet prolong requst from lev#" << PatcL->data->lev << " to lev#" << PatLd->data->lev << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-  int cpusize;
-  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-
-  MyList<var> *VarList = 0;
-  MyList<var> *p;
-  p = StateList;
-  while (p)
-  {
-    if (VarList)
-      VarList->insert(p->data);
-    else
-      VarList = new MyList<var>(p->data);
-    p = p->next;
-  }
-  p = FutureList;
-  while (p)
-  {
-    if (VarList)
-      VarList->insert(p->data);
-    else
-      VarList = new MyList<var>(p->data);
-    p = p->next;
-  }
-
-  MyList<Parallel::gridseg> *dst;
-  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
-  src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
-  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
-
-  dst = build_complete_gsl(PatLd); // including ghost
-  // copy part
-  for (int node = 0; node < cpusize; node++)
-  {
-    src[node] = build_owned_gsl(PatLs, node, 0, Symmetry);                // similar to Sync
-    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
-  }
-
-  transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
-
-  for (int node = 0; node < cpusize; node++)
-  {
-    if (src[node])
-      src[node]->destroyList();
-    if (transfer_src[node])
-      transfer_src[node]->destroyList();
-    if (transfer_dst[node])
-      transfer_dst[node]->destroyList();
-  }
-
-  MyList<Parallel::gridseg> *dsts, *dstd;
-  dsts = build_complete_gsl_virtual(PatLs);
-  dstd = dst;
-  dst = gsl_subtract(dstd, dsts);
-  if (dstd)
-    dstd->destroyList();
-  if (dsts)
-    dsts->destroyList();
-
-  if (dst)
-  {
-    // prolongation part
-    for (int node = 0; node < cpusize; node++)
-    {
-      src[node] = build_owned_gsl(PatcL, node, 4, Symmetry);                // - buffer - ghost - BD ghost
-      build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
-    }
-
-    if (CC)
-    {
-      // for FutureList
-      // restrict first~~~>
-      {
-        Restrict(PatcL, PatLs, FutureList, FutureList, Symmetry);
-        Sync(PatcL, FutureList, Symmetry);
-      }
-      //<~~~prolong then
-      transfer(transfer_src, transfer_dst, FutureList, FutureList, Symmetry);
-
-      // for StateList
-      // time interpolation part
-      if (BB)
-        prepare_inter_time_level(PatcL, FutureList, StateList, OldList,
-                                 tmList, 0); // use SynchList_pre as temporal storage space
-      else
-        prepare_inter_time_level(PatcL, FutureList, StateList,
-                                 tmList, 0); // use SynchList_pre as temporal storage space
-                                             // restrict first~~~>
-      {
-        Restrict(PatcL, PatLs, StateList, tmList, Symmetry);
-        Sync(PatcL, tmList, Symmetry);
-      }
-      //<~~~prolong then
-      transfer(transfer_src, transfer_dst, tmList, StateList, Symmetry);
-    }
-    else
-    {
-      // for both FutureList and StateList
-      // restrict first~~~>
-      {
-        Restrict(PatcL, PatLs, VarList, VarList, Symmetry);
-        Sync(PatcL, VarList, Symmetry);
-      }
-      //<~~~prolong then
-      transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
-    }
-
-    for (int node = 0; node < cpusize; node++)
-    {
-      if (src[node])
-        src[node]->destroyList();
-      if (transfer_src[node])
-        transfer_src[node]->destroyList();
-      if (transfer_dst[node])
-        transfer_dst[node]->destroyList();
-    }
-
-    dst->destroyList();
-  }
-
-  delete[] src;
-  delete[] transfer_src;
-  delete[] transfer_dst;
-
-  VarList->clearList();
-}
-void Parallel::KillBlocks(MyList<Patch> *PatchLIST)
-{
-  while (PatchLIST)
-  {
-    Patch *Pp = PatchLIST->data;
-    MyList<Block> *bg;
-    while (Pp->blb)
-    {
-      if (Pp->blb == Pp->ble)
-        break;
-      bg = (Pp->blb->next) ? Pp->blb->next : 0;
-      delete Pp->blb->data;
-      delete Pp->blb;
-      Pp->blb = bg;
-    }
-    if (Pp->ble)
-    {
-      delete Pp->ble->data;
-      delete Pp->ble;
-    }
-    Pp->blb = Pp->ble = 0;
-    PatchLIST = PatchLIST->next;
-  }
-}
-bool Parallel::PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
-                                     int NN, double **XX,
-                                     double *Shellf, int Symmetry)
-{
-  MyList<var> *varl;
-  int num_var = 0;
-  varl = VarList;
-  while (varl)
-  {
-    num_var++;
-    varl = varl->next;
-  }
-
-  double lld[dim], uud[dim];
-  double **pox;
-  pox = new double *[dim];
-  for (int j = 0; j < dim; j++)
-    pox[j] = new double[1];
-  for (int i = 0; i < NN; i++)
-  {
-    MyList<Patch> *PL = PatL;
-    while (PL)
-    {
-      bool flag = true;
-      for (int j = 0; j < dim; j++)
-      {
-        double h = PL->data->getdX(j);
-        lld[j] = PL->data->lli[j] * h;
-        uud[j] = PL->data->uui[j] * h;
-        if (XX[j][i] < PL->data->bbox[j] + lld[j] || XX[j][i] > PL->data->bbox[j + dim] - uud[j])
-        {
-          flag = false;
-          break;
-        }
-        pox[j][0] = XX[j][i];
-      }
-      if (flag)
-      {
-        PL->data->Interp_Points(VarList, 1, pox, Shellf + i * num_var, Symmetry);
-        break;
-      }
-      PL = PL->next;
-    }
-    if (!PL)
-    {
-      checkpatchlist(PatL, false);
-      return false;
-    }
-  }
-  for (int j = 0; j < dim; j++)
-    delete[] pox[j];
-  delete[] pox;
-
-  return true;
-}
-bool Parallel::PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
-                                     int NN, double **XX,
-                                     double *Shellf, int Symmetry, MPI_Comm Comm_here)
-{
-  MyList<var> *varl;
-  int num_var = 0;
-  varl = VarList;
-  while (varl)
-  {
-    num_var++;
-    varl = varl->next;
-  }
-
-  double lld[dim], uud[dim];
-  double **pox;
-  pox = new double *[dim];
-  for (int j = 0; j < dim; j++)
-    pox[j] = new double[1];
-  for (int i = 0; i < NN; i++)
-  {
-    MyList<Patch> *PL = PatL;
-    while (PL)
-    {
-      bool flag = true;
-      for (int j = 0; j < dim; j++)
-      {
-        double h = PL->data->getdX(j);
-        lld[j] = PL->data->lli[j] * h;
-        uud[j] = PL->data->uui[j] * h;
-        if (XX[j][i] < PL->data->bbox[j] + lld[j] || XX[j][i] > PL->data->bbox[j + dim] - uud[j])
-        {
-          flag = false;
-          break;
-        }
-        pox[j][0] = XX[j][i];
-      }
-      if (flag)
-      {
-        PL->data->Interp_Points(VarList, 1, pox, Shellf + i * num_var, Symmetry, Comm_here);
-        break;
-      }
-      PL = PL->next;
-    }
-    if (!PL)
-    {
-      checkpatchlist(PatL, false);
-      return false;
-    }
-  }
-  for (int j = 0; j < dim; j++)
-    delete[] pox[j];
-  delete[] pox;
-
-  return true;
-}
-void Parallel::aligncheck(double *bbox0, double *bboxl, int lev, double *DH0, int *shape)
-{
-  const double aligntiny = 0.1;
-  double DHl, rr;
-  int NN;
-  for (int i = 0; i < dim; i++)
-  {
-    DHl = DH0[i] * pow(0.5, lev);
-    rr = bboxl[i] - bbox0[i];
-    bboxl[i] = bbox0[i] + int(rr / DHl + 0.4) * DHl;
-    rr = bbox0[i + dim] - bboxl[i + dim];
-    bboxl[i + dim] = bbox0[i + dim] - int(rr / DHl + 0.4) * DHl;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-    NN = int((bboxl[i + dim] - bboxl[i]) / DHl + 0.4) + 1;
-#else
-#ifdef Cell
-    NN = int((bboxl[i + dim] - bboxl[i]) / DHl + 0.4);
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-    if (NN != shape[i])
-    {
-      int myrank;
-      MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-      if (myrank == 0)
-      {
-        cout << "Parallel::aligncheck want shape " << NN << " for lev#" << lev << ", but " << shape[i] << endl;
-        cout << "i = " << i << ", low = " << bboxl[i] << ", up = " << bboxl[i + dim] << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-    }
-  }
-}
-bool Parallel::point_locat_gsl(double *pox, MyList<Parallel::gridseg> *gsl)
-{
-  bool flag = false;
-  while (gsl)
-  {
-    for (int i = 0; i < dim; i++)
-    {
-      if (pox[i] > gsl->data->llb[i] && pox[i] < gsl->data->uub[i])
-        flag = true;
-      else
-      {
-        flag = false;
-        break;
-      }
-    }
-    if (flag)
-      break;
-    gsl = gsl->next;
-  }
-
-  return flag;
-}
-void Parallel::checkpatchlist(MyList<Patch> *PatL, bool buflog)
-{
-  MyList<Patch> *PL = PatL;
-  while (PL)
-  {
-    PL->data->checkPatch(buflog);
-    PL = PL->next;
-  }
-}
+      MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
+      req_node[req_no] = node;
+      req_is_recv[req_no] = 0;
+      req_no++;
+    }
+  }
+
+  // Unpack as soon as receive completes to reduce pure wait time.
+  while (pending_recv > 0)
+  {
+    int outcount = 0;
+    MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats);
+    if (outcount == MPI_UNDEFINED) break;
+
+    for (int i = 0; i < outcount; i++)
+    {
+      int idx = completed[i];
+      if (idx >= 0 && req_is_recv[idx])
+      {
+        int recv_node_i = req_node[idx];
+        data_packermix(cache.recv_bufs[recv_node_i], cache.combined_src[recv_node_i], cache.combined_dst[recv_node_i], recv_node_i, UNPACK, VarList1, VarList2, Symmetry);
+        pending_recv--;
+      }
+    }
+  }
+
+  if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats);
+
+  if (self_len > 0)
+    data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
+
+  delete[] req_node;
+  delete[] req_is_recv;
+  delete[] completed;
+}
+
+// collect all buffer grid segments or blocks for given patch
+MyList<Parallel::gridseg> *Parallel::build_buffer_gsl(Patch *Pat)
+{
+  MyList<Parallel::gridseg> *cgsl, *gsc, *gsb;
+
+  gsc = build_complete_gsl(Pat); // including ghost
+
+  gsb = new MyList<Parallel::gridseg>;
+  gsb->data = new Parallel::gridseg;
+
+  for (int i = 0; i < dim; i++)
+  {
+    double DH = Pat->blb->data->getdX(i);
+    gsb->data->uub[i] = Pat->bbox[dim + i] - Pat->uui[i] * DH;
+    gsb->data->llb[i] = Pat->bbox[i] + Pat->lli[i] * DH;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+    gsb->data->shape[i] = int((gsb->data->uub[i] - gsb->data->llb[i]) / DH + 0.4) + 1;
+#else
+#ifdef Cell
+    gsb->data->shape[i] = int((gsb->data->uub[i] - gsb->data->llb[i]) / DH + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+  }
+  gsb->data->Bg = 0;
+  gsb->next = 0;
+
+  cgsl = gsl_subtract(gsc, gsb);
+
+  gsc->destroyList();
+  gsb->destroyList();
+
+  //  set illb and iuub
+  gsb = cgsl;
+  while (gsb)
+  {
+    for (int i = 0; i < dim; i++)
+    {
+      double DH = Pat->blb->data->getdX(i);
+      gsb->data->iuub[i] = Pat->bbox[dim + i] - Pat->uui[i] * DH;
+      gsb->data->illb[i] = Pat->bbox[i] + Pat->lli[i] * DH;
+    }
+    gsb = gsb->next;
+  }
+
+  return cgsl;
+}
+MyList<Parallel::gridseg> *Parallel::build_buffer_gsl(MyList<Patch> *PatL)
+{
+  MyList<Parallel::gridseg> *cgsl = 0, *gs;
+  while (PatL)
+  {
+    if (cgsl)
+    {
+      gs->next = build_buffer_gsl(PatL->data);
+      gs = gs->next;
+      if (gs)
+        while (gs->next)
+          gs = gs->next;
+    }
+    else
+    {
+      cgsl = build_buffer_gsl(PatL->data);
+      gs = cgsl;
+      if (gs)
+        while (gs->next)
+          gs = gs->next;
+    }
+    PatL = PatL->next;
+  }
+
+  return cgsl;
+}
+void Parallel::Prolongint(Patch *Patc, Patch *Patf,
+                          MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
+                          int Symmetry)
+{
+  if (Patc->lev >= Patf->lev)
+  {
+    cout << "Parallel::Prolong: meet requst of Prolong from lev#" << Patc->lev << " to lev#" << Patf->lev << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+
+  int myrank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+  int num_var = 0;
+  MyList<var> *varl;
+  varl = VarList1;
+  while (varl)
+  {
+    num_var++;
+    varl = varl->next;
+  }
+
+  MyList<Block> *BP = Patf->blb;
+  while (BP)
+  {
+    int Npts;
+    if (myrank == BP->data->rank)
+      Npts = BP->data->shape[0] * BP->data->shape[1] * BP->data->shape[2];
+    MPI_Bcast(&Npts, 1, MPI_INT, BP->data->rank, MPI_COMM_WORLD);
+    double *pox[3];
+    for (int i = 0; i < 3; i++)
+      pox[i] = new double[Npts];
+    if (myrank == BP->data->rank)
+    {
+      for (int i = 0; i < Npts; i++)
+      {
+        int ind[3];
+        Parallel::getarrayindex(3, BP->data->shape, ind, i);
+        pox[0][i] = BP->data->X[0][ind[0]];
+        pox[1][i] = BP->data->X[1][ind[1]];
+        pox[2][i] = BP->data->X[2][ind[2]];
+      }
+    }
+    for (int i = 0; i < 3; i++)
+      MPI_Bcast(pox[i], Npts, MPI_DOUBLE, BP->data->rank, MPI_COMM_WORLD);
+    double *res;
+    res = new double[num_var * Npts];
+    Patc->Interp_Points(VarList1, Npts, pox, res, Symmetry); // because this operation is a global operation (for all processors)
+                                                             // we have to isolate it out of myrank==BP->data->rank
+    if (myrank == BP->data->rank)
+    {
+      for (int i = 0; i < Npts; i++)
+      {
+        varl = VarList2;
+        int j = 0;
+        while (varl)
+        {
+          (BP->data->fgfs[varl->data->sgfn])[i] = res[j + i * num_var];
+          j++;
+          varl = varl->next;
+        }
+      }
+    }
+    delete[] pox[0];
+    delete[] pox[1];
+    delete[] pox[2];
+    delete[] res;
+    BP = BP->next;
+  }
+}
+//
+void Parallel::merge_gsl(MyList<gridseg> *&A, const double ratio)
+{
+  if (!A)
+    return;
+
+  MyList<gridseg> *B, *C, *D = A;
+  bool flag = false;
+  while (D->next)
+  {
+    B = D->next;
+    while (B)
+    {
+      flag = merge_gs(D, B, C, ratio);
+      if (flag)
+        break;
+      B = B->next;
+    }
+    if (flag)
+      break;
+    D = D->next;
+  }
+
+  if (flag)
+  {
+    // delete D and B from A
+    MyList<gridseg> *E = A;
+    while (E->next)
+    {
+      MyList<gridseg> *tp = E->next;
+      if (D == tp || B == tp)
+      {
+        E->next = (tp->next) ? tp->next : 0;
+        delete tp->data;
+        delete tp;
+      }
+      if (E->next)
+        E = E->next;
+    }
+
+    if (D == A)
+    {
+      MyList<gridseg> *tp = A;
+      A = (A->next) ? A->next : 0;
+      delete tp->data;
+      delete tp;
+    }
+    // cat C to A
+    if (A)
+      A->catList(C);
+    else
+      A = C;
+
+    merge_gsl(A, ratio);
+  }
+}
+//
+bool Parallel::merge_gs(MyList<gridseg> *D, MyList<gridseg> *B, MyList<gridseg> *&C, const double ratio)
+{
+  if (!B || !D)
+    return false;
+
+  C = 0;
+  double llb[dim], uub[dim], DH[dim];
+  for (int i = 0; i < dim; i++)
+  {
+    double tdh;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+    DH[i] = (D->data->uub[i] - D->data->llb[i]) / (D->data->shape[i] - 1);
+    tdh = (B->data->uub[i] - B->data->llb[i]) / (B->data->shape[i] - 1);
+#else
+#ifdef Cell
+    DH[i] = (D->data->uub[i] - D->data->llb[i]) / D->data->shape[i];
+    tdh = (B->data->uub[i] - B->data->llb[i]) / B->data->shape[i];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+    if (!feq(DH[i], tdh, DH[i] / 2))
+    {
+      cout << "Parallel::merge_gs meets different grid segment " << DH[i] << " vs " << tdh << endl;
+      checkgsl(B, true);
+      checkgsl(D, true);
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+    llb[i] = Mymax(D->data->llb[i], B->data->llb[i]);
+    uub[i] = Mymin(D->data->uub[i], B->data->uub[i]);
+    //    if(uub[i]-llb[i] < DH[i]/2) return false;  //here this is valid for both vertex and cell
+
+    // use 0 instead of DH[i]/2, we consider contact case, 2012 Aug 8
+    if (uub[i] - llb[i] < 0)
+      return false; // here this is valid for both vertex and cell
+  }
+
+  // vb: volume of B
+  // vd: volume of D
+  // vo: volume of overlap
+  // vt: volume of smallest common box (virtual merged box)
+  double vd = 1, vb = 1, vt = 1, vo = 1;
+  for (int i = 0; i < dim; i++)
+  {
+    vt = vt * (Mymax(D->data->uub[i], B->data->uub[i]) - Mymin(D->data->llb[i], B->data->llb[i]));
+    vo = vo * (uub[i] - llb[i]);
+    vd = vd * (D->data->uub[i] - D->data->llb[i]);
+    vb = vb * (B->data->uub[i] - B->data->llb[i]);
+  }
+
+  // smller ratio, more possible to merge
+  if ((vd + vb - vo) / vt > ratio)
+  {
+    C = new MyList<gridseg>;
+    C->data = new gridseg;
+    for (int i = 0; i < dim; i++)
+    {
+      C->data->uub[i] = Mymax(D->data->uub[i], B->data->uub[i]);
+      C->data->llb[i] = Mymin(D->data->llb[i], B->data->llb[i]);
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+      C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / DH[i] + 0.4) + 1;
+#else
+#ifdef Cell
+      C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / DH[i] + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+    }
+    if (D->data->Bg == B->data->Bg)
+      C->data->Bg = D->data->Bg;
+    else
+      C->data->Bg = 0;
+
+    C->next = 0;
+
+    return true;
+  }
+  else
+  {
+    return false;
+  }
+}
+// Add ghost region to tangent plane
+// we assume the grids have the same resolution
+void Parallel::add_ghost_touch(MyList<gridseg> *&A)
+{
+  if (!A || !(A->next))
+    return;
+
+  double DH[dim];
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+  for (int i = 0; i < dim; i++)
+    DH[i] = (A->data->uub[i] - A->data->llb[i]) / (A->data->shape[i] - 1) / 2;
+#else
+#ifdef Cell
+  for (int i = 0; i < dim; i++)
+    DH[i] = (A->data->uub[i] - A->data->llb[i]) / A->data->shape[i] / 2;
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+
+  MyList<gridseg> *C1, *C2, *A1 = A, *A2, *dc;
+  dc = C1 = clone_gsl(A, false);
+  while (C1)
+  {
+    C2 = C1->next;
+    A2 = A1->next;
+    while (C2)
+    {
+      for (int i = 0; i < dim; i++)
+      {
+        if (feq(C1->data->llb[i], C2->data->uub[i], DH[i]))
+        {
+          // direction i touch, other directions overlap
+          bool flag = true;
+          for (int j = 0; j < i; j++)
+            if ((C1->data->llb[j] - C2->data->llb[j]) * (C1->data->uub[j] - C2->data->llb[j]) > 0 &&
+                (C2->data->llb[j] - C1->data->llb[j]) * (C2->data->uub[j] - C1->data->llb[j]) > 0)
+              flag = false;
+          for (int j = i + 1; j < dim; j++)
+            if ((C1->data->llb[j] - C2->data->llb[j]) * (C1->data->uub[j] - C2->data->llb[j]) > 0 &&
+                (C2->data->llb[j] - C1->data->llb[j]) * (C2->data->uub[j] - C1->data->llb[j]) > 0)
+              flag = false;
+
+          if (flag)
+          {
+            // only add one ghost region
+            if (feq(A1->data->llb[i], C1->data->llb[i], DH[i]))
+            {
+              A1->data->llb[i] -= ghost_width * 2 * DH[i];
+              A1->data->shape[i] += ghost_width;
+            }
+            if (feq(A2->data->uub[i], C2->data->uub[i], DH[i]))
+            {
+              A2->data->uub[i] += ghost_width * 2 * DH[i];
+              A2->data->shape[i] += ghost_width;
+            }
+          }
+        }
+        if (feq(C1->data->uub[i], C2->data->llb[i], DH[i]))
+        {
+          // direction i touch, other directions overlap
+          bool flag = true;
+          for (int j = 0; j < i; j++)
+            if ((C1->data->llb[j] - C2->data->llb[j]) * (C1->data->uub[j] - C2->data->llb[j]) > 0 &&
+                (C2->data->llb[j] - C1->data->llb[j]) * (C2->data->uub[j] - C1->data->llb[j]) > 0)
+              flag = false;
+          for (int j = i + 1; j < dim; j++)
+            if ((C1->data->llb[j] - C2->data->llb[j]) * (C1->data->uub[j] - C2->data->llb[j]) > 0 &&
+                (C2->data->llb[j] - C1->data->llb[j]) * (C2->data->uub[j] - C1->data->llb[j]) > 0)
+              flag = false;
+
+          if (flag)
+          {
+            // only add one ghost region
+            if (feq(A1->data->uub[i], C1->data->uub[i], DH[i]))
+            {
+              A1->data->uub[i] += ghost_width * 2 * DH[i];
+              A1->data->shape[i] += ghost_width;
+            }
+            if (feq(A2->data->llb[i], C2->data->llb[i], DH[i]))
+            {
+              A2->data->llb[i] -= ghost_width * 2 * DH[i];
+              A2->data->shape[i] += ghost_width;
+            }
+          }
+        }
+      }
+      C2 = C2->next;
+      A2 = A2->next;
+    }
+    C1 = C1->next;
+    A1 = A1->next;
+  }
+
+  if (dc)
+    dc->destroyList();
+}
+// According to overlap to cut the gsl into recular pices
+void Parallel::cut_gsl(MyList<gridseg> *&A)
+{
+  if (!A)
+    return;
+
+  MyList<gridseg> *B, *C, *D = A;
+  bool flag = false;
+  while (D->next)
+  {
+    B = D->next;
+    while (B)
+    {
+      flag = cut_gs(D, B, C);
+      if (flag)
+        break;
+      B = B->next;
+    }
+    if (flag)
+      break;
+    D = D->next;
+  }
+
+  if (flag)
+  {
+    // delete D and B from A
+    MyList<gridseg> *E = A;
+    while (E->next)
+    {
+      MyList<gridseg> *tp = E->next;
+      if (D == tp || B == tp)
+      {
+        E->next = (tp->next) ? tp->next : 0;
+        delete tp->data;
+        delete tp;
+      }
+      if (E->next)
+        E = E->next;
+    }
+
+    if (D == A)
+    {
+      MyList<gridseg> *tp = A;
+      A = (A->next) ? A->next : 0;
+      delete tp->data;
+      delete tp;
+    }
+    // cat C to A
+    if (A)
+      A->catList(C);
+    else
+      A = C;
+
+    cut_gsl(A);
+  }
+}
+// when D and B have overlap, cut them into C and return true
+// otherwise return false and C=0
+bool Parallel::cut_gs(MyList<gridseg> *D, MyList<gridseg> *B, MyList<gridseg> *&C)
+{
+  C = 0;
+  double llb[dim], uub[dim], DH[dim];
+  for (int i = 0; i < dim; i++)
+  {
+    double tdh;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+    DH[i] = (D->data->uub[i] - D->data->llb[i]) / (D->data->shape[i] - 1);
+    tdh = (B->data->uub[i] - B->data->llb[i]) / (B->data->shape[i] - 1);
+#else
+#ifdef Cell
+    DH[i] = (D->data->uub[i] - D->data->llb[i]) / D->data->shape[i];
+    tdh = (B->data->uub[i] - B->data->llb[i]) / B->data->shape[i];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+    if (!feq(DH[i], tdh, DH[i] / 2))
+    {
+      cout << "Parallel::cut_gs meets different grid segment " << DH[i] << " vs " << tdh << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+    llb[i] = Mymax(D->data->llb[i], B->data->llb[i]);
+    uub[i] = Mymin(D->data->uub[i], B->data->uub[i]);
+    // for efficiency we ask the width of the patch at least 2(buffer+ghost+BD ghost)
+    if (uub[i] - llb[i] < DH[i] * 2 * (buffer_width + 2 * ghost_width))
+      return false; // here this is valid for both vertex and cell
+  }
+
+  // this part code results in 5 patches generally
+
+  C = new MyList<gridseg>;
+  C->data = new gridseg;
+  for (int i = 0; i < dim; i++)
+  {
+    C->data->llb[i] = llb[i];
+    C->data->uub[i] = uub[i];
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+    C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / DH[i] + 0.4) + 1;
+#else
+#ifdef Cell
+    C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / DH[i] + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+  }
+  if (D->data->Bg == B->data->Bg)
+    C->data->Bg = D->data->Bg;
+  else
+    C->data->Bg = 0;
+
+  C->next = gs_subtract_virtual(D, C);
+
+  MyList<gridseg> *E = C;
+
+  while (E->next)
+    E = E->next;
+
+  E->next = gs_subtract_virtual(B, C);
+
+  // this part code results in 3 patches generally
+  /*
+       C = clone_gsl(D,true);
+       C->next = gs_subtract_virtual(B,C);
+  */
+
+  return true;
+}
+// note here it is different to real cut, we need leave the cutting edge for both vertex center and cell center
+MyList<Parallel::gridseg> *Parallel::gs_subtract_virtual(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B)
+{
+  if (!A)
+    return 0;
+  if (!B)
+    return clone_gsl(A, true);
+
+  double cut_plane[2 * dim], DH[dim];
+
+  for (int i = 0; i < dim; i++)
+  {
+    double tdh;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+    DH[i] = (A->data->uub[i] - A->data->llb[i]) / (A->data->shape[i] - 1);
+    tdh = (B->data->uub[i] - B->data->llb[i]) / (B->data->shape[i] - 1);
+#else
+#ifdef Cell
+    DH[i] = (A->data->uub[i] - A->data->llb[i]) / A->data->shape[i];
+    tdh = (B->data->uub[i] - B->data->llb[i]) / B->data->shape[i];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+    if (!feq(DH[i], tdh, DH[i] / 2))
+    {
+      cout << "Parallel::gs_subtract_virtual meets different grid segment " << DH[i] << " vs " << tdh << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+  }
+
+  MyList<Parallel::gridseg> *C = 0, *q;
+  for (int i = 0; i < dim; i++)
+  {
+    if (B->data->llb[i] > A->data->uub[i] || B->data->uub[i] < A->data->llb[i])
+      return clone_gsl(A, true);
+    cut_plane[i] = A->data->llb[i];
+    cut_plane[i + dim] = A->data->uub[i];
+  }
+
+  for (int i = 0; i < dim; i++)
+  {
+    cut_plane[i] = Mymax(A->data->llb[i], B->data->llb[i]);
+    if (cut_plane[i] > A->data->llb[i])
+    {
+      q = clone_gsl(A, true);
+      // prolong the list from head
+      if (C)
+        q->next = C;
+      C = q;
+      for (int j = 0; j < dim; j++)
+      {
+        if (i == j)
+        {
+          C->data->llb[i] = A->data->llb[i];
+          // **note here it is different to real cut, we need leave the cutting edge for both vertex center and cell center**
+          C->data->uub[i] = Mymax(C->data->llb[i], cut_plane[i]);
+        }
+        else
+        {
+          C->data->llb[j] = cut_plane[j];
+          C->data->uub[j] = cut_plane[j + dim];
+        }
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4) + 1;
+#else
+#ifdef Cell
+        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+    }
+
+    cut_plane[i + dim] = Mymin(A->data->uub[i], B->data->uub[i]);
+    if (cut_plane[i + dim] < A->data->uub[i])
+    {
+      q = clone_gsl(A, true);
+      if (C)
+        q->next = C;
+      C = q;
+      for (int j = 0; j < dim; j++)
+      {
+        if (i == j)
+        {
+          C->data->uub[i] = A->data->uub[i];
+          // note here it is different to real cut, we need leave the cutting edge for both vertex center and cell center
+          C->data->llb[i] = Mymin(C->data->uub[i], cut_plane[i + dim]);
+        }
+        else
+        {
+          C->data->llb[j] = cut_plane[j];
+          C->data->uub[j] = cut_plane[j + dim];
+        }
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4) + 1;
+#else
+#ifdef Cell
+        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+      }
+    }
+  }
+  return C;
+}
+// note the data structure
+// if CC is true
+// 1   -----------  1   ------  ^
+//                  0   ------  |  t
+// 0   -----------  old ------  |
+//
+// old -----------
+// if CC is false
+// 1   -----------  1   ------  ^
+// 0   -----------  0   ------  |  t
+// old -----------  old ------  |
+void Parallel::fill_level_data(MyList<Patch> *PatLd, MyList<Patch> *PatLs, MyList<Patch> *PatcL,
+                               MyList<var> *OldList, MyList<var> *StateList, MyList<var> *FutureList,
+                               MyList<var> *tmList, int Symmetry, bool BB, bool CC)
+{
+  if (PatLd->data->lev != PatLs->data->lev)
+  {
+    cout << "Parallel::fill_level_data: meet requst from lev#" << PatLs->data->lev << " to lev#" << PatLd->data->lev << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+  if (PatLd->data->lev <= PatcL->data->lev)
+  {
+    cout << "Parallel::fill_level_data: meet prolong requst from lev#" << PatcL->data->lev << " to lev#" << PatLd->data->lev << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+
+  int cpusize;
+  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
+
+  MyList<var> *VarList = 0;
+  MyList<var> *p;
+  p = StateList;
+  while (p)
+  {
+    if (VarList)
+      VarList->insert(p->data);
+    else
+      VarList = new MyList<var>(p->data);
+    p = p->next;
+  }
+  p = FutureList;
+  while (p)
+  {
+    if (VarList)
+      VarList->insert(p->data);
+    else
+      VarList = new MyList<var>(p->data);
+    p = p->next;
+  }
+
+  MyList<Parallel::gridseg> *dst;
+  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
+  src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
+  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
+
+  dst = build_complete_gsl(PatLd); // including ghost
+  // copy part
+  for (int node = 0; node < cpusize; node++)
+  {
+    src[node] = build_owned_gsl(PatLs, node, 0, Symmetry);                // similar to Sync
+    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
+  }
+
+  transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
+
+  for (int node = 0; node < cpusize; node++)
+  {
+    if (src[node])
+      src[node]->destroyList();
+    if (transfer_src[node])
+      transfer_src[node]->destroyList();
+    if (transfer_dst[node])
+      transfer_dst[node]->destroyList();
+  }
+
+  MyList<Parallel::gridseg> *dsts, *dstd;
+  dsts = build_complete_gsl_virtual(PatLs);
+  dstd = dst;
+  dst = gsl_subtract(dstd, dsts);
+  if (dstd)
+    dstd->destroyList();
+  if (dsts)
+    dsts->destroyList();
+
+  if (dst)
+  {
+    // prolongation part
+    for (int node = 0; node < cpusize; node++)
+    {
+      src[node] = build_owned_gsl(PatcL, node, 4, Symmetry);                // - buffer - ghost - BD ghost
+      build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
+    }
+
+    if (CC)
+    {
+      // for FutureList
+      // restrict first~~~>
+      {
+        Restrict(PatcL, PatLs, FutureList, FutureList, Symmetry);
+        Sync(PatcL, FutureList, Symmetry);
+      }
+      //<~~~prolong then
+      transfer(transfer_src, transfer_dst, FutureList, FutureList, Symmetry);
+
+      // for StateList
+      // time interpolation part
+      if (BB)
+        prepare_inter_time_level(PatcL, FutureList, StateList, OldList,
+                                 tmList, 0); // use SynchList_pre as temporal storage space
+      else
+        prepare_inter_time_level(PatcL, FutureList, StateList,
+                                 tmList, 0); // use SynchList_pre as temporal storage space
+                                             // restrict first~~~>
+      {
+        Restrict(PatcL, PatLs, StateList, tmList, Symmetry);
+        Sync(PatcL, tmList, Symmetry);
+      }
+      //<~~~prolong then
+      transfer(transfer_src, transfer_dst, tmList, StateList, Symmetry);
+    }
+    else
+    {
+      // for both FutureList and StateList
+      // restrict first~~~>
+      {
+        Restrict(PatcL, PatLs, VarList, VarList, Symmetry);
+        Sync(PatcL, VarList, Symmetry);
+      }
+      //<~~~prolong then
+      transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
+    }
+
+    for (int node = 0; node < cpusize; node++)
+    {
+      if (src[node])
+        src[node]->destroyList();
+      if (transfer_src[node])
+        transfer_src[node]->destroyList();
+      if (transfer_dst[node])
+        transfer_dst[node]->destroyList();
+    }
+
+    dst->destroyList();
+  }
+
+  delete[] src;
+  delete[] transfer_src;
+  delete[] transfer_dst;
+
+  VarList->clearList();
+}
+void Parallel::KillBlocks(MyList<Patch> *PatchLIST)
+{
+  while (PatchLIST)
+  {
+    Patch *Pp = PatchLIST->data;
+    MyList<Block> *bg;
+    while (Pp->blb)
+    {
+      if (Pp->blb == Pp->ble)
+        break;
+      bg = (Pp->blb->next) ? Pp->blb->next : 0;
+      delete Pp->blb->data;
+      delete Pp->blb;
+      Pp->blb = bg;
+    }
+    if (Pp->ble)
+    {
+      delete Pp->ble->data;
+      delete Pp->ble;
+    }
+    Pp->blb = Pp->ble = 0;
+    PatchLIST = PatchLIST->next;
+  }
+}
+bool Parallel::PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
+                                     int NN, double **XX,
+                                     double *Shellf, int Symmetry)
+{
+  MyList<var> *varl;
+  int num_var = 0;
+  varl = VarList;
+  while (varl)
+  {
+    num_var++;
+    varl = varl->next;
+  }
+
+  double lld[dim], uud[dim];
+  double **pox;
+  pox = new double *[dim];
+  for (int j = 0; j < dim; j++)
+    pox[j] = new double[1];
+  for (int i = 0; i < NN; i++)
+  {
+    MyList<Patch> *PL = PatL;
+    while (PL)
+    {
+      bool flag = true;
+      for (int j = 0; j < dim; j++)
+      {
+        double h = PL->data->getdX(j);
+        lld[j] = PL->data->lli[j] * h;
+        uud[j] = PL->data->uui[j] * h;
+        if (XX[j][i] < PL->data->bbox[j] + lld[j] || XX[j][i] > PL->data->bbox[j + dim] - uud[j])
+        {
+          flag = false;
+          break;
+        }
+        pox[j][0] = XX[j][i];
+      }
+      if (flag)
+      {
+        PL->data->Interp_Points(VarList, 1, pox, Shellf + i * num_var, Symmetry);
+        break;
+      }
+      PL = PL->next;
+    }
+    if (!PL)
+    {
+      checkpatchlist(PatL, false);
+      return false;
+    }
+  }
+  for (int j = 0; j < dim; j++)
+    delete[] pox[j];
+  delete[] pox;
+
+  return true;
+}
+bool Parallel::PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
+                                     int NN, double **XX,
+                                     double *Shellf, int Symmetry, MPI_Comm Comm_here)
+{
+  MyList<var> *varl;
+  int num_var = 0;
+  varl = VarList;
+  while (varl)
+  {
+    num_var++;
+    varl = varl->next;
+  }
+
+  double lld[dim], uud[dim];
+  double **pox;
+  pox = new double *[dim];
+  for (int j = 0; j < dim; j++)
+    pox[j] = new double[1];
+  for (int i = 0; i < NN; i++)
+  {
+    MyList<Patch> *PL = PatL;
+    while (PL)
+    {
+      bool flag = true;
+      for (int j = 0; j < dim; j++)
+      {
+        double h = PL->data->getdX(j);
+        lld[j] = PL->data->lli[j] * h;
+        uud[j] = PL->data->uui[j] * h;
+        if (XX[j][i] < PL->data->bbox[j] + lld[j] || XX[j][i] > PL->data->bbox[j + dim] - uud[j])
+        {
+          flag = false;
+          break;
+        }
+        pox[j][0] = XX[j][i];
+      }
+      if (flag)
+      {
+        PL->data->Interp_Points(VarList, 1, pox, Shellf + i * num_var, Symmetry, Comm_here);
+        break;
+      }
+      PL = PL->next;
+    }
+    if (!PL)
+    {
+      checkpatchlist(PatL, false);
+      return false;
+    }
+  }
+  for (int j = 0; j < dim; j++)
+    delete[] pox[j];
+  delete[] pox;
+
+  return true;
+}
+void Parallel::aligncheck(double *bbox0, double *bboxl, int lev, double *DH0, int *shape)
+{
+  const double aligntiny = 0.1;
+  double DHl, rr;
+  int NN;
+  for (int i = 0; i < dim; i++)
+  {
+    DHl = DH0[i] * pow(0.5, lev);
+    rr = bboxl[i] - bbox0[i];
+    bboxl[i] = bbox0[i] + int(rr / DHl + 0.4) * DHl;
+    rr = bbox0[i + dim] - bboxl[i + dim];
+    bboxl[i + dim] = bbox0[i + dim] - int(rr / DHl + 0.4) * DHl;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+    NN = int((bboxl[i + dim] - bboxl[i]) / DHl + 0.4) + 1;
+#else
+#ifdef Cell
+    NN = int((bboxl[i + dim] - bboxl[i]) / DHl + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+    if (NN != shape[i])
+    {
+      int myrank;
+      MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+      if (myrank == 0)
+      {
+        cout << "Parallel::aligncheck want shape " << NN << " for lev#" << lev << ", but " << shape[i] << endl;
+        cout << "i = " << i << ", low = " << bboxl[i] << ", up = " << bboxl[i + dim] << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+    }
+  }
+}
+bool Parallel::point_locat_gsl(double *pox, MyList<Parallel::gridseg> *gsl)
+{
+  bool flag = false;
+  while (gsl)
+  {
+    for (int i = 0; i < dim; i++)
+    {
+      if (pox[i] > gsl->data->llb[i] && pox[i] < gsl->data->uub[i])
+        flag = true;
+      else
+      {
+        flag = false;
+        break;
+      }
+    }
+    if (flag)
+      break;
+    gsl = gsl->next;
+  }
+
+  return flag;
+}
+void Parallel::checkpatchlist(MyList<Patch> *PatL, bool buflog)
+{
+  MyList<Patch> *PL = PatL;
+  while (PL)
+  {
+    PL->data->checkPatch(buflog);
+    PL = PL->next;
+  }
+}
diff --git a/AMSS_NCKU_source/Parallel.h b/AMSS_NCKU_source/Parallel.h
index 5179786..5712a15 100644
--- a/AMSS_NCKU_source/Parallel.h
+++ b/AMSS_NCKU_source/Parallel.h
@@ -100,29 +100,36 @@ namespace Parallel
     MyList<gridseg> **combined_dst;
     int *send_lengths;
     int *recv_lengths;
-    double **send_bufs;
-    double **recv_bufs;
-    int *send_buf_caps;
-    int *recv_buf_caps;
-    unsigned char *send_buf_pinned;
-    unsigned char *recv_buf_pinned;
-    MPI_Request *reqs;
-    MPI_Status *stats;
+    double **send_bufs;
+    double **recv_bufs;
+    int *send_buf_caps;
+    int *recv_buf_caps;
+    unsigned char *send_buf_pinned;
+    unsigned char *recv_buf_pinned;
+    unsigned char *send_buf_is_dev;
+    unsigned char *recv_buf_is_dev;
+    int *send_buf_caps_dev;
+    int *recv_buf_caps_dev;
+    double **send_bufs_dev;
+    double **recv_bufs_dev;
+    MPI_Request *reqs;
+    MPI_Status *stats;
     int max_reqs;
     bool lengths_valid;
     int *tc_req_node;
     int *tc_req_is_recv;
     int *tc_completed;
+    bool cuda_aware_mode;
     SyncCache();
     void invalidate();
     void destroy();
   };
 
-  void Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache);
-  void Sync_ensure_cache(MyList<Patch> *PatL, int Symmetry, SyncCache &cache);
-  void transfer_cached(MyList<gridseg> **src, MyList<gridseg> **dst,
-                       MyList<var> *VarList1, MyList<var> *VarList2,
-                       int Symmetry, SyncCache &cache);
+  void Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache);
+  void Sync_ensure_cache(MyList<Patch> *PatL, int Symmetry, SyncCache &cache);
+  void transfer_cached(MyList<gridseg> **src, MyList<gridseg> **dst,
+                       MyList<var> *VarList1, MyList<var> *VarList2,
+                       int Symmetry, SyncCache &cache);
 
   struct AsyncSyncState {
     int req_no;
@@ -182,13 +189,13 @@ namespace Parallel
   MyList<Parallel::gridseg> *clone_gsl(MyList<Parallel::gridseg> *p, bool first_only);
   MyList<Parallel::gridseg> *build_bulk_gsl(Patch *Pat); // similar to build_owned_gsl0 but does not care rank issue
   MyList<Parallel::gridseg> *build_bulk_gsl(Block *bp, Patch *Pat);
-  void build_PhysBD_gstl(Patch *Pat, MyList<Parallel::gridseg> *srci, MyList<Parallel::gridseg> *dsti,
-                         MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst);
-  void PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry);
-  double L2Norm(Patch *Pat, var *vf);
-  void L2Norm7(Patch *Pat, var **vf, double *norms);
-  void checkgsl(MyList<Parallel::gridseg> *pp, bool first_only);
-  void checkvarl(MyList<var> *pp, bool first_only);
+  void build_PhysBD_gstl(Patch *Pat, MyList<Parallel::gridseg> *srci, MyList<Parallel::gridseg> *dsti,
+                         MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst);
+  void PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry);
+  double L2Norm(Patch *Pat, var *vf);
+  void L2Norm7(Patch *Pat, var **vf, double *norms);
+  void checkgsl(MyList<Parallel::gridseg> *pp, bool first_only);
+  void checkvarl(MyList<var> *pp, bool first_only);
   MyList<Parallel::gridseg> *divide_gsl(MyList<Parallel::gridseg> *p, Patch *Pat);
   MyList<Parallel::gridseg> *divide_gs(MyList<Parallel::gridseg> *p, Patch *Pat);
   void prepare_inter_time_level(Patch *Pat,
@@ -220,12 +227,12 @@ namespace Parallel
   void aligncheck(double *bbox0, double *bboxl, int lev, double *DH0, int *shape);
   bool point_locat_gsl(double *pox, MyList<Parallel::gridseg> *gsl);
   void checkpatchlist(MyList<Patch> *PatL, bool buflog);
-
-  double L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here);
-  void L2Norm7(Patch *Pat, var **vf, double *norms, MPI_Comm Comm_here);
-  bool PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
-                             int NN, double **XX,
-                             double *Shellf, int Symmetry, MPI_Comm Comm_here);
+
+  double L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here);
+  void L2Norm7(Patch *Pat, var **vf, double *norms, MPI_Comm Comm_here);
+  bool PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
+                             int NN, double **XX,
+                             double *Shellf, int Symmetry, MPI_Comm Comm_here);
 #if (PSTR == 1 || PSTR == 2 || PSTR == 3)
   MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
                             bool periodic, int start_rank, int end_rank, int nodes = 0);
diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu
index ad31c7f..c818792 100644
--- a/AMSS_NCKU_source/bssn_rhs_cuda.cu
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu
@@ -76,11 +76,36 @@ struct CudaProfileStats {
     double output_ms;
 };
 
+enum RhsStageId {
+    RHS_STAGE_PREP = 0,
+    RHS_STAGE_DERIV1,
+    RHS_STAGE_METRIC,
+    RHS_STAGE_GAUGE_DERIV,
+    RHS_STAGE_GAMMA_CONTRACT,
+    RHS_STAGE_RICCI_DIFF,
+    RHS_STAGE_RICCI_FUSED,
+    RHS_STAGE_CHI,
+    RHS_STAGE_GAUGE_RHS,
+    RHS_STAGE_KODIS,
+    RHS_STAGE_CONSTRAINTS,
+    RHS_STAGE_COUNT
+};
+
+struct RhsStageProfileStats {
+    long long calls;
+    double ms[RHS_STAGE_COUNT];
+};
+
 static CudaProfileStats &cuda_profile_stats() {
     static CudaProfileStats stats = {0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
     return stats;
 }
 
+static RhsStageProfileStats &rhs_stage_profile_stats() {
+    static RhsStageProfileStats stats = {};
+    return stats;
+}
+
 static bool cuda_profile_enabled() {
     static int enabled = -1;
     if (enabled < 0) {
@@ -99,6 +124,24 @@ static int cuda_profile_every() {
     return every;
 }
 
+static bool rhs_stage_timing_enabled() {
+    static int enabled = -1;
+    if (enabled < 0) {
+        const char *env = getenv("AMSS_GPU_STAGE_TIMING");
+        enabled = (env && atoi(env) != 0) ? 1 : 0;
+    }
+    return enabled != 0;
+}
+
+static int rhs_stage_timing_every() {
+    static int every = -1;
+    if (every < 0) {
+        const char *env = getenv("AMSS_GPU_STAGE_TIMING_EVERY");
+        every = (env && atoi(env) > 0) ? atoi(env) : cuda_profile_every();
+    }
+    return every;
+}
+
 static double cuda_profile_now_ms() {
     using clock = std::chrono::steady_clock;
     return std::chrono::duration<double, std::milli>(
@@ -131,6 +174,36 @@ static void cuda_profile_maybe_log() {
     fflush(stderr);
 }
 
+static void rhs_stage_profile_accumulate(const double *stage_ms) {
+    if (!rhs_stage_timing_enabled()) return;
+
+    RhsStageProfileStats &stats = rhs_stage_profile_stats();
+    stats.calls++;
+    for (int i = 0; i < RHS_STAGE_COUNT; ++i) {
+        stats.ms[i] += stage_ms[i];
+    }
+    if (stats.calls <= 0 || stats.calls % rhs_stage_timing_every() != 0) return;
+
+    fprintf(stderr,
+            "[AMSS-CUDA-STAGE][rank %d][dev %d] calls=%lld"
+            " prep=%.3f deriv1=%.3f metric=%.3f gauge_deriv=%.3f"
+            " gamma_contract=%.3f ricci_diff=%.3f ricci_fused=%.3f"
+            " chi=%.3f gauge_rhs=%.3f kodis=%.3f constraints=%.3f ms\n",
+            g_dispatch.my_rank, g_dispatch.my_device, stats.calls,
+            stats.ms[RHS_STAGE_PREP] / (double)stats.calls,
+            stats.ms[RHS_STAGE_DERIV1] / (double)stats.calls,
+            stats.ms[RHS_STAGE_METRIC] / (double)stats.calls,
+            stats.ms[RHS_STAGE_GAUGE_DERIV] / (double)stats.calls,
+            stats.ms[RHS_STAGE_GAMMA_CONTRACT] / (double)stats.calls,
+            stats.ms[RHS_STAGE_RICCI_DIFF] / (double)stats.calls,
+            stats.ms[RHS_STAGE_RICCI_FUSED] / (double)stats.calls,
+            stats.ms[RHS_STAGE_CHI] / (double)stats.calls,
+            stats.ms[RHS_STAGE_GAUGE_RHS] / (double)stats.calls,
+            stats.ms[RHS_STAGE_KODIS] / (double)stats.calls,
+            stats.ms[RHS_STAGE_CONSTRAINTS] / (double)stats.calls);
+    fflush(stderr);
+}
+
 /* ------------------------------------------------------------------ */
 /*  Error checking                                                     */
 /* ------------------------------------------------------------------ */
@@ -4643,6 +4716,20 @@ static void compute_patch_boundary_flags(int *ex,
 static void upload_state_inputs(double **state_host, size_t all)
 {
     const size_t bytes = all * sizeof(double);
+    static int direct_upload = -1;
+    if (direct_upload < 0) {
+        const char *env = getenv("AMSS_CUDA_DIRECT_STATE_UPLOAD");
+        const char *pin_env = getenv("AMSS_CUDA_PIN_GRIDFUNCS");
+        direct_upload = env ? ((atoi(env) != 0) ? 1 : 0)
+                            : ((pin_env && atoi(pin_env) != 0) ? 1 : 0);
+    }
+    if (direct_upload) {
+        for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
+            CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[k_state_input_slots[i]], state_host[i],
+                                       bytes, cudaMemcpyHostToDevice));
+        }
+        return;
+    }
     for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
         std::memcpy(g_buf.h_stage + (size_t)i * all, state_host[i], bytes);
     }
@@ -4697,11 +4784,24 @@ static void launch_rhs_pipeline(int all, double eps, int co)
 {
     const double SYM = 1.0;
     const double ANTI = -1.0;
+    const bool stage_timing = rhs_stage_timing_enabled();
+    double stage_ms[RHS_STAGE_COUNT] = {};
+    double stage_t0 = stage_timing ? cuda_profile_now_ms() : 0.0;
 
     #define D(s) g_buf.slot[s]
+    #define MARK_RHS_STAGE(stage_id) do {                             \
+        if (stage_timing) {                                            \
+            cuda_profile_sync();                                       \
+            const double stage_t1 = cuda_profile_now_ms();             \
+            stage_ms[(stage_id)] += stage_t1 - stage_t0;               \
+            stage_t0 = stage_t1;                                       \
+        }                                                             \
+    } while (0)
+
     kern_phase1_prep<<<grid(all),BLK>>>(
         D(S_Lap), D(S_chi), D(S_dxx), D(S_dyy), D(S_dzz),
         D(S_alpn1), D(S_chin1), D(S_gxx), D(S_gyy), D(S_gzz));
+    MARK_RHS_STAGE(RHS_STAGE_PREP);
 
     {
         double *src_fields[] = {
@@ -4742,6 +4842,7 @@ static void launch_rhs_pipeline(int all, double eps, int co)
                           src_fields, fx_fields, fy_fields, fz_fields,
                           soa_signs, all);
     }
+    MARK_RHS_STAGE(RHS_STAGE_DERIV1);
 
     kern_phase2_metric_rhs<<<grid(all),BLK>>>(
         D(S_alpn1), D(S_chin1),
@@ -4799,6 +4900,7 @@ static void launch_rhs_pipeline(int all, double eps, int co)
         D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
         D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
         D(S_Gamx_rhs), D(S_Gamy_rhs), D(S_Gamz_rhs));
+    MARK_RHS_STAGE(RHS_STAGE_METRIC);
 
     {
         double *src_fields[] = {D(S_betax), D(S_betay), D(S_betaz)};
@@ -4832,6 +4934,7 @@ static void launch_rhs_pipeline(int all, double eps, int co)
                           src_fields, fx_fields, fy_fields, fz_fields,
                           soa_signs, all);
     }
+    MARK_RHS_STAGE(RHS_STAGE_GAUGE_DERIV);
 
     kern_phase8_9_gamma_rhs_contract_fused<<<grid(all),BLK>>>(
         D(S_gupxx), D(S_gupxy), D(S_gupxz),
@@ -4854,6 +4957,7 @@ static void launch_rhs_pipeline(int all, double eps, int co)
         D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx),
         D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy),
         D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz));
+    MARK_RHS_STAGE(RHS_STAGE_GAMMA_CONTRACT);
 
     {
         double *src_fields[] = {D(S_dxx), D(S_dyy), D(S_dzz), D(S_gxy), D(S_gxz), D(S_gyz)};
@@ -4870,6 +4974,7 @@ static void launch_rhs_pipeline(int all, double eps, int co)
                                 D(S_gupyy), D(S_gupyz), D(S_gupzz),
                                 src_fields, dst_fields, soa_signs, all);
     }
+    MARK_RHS_STAGE(RHS_STAGE_RICCI_DIFF);
 
     kern_phase11_ricci_fused<<<grid(all),BLK>>>(
         D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz),
@@ -4889,6 +4994,7 @@ static void launch_rhs_pipeline(int all, double eps, int co)
         D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz),
         D(S_Rxx),D(S_Rxy),D(S_Rxz),
         D(S_Ryy),D(S_Ryz),D(S_Rzz));
+    MARK_RHS_STAGE(RHS_STAGE_RICCI_FUSED);
 
     kern_phase12_13_chi_correction_fused<<<grid((size_t)all),BLK>>>(
         D(S_chi), D(S_chin1),
@@ -4904,6 +5010,7 @@ static void launch_rhs_pipeline(int all, double eps, int co)
         D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
         D(S_Rxx), D(S_Rxy), D(S_Rxz),
         D(S_Ryy), D(S_Ryz), D(S_Rzz));
+    MARK_RHS_STAGE(RHS_STAGE_CHI);
 
     kern_phase15_trK_Aij_gauge<<<grid(all),BLK>>>(
         D(S_alpn1), D(S_chin1),
@@ -4936,8 +5043,10 @@ static void launch_rhs_pipeline(int all, double eps, int co)
         D(S_betax_rhs), D(S_betay_rhs), D(S_betaz_rhs),
         D(S_Gamx_rhs), D(S_Gamy_rhs), D(S_Gamz_rhs),
         D(S_f_arr), D(S_S_arr));
+    MARK_RHS_STAGE(RHS_STAGE_GAUGE_RHS);
 
     gpu_lopsided_kodis_state_batch(eps, all);
+    MARK_RHS_STAGE(RHS_STAGE_KODIS);
 
     if (co == 0) {
         {
@@ -4982,7 +5091,10 @@ static void launch_rhs_pipeline(int all, double eps, int co)
             D(S_gzzx), D(S_gzzy), D(S_gzzz),
             D(S_ham_Res), D(S_movx_Res), D(S_movy_Res), D(S_movz_Res));
     }
+    MARK_RHS_STAGE(RHS_STAGE_CONSTRAINTS);
 
+    rhs_stage_profile_accumulate(stage_ms);
+    #undef MARK_RHS_STAGE
     #undef D
 }
 
@@ -5196,6 +5308,21 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos
     const size_t all = (size_t)ex[0] * ex[1] * ex[2];
     const size_t bytes = all * sizeof(double);
     StepContext &ctx = ensure_step_ctx(block_tag, all);
+    static int direct_download = -1;
+    if (direct_download < 0) {
+        const char *env = getenv("AMSS_CUDA_DIRECT_STATE_DOWNLOAD");
+        const char *pin_env = getenv("AMSS_CUDA_PIN_GRIDFUNCS");
+        direct_download = env ? ((atoi(env) != 0) ? 1 : 0)
+                              : ((pin_env && atoi(pin_env) != 0) ? 1 : 0);
+    }
+    if (direct_download) {
+        for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
+            CUDA_CHECK(cudaMemcpyAsync(state_host_out[i], ctx.d_state_curr[i],
+                                       bytes, cudaMemcpyDeviceToHost));
+        }
+        CUDA_CHECK(cudaDeviceSynchronize());
+        return;
+    }
     CUDA_CHECK(cudaMemcpy(g_buf.h_stage, ctx.d_state_curr_mem,
                           (size_t)BSSN_STATE_COUNT * bytes,
                           cudaMemcpyDeviceToHost));
@@ -5902,6 +6029,67 @@ int bssn_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
     return 0;
 }
 
+static void copy_state_device_batch(void *block_tag,
+                                    int state_count,
+                                    double *device_buffer,
+                                    const int *ex,
+                                    int i0, int j0, int k0,
+                                    int sx, int sy, int sz,
+                                    int pack_not_unpack)
+{
+    if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return;
+    if (sx <= 0 || sy <= 0 || sz <= 0) return;
+
+    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    const int region_all = sx * sy * sz;
+    dim3 launch_grid((unsigned int)grid((size_t)region_all),
+                     (unsigned int)state_count);
+
+    if (pack_not_unpack) {
+        kern_pack_state_region_batch<<<launch_grid, BLK>>>(
+            ctx.d_state_curr_mem, device_buffer,
+            ex[0], ex[1], i0, j0, k0, sx, sy, sz,
+            region_all, state_count,
+            ex[0] * ex[1] * ex[2]);
+    } else {
+        kern_unpack_state_region_batch<<<launch_grid, BLK>>>(
+            ctx.d_state_curr_mem, device_buffer,
+            ex[0], ex[1], i0, j0, k0, sx, sy, sz,
+            region_all, state_count,
+            ex[0] * ex[1] * ex[2]);
+    }
+}
+
+extern "C"
+int bssn_cuda_pack_state_batch_to_device_buffer(void *block_tag,
+                                                 int state_count,
+                                                 double *device_buffer,
+                                                 int *ex,
+                                                 int i0, int j0, int k0,
+                                                 int sx, int sy, int sz)
+{
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    copy_state_device_batch(block_tag, state_count, device_buffer, ex,
+                            i0, j0, k0, sx, sy, sz, 1);
+    return 0;
+}
+
+extern "C"
+int bssn_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
+                                                     int state_count,
+                                                     double *device_buffer,
+                                                     int *ex,
+                                                     int i0, int j0, int k0,
+                                                     int sx, int sy, int sz)
+{
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    copy_state_device_batch(block_tag, state_count, device_buffer, ex,
+                            i0, j0, k0, sx, sy, sz, 0);
+    return 0;
+}
+
 extern "C"
 int bssn_cuda_download_state_subset(void *block_tag,
                                     int *ex,
diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.h b/AMSS_NCKU_source/bssn_rhs_cuda.h
index 55b6380..12e190a 100644
--- a/AMSS_NCKU_source/bssn_rhs_cuda.h
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.h
@@ -1,6 +1,6 @@
-#ifndef BSSN_RHS_CUDA_H
-#define BSSN_RHS_CUDA_H
-
+#ifndef BSSN_RHS_CUDA_H
+#define BSSN_RHS_CUDA_H
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -9,28 +9,28 @@ enum {
     BSSN_CUDA_STATE_COUNT = 24,
     BSSN_CUDA_MATTER_COUNT = 10
 };
-
+
 int f_compute_rhs_bssn(int *ex, double &T,
                        double *X, double *Y, double *Z,
                        double *chi, double *trK,
-                       double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
-                       double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
-                       double *Gamx, double *Gamy, double *Gamz,
-                       double *Lap, double *betax, double *betay, double *betaz,
-                       double *dtSfx, double *dtSfy, double *dtSfz,
-                       double *chi_rhs, double *trK_rhs,
-                       double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
-                       double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
-                       double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
-                       double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
-                       double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
-                       double *rho, double *Sx, double *Sy, double *Sz,
-                       double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
-                       double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
-                       double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
-                       double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
-                       double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
-                       double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
+                       double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
+                       double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
+                       double *Gamx, double *Gamy, double *Gamz,
+                       double *Lap, double *betax, double *betay, double *betaz,
+                       double *dtSfx, double *dtSfy, double *dtSfz,
+                       double *chi_rhs, double *trK_rhs,
+                       double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
+                       double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
+                       double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
+                       double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
+                       double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
+                       double *rho, double *Sx, double *Sy, double *Sz,
+                       double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
+                       double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
+                       double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
+                       double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
+                       double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
+                       double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
                        double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
                        int &Symmetry, int &Lev, double &eps, int &co);
 
@@ -104,6 +104,20 @@ int bssn_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
                                                   int i0, int j0, int k0,
                                                   int sx, int sy, int sz);
 
+int bssn_cuda_pack_state_batch_to_device_buffer(void *block_tag,
+                                                int state_count,
+                                                double *device_buffer,
+                                                int *ex,
+                                                int i0, int j0, int k0,
+                                                int sx, int sy, int sz);
+
+int bssn_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
+                                                    int state_count,
+                                                    double *device_buffer,
+                                                    int *ex,
+                                                    int i0, int j0, int k0,
+                                                    int sx, int sy, int sz);
+
 int bssn_cuda_download_state_subset(void *block_tag,
                                     int *ex,
                                     int subset_count,
@@ -122,6 +136,6 @@ void bssn_cuda_release_step_ctx(void *block_tag);
 
 #ifdef __cplusplus
 }
-#endif
-
-#endif
+#endif
+
+#endif