AMSS-NCKU/AMSS_NCKU_source/Parallel.C


#include "Parallel.h"
#include "fmisc.h"
#include "prolongrestrict.h"
#include "misc.h"
#include "parameters.h"
#include <set>

int Parallel::partition1(int &nx, int split_size, int min_width, int cpusize, int shape) // special for 1 diemnsion
{
  nx = Mymax(1, shape / min_width);
  nx = Mymin(cpusize, nx);

  return nx;
}
int Parallel::partition2(int *nxy, int split_size, int *min_width, int cpusize, int *shape) // special for 2 diemnsions
{
#define SEARCH_SIZE 5
  int i, j, nx, ny;
  int maxnx, maxny;
  int mnx, mny;
  int dn, hmin_width, cmin_width;
  int cnx, cny;
  double fx, fy;
  int block_size;
  int n;

  block_size = shape[0] * shape[1];
  n = Mymax(1, (block_size + split_size / 2) / split_size);

  maxnx = Mymax(1, shape[0] / min_width[0]);
  maxnx = Mymin(cpusize, maxnx);
  maxny = Mymax(1, shape[1] / min_width[1]);
  maxny = Mymin(cpusize, maxny);
  fx = (double)shape[0] / (shape[0] + shape[1]);
  fy = (double)shape[1] / (shape[0] + shape[1]);
  nx = mnx = Mymax(1, Mymin(maxnx, (int)(sqrt(double(n)) * fx / fy)));
  ny = mny = Mymax(1, Mymin(maxny, (int)(sqrt(double(n)) * fy / fx)));
  dn = abs(n - nx * ny);
  hmin_width = Mymin(shape[0] / nx, shape[1] / ny);
  for (cny = Mymax(1, mny - SEARCH_SIZE); cny <= (Mymin(mny + SEARCH_SIZE, maxny)); cny++)
    for (cnx = Mymax(1, mnx - SEARCH_SIZE); cnx <= (Mymin(mnx + SEARCH_SIZE, maxnx)); cnx++)
    {
      cmin_width = Mymin(shape[0] / cnx, shape[1] / cny);
      if (dn > abs(n - cnx * cny) || (dn == abs(n - cnx * cny) && cmin_width > hmin_width))
      {
        dn = abs(n - cnx * cny);
        nx = cnx;
        ny = cny;
        hmin_width = cmin_width;
      }
    }

  nxy[0] = nx;
  nxy[1] = ny;

  return nx * ny;
#undef SEARCH_SIZE
}
int Parallel::partition3(int *nxyz, int split_size, int *min_width, int cpusize, int *shape) // special for 3 diemnsions
#if 1                                                                                        // algrithsm from Pretorius
{
//	cout<<split_size<<endl<<min_width[0]<<endl<<min_width[1]<<endl<<min_width[2]<<endl
//            <<shape[0]<<endl<<shape[1]<<endl<<shape[2]<<endl<<cpusize<<endl;
#define SEARCH_SIZE 5
  int i, j, k, nx, ny, nz;
  int maxnx, maxny, maxnz;
  int mnx, mny, mnz;
  int dn, hmin_width, cmin_width;
  int cnx, cny, cnz;
  double fx, fy, fz, max_fxfy, max_fxfz, max_fyfz;
  int block_size;
  int n;

  block_size = shape[0] * shape[1] * shape[2];
  n = Mymax(1, (block_size + split_size / 2) / split_size);

  maxnx = Mymax(1, shape[0] / min_width[0]);
  maxnx = Mymin(cpusize, maxnx);
  maxny = Mymax(1, shape[1] / min_width[1]);
  maxny = Mymin(cpusize, maxny);
  maxnz = Mymax(1, shape[2] / min_width[2]);
  maxnz = Mymin(cpusize, maxnz);
  fx = (double)shape[0] / (shape[0] + shape[1] + shape[2]);
  fy = (double)shape[1] / (shape[0] + shape[1] + shape[2]);
  fz = (double)shape[2] / (shape[0] + shape[1] + shape[2]);
  max_fxfy = Mymax(fx, fy);
  max_fxfz = Mymax(fx, fz);
  max_fyfz = Mymax(fy, fz);
  nx = mnx = Mymax(1, Mymin(maxnx, (int)(pow(n, 1.0 / 3.0) * fx / max_fyfz)));
  ny = mny = Mymax(1, Mymin(maxny, (int)(pow(n, 1.0 / 3.0) * fy / max_fxfz)));
  nz = mnz = Mymax(1, Mymin(maxnz, (int)(pow(n, 1.0 / 3.0) * fz / max_fxfy)));
  dn = abs(n - nx * ny * nz);
  hmin_width = Mymin(shape[2] / nz, shape[1] / ny);
  hmin_width = Mymin(hmin_width, shape[0] / nx);
  for (cnz = Mymax(1, mnz - SEARCH_SIZE); cnz <= (Mymin(mnz + SEARCH_SIZE, maxnz)); cnz++)
    for (cny = Mymax(1, mny - SEARCH_SIZE); cny <= (Mymin(mny + SEARCH_SIZE, maxny)); cny++)
      for (cnx = Mymax(1, mnx - SEARCH_SIZE); cnx <= (Mymin(mnx + SEARCH_SIZE, maxnx)); cnx++)
      {
        cmin_width = Mymin(shape[2] / cnz, shape[1] / cny);
        cmin_width = Mymin(cmin_width, shape[0] / cnx);
        if (dn > abs(n - cnx * cny * cnz) || (dn == abs(n - cnx * cny * cnz) && cmin_width > hmin_width))
        {
          dn = abs(n - cnx * cny * cnz);
          nx = cnx;
          ny = cny;
          nz = cnz;
          hmin_width = cmin_width;
        }
      }

  nxyz[0] = nx;
  nxyz[1] = ny;
  nxyz[2] = nz;

  return nx * ny * nz;
#undef SEARCH_SIZE
}
#elif 0 // Zhihui's idea one on 2013-09-25
{
  int nx, ny, nz;
  int hmin_width;
  hmin_width = Mymin(min_width[0], min_width[1]);
  hmin_width = Mymin(hmin_width, min_width[2]);
  nx = shape[0] / hmin_width;
  if (nx * hmin_width < shape[0])
    nx++;
  ny = shape[1] / hmin_width;
  if (ny * hmin_width < shape[1])
    ny++;
  nz = shape[2] / hmin_width;
  if (nz * hmin_width < shape[2])
    nz++;
  while (nx * ny * nz > cpusize)
  {
    hmin_width++;
    nx = shape[0] / hmin_width;
    if (nx * hmin_width < shape[0])
      nx++;
    ny = shape[1] / hmin_width;
    if (ny * hmin_width < shape[1])
      ny++;
    nz = shape[2] / hmin_width;
    if (nz * hmin_width < shape[2])
      nz++;
  }

  nxyz[0] = nx;
  nxyz[1] = ny;
  nxyz[2] = nz;

  return nx * ny * nz;
}
#elif 0 // Zhihui's idea two on 2013-09-25
{
  int nx, ny, nz;
  const int hmin_width = 8; // for example we use 8
  nx = shape[0] / hmin_width;
  if (nx * hmin_width < shape[0])
    nx++;
  ny = shape[1] / hmin_width;
  if (ny * hmin_width < shape[1])
    ny++;
  nz = shape[2] / hmin_width;
  if (nz * hmin_width < shape[2])
    nz++;

  nxyz[0] = nx;
  nxyz[1] = ny;
  nxyz[2] = nz;

  return nx * ny * nz;
}
#endif
// distribute the data to cprocessors
#if (PSTR == 0)
MyList<Block> *Parallel::distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
                                    bool periodic, int nodes)
{
#ifdef USE_GPU_DIVIDE
  double cpu_part, gpu_part;
  map<string, double>::iterator iter;
  iter = parameters::dou_par.find("cpu part");
  if (iter != parameters::dou_par.end())
  {
    cpu_part = iter->second;
  }
  else
  {
    int myrank;
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
    // read parameter from file
    const int LEN = 256;
    char pline[LEN];
    string str, sgrp, skey, sval;
    int sind;
    char pname[50];
    {
      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
      if (iter != parameters::str_par.end())
      {
        strcpy(pname, (iter->second).c_str());
      }
      else
      {
        cout << "Error inputpar" << endl;
        exit(0);
      }
    }
    ifstream inf(pname, ifstream::in);
    if (!inf.good() && myrank == 0)
    {
      cout << "Can not open parameter file " << pname << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }

    for (int i = 1; inf.good(); i++)
    {
      inf.getline(pline, LEN);
      str = pline;

      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
      if (status == -1)
      {
        cout << "error reading parameter file " << pname << " in line " << i << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
      else if (status == 0)
        continue;

      if (sgrp == "ABE")
      {
        if (skey == "cpu part")
          cpu_part = atof(sval.c_str());
      }
    }
    inf.close();

    parameters::dou_par.insert(map<string, double>::value_type("cpu part", cpu_part));
  }
  iter = parameters::dou_par.find("gpu part");
  if (iter != parameters::dou_par.end())
  {
    gpu_part = iter->second;
  }
  else
  {
    int myrank;
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
    // read parameter from file
    const int LEN = 256;
    char pline[LEN];
    string str, sgrp, skey, sval;
    int sind;
    char pname[50];
    {
      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
      if (iter != parameters::str_par.end())
      {
        strcpy(pname, (iter->second).c_str());
      }
      else
      {
        cout << "Error inputpar" << endl;
        exit(0);
      }
    }
    ifstream inf(pname, ifstream::in);
    if (!inf.good() && myrank == 0)
    {
      cout << "Can not open parameter file " << pname << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }

    for (int i = 1; inf.good(); i++)
    {
      inf.getline(pline, LEN);
      str = pline;

      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
      if (status == -1)
      {
        cout << "error reading parameter file " << pname << " in line " << i << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
      else if (status == 0)
        continue;

      if (sgrp == "ABE")
      {
        if (skey == "gpu part")
          gpu_part = atof(sval.c_str());
      }
    }
    inf.close();

    parameters::dou_par.insert(map<string, double>::value_type("gpu part", gpu_part));
  }

  if (nodes == 0)
    nodes = cpusize / 2;
#else
  if (nodes == 0)
    nodes = cpusize;
#endif

  if (dim != 3)
  {
    cout << "distrivute: now we only support 3-dimension" << endl;
    MPI_Abort(MPI_COMM_WORLD, 1);
  }

  MyList<Block> *BlL = 0;

  int split_size, min_size, block_size = 0;

  int min_width = 2 * Mymax(ghost_width, buffer_width);
  int nxyz[dim], mmin_width[dim], min_shape[dim];

  MyList<Patch> *PLi = PatchLIST;
  for (int i = 0; i < dim; i++)
    min_shape[i] = PLi->data->shape[i];
  int lev = PLi->data->lev;
  PLi = PLi->next;
  while (PLi)
  {
    Patch *PP = PLi->data;
    for (int i = 0; i < dim; i++)
      min_shape[i] = Mymin(min_shape[i], PP->shape[i]);
    if (lev != PLi->data->lev)
      cout << "Parallel::distribute CAUSTION: meet Patches for different level: " << lev << " and " << PLi->data->lev << endl;
    PLi = PLi->next;
  }

  for (int i = 0; i < dim; i++)
    mmin_width[i] = Mymin(min_width, min_shape[i]);

  min_size = mmin_width[0];
  for (int i = 1; i < dim; i++)
    min_size = min_size * mmin_width[i];

  PLi = PatchLIST;
  while (PLi)
  {
    Patch *PP = PLi->data;
    //    PP->checkPatch(true);
    int bs = PP->shape[0];
    for (int i = 1; i < dim; i++)
      bs = bs * PP->shape[i];
    block_size = block_size + bs;
    PLi = PLi->next;
  }
  split_size = Mymax(min_size, block_size / nodes);
  split_size = Mymax(1, split_size);

  int n_rank = 0;
  PLi = PatchLIST;
  int reacpu = 0;
  while (PLi)
  {
    Patch *PP = PLi->data;

    reacpu += partition3(nxyz, split_size, mmin_width, nodes, PP->shape);

    Block *ng0, *ng;
    int shape_here[dim], ibbox_here[2 * dim];
    double bbox_here[2 * dim], dd;

    // ibbox : 0,...N-1
    for (int i = 0; i < nxyz[0]; i++)
      for (int j = 0; j < nxyz[1]; j++)
        for (int k = 0; k < nxyz[2]; k++)
        {
          ibbox_here[0] = (PP->shape[0] * i) / nxyz[0];
          ibbox_here[3] = (PP->shape[0] * (i + 1)) / nxyz[0] - 1;
          ibbox_here[1] = (PP->shape[1] * j) / nxyz[1];
          ibbox_here[4] = (PP->shape[1] * (j + 1)) / nxyz[1] - 1;
          ibbox_here[2] = (PP->shape[2] * k) / nxyz[2];
          ibbox_here[5] = (PP->shape[2] * (k + 1)) / nxyz[2] - 1;

          if (periodic)
          {
            ibbox_here[0] = ibbox_here[0] - ghost_width;
            ibbox_here[3] = ibbox_here[3] + ghost_width;
            ibbox_here[1] = ibbox_here[1] - ghost_width;
            ibbox_here[4] = ibbox_here[4] + ghost_width;
            ibbox_here[2] = ibbox_here[2] - ghost_width;
            ibbox_here[5] = ibbox_here[5] + ghost_width;
          }
          else
          {
            ibbox_here[0] = Mymax(0, ibbox_here[0] - ghost_width);
            ibbox_here[3] = Mymin(PP->shape[0] - 1, ibbox_here[3] + ghost_width);
            ibbox_here[1] = Mymax(0, ibbox_here[1] - ghost_width);
            ibbox_here[4] = Mymin(PP->shape[1] - 1, ibbox_here[4] + ghost_width);
            ibbox_here[2] = Mymax(0, ibbox_here[2] - ghost_width);
            ibbox_here[5] = Mymin(PP->shape[2] - 1, ibbox_here[5] + ghost_width);
          }

          shape_here[0] = ibbox_here[3] - ibbox_here[0] + 1;
          shape_here[1] = ibbox_here[4] - ibbox_here[1] + 1;
          shape_here[2] = ibbox_here[5] - ibbox_here[2] + 1;
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
          // 0--4, 5--10
          dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1);
          bbox_here[0] = PP->bbox[0] + ibbox_here[0] * dd;
          bbox_here[3] = PP->bbox[0] + ibbox_here[3] * dd;

          dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1);
          bbox_here[1] = PP->bbox[1] + ibbox_here[1] * dd;
          bbox_here[4] = PP->bbox[1] + ibbox_here[4] * dd;

          dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1);
          bbox_here[2] = PP->bbox[2] + ibbox_here[2] * dd;
          bbox_here[5] = PP->bbox[2] + ibbox_here[5] * dd;
#else
#ifdef Cell
          // 0--5, 5--10
          dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0];
          bbox_here[0] = PP->bbox[0] + (ibbox_here[0]) * dd;
          bbox_here[3] = PP->bbox[0] + (ibbox_here[3] + 1) * dd;

          dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1];
          bbox_here[1] = PP->bbox[1] + (ibbox_here[1]) * dd;
          bbox_here[4] = PP->bbox[1] + (ibbox_here[4] + 1) * dd;

          dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2];
          bbox_here[2] = PP->bbox[2] + (ibbox_here[2]) * dd;
          bbox_here[5] = PP->bbox[2] + (ibbox_here[5] + 1) * dd;
#else
#error Not define Vertex nor Cell
#endif
#endif

#ifdef USE_GPU_DIVIDE
          {
            const int pices = 2;
            double picef[pices];
            picef[0] = cpu_part;
            picef[1] = gpu_part;
            int shape_res[dim * pices];
            double bbox_res[2 * dim * pices];
            misc::dividBlock(dim, shape_here, bbox_here, pices, picef, shape_res, bbox_res, min_width);
            ng = ng0 = new Block(dim, shape_res, bbox_res, n_rank++, ingfsi, fngfsi, PP->lev, 0); // delete through KillBlocks

            //	       if(n_rank==cpusize) {n_rank=0; cerr<<"place one!!"<<endl;}

            //	       ng->checkBlock();
            if (BlL)
              BlL->insert(ng);
            else
              BlL = new MyList<Block>(ng); // delete through KillBlocks

            for (int i = 1; i < pices; i++)
            {
              ng = new Block(dim, shape_res + i * dim, bbox_res + i * 2 * dim, n_rank++, ingfsi, fngfsi, PP->lev, i); // delete through KillBlocks
              //	        if(n_rank==cpusize) {n_rank=0; cerr<<"place two!! "<<i<<endl;}
              //	        ng->checkBlock();
              BlL->insert(ng);
            }
          }
#else
          ng = ng0 = new Block(dim, shape_here, bbox_here, n_rank++, ingfsi, fngfsi, PP->lev); // delete through KillBlocks
          //	    ng->checkBlock();
          if (BlL)
            BlL->insert(ng);
          else
            BlL = new MyList<Block>(ng); // delete through KillBlocks
#endif
          if (n_rank == cpusize)
            n_rank = 0;

          // set PP->blb
          if (i == 0 && j == 0 && k == 0)
          {
            MyList<Block> *Bp = BlL;
            while (Bp->data != ng0)
              Bp = Bp->next; // ng0 is the first of the pices list
            PP->blb = Bp;
          }
        }
    // set PP->ble
    {
      MyList<Block> *Bp = BlL;
      while (Bp->data != ng)
        Bp = Bp->next; // ng is the last of the pices list
      PP->ble = Bp;
    }
    PLi = PLi->next;
  }
  if (reacpu < nodes * 2 / 3)
  {
    int myrank;
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
    if (myrank == 0)
      cout << "Parallel::distribute CAUSTION: level#" << lev << " uses essencially " << reacpu << " processors vs " << nodes << " nodes run, your scientific computation scale is not as large as you estimate." << endl;
  }

  return BlL;
}
MyList<Block> *Parallel::distribute_hard(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
                                    bool periodic, int nodes)
{
#ifdef USE_GPU_DIVIDE
  double cpu_part, gpu_part;
  map<string, double>::iterator iter;
  iter = parameters::dou_par.find("cpu part");
  if (iter != parameters::dou_par.end())
  {
    cpu_part = iter->second;
  }
  else
  {
    int myrank;
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
    // read parameter from file
    const int LEN = 256;
    char pline[LEN];
    string str, sgrp, skey, sval;
    int sind;
    char pname[50];
    {
      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
      if (iter != parameters::str_par.end())
      {
        strcpy(pname, (iter->second).c_str());
      }
      else
      {
        cout << "Error inputpar" << endl;
        exit(0);
      }
    }
    ifstream inf(pname, ifstream::in);
    if (!inf.good() && myrank == 0)
    {
      cout << "Can not open parameter file " << pname << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }

    for (int i = 1; inf.good(); i++)
    {
      inf.getline(pline, LEN);
      str = pline;

      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
      if (status == -1)
      {
        cout << "error reading parameter file " << pname << " in line " << i << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
      else if (status == 0)
        continue;

      if (sgrp == "ABE")
      {
        if (skey == "cpu part")
          cpu_part = atof(sval.c_str());
      }
    }
    inf.close();

    parameters::dou_par.insert(map<string, double>::value_type("cpu part", cpu_part));
  }
  iter = parameters::dou_par.find("gpu part");
  if (iter != parameters::dou_par.end())
  {
    gpu_part = iter->second;
  }
  else
  {
    int myrank;
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
    // read parameter from file
    const int LEN = 256;
    char pline[LEN];
    string str, sgrp, skey, sval;
    int sind;
    char pname[50];
    {
      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
      if (iter != parameters::str_par.end())
      {
        strcpy(pname, (iter->second).c_str());
      }
      else
      {
        cout << "Error inputpar" << endl;
        exit(0);
      }
    }
    ifstream inf(pname, ifstream::in);
    if (!inf.good() && myrank == 0)
    {
      cout << "Can not open parameter file " << pname << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }

    for (int i = 1; inf.good(); i++)
    {
      inf.getline(pline, LEN);
      str = pline;

      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
      if (status == -1)
      {
        cout << "error reading parameter file " << pname << " in line " << i << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
      else if (status == 0)
        continue;

      if (sgrp == "ABE")
      {
        if (skey == "gpu part")
          gpu_part = atof(sval.c_str());
      }
    }
    inf.close();

    parameters::dou_par.insert(map<string, double>::value_type("gpu part", gpu_part));
  }

  if (nodes == 0)
    nodes = cpusize / 2;
#else
  if (nodes == 0)
    nodes = cpusize;
#endif

  if (dim != 3)
  {
    cout << "distrivute: now we only support 3-dimension" << endl;
    MPI_Abort(MPI_COMM_WORLD, 1);
  }

  MyList<Block> *BlL = 0;
  int split_size, min_size, block_size = 0;

  int min_width = 2 * Mymax(ghost_width, buffer_width);
  int nxyz[dim], mmin_width[dim], min_shape[dim];

  MyList<Patch> *PLi = PatchLIST;
  for (int i = 0; i < dim; i++)
    min_shape[i] = PLi->data->shape[i];
  int lev = PLi->data->lev;
  PLi = PLi->next;
  while (PLi)
  {
    Patch *PP = PLi->data;
    for (int i = 0; i < dim; i++)
      min_shape[i] = Mymin(min_shape[i], PP->shape[i]);
    if (lev != PLi->data->lev)
      cout << "Parallel::distribute CAUSTION: meet Patches for different level: " << lev << " and " << PLi->data->lev << endl;
    PLi = PLi->next;
  }

  for (int i = 0; i < dim; i++)
    mmin_width[i] = Mymin(min_width, min_shape[i]);

  min_size = mmin_width[0];
  for (int i = 1; i < dim; i++)
    min_size = min_size * mmin_width[i];

  PLi = PatchLIST;
  while (PLi)
  {
    Patch *PP = PLi->data;
    //    PP->checkPatch(true);
    int bs = PP->shape[0];
    for (int i = 1; i < dim; i++)
      bs = bs * PP->shape[i];
    block_size = block_size + bs;
    PLi = PLi->next;
  }
  split_size = Mymax(min_size, block_size / nodes);
  split_size = Mymax(1, split_size);

  int n_rank = 0;
  PLi = PatchLIST;
  int reacpu = 0;
  int current_block_id = 0;
  while (PLi) {
    Block *ng0, *ng;
    bool first_block_in_patch = true;
    Patch *PP = PLi->data;
    reacpu += partition3(nxyz, split_size, mmin_width, nodes, PP->shape);

    for (int i = 0; i < nxyz[0]; i++)
    for (int j = 0; j < nxyz[1]; j++)
    for (int k = 0; k < nxyz[2]; k++)
    {
        // --- 1. 定义局部变量 ---
        int ibbox_here[6], shape_here[3];
        double bbox_here[6], dd;
        Block *current_ng_start = nullptr; // 本次循环产生的第一个(或唯一一个)块

        // --- 2. 核心逻辑分支 ---
        if (current_block_id == 27 || current_block_id == 28 ||
            current_block_id == 35 || current_block_id == 36)
        {
            // A. 计算原始索引 (不带 Ghost)
            int ib0 = (PP->shape[0] * i) / nxyz[0];
            int ib3 = (PP->shape[0] * (i + 1)) / nxyz[0] - 1;
            int jb1 = (PP->shape[1] * j) / nxyz[1];
            int jb4 = (PP->shape[1] * (j + 1)) / nxyz[1] - 1;
            int kb2 = (PP->shape[2] * k) / nxyz[2];
            int kb5 = (PP->shape[2] * (k + 1)) / nxyz[2] - 1;

            int r_l, r_r;
            if(current_block_id == 27)      { r_l = 26; r_r = 27; }
            else if(current_block_id == 28) { r_l = 28; r_r = 29; }
            else if(current_block_id == 35) { r_l = 34; r_r = 35; }
            else                            { r_l = 36; r_r = 37; }
            Block * split_first_block = nullptr;
            Block * split_last_block = nullptr;
            // 拆分逻辑：该函数应更新类成员变量 split_first_block 和 split_last_block
            splitHotspotBlock(BlL, dim, ib0, ib3, jb1, jb4, kb2, kb5,
                              PP, r_l, r_r, ingfsi, fngfsi, periodic,split_first_block,split_last_block);

            current_ng_start = split_first_block;
            ng = split_last_block;
        }
        else
        {
            // B. 普通块逻辑 (含 Ghost 扩张)
            ibbox_here[0] = (PP->shape[0] * i) / nxyz[0];
            ibbox_here[3] = (PP->shape[0] * (i + 1)) / nxyz[0] - 1;
            ibbox_here[1] = (PP->shape[1] * j) / nxyz[1];
            ibbox_here[4] = (PP->shape[1] * (j + 1)) / nxyz[1] - 1;
            ibbox_here[2] = (PP->shape[2] * k) / nxyz[2];
            ibbox_here[5] = (PP->shape[2] * (k + 1)) / nxyz[2] - 1;

            if (periodic) {
                for(int d=0; d<3; d++) {
                    ibbox_here[d] -= ghost_width;
                    ibbox_here[d+3] += ghost_width;
                }
            } else {
                ibbox_here[0] = Mymax(0, ibbox_here[0] - ghost_width);
                ibbox_here[3] = Mymin(PP->shape[0] - 1, ibbox_here[3] + ghost_width);
                ibbox_here[1] = Mymax(0, ibbox_here[1] - ghost_width);
                ibbox_here[4] = Mymin(PP->shape[1] - 1, ibbox_here[4] + ghost_width);
                ibbox_here[2] = Mymax(0, ibbox_here[2] - ghost_width);
                ibbox_here[5] = Mymin(PP->shape[2] - 1, ibbox_here[5] + ghost_width);
            }

            for(int d=0; d<3; d++) shape_here[d] = ibbox_here[d+3] - ibbox_here[d] + 1;

            // 物理坐标计算 (根据你的宏定义 Cell/Vertex)
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
          // 0--4, 5--10
          dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1);
          bbox_here[0] = PP->bbox[0] + ibbox_here[0] * dd;
          bbox_here[3] = PP->bbox[0] + ibbox_here[3] * dd;

          dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1);
          bbox_here[1] = PP->bbox[1] + ibbox_here[1] * dd;
          bbox_here[4] = PP->bbox[1] + ibbox_here[4] * dd;

          dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1);
          bbox_here[2] = PP->bbox[2] + ibbox_here[2] * dd;
          bbox_here[5] = PP->bbox[2] + ibbox_here[5] * dd;
#else
#ifdef Cell
          // 0--5, 5--10
          dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0];
          bbox_here[0] = PP->bbox[0] + (ibbox_here[0]) * dd;
          bbox_here[3] = PP->bbox[0] + (ibbox_here[3] + 1) * dd;

          dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1];
          bbox_here[1] = PP->bbox[1] + (ibbox_here[1]) * dd;
          bbox_here[4] = PP->bbox[1] + (ibbox_here[4] + 1) * dd;

          dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2];
          bbox_here[2] = PP->bbox[2] + (ibbox_here[2]) * dd;
          bbox_here[5] = PP->bbox[2] + (ibbox_here[5] + 1) * dd;
#else
#error Not define Vertex nor Cell
#endif
#endif
            ng = createMappedBlock(BlL, dim, shape_here, bbox_here, current_block_id, ingfsi, fngfsi, PP->lev);
            current_ng_start = ng;
        }

        // --- 3. 统一处理 Patch 起始 Block 指针 ---
        if (first_block_in_patch) {
            ng0 = current_ng_start;

            // 立即设置 PP->blb，避免后续循环覆盖 ng0
            MyList<Block> *Bp_start = BlL;
            while (Bp_start && Bp_start->data != ng0) Bp_start = Bp_start->next;
            PP->blb = Bp_start;

            first_block_in_patch = false;
        }

        current_block_id++;
    }

    // --- 4. 设置 Patch 结束 Block 指针 ---
    MyList<Block> *Bp_end = BlL;
    while (Bp_end && Bp_end->data != ng) Bp_end = Bp_end->next;
    PP->ble = Bp_end;

    PLi = PLi->next;
    first_block_in_patch = true;
  }
  if (reacpu < nodes * 2 / 3)
  {
    int myrank;
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
    if (myrank == 0)
      cout << "Parallel::distribute CAUSTION: level#" << lev << " uses essencially " << reacpu << " processors vs " << nodes << " nodes run, your scientific computation scale is not as large as you estimate." << endl;
  }

  return BlL;
}

 /**
 * @brief 将当前 Block 几何二等分并存入列表
 * @param axis 拆分轴：0-x, 1-y, 2-z (建议选最长轴)
 */
Block* Parallel::splitHotspotBlock(MyList<Block>* &BlL, int _dim,
                                 int ib0_orig, int ib3_orig,
                                 int jb1_orig, int jb4_orig,
                                 int kb2_orig, int kb5_orig,
                                 Patch* PP, int r_left, int r_right,
                                 int ingfsi, int fngfsi, bool periodic,
                                 Block* &split_first_block, Block* &split_last_block)
{
    // 1. 索引二分 (基于无 ghost 的原始索引)
    int mid = (ib0_orig + ib3_orig) / 2;

    // 左块原始索引: [ib0, mid], 右块原始索引: [mid+1, ib3]
    int indices_L[6] = {ib0_orig, jb1_orig, kb2_orig, mid, jb4_orig, kb5_orig};
    int indices_R[6] = {mid + 1, jb1_orig, kb2_orig, ib3_orig, jb4_orig, kb5_orig};

    // 2. 内部处理逻辑 (复刻原 distribute 逻辑)
    auto createSubBlock = [&](int* ib_raw, int target_rank) {
        int ib_final[6];
        int sh_here[3];
        double bb_here[6], dd;

        // --- 逻辑 A: Ghost 扩张 ---
        if (periodic) {
            ib_final[0] = ib_raw[0] - ghost_width;
            ib_final[3] = ib_raw[3] + ghost_width;
            ib_final[1] = ib_raw[1] - ghost_width;
            ib_final[4] = ib_raw[4] + ghost_width;
            ib_final[2] = ib_raw[2] - ghost_width;
            ib_final[5] = ib_raw[5] + ghost_width;
        } else {
            ib_final[0] = Mymax(0, ib_raw[0] - ghost_width);
            ib_final[3] = Mymin(PP->shape[0] - 1, ib_raw[3] + ghost_width);
            ib_final[1] = Mymax(0, ib_raw[1] - ghost_width);
            ib_final[4] = Mymin(PP->shape[1] - 1, ib_raw[4] + ghost_width);
            ib_final[2] = Mymax(0, ib_raw[2] - ghost_width);
            ib_final[5] = Mymin(PP->shape[2] - 1, ib_raw[5] + ghost_width);
        }

        sh_here[0] = ib_final[3] - ib_final[0] + 1;
        sh_here[1] = ib_final[4] - ib_final[1] + 1;
        sh_here[2] = ib_final[5] - ib_final[2] + 1;

        // --- 逻辑 B: 物理坐标计算 (严格匹配 Cell 模式) ---
        // X 方向
        dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0];
        bb_here[0] = PP->bbox[0] + ib_final[0] * dd;
        bb_here[3] = PP->bbox[0] + (ib_final[3] + 1) * dd;

        // Y 方向
        dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1];
        bb_here[1] = PP->bbox[1] + ib_final[1] * dd;
        bb_here[4] = PP->bbox[1] + (ib_final[4] + 1) * dd;

        // Z 方向
        dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2];
        bb_here[2] = PP->bbox[2] + ib_final[2] * dd;
        bb_here[5] = PP->bbox[2] + (ib_final[5] + 1) * dd;

        Block* Bg = new Block(dim, sh_here, bb_here, target_rank, ingfsi, fngfsi, PP->lev);
        if (BlL) BlL->insert(Bg);
        else     BlL = new MyList<Block>(Bg);


        return Bg;
    };

    // 执行创建
    split_first_block = createSubBlock(indices_L, r_left);
    split_last_block  = createSubBlock(indices_R, r_right);
}


/**
 * @brief 创建映射后的 Block
 */
  Block* Parallel::createMappedBlock(MyList<Block>* &BlL, int _dim, int* shape, double* bbox,
                        int block_id, int ingfsi, int fngfsi, int lev)
  {
      // 映射表逻辑
      int target_rank = block_id;
      if (block_id == 26)      target_rank = 25;
      else if (block_id == 29) target_rank = 30;
      else if (block_id == 34) target_rank = 33;
      else if (block_id == 37) target_rank = 38;

      Block* ng = new Block(dim, shape, bbox, target_rank, ingfsi, fngfsi, lev);

      if (BlL) BlL->insert(ng);
      else     BlL = new MyList<Block>(ng);

      return ng;
  }


#elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
MyList<Block> *Parallel::distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
                                    bool periodic, int start_rank, int end_rank, int nodes)
{
#ifdef USE_GPU_DIVIDE
  double cpu_part, gpu_part;
  map<string, double>::iterator iter;
  iter = parameters::dou_par.find("cpu part");
  if (iter != parameters::dou_par.end())
  {
    cpu_part = iter->second;
  }
  else
  {
    int myrank;
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
    // read parameter from file
    const int LEN = 256;
    char pline[LEN];
    string str, sgrp, skey, sval;
    int sind;
    char pname[50];
    {
      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
      if (iter != parameters::str_par.end())
      {
        strcpy(pname, (iter->second).c_str());
      }
      else
      {
        cout << "Error inputpar" << endl;
        exit(0);
      }
    }
    ifstream inf(pname, ifstream::in);
    if (!inf.good() && myrank == 0)
    {
      cout << "Can not open parameter file " << pname << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }

    for (int i = 1; inf.good(); i++)
    {
      inf.getline(pline, LEN);
      str = pline;

      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
      if (status == -1)
      {
        cout << "error reading parameter file " << pname << " in line " << i << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
      else if (status == 0)
        continue;

      if (sgrp == "ABE")
      {
        if (skey == "cpu part")
          cpu_part = atof(sval.c_str());
      }
    }
    inf.close();

    parameters::dou_par.insert(map<string, double>::value_type("cpu part", cpu_part));
  }
  iter = parameters::dou_par.find("gpu part");
  if (iter != parameters::dou_par.end())
  {
    gpu_part = iter->second;
  }
  else
  {
    int myrank;
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
    // read parameter from file
    const int LEN = 256;
    char pline[LEN];
    string str, sgrp, skey, sval;
    int sind;
    char pname[50];
    {
      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
      if (iter != parameters::str_par.end())
      {
        strcpy(pname, (iter->second).c_str());
      }
      else
      {
        cout << "Error inputpar" << endl;
        exit(0);
      }
    }
    ifstream inf(pname, ifstream::in);
    if (!inf.good() && myrank == 0)
    {
      cout << "Can not open parameter file " << pname << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }

    for (int i = 1; inf.good(); i++)
    {
      inf.getline(pline, LEN);
      str = pline;

      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
      if (status == -1)
      {
        cout << "error reading parameter file " << pname << " in line " << i << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
      else if (status == 0)
        continue;

      if (sgrp == "ABE")
      {
        if (skey == "gpu part")
          gpu_part = atof(sval.c_str());
      }
    }
    inf.close();

    parameters::dou_par.insert(map<string, double>::value_type("gpu part", gpu_part));
  }

  if (nodes == 0)
    nodes = cpusize / 2;
#else
  if (nodes == 0)
    nodes = cpusize;
#endif

  if (dim != 3)
  {
    cout << "distrivute: now we only support 3-dimension" << endl;
    MPI_Abort(MPI_COMM_WORLD, 1);
  }

  MyList<Block> *BlL = 0;

  int split_size, min_size, block_size = 0;

  int min_width = 2 * Mymax(ghost_width, buffer_width);
  int nxyz[dim], mmin_width[dim], min_shape[dim];

  MyList<Patch> *PLi = PatchLIST;
  for (int i = 0; i < dim; i++)
    min_shape[i] = PLi->data->shape[i];
  int lev = PLi->data->lev;
  PLi = PLi->next;
  while (PLi)
  {
    Patch *PP = PLi->data;
    for (int i = 0; i < dim; i++)
      min_shape[i] = Mymin(min_shape[i], PP->shape[i]);
    if (lev != PLi->data->lev)
      cout << "Parallel::distribute CAUSTION: meet Patches for different level: " << lev << " and " << PLi->data->lev << endl;
    PLi = PLi->next;
  }

  for (int i = 0; i < dim; i++)
    mmin_width[i] = Mymin(min_width, min_shape[i]);

  min_size = mmin_width[0];
  for (int i = 1; i < dim; i++)
    min_size = min_size * mmin_width[i];

  PLi = PatchLIST;
  while (PLi)
  {
    Patch *PP = PLi->data;
    //    PP->checkPatch(true);
    int bs = PP->shape[0];
    for (int i = 1; i < dim; i++)
      bs = bs * PP->shape[i];
    block_size = block_size + bs;
    PLi = PLi->next;
  }
  split_size = Mymax(min_size, block_size / cpusize);
  split_size = Mymax(1, split_size);

  int n_rank = start_rank;
  PLi = PatchLIST;
  int reacpu = 0;
  while (PLi)
  {
    Patch *PP = PLi->data;

    reacpu += partition3(nxyz, split_size, mmin_width, cpusize, PP->shape);

    Block *ng, *ng0;
    int shape_here[dim], ibbox_here[2 * dim];
    double bbox_here[2 * dim], dd;

    // ibbox : 0,...N-1
    for (int i = 0; i < nxyz[0]; i++)
      for (int j = 0; j < nxyz[1]; j++)
        for (int k = 0; k < nxyz[2]; k++)
        {
          ibbox_here[0] = (PP->shape[0] * i) / nxyz[0];
          ibbox_here[3] = (PP->shape[0] * (i + 1)) / nxyz[0] - 1;
          ibbox_here[1] = (PP->shape[1] * j) / nxyz[1];
          ibbox_here[4] = (PP->shape[1] * (j + 1)) / nxyz[1] - 1;
          ibbox_here[2] = (PP->shape[2] * k) / nxyz[2];
          ibbox_here[5] = (PP->shape[2] * (k + 1)) / nxyz[2] - 1;

          if (periodic)
          {
            ibbox_here[0] = ibbox_here[0] - ghost_width;
            ibbox_here[3] = ibbox_here[3] + ghost_width;
            ibbox_here[1] = ibbox_here[1] - ghost_width;
            ibbox_here[4] = ibbox_here[4] + ghost_width;
            ibbox_here[2] = ibbox_here[2] - ghost_width;
            ibbox_here[5] = ibbox_here[5] + ghost_width;
          }
          else
          {
            ibbox_here[0] = Mymax(0, ibbox_here[0] - ghost_width);
            ibbox_here[3] = Mymin(PP->shape[0] - 1, ibbox_here[3] + ghost_width);
            ibbox_here[1] = Mymax(0, ibbox_here[1] - ghost_width);
            ibbox_here[4] = Mymin(PP->shape[1] - 1, ibbox_here[4] + ghost_width);
            ibbox_here[2] = Mymax(0, ibbox_here[2] - ghost_width);
            ibbox_here[5] = Mymin(PP->shape[2] - 1, ibbox_here[5] + ghost_width);
          }

          shape_here[0] = ibbox_here[3] - ibbox_here[0] + 1;
          shape_here[1] = ibbox_here[4] - ibbox_here[1] + 1;
          shape_here[2] = ibbox_here[5] - ibbox_here[2] + 1;
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
          // 0--4, 5--10
          dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1);
          bbox_here[0] = PP->bbox[0] + ibbox_here[0] * dd;
          bbox_here[3] = PP->bbox[0] + ibbox_here[3] * dd;

          dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1);
          bbox_here[1] = PP->bbox[1] + ibbox_here[1] * dd;
          bbox_here[4] = PP->bbox[1] + ibbox_here[4] * dd;

          dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1);
          bbox_here[2] = PP->bbox[2] + ibbox_here[2] * dd;
          bbox_here[5] = PP->bbox[2] + ibbox_here[5] * dd;
#else
#ifdef Cell
          // 0--5, 5--10
          dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0];
          bbox_here[0] = PP->bbox[0] + (ibbox_here[0]) * dd;
          bbox_here[3] = PP->bbox[0] + (ibbox_here[3] + 1) * dd;

          dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1];
          bbox_here[1] = PP->bbox[1] + (ibbox_here[1]) * dd;
          bbox_here[4] = PP->bbox[1] + (ibbox_here[4] + 1) * dd;

          dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2];
          bbox_here[2] = PP->bbox[2] + (ibbox_here[2]) * dd;
          bbox_here[5] = PP->bbox[2] + (ibbox_here[5] + 1) * dd;
#else
#error Not define Vertex nor Cell
#endif
#endif

#ifdef USE_GPU_DIVIDE
          {
            const int pices = 2;
            double picef[pices];
            picef[0] = cpu_part;
            picef[1] = gpu_part;
            int shape_res[dim * pices];
            double bbox_res[2 * dim * pices];
            misc::dividBlock(dim, shape_here, bbox_here, pices, picef, shape_res, bbox_res, min_width);
            ng = ng0 = new Block(dim, shape_res, bbox_res, n_rank++, ingfsi, fngfsi, PP->lev, 0); // delete through KillBlocks
            //	       ng->checkBlock();
            if (BlL)
              BlL->insert(ng);
            else
              BlL = new MyList<Block>(ng); // delete through KillBlocks

            for (int i = 1; i < pices; i++)
            {
              ng = new Block(dim, shape_res + i * dim, bbox_res + i * 2 * dim, n_rank++, ingfsi, fngfsi, PP->lev, i); // delete through KillBlocks
              //	        ng->checkBlock();
              BlL->insert(ng);
            }
          }
#else
          ng = ng0 = new Block(dim, shape_here, bbox_here, n_rank++, ingfsi, fngfsi, PP->lev); // delete through KillBlocks
          //	    ng->checkBlock();
          if (BlL)
            BlL->insert(ng);
          else
            BlL = new MyList<Block>(ng); // delete through KillBlocks
#endif

          if (n_rank == end_rank + 1)
            n_rank = start_rank;

          // set PP->blb
          if (i == 0 && j == 0 && k == 0)
          {
            MyList<Block> *Bp = BlL;
            while (Bp->data != ng0)
              Bp = Bp->next; // ng0 is the first of the pices list
            PP->blb = Bp;
          }
        }
    // set PP->ble
    {
      MyList<Block> *Bp = BlL;
      while (Bp->data != ng)
        Bp = Bp->next; // ng is the last of the pices list
      PP->ble = Bp;
    }
    PLi = PLi->next;
  }
  if (reacpu < nodes * 2 / 3)
  {
    int myrank;
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
    if (myrank == start_rank)
      cout << "Parallel::distribute CAUSTION: level#" << lev << " uses essencially " << reacpu << " processors vs " << nodes << " nodes run, your scientific computation scale is not as large as you estimate." << endl;
  }

  return BlL;
}
#endif
void Parallel::setfunction(MyList<Block> *BlL, var *vn, double func(double x, double y, double z))
{
  while (BlL)
  {
    if (BlL->data->X[0])
    {
      int nn = BlL->data->shape[0] * BlL->data->shape[1] * BlL->data->shape[2];
      double *p = BlL->data->fgfs[vn->sgfn];
      for (int i = 0; i < nn; i++)
      {
        int ind[3];
        getarrayindex(3, BlL->data->shape, ind, i);
        p[i] = func(BlL->data->X[0][ind[0]], BlL->data->X[1][ind[1]], BlL->data->X[2][ind[2]]);
      }
    }
    BlL = BlL->next;
  }
}
// set function only for cpu rank
void Parallel::setfunction(int rank, MyList<Block> *BlL, var *vn, double func(double x, double y, double z))
{
  while (BlL)
  {
    if (BlL->data->X[0] && BlL->data->rank == rank)
    {
      int nn = BlL->data->shape[0] * BlL->data->shape[1] * BlL->data->shape[2];
      double *p = BlL->data->fgfs[vn->sgfn];
      for (int i = 0; i < nn; i++)
      {
        int ind[3];
        getarrayindex(3, BlL->data->shape, ind, i);
        p[i] = func(BlL->data->X[0][ind[0]], BlL->data->X[1][ind[1]], BlL->data->X[2][ind[2]]);
      }
    }
    BlL = BlL->next;
  }
}
void Parallel::getarrayindex(int DIM, int *shape, int *index, int n)
{
  // we assume index has already memory space
  int *mu;
  mu = new int[DIM];
  mu[0] = 1;
  for (int i = 1; i < DIM; i++)
    mu[i] = mu[i - 1] * shape[i - 1];
  for (int i = DIM - 1; i >= 0; i--)
  {
    index[i] = n / mu[i];
    n = n - index[i] * mu[i];
  }

  delete[] mu;
}
int Parallel::getarraylocation(int DIM, int *shape, int *index)
{
  int n, mu;
  mu = shape[0];
  n = index[0];
  for (int i = 1; i < DIM; i++)
  {
    n = n + index[i] * mu;
    mu = mu * shape[i];
  }

  return n;
}
void Parallel::copy(int DIM, double *llbout, double *uubout, int *Dshape, double *DD, double *llbin, double *uubin,
                    int *shape, double *datain, double *llb, double *uub)
{
  // for 3 dimensional case, based on simple test, I found this is half slower than f90 code
  int *illi, *iuui;
  int *illo, *iuuo;
  int *indi, *indo;
  illi = new int[DIM];
  iuui = new int[DIM];
  illo = new int[DIM];
  iuuo = new int[DIM];
  indi = new int[DIM];
  indo = new int[DIM];

  int ial = 1;
  for (int i = 0; i < DIM; i++)
  {
    double ho, hi;
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
    ho = (uubout[i] - llbout[i]) / (Dshape[i] - 1);
    hi = (uubin[i] - llbin[i]) / (shape[i] - 1);
#else
#ifdef Cell
    ho = (uubout[i] - llbout[i]) / Dshape[i];
    hi = (uubin[i] - llbin[i]) / shape[i];
#else
#error Not define Vertex nor Cell
#endif
#endif
    illo[i] = int((llb[i] - llbout[i]) / ho);
    iuuo[i] = Dshape[i] - 1 - int((uubout[i] - uub[i]) / ho);
    illi[i] = int((llb[i] - llbin[i]) / hi);
    iuui[i] = shape[i] - 1 - int((uubin[i] - uub[i]) / hi);

    if (illo[i] > iuuo[i] || illi[i] > iuui[i] || illo[i] < 0 || illi[i] < 0 ||
        iuui[i] >= shape[i] || iuuo[i] >= Dshape[i])
    {
      cout << "Parallel copy: in direction " << i << ":" << endl;
      cout << "llb = " << llb[i] << ", uub = " << uub[i] << endl;
      cout << " in data : il = " << illi[i] << ", iu = " << iuui[i] << endl;
      cout << "bbox = (" << llbin[i] << "," << uubin[i] << ")" << endl;
      cout << "shape = " << shape[i] << endl;
      cout << "out data : il = " << illo[i] << ", iu = " << iuuo[i] << endl;
      cout << "bbox = (" << llbout[i] << "," << uubout[i] << ")" << endl;
      cout << "shape = " << Dshape[i] << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
    int ihi = iuui[i] - illi[i] + 1, iho = iuuo[i] - illo[i] + 1;
    if (!(feq(ho, hi, ho / 2)) || ihi != iho)
    {
      cout << "Parallel copy: in direction " << i << ":" << endl;
      cout << "Parallel copy: not the same grid structure." << endl;
      cout << "hi = " << hi << ", bbox = (" << llbin[i] << "," << uubin[i] << "), shape = " << shape[i] << endl;
      cout << "ho = " << ho << ", bbox = (" << llbout[i] << "," << uubout[i] << "), shape = " << Dshape[i] << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
    ial = ial * ihi;
  }

  for (int i = 0; i < DIM; i++)
  {
    indi[i] = illi[i];
    indo[i] = illo[i];
  }
  /*
  //check start index
     for(int i=0;i<DIM;i++)
     {
       cout << "Parallel copy: in direction " <<i<<":"<< endl;
       cout<<"start : indi = "<<indi[i]<<", indo = "<<indo[i]<<endl;
     }
  */
  int NNi = 1, NNo = 1;
  for (int i = 0; i < DIM; i++)
  {
    NNi = NNi * shape[i];
    NNo = NNo * Dshape[i];
  }
  for (int i = 0; i < ial; i++)
  {
    int ni, no;
    ni = getarraylocation(DIM, shape, indi);
    no = getarraylocation(DIM, Dshape, indo);
    if (no < 0 || no > NNo)
    {
      cout << "Parallel copy: no = " << no << " is out of array range (0," << NNo << ")." << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
    if (ni < 0 || ni > NNi)
    {
      cout << "Parallel copy: ni = " << ni << " is out of array range (0," << NNi << ")." << endl;
      cout << "shape = (";
      for (int j = 0; j < DIM; j++)
      {
        cout << shape[j];
        if (j < DIM - 1)
          cout << ",";
        else
          cout << ")" << endl;
      }
      cout << "ind = (";
      for (int j = 0; j < DIM; j++)
      {
        cout << indi[j];
        if (j < DIM - 1)
          cout << ",";
        else
          cout << ")" << endl;
      }
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
    DD[no] = datain[ni];

    indi[0]++;
    for (int j = 1; j < DIM; j++)
    {
      if (indi[j - 1] == iuui[j - 1] + 1)
      {
        indi[j - 1] = illi[j - 1];
        indi[j]++;
      } // carry 1 to next digital
      else
        break;
    }
    indo[0]++;
    for (int j = 1; j < DIM; j++)
    {
      if (indo[j - 1] == iuuo[j - 1] + 1)
      {
        indo[j - 1] = illo[j - 1];
        indo[j]++;
      }
      else
        break;
    }
  }
  /*
  //check final index
     for(int i=0;i<DIM;i++)
     {
       cout << "Parallel copy: in direction " <<i<<":"<< endl;
       cout<<"final : indi = "<<indi[i]<<", indo = "<<indo[i]<<endl;
     }
  */
  delete[] illi;
  delete[] iuui;
  delete[] illo;
  delete[] iuuo;
  delete[] indi;
  delete[] indo;
}
void Parallel::writefile(double time, int nx, int ny, int nz, double xmin, double xmax, double ymin, double ymax,
                         double zmin, double zmax, char *filename, double *data_out)
{
  ofstream outfile;
  outfile.open(filename, ios::out | ios::trunc);
  if (!outfile)
  {
    cout << "Can't open " << filename << " for output." << endl;
    MPI_Abort(MPI_COMM_WORLD, 1);
  }
  outfile.write((char *)&time, sizeof(double));
  outfile.write((char *)&nx, sizeof(int));
  outfile.write((char *)&ny, sizeof(int));
  outfile.write((char *)&nz, sizeof(int));
  outfile.write((char *)&xmin, sizeof(double));
  outfile.write((char *)&xmax, sizeof(double));
  outfile.write((char *)&ymin, sizeof(double));
  outfile.write((char *)&ymax, sizeof(double));
  outfile.write((char *)&zmin, sizeof(double));
  outfile.write((char *)&zmax, sizeof(double));
  outfile.write((char *)data_out, nx * ny * nz * sizeof(double));
  outfile.close();
}
void Parallel::writefile(double time, int nx, int ny, double xmin, double xmax, double ymin, double ymax,
                         char *filename, double *datain)
{
  int i, j;
  double *X, *Y;
  X = new double[nx];
  Y = new double[ny];
  double dd;
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
  dd = (xmax - xmin) / (nx - 1);
  for (i = 0; i < nx; i++)
    X[i] = xmin + i * dd;
  dd = (ymax - ymin) / (ny - 1);
  for (j = 0; j < ny; j++)
    Y[j] = ymin + j * dd;
#else
#ifdef Cell
  dd = (xmax - xmin) / nx;
  for (i = 0; i < nx; i++)
    X[i] = xmin + (i + 0.5) * dd;
  dd = (ymax - ymin) / ny;
  for (j = 0; j < ny; j++)
    Y[j] = ymin + (j + 0.5) * dd;
#else
#error Not define Vertex nor Cell
#endif
#endif
  ofstream outfile;
  outfile.open(filename, ios::out | ios::trunc);
  if (!outfile)
  {
    cout << "Can't open " << filename << " for output." << endl;
    MPI_Abort(MPI_COMM_WORLD, 1);
  }
  outfile << "# t = " << time << endl;
  for (j = 0; j < ny; j++)
  {
    for (i = 0; i < nx; i++)
    {
      int ind1 = i + j * nx;
      outfile << setw(10) << setprecision(10) << X[i] << " "
              << setw(10) << setprecision(10) << Y[j] << " "
              << setw(16) << setprecision(15) << datain[ind1]
              << endl;
    }
    outfile << "\n"; /* blanck line for gnuplot */
  }
  outfile.close();

  delete[] X;
  delete[] Y;
}
void Parallel::Dump_CPU_Data(MyList<Block> *BlL, MyList<var> *DumpList, char *tag, double time, double dT)
{
  int myrank;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  // round at 4 and 5
  int ncount = int(time / dT + 0.5);

  MyList<Block> *Bp;
  while (DumpList)
  {
    Bp = BlL;
    int Bi = 0;
    while (Bp)
    {
      Block *BP = Bp->data;
      var *VP = DumpList->data;
      if (BP->rank == myrank)
      {

        string out_dir;
        map<string, string>::iterator iter;
        iter = parameters::str_par.find("output dir");
        if (iter != parameters::str_par.end())
        {
          out_dir = iter->second;
        }
        else
        {
          // read parameter from file
          const int LEN = 256;
          char pline[LEN];
          string str, sgrp, skey, sval;
          int sind;
          char pname[50];
          {
            map<string, string>::iterator iter = parameters::str_par.find("inputpar");
            if (iter != parameters::str_par.end())
            {
              strcpy(pname, (iter->second).c_str());
            }
            else
            {
              cout << "Error inputpar" << endl;
              exit(0);
            }
          }
          ifstream inf(pname, ifstream::in);
          if (!inf.good())
          {
            cout << "Can not open parameter file " << pname << endl;
            MPI_Abort(MPI_COMM_WORLD, 1);
          }

          for (int i = 1; inf.good(); i++)
          {
            inf.getline(pline, LEN);
            str = pline;

            int status = misc::parse_parts(str, sgrp, skey, sval, sind);
            if (status == -1)
            {
              cout << "error reading parameter file " << pname << " in line " << i << endl;
              MPI_Abort(MPI_COMM_WORLD, 1);
            }
            else if (status == 0)
              continue;

            if (sgrp == "ABE")
            {
              if (skey == "output dir")
                out_dir = sval;
            }
          }
          inf.close();

          parameters::str_par.insert(map<string, string>::value_type("output dir", out_dir));
        }

        char filename[100];
        if (tag)
          sprintf(filename, "%s/%s_Lev%02d-%02d_%02d_%s_%05d.bin", out_dir.c_str(), tag, BP->lev, Bi, myrank, VP->name, ncount);
        else
          sprintf(filename, "%s/Lev%02d-%02d_%02d_%s_%05d.bin", out_dir.c_str(), BP->lev, Bi, myrank, VP->name, ncount);
        writefile(time, BP->shape[0], BP->shape[1], BP->shape[2], BP->bbox[0], BP->bbox[3], BP->bbox[1], BP->bbox[4],
                  BP->bbox[2], BP->bbox[5], filename, BP->fgfs[VP->sgfn]);
        cout << "end of dump " << VP->name << " at time " << time << ", on node " << myrank << endl;
      }
      Bp = Bp->next;
      Bi++;
    }
    DumpList = DumpList->next;
  }
}
// Now we dump the data including buffer points
void Parallel::Dump_Data(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT, int grd)
{
  int myrank;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  //   round at 4 and 5
  int ncount = int(time / dT + 0.5);

  MPI_Status sta;
  int DIM = 3;
  double llb[3], uub[3];
  double DX, DY, DZ;

  double *databuffer = 0;
  if (myrank == 0)
  {
    databuffer = (double *)malloc(sizeof(double) * PP->shape[0] * PP->shape[1] * PP->shape[2]);
    if (!databuffer)
    {
      cout << "Parallel::Dump_Data: out of memory when dumping data." << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }

  while (DumpList)
  {
    var *VP = DumpList->data;

    MyList<Block> *Bp = PP->blb;
    while (Bp)
    {
      Block *BP = Bp->data;
      if (BP->rank == 0 && myrank == 0)
      {
        DX = BP->getdX(0);
        DY = BP->getdX(1);
        DZ = BP->getdX(2);
        llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
        llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
        llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
        uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
        uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
        uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
        f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, BP->fgfs[VP->sgfn], llb, uub);
      }
      else
      {
        int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]);
        if (myrank == 0)
        {
          double *bufferhere = (double *)malloc(sizeof(double) * nnn);
          if (!bufferhere)
          {
            cout << "on node#" << myrank << ", out of memory when dumping data." << endl;
            MPI_Abort(MPI_COMM_WORLD, 1);
          }
          MPI_Recv(bufferhere, nnn, MPI_DOUBLE, BP->rank, 0, MPI_COMM_WORLD, &sta);
          DX = BP->getdX(0);
          DY = BP->getdX(1);
          DZ = BP->getdX(2);
          llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
          llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
          llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
          uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
          uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
          uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
          f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, bufferhere, llb, uub);
          free(bufferhere);
        }
        else if (myrank == BP->rank)
        {
          MPI_Send(BP->fgfs[VP->sgfn], nnn, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
        }
      }
      if (Bp == PP->ble)
        break;
      Bp = Bp->next;
    }
    if (myrank == 0)
    {

      string out_dir;
      map<string, string>::iterator iter;
      iter = parameters::str_par.find("output dir");
      if (iter != parameters::str_par.end())
      {
        out_dir = iter->second;
      }
      else
      {
        // read parameter from file
        const int LEN = 256;
        char pline[LEN];
        string str, sgrp, skey, sval;
        int sind;
        char pname[50];
        {
          map<string, string>::iterator iter = parameters::str_par.find("inputpar");
          if (iter != parameters::str_par.end())
          {
            strcpy(pname, (iter->second).c_str());
          }
          else
          {
            cout << "Error inputpar" << endl;
            exit(0);
          }
        }
        ifstream inf(pname, ifstream::in);
        if (!inf.good())
        {
          cout << "Can not open parameter file " << pname << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }

        for (int i = 1; inf.good(); i++)
        {
          inf.getline(pline, LEN);
          str = pline;

          int status = misc::parse_parts(str, sgrp, skey, sval, sind);
          if (status == -1)
          {
            cout << "error reading parameter file " << pname << " in line " << i << endl;
            MPI_Abort(MPI_COMM_WORLD, 1);
          }
          else if (status == 0)
            continue;

          if (sgrp == "ABE")
          {
            if (skey == "output dir")
              out_dir = sval;
          }
        }
        inf.close();

        parameters::str_par.insert(map<string, string>::value_type("output dir", out_dir));
      }

      char filename[100];
      if (tag)
        sprintf(filename, "%s/%s_Lev%02d-%02d_%s_%05d.bin", out_dir.c_str(), tag, PP->lev, grd, VP->name, ncount);
      else
        sprintf(filename, "%s/Lev%02d-%02d_%s_%05d.bin", out_dir.c_str(), PP->lev, grd, VP->name, ncount);

      writefile(time, PP->shape[0], PP->shape[1], PP->shape[2], PP->bbox[0], PP->bbox[3], PP->bbox[1], PP->bbox[4],
                PP->bbox[2], PP->bbox[5], filename, databuffer);
    }
    DumpList = DumpList->next;
  }

  if (myrank == 0)
    free(databuffer);
}
void Parallel::Dump_Data(MyList<Patch> *PL, MyList<var> *DumpList, char *tag, double time, double dT)
{
  MyList<Patch> *Pp;
  Pp = PL;
  int grd = 0;
  while (Pp)
  {
    Patch *PP = Pp->data;
    Dump_Data(PP, DumpList, tag, time, dT, grd);
    grd++;
    Pp = Pp->next;
  }
}
// collect the data including buffer points
double *Parallel::Collect_Data(Patch *PP, var *VP)
{
  int myrank;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);

  MPI_Status sta;
  int DIM = 3;
  double llb[3], uub[3];
  double DX, DY, DZ;

  double *databuffer = 0;
  if (myrank == 0)
  {
    databuffer = (double *)malloc(sizeof(double) * PP->shape[0] * PP->shape[1] * PP->shape[2]);
    if (!databuffer)
    {
      cout << "Parallel::Collect_Data: out of memory when dumping data." << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }

  MyList<Block> *Bp = PP->blb;
  while (Bp)
  {
    Block *BP = Bp->data;
    if (BP->rank == 0 && myrank == 0)
    {
      DX = BP->getdX(0);
      DY = BP->getdX(1);
      DZ = BP->getdX(2);
      llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
      llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
      llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
      uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
      uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
      uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
      f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, BP->fgfs[VP->sgfn], llb, uub);
    }
    else
    {
      int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]);
      if (myrank == 0)
      {
        double *bufferhere = (double *)malloc(sizeof(double) * nnn);
        if (!bufferhere)
        {
          cout << "on node#" << myrank << ", out of memory when dumping data." << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        MPI_Recv(bufferhere, nnn, MPI_DOUBLE, BP->rank, 0, MPI_COMM_WORLD, &sta);
        DX = BP->getdX(0);
        DY = BP->getdX(1);
        DZ = BP->getdX(2);
        llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
        llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
        llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
        uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
        uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
        uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
        f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, bufferhere, llb, uub);
        free(bufferhere);
      }
      else if (myrank == BP->rank)
      {
        MPI_Send(BP->fgfs[VP->sgfn], nnn, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
      }
    }
    if (Bp == PP->ble)
      break;
    Bp = Bp->next;
  }

  return databuffer;
}
// Now we dump the data including buffer points
// dump z = 0 plane
void Parallel::d2Dump_Data(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT, int grd)
{
  int myrank;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  //   round at 4 and 5
  int ncount = int(time / dT + 0.5);

  MPI_Status sta;
  int DIM = 3;
  double llb[3], uub[3];
  double DX, DY, DZ;

  double *databuffer = 0, *databuffer2 = 0;
  if (myrank == 0)
  {
    databuffer = (double *)malloc(sizeof(double) * PP->shape[0] * PP->shape[1] * PP->shape[2]);
    databuffer2 = (double *)malloc(sizeof(double) * PP->shape[0] * PP->shape[1]);
    if (!databuffer || !databuffer2)
    {
      cout << "Parallel::d2Dump_Data: out of memory when dumping data." << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }

  while (DumpList)
  {
    var *VP = DumpList->data;

    MyList<Block> *Bp = PP->blb;
    while (Bp)
    {
      Block *BP = Bp->data;
      if (BP->rank == 0 && myrank == 0)
      {
        DX = BP->getdX(0);
        DY = BP->getdX(1);
        DZ = BP->getdX(2);
        llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
        llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
        llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
        uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
        uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
        uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
        f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, BP->fgfs[VP->sgfn], llb, uub);
      }
      else
      {
        int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]);
        if (myrank == 0)
        {
          double *bufferhere = (double *)malloc(sizeof(double) * nnn);
          if (!bufferhere)
          {
            cout << "on node#" << myrank << ", out of memory when dumping data." << endl;
            MPI_Abort(MPI_COMM_WORLD, 1);
          }
          MPI_Recv(bufferhere, nnn, MPI_DOUBLE, BP->rank, 0, MPI_COMM_WORLD, &sta);
          DX = BP->getdX(0);
          DY = BP->getdX(1);
          DZ = BP->getdX(2);
          llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
          llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
          llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
          uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
          uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
          uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
          f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, bufferhere, llb, uub);
          free(bufferhere);
        }
        else if (myrank == BP->rank)
        {
          MPI_Send(BP->fgfs[VP->sgfn], nnn, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
        }
      }
      if (Bp == PP->ble)
        break;
      Bp = Bp->next;
    }
    if (myrank == 0)
    {

      string out_dir;
      map<string, string>::iterator iter;
      iter = parameters::str_par.find("output dir");
      if (iter != parameters::str_par.end())
      {
        out_dir = iter->second;
      }
      else
      {
        // read parameter from file
        const int LEN = 256;
        char pline[LEN];
        string str, sgrp, skey, sval;
        int sind;
        char pname[50];
        {
          map<string, string>::iterator iter = parameters::str_par.find("inputpar");
          if (iter != parameters::str_par.end())
          {
            strcpy(pname, (iter->second).c_str());
          }
          else
          {
            cout << "Error inputpar" << endl;
            exit(0);
          }
        }
        ifstream inf(pname, ifstream::in);
        if (!inf.good())
        {
          cout << "Can not open parameter file " << pname << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }

        for (int i = 1; inf.good(); i++)
        {
          inf.getline(pline, LEN);
          str = pline;

          int status = misc::parse_parts(str, sgrp, skey, sval, sind);
          if (status == -1)
          {
            cout << "error reading parameter file " << pname << " in line " << i << endl;
            MPI_Abort(MPI_COMM_WORLD, 1);
          }
          else if (status == 0)
            continue;

          if (sgrp == "ABE")
          {
            if (skey == "output dir")
              out_dir = sval;
          }
        }
        inf.close();

        parameters::str_par.insert(map<string, string>::value_type("output dir", out_dir));
      }

      char filename[100];
      if (tag)
        sprintf(filename, "%s/%s_2d_Lev%02d-%02d_%s_%05d.dat", out_dir.c_str(), tag, PP->lev, grd, VP->name, ncount);
      else
        sprintf(filename, "%s/2d_Lev%02d-%02d_%s_%05d.dat", out_dir.c_str(), PP->lev, grd, VP->name, ncount);

      int gord = ghost_width;
      f_d2dump(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, databuffer2, gord, VP->SoA);
      writefile(time, PP->shape[0], PP->shape[1], PP->bbox[0], PP->bbox[3], PP->bbox[1], PP->bbox[4],
                filename, databuffer2);
    }
    DumpList = DumpList->next;
  }

  if (myrank == 0)
  {
    free(databuffer);
    free(databuffer2);
  }
}
void Parallel::d2Dump_Data(MyList<Patch> *PL, MyList<var> *DumpList, char *tag, double time, double dT)
{
  MyList<Patch> *Pp;
  Pp = PL;
  int grd = 0;
  while (Pp)
  {
    Patch *PP = Pp->data;
    d2Dump_Data(PP, DumpList, tag, time, dT, grd);
    grd++;
    Pp = Pp->next;
  }
}
// Now we dump the data including buffer points and ghost points of the given patch
void Parallel::Dump_Data0(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT)
{
  int myrank;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  //   round at 4 and 5
  int ncount = int(time / dT + 0.5);

  MPI_Status sta;
  int DIM = 3;
  double llb[3], uub[3], tllb[3], tuub[3];
  int tshape[3];
  double DX, DY, DZ;

  for (int i = 0; i < 3; i++)
  {
    double DX = PP->blb->data->getdX(i);
    tshape[i] = PP->shape[i] + 2 * ghost_width;
    tllb[i] = PP->bbox[i] - ghost_width * DX;
    tuub[i] = PP->bbox[i + dim] + ghost_width * DX;
  }

  int NN = tshape[0] * tshape[1] * tshape[2];
  double *databuffer = 0;
  if (myrank == 0)
  {
    databuffer = (double *)malloc(sizeof(double) * NN);
    if (!databuffer)
    {
      cout << "on node# " << myrank << ", out of memory when dumping data." << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }

  while (DumpList)
  {
    var *VP = DumpList->data;
    MyList<Block> *Bp = PP->blb;
    while (Bp)
    {
      Block *BP = Bp->data;
      if (BP->rank == 0 && myrank == 0)
      {
        DX = BP->getdX(0);
        DY = BP->getdX(1);
        DZ = BP->getdX(2);
        llb[0] = (feq(BP->bbox[0], tllb[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
        llb[1] = (feq(BP->bbox[1], tllb[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
        llb[2] = (feq(BP->bbox[2], tllb[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
        uub[0] = (feq(BP->bbox[3], tuub[0], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
        uub[1] = (feq(BP->bbox[4], tuub[1], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
        uub[2] = (feq(BP->bbox[5], tuub[2], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
        f_copy(DIM, tllb, tuub, tshape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, BP->fgfs[VP->sgfn], llb, uub);
      }
      else
      {
        if (myrank == 0)
        {
          int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]);
          double *bufferhere = (double *)malloc(sizeof(double) * nnn);
          if (!bufferhere)
          {
            cout << "on node#" << myrank << ", out of memory when dumping data." << endl;
            MPI_Abort(MPI_COMM_WORLD, 1);
          }
          MPI_Recv(bufferhere, nnn, MPI_DOUBLE, BP->rank, 0, MPI_COMM_WORLD, &sta);
          DX = BP->getdX(0);
          DY = BP->getdX(1);
          DZ = BP->getdX(2);
          llb[0] = (feq(BP->bbox[0], tllb[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX;
          llb[1] = (feq(BP->bbox[1], tllb[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY;
          llb[2] = (feq(BP->bbox[2], tllb[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ;
          uub[0] = (feq(BP->bbox[3], tuub[0], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX;
          uub[1] = (feq(BP->bbox[4], tuub[1], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY;
          uub[2] = (feq(BP->bbox[5], tuub[2], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ;
          f_copy(DIM, tllb, tuub, tshape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, bufferhere, llb, uub);
          free(bufferhere);
        }
        else if (myrank == BP->rank)
        {
          int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]);
          MPI_Send(BP->fgfs[VP->sgfn], nnn, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
        }
      }
      if (Bp == PP->ble)
        break;
      Bp = Bp->next;
    }
    if (myrank == 0)
    {

      string out_dir;
      map<string, string>::iterator iter;
      iter = parameters::str_par.find("output dir");
      if (iter != parameters::str_par.end())
      {
        out_dir = iter->second;
      }
      else
      {
        // read parameter from file
        const int LEN = 256;
        char pline[LEN];
        string str, sgrp, skey, sval;
        int sind;
        char pname[50];
        {
          map<string, string>::iterator iter = parameters::str_par.find("inputpar");
          if (iter != parameters::str_par.end())
          {
            strcpy(pname, (iter->second).c_str());
          }
          else
          {
            cout << "Error inputpar" << endl;
            exit(0);
          }
        }
        ifstream inf(pname, ifstream::in);
        if (!inf.good())
        {
          cout << "Can not open parameter file " << pname << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }

        for (int i = 1; inf.good(); i++)
        {
          inf.getline(pline, LEN);
          str = pline;

          int status = misc::parse_parts(str, sgrp, skey, sval, sind);
          if (status == -1)
          {
            cout << "error reading parameter file " << pname << " in line " << i << endl;
            MPI_Abort(MPI_COMM_WORLD, 1);
          }
          else if (status == 0)
            continue;

          if (sgrp == "ABE")
          {
            if (skey == "output dir")
              out_dir = sval;
          }
        }
        inf.close();

        parameters::str_par.insert(map<string, string>::value_type("output dir", out_dir));
      }

      char filename[100];
      if (tag)
        sprintf(filename, "%s/%s_Lev%02d_%s_%05d.bin", out_dir.c_str(), tag, PP->lev, VP->name, ncount);
      else
        sprintf(filename, "%s/Lev%02d_%s_%05d.bin", out_dir.c_str(), PP->lev, VP->name, ncount);

      writefile(time, tshape[0], tshape[1], tshape[2], tllb[0], tuub[0], tllb[1], tuub[2],
                tllb[2], tuub[2], filename, databuffer);
    }
    DumpList = DumpList->next;
  }

  if (myrank == 0)
    free(databuffer);
}
// Map point is much easier than maping data itself
// But the main problem is about the points near the boundary
// worst case is -ghost -ghost+1 .... 0 * ......
double Parallel::global_interp(int DIM, int *ext, double **CoX, double *datain,
                               double *poXb, int ordn, double *SoA, int Symmetry)
{
  if (DIM != 3)
  {
    cout << "Parallel::global_interp does not suport DIM = " << DIM << " for Symmetry." << endl;
    MPI_Abort(MPI_COMM_WORLD, 1);
  }
  double resu;
  double poX[3];
  double asgn = 1;

  for (int i = 0; i < 3; i++)
    poX[i] = poXb[i];

  switch (Symmetry)
  {
  case 2:
    for (int i = 0; i < 3; i++)
      if (poX[i] < 0)
      {
        poX[i] = -poX[i];
        asgn = asgn * SoA[i];
      }
    break;
  case 1:
    if (poX[2] < 0)
    {
      poX[2] = -poX[2];
      asgn = asgn * SoA[2];
    }
  }

  int extb[3];

  for (int i = 0; i < 3; i++)
    extb[i] = ext[i];

  switch (Symmetry)
  {
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
  case 2:
    if (poX[0] < (ghost_width - 1) * (CoX[0][1] - CoX[0][0]))
      extb[0] = extb[0] + ghost_width - 1;
    if (poX[1] < (ghost_width - 1) * (CoX[1][1] - CoX[1][0]))
      extb[1] = extb[1] + ghost_width - 1;
  case 1:
    if (poX[2] < (ghost_width - 1) * (CoX[2][1] - CoX[2][0]))
      extb[2] = extb[2] + ghost_width - 1;
#else
#ifdef Cell
  case 2:
    if (poX[0] < (ghost_width - 0.5) * (CoX[0][1] - CoX[0][0]))
      extb[0] = extb[0] + ghost_width;
    if (poX[1] < (ghost_width - 0.5) * (CoX[1][1] - CoX[1][0]))
      extb[1] = extb[1] + ghost_width;
  case 1:
    if (poX[2] < (ghost_width - 0.5) * (CoX[2][1] - CoX[2][0]))
      extb[2] = extb[2] + ghost_width;
#else
#error Not define Vertex nor Cell
#endif
#endif
  }

  if (extb[0] > ext[0] || extb[1] > ext[1] || extb[2] > ext[2])
  {
    double *CoXb[3];
    int Nb = extb[0] * extb[1] * extb[2];
    double *datab;
    datab = new double[Nb];
    for (int i = 0; i < 3; i++)
    {
      CoXb[i] = new double[extb[i]];
      double DH = CoX[i][1] - CoX[i][0];
      if (extb[i] > ext[i])
      {
        if (CoX[i][0] > DH)
        {
          cout << "lower boundary[" << i << "] = " << CoX[i][0] << ", but SYmmetry = " << Symmetry << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
        for (int j = 0; j < ghost_width - 1; j++)
          CoXb[i][j] = -CoX[i][ghost_width - 1 - j];
        for (int j = ghost_width - 1; j < extb[i]; j++)
          CoXb[i][j] = CoX[i][j - ghost_width + 1];
#else
#ifdef Cell
        for (int j = 0; j < ghost_width; j++)
          CoXb[i][j] = -CoX[i][ghost_width - 1 - j];
        for (int j = ghost_width; j < extb[i]; j++)
          CoXb[i][j] = CoX[i][j - ghost_width];
#else
#error Not define Vertex nor Cell
#endif
#endif
      }
      else
      {
        for (int j = 0; j < extb[i]; j++)
          CoXb[i][j] = CoX[i][j];
      }
    }

    for (int i = 0; i < Nb; i++)
    {
      int ind[3], indb[3];
      getarrayindex(3, extb, indb, i);
      double sgn = 1;
      for (int j = 0; j < 3; j++)
      {
        if (extb[j] > ext[j])
        {
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
          if (indb[j] < ghost_width - 1)
          {
            ind[j] = ghost_width - 1 - indb[j];
            sgn = sgn * SoA[j];
          }
          else
          {
            ind[j] = 1 + indb[j] - ghost_width;
          }
#else
#ifdef Cell
          if (indb[j] < ghost_width)
          {
            ind[j] = ghost_width - 1 - indb[j];
            sgn = sgn * SoA[j];
          }
          else
          {
            ind[j] = indb[j] - ghost_width;
          }
#else
#error Not define Vertex nor Cell
#endif
#endif
        }
        else
          ind[j] = indb[j];
      }
      int lon = getarraylocation(3, ext, ind);
      datab[i] = datain[lon] * sgn;
    }

    resu = global_interp(DIM, extb, CoXb, datab, poX, ordn);

    for (int i = 0; i < 3; i++)
      delete[] CoXb[i];
    delete[] datab;
  }
  else
  {
    resu = global_interp(DIM, ext, CoX, datain, poX, ordn);
  }

  return resu * asgn;
}
double Parallel::global_interp(int DIM, int *ext, double **CoX, double *datain,
                               double *poX, int ordn)
{
  if (ordn > 2 * ghost_width)
  {
    cout << "Parallel::global_interp can not handle ordn = " << ordn << " > 2*ghost_width = " << 2 * ghost_width << endl;
    MPI_Abort(MPI_COMM_WORLD, 1);
  }

  double *bbox, *datainbbox;
  bbox = new double[2 * DIM];
  datainbbox = new double[2 * DIM];

  int *NN, *ind, *shape;
  NN = new int[DIM];
  ind = new int[DIM];
  shape = new int[DIM];

  for (int i = 0; i < DIM; i++)
  {
    ind[i] = int((poX[i] - CoX[i][0]) / (CoX[i][1] - CoX[i][0])) - ordn / 2 + 1;
    // poX may exactly locate on the boundary (exclude ghost)
    if (ind[i] == -1 && feq(poX[i], CoX[i][0], (CoX[i][1] - CoX[i][0]) / 2))
      ind[i] = 0;
    /*
         if(ind[i] < 0)
         {
           cout<<"Parallel::global_interp error ind["<<i<<"] = "<<ind[i]<<endl;
           cout<<"pox = "<<poX[i]<<", CoX[0] = "<<CoX[i][0]<<endl;
           MPI_Abort(MPI_COMM_WORLD,1);
         }
    */
    if (ind[i] == ext[i] - ordn + 1 && feq(poX[i], CoX[i][ext[i] - ordn / 2], (CoX[i][1] - CoX[i][0]) / 2))
      ind[i] = ext[i] - ordn - 1;
    /*
         if(ind[i]+ordn-1 > ext[i]-1)
         {
           cout<<"Parallel::global_interp error ind["<<i<<"] = "<<ind[i]<<" + ordn ("<<ordn<<") > ext = "<<ext[i]<<endl;
           cout<<"pox = "<<poX[i]<<", CoX[ind] = "<<CoX[i][ind[i]]<<", CoX = ("<<CoX[i][0]<<","<<CoX[i][ext[i]-1]<<")"<<endl;
           MPI_Abort(MPI_COMM_WORLD,1);
         }
    */
    bbox[i] = CoX[i][ind[i]];
    bbox[DIM + i] = CoX[i][ind[i] + ordn - 1];
    datainbbox[i] = CoX[i][0];
    datainbbox[DIM + i] = CoX[i][ext[i] - 1];
    shape[i] = ordn;
  }

  NN[DIM - 1] = ordn;
  for (int i = DIM - 2; i >= 0; i--)
    NN[i] = NN[i + 1] * ordn;

  double *xpts, *funcvals;
  xpts = new double[ordn];
  funcvals = new double[ordn];
  double *DDd, *DDd1, rr;

  DDd = new double[NN[0]];

  copy(DIM, bbox, bbox + DIM, shape, DDd, datainbbox, datainbbox + DIM, ext, datain, bbox, bbox + DIM);

  for (int i = 0; i < DIM; i++)
  {
    for (int j = ind[i]; j < ind[i] + ordn; j++)
    {
      xpts[j - ind[i]] = CoX[i][j];
    }

    if (i < DIM - 1)
    {
      DDd1 = new double[NN[i + 1]];
      for (int j = 0; j < NN[i + 1]; j++)
      {
        for (int k = 0; k < ordn; k++)
          funcvals[k] = DDd[k + j * ordn];
        DDd1[j] = Lagrangian_Int(poX[i], ordn, xpts, funcvals);
      }
      delete[] DDd;
      DDd = DDd1;
    }
    else
    {
      for (int j = 0; j < ordn; j++)
        funcvals[j] = DDd[j];
      rr = Lagrangian_Int(poX[i], ordn, xpts, funcvals);
      delete[] DDd1; // since DDd and DDd1 now point to the same stuff, we need delete after above int
    }
  }

  delete[] NN;
  delete[] ind;
  delete[] xpts;
  delete[] funcvals;
  delete[] bbox;
  delete[] datainbbox;
  delete[] shape;

  return rr;
}
double Parallel::Lagrangian_Int(double x, int npts, double *xpts, double *funcvals)
{
  double sum = 0;
  for (int i = 0; i < npts; i++)
  {
    sum = sum + funcvals[i] * LagrangePoly(x, i, npts, xpts);
  }
  return sum;
}
double Parallel::LagrangePoly(double x, int pt, int npts, double *xpts)
{
  double h = 1;
  int i;

  for (i = 0; i < pt; i++)
    h = h * (x - xpts[i]) / (xpts[pt] - xpts[i]);

  for (i = pt + 1; i < npts; i++)
    h = h * (x - xpts[i]) / (xpts[pt] - xpts[i]);

  return h;
}
// collect all grid segments or blocks including ghost and buffer for given patch
MyList<Parallel::gridseg> *Parallel::build_complete_gsl(Patch *Pat)
{
  MyList<Parallel::gridseg> *cgsl = 0, *gs;
  MyList<Block> *BP = Pat->blb;
  while (BP)
  {
    if (!cgsl)
    {
      cgsl = gs = new MyList<Parallel::gridseg>; // delete through destroyList();
      gs->data = new Parallel::gridseg;
    }
    else
    {
      gs->next = new MyList<Parallel::gridseg>;
      gs = gs->next;
      gs->data = new Parallel::gridseg;
    }

    for (int i = 0; i < dim; i++)
    {
      gs->data->llb[i] = BP->data->bbox[i];
      gs->data->uub[i] = BP->data->bbox[dim + i];
      gs->data->shape[i] = BP->data->shape[i];
    }
    gs->data->Bg = BP->data;
    gs->next = 0;

    if (BP == Pat->ble)
      break;
    BP = BP->next;
  }

  return cgsl;
}
// collect all grid segments or blocks including ghost and buffer for given patch list
MyList<Parallel::gridseg> *Parallel::build_complete_gsl(MyList<Patch> *PatL)
{
  MyList<Parallel::gridseg> *cgsl = 0, *gs;
  while (PatL)
  {
    if (!cgsl)
    {
      cgsl = build_complete_gsl(PatL->data);
      gs = cgsl;
      while (gs->next)
        gs = gs->next;
    }
    else
    {
      gs->next = build_complete_gsl(PatL->data);
      gs = gs->next;
      while (gs->next)
        gs = gs->next;
    }
    PatL = PatL->next;
  }

  return cgsl;
}
// cellect the information of Patch list
MyList<Parallel::gridseg> *Parallel::build_complete_gsl_virtual(MyList<Patch> *PatL)
{
  MyList<Parallel::gridseg> *cgsl = 0, *gs;
  while (PatL)
  {
    if (cgsl)
    {
      gs->next = new MyList<Parallel::gridseg>;
      gs = gs->next;
      gs->data = new Parallel::gridseg;
    }
    else
    {
      cgsl = gs = new MyList<Parallel::gridseg>;
      gs->data = new Parallel::gridseg;
    }

    for (int i = 0; i < dim; i++)
    {
      gs->data->llb[i] = PatL->data->bbox[i];
      gs->data->uub[i] = PatL->data->bbox[dim + i];
      gs->data->shape[i] = PatL->data->shape[i];
    }
    gs->data->Bg = 0;
    gs->next = 0;

    PatL = PatL->next;
  }

  return cgsl;
}
// cellect the information of Patch list without buffer points
MyList<Parallel::gridseg> *Parallel::build_complete_gsl_virtual2(MyList<Patch> *PatL) // - buffer
{
  MyList<Parallel::gridseg> *cgsl = 0, *gs;
  while (PatL)
  {
    if (cgsl)
    {
      gs->next = new MyList<Parallel::gridseg>;
      gs = gs->next;
      gs->data = new Parallel::gridseg;
    }
    else
    {
      cgsl = gs = new MyList<Parallel::gridseg>;
      gs->data = new Parallel::gridseg;
    }

    for (int i = 0; i < dim; i++)
    {
      double DH = PatL->data->getdX(i);
      gs->data->llb[i] = PatL->data->bbox[i] + PatL->data->lli[i] * DH;
      gs->data->uub[i] = PatL->data->bbox[dim + i] - PatL->data->uui[i] * DH;
      gs->data->shape[i] = PatL->data->shape[i] - PatL->data->lli[i] - PatL->data->uui[i];
    }
    gs->data->Bg = 0;
    gs->next = 0;

    PatL = PatL->next;
  }

  return cgsl;
}
// collect all grid segments or blocks without ghost for given patch, without extension
MyList<Parallel::gridseg> *Parallel::build_bulk_gsl(Patch *Pat)
{
  MyList<Parallel::gridseg> *cgsl = 0, *gs;
  MyList<Block> *BP = Pat->blb;
  while (BP)
  {
    Block *bp = BP->data;
    if (!cgsl)
    {
      cgsl = gs = new MyList<Parallel::gridseg>;
      gs->data = new Parallel::gridseg;
    }
    else
    {
      gs->next = new MyList<Parallel::gridseg>;
      gs = gs->next;
      gs->data = new Parallel::gridseg;
    }

    for (int i = 0; i < dim; i++)
    {
      double DH = bp->getdX(i);
      gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] : bp->bbox[dim + i] - ghost_width * DH;
      gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + ghost_width * DH;
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
      gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
#else
#ifdef Cell
      gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
#else
#error Not define Vertex nor Cell
#endif
#endif
    }
    gs->data->Bg = BP->data;
    gs->next = 0;

    if (BP == Pat->ble)
      break;
    BP = BP->next;
  }

  return cgsl;
}
// bulk part for given Block within given patch, without extension
MyList<Parallel::gridseg> *Parallel::build_bulk_gsl(Block *bp, Patch *Pat)
{
  MyList<Parallel::gridseg> *gs = 0;

  gs = new MyList<Parallel::gridseg>;
  gs->data = new Parallel::gridseg;

  for (int i = 0; i < dim; i++)
  {
    double DH = bp->getdX(i);
    gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] : bp->bbox[dim + i] - ghost_width * DH;
    gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + ghost_width * DH;
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
    gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
#else
#ifdef Cell
    gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
#else
#error Not define Vertex nor Cell
#endif
#endif
  }
  gs->data->Bg = bp;
  gs->next = 0;

  return gs;
}
MyList<Parallel::gridseg> *Parallel::clone_gsl(MyList<Parallel::gridseg> *p, bool first_only)
{
  MyList<Parallel::gridseg> *np = 0, *q = 0, *pq = 0;

  while (p)
  {
    q = new MyList<Parallel::gridseg>;
    q->data = new Parallel::gridseg;
    q->data->Bg = p->data->Bg;
    for (int i = 0; i < dim; i++)
    {
      q->data->llb[i] = p->data->llb[i];
      q->data->uub[i] = p->data->uub[i];
      q->data->shape[i] = p->data->shape[i];
    }
    if (pq)
      pq->next = q;
    else
      np = q;
    if (first_only)
    {
      np->next = 0;
      return np;
    }
    pq = q;
    p = p->next;
  }
  return np;
}
MyList<Parallel::gridseg> *Parallel::gs_subtract(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B)
{
  if (!A)
    return 0;
  if (!B)
    return clone_gsl(A, true);

  double cut_plane[2 * dim], DH[dim];

  for (int i = 0; i < dim; i++)
  {
    DH[i] = A->data->Bg->getdX(i);
    if (B->data->Bg && !feq(DH[i], B->data->Bg->getdX(i), DH[i] / 2))
    {
      cout << "Parallel::gs_subtract meets different grid segment " << DH[i] << " vs " << B->data->Bg->getdX(i) << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }

  MyList<Parallel::gridseg> *C = 0, *q;
  for (int i = 0; i < dim; i++)
  {
    if (B->data->llb[i] > A->data->uub[i] || B->data->uub[i] < A->data->llb[i])
      return clone_gsl(A, true);
    cut_plane[i] = A->data->llb[i];
    cut_plane[i + dim] = A->data->uub[i];
  }

  for (int i = 0; i < dim; i++)
  {
    cut_plane[i] = Mymax(A->data->llb[i], B->data->llb[i]);
    if (cut_plane[i] - A->data->llb[i] > DH[i] / 2)
    {
      q = clone_gsl(A, true);
      // prolong the list from head
      if (C)
        q->next = C;
      C = q;
      for (int j = 0; j < dim; j++)
      {
        if (i == j)
        {
          C->data->llb[i] = A->data->llb[i];
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
          C->data->uub[i] = Mymax(C->data->llb[i], cut_plane[i] - DH[i]);
#else
#ifdef Cell
          C->data->uub[i] = Mymax(C->data->llb[i], cut_plane[i]);
#else
#error Not define Vertex nor Cell
#endif
#endif
        }
        else
        {
          C->data->llb[j] = cut_plane[j];
          C->data->uub[j] = cut_plane[j + dim];
        }
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4) + 1;
#else
#ifdef Cell
        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4);
#else
#error Not define Vertex nor Cell
#endif
#endif
      }
    }

    cut_plane[i + dim] = Mymin(A->data->uub[i], B->data->uub[i]);
    if (A->data->uub[i] - cut_plane[i + dim] > DH[i] / 2)
    {
      q = clone_gsl(A, true);
      if (C)
        q->next = C;
      C = q;
      for (int j = 0; j < dim; j++)
      {
        if (i == j)
        {
          C->data->uub[i] = A->data->uub[i];
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
          C->data->llb[i] = Mymin(C->data->uub[i], cut_plane[i + dim] + DH[i]);
#else
#ifdef Cell
          C->data->llb[i] = Mymin(C->data->uub[i], cut_plane[i + dim]);
#else
#error Not define Vertex nor Cell
#endif
#endif
        }
        else
        {
          C->data->llb[j] = cut_plane[j];
          C->data->uub[j] = cut_plane[j + dim];
        }
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4) + 1;
#else
#ifdef Cell
        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4);
#else
#error Not define Vertex nor Cell
#endif
#endif
      }
    }
  }
  return C;
}
// stupid method
/*
MyList<Parallel::gridseg> *Parallel::gsl_subtract(MyList<Parallel::gridseg> *A,MyList<Parallel::gridseg> *B) //A subtract B but with A's information
{
// always make return and A, B distinct
  if(!A) return 0;

  if(!B) return clone_gsl(A,0);

  MyList<Parallel::gridseg> *C=0,*C0,*C1,*Cc,*CC0,*gs;

  while(A)
  {
     C0=gs_subtract(A,B);  // note C0 becomes a list after subtraction
     C1=B->next;
     while(C1)
     {
  CC0=C0;
  Cc=0;
  while(CC0)
  {
    gs=gs_subtract(CC0,C1);
    if(Cc) Cc->catList(gs);
    else   Cc=gs;
    CC0=CC0->next;
  }
  if(C0) C0->destroyList();
  C0=Cc;
  C1=C1->next;
     }
     if(C) C->catList(C0);
     else  C=C0;
     A=A->next;
  }

  return C;
}
*/
// more clever method
MyList<Parallel::gridseg> *Parallel::gsl_subtract(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B) // A subtract B but with A's information
{
  // always make return and A, B distinct
  if (!A)
    return 0;

  MyList<Parallel::gridseg> *C = 0, *C0, *C1;

  C = clone_gsl(A, 0);

  while (B)
  {
    C0 = 0;
    C1 = C;
    while (C1)
    {
      if (C0)
        C0->catList(gs_subtract(C1, B));
      else
        C0 = gs_subtract(C1, B);
      C1 = C1->next;
    }
    if (C)
      C->destroyList();
    else
    {
      if (C0)
        C0->destroyList();
      return 0;
    }

    C = C0;
    B = B->next;
  }

  return C;
}
MyList<Parallel::gridseg> *Parallel::gs_and(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B)
{
  if (!A || !B)
    return 0;

  double llb[dim], uub[dim];
  bool flag = false;
  for (int i = 0; i < dim; i++)
  {
    llb[i] = Mymax(A->data->llb[i], B->data->llb[i]);
    uub[i] = Mymin(A->data->uub[i], B->data->uub[i]);
    if (llb[i] > uub[i])
    {
      flag = true;
      break;
    }
  }
  if (flag)
    return 0;

  MyList<Parallel::gridseg> *C;
  C = clone_gsl(A, true);
  for (int i = 0; i < dim; i++)
  {
    C->data->llb[i] = llb[i];
    C->data->uub[i] = uub[i];
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
    C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / C->data->Bg->getdX(i) + 0.4) + 1;
#else
#ifdef Cell
    C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / C->data->Bg->getdX(i) + 0.4);
#else
#error Not define Vertex nor Cell
#endif
#endif
  }

  return C;
}
// overlap of A_i and (union of all j of B_j)
MyList<Parallel::gridseg> *Parallel::gsl_and(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B) // A and B but with A's information
{
  MyList<Parallel::gridseg> *C = 0, *C1;

  while (A)
  {
    C1 = B;
    while (C1)
    {
      if (C)
        C->catList(gs_and(A, C1));
      else
        C = gs_and(A, C1);
      C1 = C1->next;
    }
    A = A->next;
  }
  return C;
}
// collect all ghost grid segments or blocks for given patch
MyList<Parallel::gridseg> *Parallel::build_ghost_gsl(Patch *Pat)
{
  MyList<Parallel::gridseg> *cgsl = 0, *gs, *gsb;
  MyList<Block> *BP = Pat->blb;
  while (BP)
  {
    gs = new MyList<Parallel::gridseg>;
    gs->data = new Parallel::gridseg;

    for (int i = 0; i < dim; i++)
    {
      gs->data->llb[i] = BP->data->bbox[i];
      gs->data->uub[i] = BP->data->bbox[dim + i];
      gs->data->shape[i] = BP->data->shape[i];
    }
    gs->data->Bg = BP->data;
    gs->next = 0;

    gsb = build_bulk_gsl(BP->data, Pat);

    if (!cgsl)
      cgsl = gs_subtract(gs, gsb);
    else
      cgsl->catList(gs_subtract(gs, gsb));

    gsb->destroyList();
    gs->destroyList();

    if (BP == Pat->ble)
      break;
    BP = BP->next;
  }

  return cgsl;
}
// collect all ghost grid segments or blocks for given patch list
MyList<Parallel::gridseg> *Parallel::build_ghost_gsl(MyList<Patch> *PatL)
{
  MyList<Parallel::gridseg> *cgsl = 0, *gs;
  while (PatL)
  {
    if (!cgsl)
    {
      cgsl = build_ghost_gsl(PatL->data);
      gs = cgsl;
      while (gs->next)
        gs = gs->next;
    }
    else
    {
      gs->next = build_ghost_gsl(PatL->data);
      gs = gs->next;
      while (gs->next)
        gs = gs->next;
    }
    PatL = PatL->next;
  }

  return cgsl;
}
// collect all grid segments or blocks without ghost for given patch
// special for Sync usage, so we do not need consider missing points
MyList<Parallel::gridseg> *Parallel::build_owned_gsl0(Patch *Pat, int rank_in)
{
  MyList<Parallel::gridseg> *cgsl = 0, *gs;
  MyList<Block> *BP = Pat->blb;
  while (BP)
  {
    Block *bp = BP->data;
    if (bp->rank == rank_in)
    {
      if (!cgsl)
      {
        cgsl = gs = new MyList<Parallel::gridseg>;
        gs->data = new Parallel::gridseg;
      }
      else
      {
        gs->next = new MyList<Parallel::gridseg>;
        gs = gs->next;
        gs->data = new Parallel::gridseg;
      }

      for (int i = 0; i < dim; i++)
      {
        double DH = bp->getdX(i);
        gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] : bp->bbox[dim + i] - ghost_width * DH;
        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + ghost_width * DH;
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
#else
#ifdef Cell
        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
#else
#error Not define Vertex nor Cell
#endif
#endif
      }
      gs->data->Bg = BP->data;
      gs->next = 0;
    }

    if (BP == Pat->ble)
      break;
    BP = BP->next;
  }

  return cgsl;
}
// collect all grid segments or blocks without ghost for given patch
MyList<Parallel::gridseg> *Parallel::build_owned_gsl1(Patch *Pat, int rank_in)
{
  MyList<Parallel::gridseg> *cgsl = 0, *gs;
  MyList<Block> *BP = Pat->blb;
  while (BP)
  {
    Block *bp = BP->data;
    if (bp->rank == rank_in)
    {
      if (!cgsl)
      {
        cgsl = gs = new MyList<Parallel::gridseg>;
        gs->data = new Parallel::gridseg;
      }
      else
      {
        gs->next = new MyList<Parallel::gridseg>;
        gs = gs->next;
        gs->data = new Parallel::gridseg;
      }

      for (int i = 0; i < dim; i++)
      {
        double DH = bp->getdX(i);
        gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] : bp->bbox[dim + i] - ghost_width * DH;
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
        // NOTE: our dividing structure is (exclude ghost)
        // -1 0
        //       1  2
        // so (0,1) does not belong to any part for vertex structure, we always put it to right part, this is consistent to
        // the fortran routine where we always take floor to get index
        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + (ghost_width - 1) * DH;
        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
#else
#ifdef Cell
        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + ghost_width * DH;
        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
#else
#error Not define Vertex nor Cell
#endif
#endif
      }
      gs->data->Bg = BP->data;
      gs->next = 0;
    }

    if (BP == Pat->ble)
      break;
    BP = BP->next;
  }

  return cgsl;
}
// collect all grid segments or blocks without ghost nor buffer for given patch
MyList<Parallel::gridseg> *Parallel::build_owned_gsl2(Patch *Pat, int rank_in)
{
  MyList<Parallel::gridseg> *cgsl = 0, *gs;
  MyList<Block> *BP = Pat->blb;
  while (BP)
  {
    Block *bp = BP->data;
    if (bp->rank == rank_in)
    {
      if (!cgsl)
      {
        cgsl = gs = new MyList<Parallel::gridseg>;
        gs->data = new Parallel::gridseg;
      }
      else
      {
        gs->next = new MyList<Parallel::gridseg>;
        gs = gs->next;
        gs->data = new Parallel::gridseg;
      }

      for (int i = 0; i < dim; i++)
      {
        double DH = bp->getdX(i);
        gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] - Pat->uui[i] * DH : bp->bbox[dim + i] - ghost_width * DH;
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
        // NOTE: our dividing structure is (exclude ghost)
        // -1 0
        //       1  2
        // so (0,1) does not belong to any part for vertex structure, we always put it to right part, this is consistent to
        // the fortran routine where we always take floor to get index
        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i] + (ghost_width - 1) * DH;
        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
#else
#ifdef Cell
        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i] + ghost_width * DH;
        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
#else
#error Not define Vertex nor Cell
#endif
#endif
      }
      gs->data->Bg = BP->data;
      gs->next = 0;
    }

    if (BP == Pat->ble)
      break;
    BP = BP->next;
  }

  return cgsl;
}
// collect all grid segments or blocks without ghost for given patch, and delete the ghost_width for interpolation consideration on the patch boundary
MyList<Parallel::gridseg> *Parallel::build_owned_gsl3(Patch *Pat, int rank_in, int Symmetry)
{
  MyList<Parallel::gridseg> *cgsl = 0, *gs;
  MyList<Block> *BP = Pat->blb;
  while (BP)
  {
    Block *bp = BP->data;
    if (bp->rank == rank_in)
    {
      if (!cgsl)
      {
        cgsl = gs = new MyList<Parallel::gridseg>;
        gs->data = new Parallel::gridseg;
      }
      else
      {
        gs->next = new MyList<Parallel::gridseg>;
        gs = gs->next;
        gs->data = new Parallel::gridseg;
      }

      for (int i = 0; i < dim; i++)
      {
        double DH = bp->getdX(i);
        gs->data->uub[i] = bp->bbox[dim + i] - ghost_width * DH;
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
        // NOTE: our dividing structure is (exclude ghost)
        // -1 0
        //       1  2
        // so (0,1) does not belong to any part for vertex structure, we always put it to right part, this is consistent to
        // the fortran routine where we always take floor to get index
        gs->data->llb[i] = bp->bbox[i] + (ghost_width - 1) * DH;
        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
#else
#ifdef Cell
        gs->data->llb[i] = bp->bbox[i] + ghost_width * DH;
        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
#else
#error Not define Vertex nor Cell
#endif
#endif
      }
      // Symmetry consideration
      if (Symmetry > 0)
      {
        double DH = bp->getdX(2);
        if (feq(bp->bbox[2], 0, DH / 2))
        {
          gs->data->llb[2] = bp->bbox[2];
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
          gs->data->shape[2] = int((gs->data->uub[2] - gs->data->llb[2]) / DH + 0.4) + 1;
#else
#ifdef Cell
          gs->data->shape[2] = int((gs->data->uub[2] - gs->data->llb[2]) / DH + 0.4);
#else
#error Not define Vertex nor Cell
#endif
#endif
        }
        if (Symmetry > 1)
        {
          for (int i = 0; i < 2; i++)
          {
            DH = bp->getdX(i);
            if (feq(bp->bbox[i], 0, DH / 2))
            {
              gs->data->llb[i] = bp->bbox[i];
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
              gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
#else
#ifdef Cell
              gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
#else
#error Not define Vertex nor Cell
#endif
#endif
            }
          }
        }
      }

      gs->data->Bg = BP->data;
      gs->next = 0;
    }

    if (BP == Pat->ble)
      break;
    BP = BP->next;
  }

  return cgsl;
}
// collect all grid segments or blocks without ghost nor buffer for given patch,
// and delete the ghost_width for interpolation consideration on the patch boundary
MyList<Parallel::gridseg> *Parallel::build_owned_gsl4(Patch *Pat, int rank_in, int Symmetry)
{
  MyList<Parallel::gridseg> *cgsl = 0, *gs;
  MyList<Block> *BP = Pat->blb;
  while (BP)
  {
    Block *bp = BP->data;
    if (bp->rank == rank_in)
    {
      if (!cgsl)
      {
        cgsl = gs = new MyList<Parallel::gridseg>;
        gs->data = new Parallel::gridseg;
      }
      else
      {
        gs->next = new MyList<Parallel::gridseg>;
        gs = gs->next;
        gs->data = new Parallel::gridseg;
      }

      for (int i = 0; i < dim; i++)
      {
        double DH = bp->getdX(i);
        gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] - Pat->uui[i] * DH : bp->bbox[dim + i];
        gs->data->uub[i] -= ghost_width * DH;
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
        // NOTE: our dividing structure is (exclude ghost)
        // -1 0
        //       1  2
        // so (0,1) does not belong to any part for vertex structure, we always put it to right part, this is consistent to
        // the fortran routine where we always take floor to get index
        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i];
        gs->data->llb[i] += (ghost_width - 1) * DH;
        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
#else
#ifdef Cell
        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i];
        gs->data->llb[i] += ghost_width * DH;
        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
#else
#error Not define Vertex nor Cell
#endif
#endif
      }
      // Symmetry consideration
      if (Symmetry > 0)
      {
        double DH = bp->getdX(2);
        if (feq(bp->bbox[2], 0, DH / 2))
        {
          gs->data->llb[2] = bp->bbox[2];
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
          gs->data->shape[2] = int((gs->data->uub[2] - gs->data->llb[2]) / DH + 0.4) + 1;
#else
#ifdef Cell
          gs->data->shape[2] = int((gs->data->uub[2] - gs->data->llb[2]) / DH + 0.4);
#else
#error Not define Vertex nor Cell
#endif
#endif
        }
        if (Symmetry > 1)
        {
          for (int i = 0; i < 2; i++)
          {
            DH = bp->getdX(i);
            if (feq(bp->bbox[i], 0, DH / 2))
            {
              gs->data->llb[i] = bp->bbox[i];
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
              gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
#else
#ifdef Cell
              gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
#else
#error Not define Vertex nor Cell
#endif
#endif
            }
          }
        }
      }

      gs->data->Bg = BP->data;
      gs->next = 0;
    }

    if (BP == Pat->ble)
      break;
    BP = BP->next;
  }

  return cgsl;
}
// collect all grid segments or blocks without ghost nor buffer for given patch, no extention
MyList<Parallel::gridseg> *Parallel::build_owned_gsl5(Patch *Pat, int rank_in)
{
  MyList<Parallel::gridseg> *cgsl = 0, *gs;
  MyList<Block> *BP = Pat->blb;
  while (BP)
  {
    Block *bp = BP->data;
    if (bp->rank == rank_in)
    {
      if (!cgsl)
      {
        cgsl = gs = new MyList<Parallel::gridseg>;
        gs->data = new Parallel::gridseg;
      }
      else
      {
        gs->next = new MyList<Parallel::gridseg>;
        gs = gs->next;
        gs->data = new Parallel::gridseg;
      }

      for (int i = 0; i < dim; i++)
      {
        double DH = bp->getdX(i);
        gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] - Pat->uui[i] * DH : bp->bbox[dim + i] - ghost_width * DH;
        gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i] + ghost_width * DH;
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1;
#else
#ifdef Cell
        gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4);
#else
#error Not define Vertex nor Cell
#endif
#endif
      }
      gs->data->Bg = BP->data;
      gs->next = 0;
    }

    if (BP == Pat->ble)
      break;
    BP = BP->next;
  }

  return cgsl;
}
// collect all grid segments or blocks without ghost for given patch list
// stupid method
/*
MyList<Parallel::gridseg> *Parallel::build_owned_gsl(MyList<Patch> *PatL,int rank_in,int type,int Symmetry)
{
       MyList<Parallel::gridseg> *cgsl=0,*gs;
       while(PatL)
       {
    if(!cgsl)
    {
            switch(type)
      {
         case 0:
                  cgsl = build_owned_gsl0(PatL->data,rank_in);
      break;
         case 1:
                  cgsl = build_owned_gsl1(PatL->data,rank_in);
      break;
         case 2:
                  cgsl = build_owned_gsl2(PatL->data,rank_in);
      break;
         case 3:
                  cgsl = build_owned_gsl3(PatL->data,rank_in,Symmetry);
      break;
         case 4:
                  cgsl = build_owned_gsl4(PatL->data,rank_in,Symmetry);
      break;
         case 5:
                  cgsl = build_owned_gsl5(PatL->data,rank_in);
      break;
               default:
      cout<<"Parallel::build_owned_gsl : unknown type = "<<type<<endl;
                  MPI_Abort(MPI_COMM_WORLD,1);
      }
       gs = cgsl;
       while(gs && gs->next) gs = gs->next;
    }
    else
    {
       switch(type)
      {
         case 0:
                  gs->next = build_owned_gsl0(PatL->data,rank_in);
      break;
         case 1:
                  gs->next = build_owned_gsl1(PatL->data,rank_in);
      break;
         case 2:
                  gs->next = build_owned_gsl2(PatL->data,rank_in);
      break;
         case 3:
                  gs->next = build_owned_gsl3(PatL->data,rank_in,Symmetry);
      break;
         case 4:
                  gs->next = build_owned_gsl4(PatL->data,rank_in,Symmetry);
      break;
         case 5:
                  gs->next = build_owned_gsl5(PatL->data,rank_in);
      break;
               default:
      cout<<"Parallel::build_owned_gsl : unknown type = "<<type<<endl;
                  MPI_Abort(MPI_COMM_WORLD,1);
      }
       while(gs && gs->next) gs = gs->next;
    }
    PatL = PatL->next;
       }

       return cgsl;
}
*/
// more clever method
MyList<Parallel::gridseg> *Parallel::build_owned_gsl(MyList<Patch> *PatL, int rank_in, int type, int Symmetry)
{
  MyList<Parallel::gridseg> *cgsl = 0, *gs;
  while (PatL)
  {
    switch (type)
    {
    case 0:
      gs = build_owned_gsl0(PatL->data, rank_in);
      break;
    case 1:
      gs = build_owned_gsl1(PatL->data, rank_in);
      break;
    case 2:
      gs = build_owned_gsl2(PatL->data, rank_in);
      break;
    case 3:
      gs = build_owned_gsl3(PatL->data, rank_in, Symmetry);
      break;
    case 4:
      gs = build_owned_gsl4(PatL->data, rank_in, Symmetry);
      break;
    case 5:
      gs = build_owned_gsl5(PatL->data, rank_in);
      break;
    default:
      cout << "Parallel::build_owned_gsl : unknown type = " << type << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
    if (cgsl)
      cgsl->catList(gs);
    else
      cgsl = gs;
    PatL = PatL->next;
  }

  return cgsl;
}
// according to overlape to determine real grid segments
void Parallel::build_gstl(MyList<Parallel::gridseg> *srci, MyList<Parallel::gridseg> *dsti,
                          MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst)
{
  *out_src = *out_dst = 0;

  if (!srci || !dsti)
    return;

  MyList<Parallel::gridseg> *s, *d;
  MyList<Parallel::gridseg> *s2, *d2;

  double llb[dim], uub[dim];

  s = srci;
  while (s)
  {
    Parallel::gridseg *sd = s->data;
    d = dsti;
    while (d)
    {
      Parallel::gridseg *dd = d->data;
      bool flag = true;
      for (int i = 0; i < dim; i++)
      {
        double SH = sd->Bg->getdX(i), DH = dd->Bg->getdX(i);
        llb[i] = Mymax(sd->llb[i], dd->llb[i]);
        uub[i] = Mymin(sd->uub[i], dd->uub[i]);
        // make sure the region boundary is consistent to the grids
        // here we only judge if the domain is empty, so do not need to adjust the align
        double lb = llb[i], ub = uub[i];
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
        // ---*---
        // x-------x
        //		if     (int(2*(sd->uub[i]-uub[i])/SH+0.4)%2 == 1) ub = uub[i]-SH/2;
        //		else if(int(2*(dd->uub[i]-uub[i])/DH+0.4)%2 == 1) ub = uub[i]-DH/2;
        //		if     (int(2*(llb[i]-sd->llb[i])/SH+0.4)%2 == 1) lb = llb[i]+SH/2;
        //		else if(int(2*(llb[i]-dd->llb[i])/DH+0.4)%2 == 1) lb = llb[i]+DH/2;
        if (lb > ub + Mymin(SH, DH) / 2)
        {
          flag = false;
          break;
        } // special for isolated point
#else
#ifdef Cell
        // |------|
        // |-------------|
        //		if     (int(2*(sd->uub[i]-uub[i])/SH+0.4)%2 == 1) ub = uub[i]+SH/2;
        //		else if(int(2*(dd->uub[i]-uub[i])/DH+0.4)%2 == 1) ub = uub[i]+DH/2;
        //        |------|
        // |-------------|
        //		if     (int(2*(llb[i]-sd->llb[i])/SH+0.4)%2 == 1) lb = llb[i]-SH/2;
        //		else if(int(2*(llb[i]-dd->llb[i])/DH+0.4)%2 == 1) lb = llb[i]-DH/2;
        if (ub - lb < Mymin(SH, DH) / 2)
        {
          flag = false;
          break;
        } // even for isolated point, it has a cell belong to it
#else
#error Not define Vertex nor Cell
#endif
#endif
      }

      if (flag)
      {
        if (!(*out_src))
        {
          *out_src = s2 = new MyList<Parallel::gridseg>;
          *out_dst = d2 = new MyList<Parallel::gridseg>;
          s2->data = new Parallel::gridseg;
          d2->data = new Parallel::gridseg;
        }
        else
        {
          s2->next = new MyList<Parallel::gridseg>;
          s2 = s2->next;
          d2->next = new MyList<Parallel::gridseg>;
          d2 = d2->next;
          s2->data = new Parallel::gridseg;
          d2->data = new Parallel::gridseg;
        }

        for (int i = 0; i < dim; i++)
        {
          double SH = sd->Bg->getdX(i), DH = dd->Bg->getdX(i);
          s2->data->llb[i] = d2->data->llb[i] = llb[i];
          s2->data->uub[i] = d2->data->uub[i] = uub[i];
// using float method to count point, we do not need following consideration (2012 nov 17)
#if 1

#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
          // old code distuinguish vertex and cell
          //		   if     (int(2*(sd->uub[i]-uub[i])/SH+0.4)%2 == 1) s2->data->uub[i] = uub[i]-SH/2;
          //		   else if(int(2*(dd->uub[i]-uub[i])/DH+0.4)%2 == 1) d2->data->uub[i] = uub[i]-DH/2;
          //	           if     (int(2*(llb[i]-sd->llb[i])/SH+0.4)%2 == 1) s2->data->llb[i] = llb[i]+SH/2;
          //		   else if(int(2*(llb[i]-dd->llb[i])/DH+0.4)%2 == 1) d2->data->llb[i] = llb[i]+DH/2;
          // new code: here we concern much more about missing point, because overlaping domain has been gaureented above
          if (int(2 * (sd->uub[i] - uub[i]) / SH + 0.4) % 2 == 1)
            s2->data->uub[i] = uub[i] + SH / 2;
          else if (int(2 * (dd->uub[i] - uub[i]) / DH + 0.4) % 2 == 1)
            d2->data->uub[i] = uub[i] + DH / 2;
          if (int(2 * (llb[i] - sd->llb[i]) / SH + 0.4) % 2 == 1)
            s2->data->llb[i] = llb[i] - SH / 2;
          else if (int(2 * (llb[i] - dd->llb[i]) / DH + 0.4) % 2 == 1)
            d2->data->llb[i] = llb[i] - DH / 2;
          s2->data->shape[i] = int((s2->data->uub[i] - s2->data->llb[i]) / SH + 0.4) + 1;
          d2->data->shape[i] = int((d2->data->uub[i] - d2->data->llb[i]) / DH + 0.4) + 1;
#else
#ifdef Cell
          if (int(2 * (sd->uub[i] - uub[i]) / SH + 0.4) % 2 == 1)
            s2->data->uub[i] = uub[i] + SH / 2;
          else if (int(2 * (dd->uub[i] - uub[i]) / DH + 0.4) % 2 == 1)
            d2->data->uub[i] = uub[i] + DH / 2;
          if (int(2 * (llb[i] - sd->llb[i]) / SH + 0.4) % 2 == 1)
            s2->data->llb[i] = llb[i] - SH / 2;
          else if (int(2 * (llb[i] - dd->llb[i]) / DH + 0.4) % 2 == 1)
            d2->data->llb[i] = llb[i] - DH / 2;
          s2->data->shape[i] = int((s2->data->uub[i] - s2->data->llb[i]) / SH + 0.4);
          d2->data->shape[i] = int((d2->data->uub[i] - d2->data->llb[i]) / DH + 0.4);
#else
#error Not define Vertex nor Cell
#endif
#endif

#endif
          s2->data->illb[i] = sd->illb[i];
          d2->data->illb[i] = dd->illb[i];
          s2->data->iuub[i] = sd->iuub[i];
          d2->data->iuub[i] = dd->iuub[i];
        }
        s2->data->Bg = sd->Bg;
        s2->next = 0;
        d2->data->Bg = dd->Bg;
        d2->next = 0;
      }
      d = d->next;
    }
    s = s->next;
  }
}
//   PACK: prepare target data in 'data'
// UNPACK: copy target data from 'data' to corresponding numerical grids
int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<Parallel::gridseg> *dst, int rank_in, int dir,
                          MyList<var> *VarLists /* source */, MyList<var> *VarListd /* target */, int Symmetry)
{
  int myrank;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);

  int DIM = dim;

  if (dir != PACK && dir != UNPACK)
  {
    cout << "error dir " << dir << " for data_packer " << endl;
    MPI_Abort(MPI_COMM_WORLD, 1);
  }

  int size_out = 0;

  if (!src || !dst)
    return size_out;

  MyList<var> *varls, *varld;

  varls = VarLists;
  varld = VarListd;
  while (varls && varld)
  {
    varls = varls->next;
    varld = varld->next;
  }

  if (varls || varld)
  {
    cout << "error in short data packer, var lists does not match." << endl;
    MPI_Abort(MPI_COMM_WORLD, 1);
  }

  int type; /* 1 copy, 2 restrict, 3 prolong */
  if (src->data->Bg->lev == dst->data->Bg->lev)
    type = 1;
  else if (src->data->Bg->lev > dst->data->Bg->lev)
    type = 2;
  else
    type = 3;

  while (src && dst)
  {
    if ((dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
        (dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank))
    {
      varls = VarLists;
      varld = VarListd;
      while (varls && varld)
      {
        if (data)
        {
          if (dir == PACK)
            switch (type)
            {
              // attention must be paied to the difference between src's llb,uub and dst's llb,uub
            case 1:
              f_copy(DIM, dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
                     src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
                     dst->data->llb, dst->data->uub);
              break;
            case 2:
              f_restrict3(DIM, dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
                          src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
                          dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
              break;
            case 3:
              f_prolong3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
                         dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
                         dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
            }
          if (dir == UNPACK) // from target data to corresponding grid
            f_copy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn],
                   dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
                   dst->data->llb, dst->data->uub);
        }
        size_out += dst->data->shape[0] * dst->data->shape[1] * dst->data->shape[2];
        varls = varls->next;
        varld = varld->next;
      }
    }
    dst = dst->next;
    src = src->next;
  }

  return size_out;
}
int Parallel::data_packermix(double *data, MyList<Parallel::gridseg> *src, MyList<Parallel::gridseg> *dst, int rank_in, int dir,
                             MyList<var> *VarLists /* source */, MyList<var> *VarListd /* target */, int Symmetry)
{
  int myrank;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);

  int DIM = dim;

  if (dir != PACK && dir != UNPACK)
  {
    cout << "Parallel::data_packermix: error dir " << dir << " for data_packermix." << endl;
    MPI_Abort(MPI_COMM_WORLD, 1);
  }

  int size_out = 0;

  if (!src || !dst)
    return size_out;

  MyList<var> *varls, *varld;

  varls = VarLists;
  varld = VarListd;
  while (varls && varld)
  {
    varls = varls->next;
    varld = varld->next;
  }

  if (varls || varld)
  {
    cout << "error in short data packer, var lists does not match." << endl;
    MPI_Abort(MPI_COMM_WORLD, 1);
  }

  int type; /* 1 copy, 2 restrict, 3 prolong */
  if (src->data->Bg->lev == dst->data->Bg->lev)
    type = 1;
  else if (src->data->Bg->lev > dst->data->Bg->lev)
    type = 2;
  else
    type = 3;

  if (type != 3)
  {
    cout << "Parallel::data_packermix: error type " << type << " for data_packermix." << endl;
    MPI_Abort(MPI_COMM_WORLD, 1);
  }

  while (src && dst)
  {
    if ((dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
        (dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank))
    {
      varls = VarLists;
      varld = VarListd;
      while (varls && varld)
      {
        if (data)
        {
          if (dir == PACK)
            f_prolongcopy3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
                           dst->data->llb, dst->data->uub, src->data->shape, data + size_out,
                           src->data->llb, src->data->uub, varls->data->SoA, Symmetry);
          if (dir == UNPACK) // from target data to corresponding grid
            f_prolongmix3(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn],
                          src->data->llb, src->data->uub, src->data->shape, data + size_out,
                          dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry, dst->data->illb, dst->data->iuub);
        }
        // the symmetry problem should be dealt in prolongcopy3,
        // so we always have ghost_width for both sides
        size_out += (src->data->shape[0] + 2 * ghost_width) * (src->data->shape[1] + 2 * ghost_width) * (src->data->shape[2] + 2 * ghost_width);
        varls = varls->next;
        varld = varld->next;
      }
    }
    dst = dst->next;
    src = src->next;
  }

  return size_out;
}
//
void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
                        MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
                        int Symmetry)
{
  int myrank, cpusize;
  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);

  int node;

  MPI_Request *reqs;
  MPI_Status *stats;
  reqs = new MPI_Request[2 * cpusize];
  stats = new MPI_Status[2 * cpusize];
  int req_no = 0;

  double **send_data, **rec_data;
  send_data = new double *[cpusize];
  rec_data = new double *[cpusize];
  int length;

  for (node = 0; node < cpusize; node++)
  {
    send_data[node] = rec_data[node] = 0;
    if (node == myrank)
    {
      if (length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
      {
        rec_data[node] = new double[length];
        if (!rec_data[node])
        {
          cout << "out of memory when new in short transfer, place 1" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        data_packer(rec_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
      }
    }
    else
    {
      // send from this cpu to cpu#node
      if (length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
      {
        send_data[node] = new double[length];
        if (!send_data[node])
        {
          cout << "out of memory when new in short transfer, place 2" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
        MPI_Isend((void *)send_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
      }
      // receive from cpu#node to this cpu
      if (length = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry))
      {
        rec_data[node] = new double[length];
        if (!rec_data[node])
        {
          cout << "out of memory when new in short transfer, place 3" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        MPI_Irecv((void *)rec_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
      }
    }
  }
  // wait for all requests to complete
  MPI_Waitall(req_no, reqs, stats);

  for (node = 0; node < cpusize; node++)
    if (rec_data[node])
      data_packer(rec_data[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);

  for (node = 0; node < cpusize; node++)
  {
    if (send_data[node])
      delete[] send_data[node];
    if (rec_data[node])
      delete[] rec_data[node];
  }

  delete[] reqs;
  delete[] stats;
  delete[] send_data;
  delete[] rec_data;
}
//
void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
                           MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
                           int Symmetry)
{
  int myrank, cpusize;
  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);

  int node;

  MPI_Request *reqs;
  MPI_Status *stats;
  reqs = new MPI_Request[2 * cpusize];
  stats = new MPI_Status[2 * cpusize];
  int req_no = 0;

  double **send_data, **rec_data;
  send_data = new double *[cpusize];
  rec_data = new double *[cpusize];
  int length;

  for (node = 0; node < cpusize; node++)
  {
    send_data[node] = rec_data[node] = 0;
    if (node == myrank)
    {
      if (length = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
      {
        rec_data[node] = new double[length];
        if (!rec_data[node])
        {
          cout << "out of memory when new in short transfer, place 1" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        data_packermix(rec_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
      }
    }
    else
    {
      // send from this cpu to cpu#node
      if (length = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
      {
        send_data[node] = new double[length];
        if (!send_data[node])
        {
          cout << "out of memory when new in short transfer, place 2" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        data_packermix(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
        MPI_Isend((void *)send_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
      }
      // receive from cpu#node to this cpu
      if (length = data_packermix(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry))
      {
        rec_data[node] = new double[length];
        if (!rec_data[node])
        {
          cout << "out of memory when new in short transfer, place 3" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        MPI_Irecv((void *)rec_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
      }
    }
  }
  // wait for all requests to complete
  MPI_Waitall(req_no, reqs, stats);

  for (node = 0; node < cpusize; node++)
    if (rec_data[node])
      data_packermix(rec_data[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);

  for (node = 0; node < cpusize; node++)
  {
    if (send_data[node])
      delete[] send_data[node];
    if (rec_data[node])
      delete[] rec_data[node];
  }

  delete[] reqs;
  delete[] stats;
  delete[] send_data;
  delete[] rec_data;
}
void Parallel::Sync(Patch *Pat, MyList<var> *VarList, int Symmetry)
{
  int cpusize;
  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);

  MyList<Parallel::gridseg> *dst;
  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
  src = new MyList<Parallel::gridseg> *[cpusize];
  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];

  dst = build_ghost_gsl(Pat); // ghost region only
  for (int node = 0; node < cpusize; node++)
  {
    src[node] = build_owned_gsl0(Pat, node);                              // for the part without ghost points and do not extend
    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer_src[node], data locate on cpu#node;
                                                                          // but for transfer_dst[node] the data may locate on any node
  }

  transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);

  if (dst)
    dst->destroyList();
  for (int node = 0; node < cpusize; node++)
  {
    if (src[node])
      src[node]->destroyList();
    if (transfer_src[node])
      transfer_src[node]->destroyList();
    if (transfer_dst[node])
      transfer_dst[node]->destroyList();
  }

  delete[] src;
  delete[] transfer_src;
  delete[] transfer_dst;
}
void Parallel::Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
{
  // Patch inner Synch
  MyList<Patch> *Pp = PatL;
  while (Pp)
  {
    Sync(Pp->data, VarList, Symmetry);
    Pp = Pp->next;
  }

  // Patch inter Synch
  int cpusize;
  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);

  MyList<Parallel::gridseg> *dst;
  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
  src = new MyList<Parallel::gridseg> *[cpusize];
  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];

  dst = build_buffer_gsl(PatL); // buffer region only
  for (int node = 0; node < cpusize; node++)
  {
    src[node] = build_owned_gsl(PatL, node, 5, Symmetry);                 // for the part without ghost nor buffer points and do not extend
    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
  }

  transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);

  if (dst)
    dst->destroyList();
  for (int node = 0; node < cpusize; node++)
  {
    if (src[node])
      src[node]->destroyList();
    if (transfer_src[node])
      transfer_src[node]->destroyList();
    if (transfer_dst[node])
      transfer_dst[node]->destroyList();
  }

  delete[] src;
  delete[] transfer_src;
  delete[] transfer_dst;
}
// Merged Sync: collect all intra-patch and inter-patch grid segment lists,
// then issue a single transfer() call instead of N+1 separate ones.
void Parallel::Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
{
  int cpusize;
  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);

  MyList<Parallel::gridseg> **combined_src = new MyList<Parallel::gridseg> *[cpusize];
  MyList<Parallel::gridseg> **combined_dst = new MyList<Parallel::gridseg> *[cpusize];
  for (int node = 0; node < cpusize; node++)
    combined_src[node] = combined_dst[node] = 0;

  // Phase A: Intra-patch ghost exchange segments
  MyList<Patch> *Pp = PatL;
  while (Pp)
  {
    Patch *Pat = Pp->data;
    MyList<Parallel::gridseg> *dst_ghost = build_ghost_gsl(Pat);

    for (int node = 0; node < cpusize; node++)
    {
      MyList<Parallel::gridseg> *src_owned = build_owned_gsl0(Pat, node);
      MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
      build_gstl(src_owned, dst_ghost, &tsrc, &tdst);

      if (tsrc)
      {
        if (combined_src[node])
          combined_src[node]->catList(tsrc);
        else
          combined_src[node] = tsrc;
      }
      if (tdst)
      {
        if (combined_dst[node])
          combined_dst[node]->catList(tdst);
        else
          combined_dst[node] = tdst;
      }

      if (src_owned)
        src_owned->destroyList();
    }

    if (dst_ghost)
      dst_ghost->destroyList();

    Pp = Pp->next;
  }

  // Phase B: Inter-patch buffer exchange segments
  MyList<Parallel::gridseg> *dst_buffer = build_buffer_gsl(PatL);
  for (int node = 0; node < cpusize; node++)
  {
    MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatL, node, 5, Symmetry);
    MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
    build_gstl(src_owned, dst_buffer, &tsrc, &tdst);

    if (tsrc)
    {
      if (combined_src[node])
        combined_src[node]->catList(tsrc);
      else
        combined_src[node] = tsrc;
    }
    if (tdst)
    {
      if (combined_dst[node])
        combined_dst[node]->catList(tdst);
      else
        combined_dst[node] = tdst;
    }

    if (src_owned)
      src_owned->destroyList();
  }
  if (dst_buffer)
    dst_buffer->destroyList();

  // Phase C: Single transfer
  transfer(combined_src, combined_dst, VarList, VarList, Symmetry);

  // Phase D: Cleanup
  for (int node = 0; node < cpusize; node++)
  {
    if (combined_src[node])
      combined_src[node]->destroyList();
    if (combined_dst[node])
      combined_dst[node]->destroyList();
  }
  delete[] combined_src;
  delete[] combined_dst;
}
// SyncCache constructor
Parallel::SyncCache::SyncCache()
    : valid(false), cpusize(0), combined_src(0), combined_dst(0),
      send_lengths(0), recv_lengths(0), send_bufs(0), recv_bufs(0),
      send_buf_caps(0), recv_buf_caps(0), reqs(0), stats(0), max_reqs(0),
      lengths_valid(false)
{
}
// SyncCache invalidate: free grid segment lists but keep buffers
void Parallel::SyncCache::invalidate()
{
  if (!valid)
    return;
  for (int i = 0; i < cpusize; i++)
  {
    if (combined_src[i])
      combined_src[i]->destroyList();
    if (combined_dst[i])
      combined_dst[i]->destroyList();
    combined_src[i] = combined_dst[i] = 0;
    send_lengths[i] = recv_lengths[i] = 0;
  }
  valid = false;
  lengths_valid = false;
}
// SyncCache destroy: free everything
void Parallel::SyncCache::destroy()
{
  invalidate();
  if (combined_src) delete[] combined_src;
  if (combined_dst) delete[] combined_dst;
  if (send_lengths) delete[] send_lengths;
  if (recv_lengths) delete[] recv_lengths;
  if (send_buf_caps) delete[] send_buf_caps;
  if (recv_buf_caps) delete[] recv_buf_caps;
  for (int i = 0; i < cpusize; i++)
  {
    if (send_bufs && send_bufs[i]) delete[] send_bufs[i];
    if (recv_bufs && recv_bufs[i]) delete[] recv_bufs[i];
  }
  if (send_bufs) delete[] send_bufs;
  if (recv_bufs) delete[] recv_bufs;
  if (reqs) delete[] reqs;
  if (stats) delete[] stats;
  combined_src = combined_dst = 0;
  send_lengths = recv_lengths = 0;
  send_buf_caps = recv_buf_caps = 0;
  send_bufs = recv_bufs = 0;
  reqs = 0; stats = 0;
  cpusize = 0; max_reqs = 0;
}
// transfer_cached: reuse pre-allocated buffers from SyncCache
void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
                               MyList<var> *VarList1, MyList<var> *VarList2,
                               int Symmetry, SyncCache &cache)
{
  int myrank;
  MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize);
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  int cpusize = cache.cpusize;

  int req_no = 0;
  int node;

  for (node = 0; node < cpusize; node++)
  {
    if (node == myrank)
    {
      int length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
      cache.recv_lengths[node] = length;
      if (length > 0)
      {
        if (length > cache.recv_buf_caps[node])
        {
          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
          cache.recv_bufs[node] = new double[length];
          cache.recv_buf_caps[node] = length;
        }
        data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
      }
    }
    else
    {
      // send
      int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
      cache.send_lengths[node] = slength;
      if (slength > 0)
      {
        if (slength > cache.send_buf_caps[node])
        {
          if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
          cache.send_bufs[node] = new double[slength];
          cache.send_buf_caps[node] = slength;
        }
        data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
        MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
      }
      // recv
      int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
      cache.recv_lengths[node] = rlength;
      if (rlength > 0)
      {
        if (rlength > cache.recv_buf_caps[node])
        {
          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
          cache.recv_bufs[node] = new double[rlength];
          cache.recv_buf_caps[node] = rlength;
        }
        MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
      }
    }
  }

  MPI_Waitall(req_no, cache.reqs, cache.stats);

  for (node = 0; node < cpusize; node++)
    if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
      data_packer(cache.recv_bufs[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
}
// Sync_cached: build grid segment lists on first call, reuse on subsequent calls
void Parallel::Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache)
{
  if (!cache.valid)
  {
    int cpusize;
    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
    cache.cpusize = cpusize;

    // Allocate cache arrays if needed
    if (!cache.combined_src)
    {
      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
      cache.send_lengths = new int[cpusize];
      cache.recv_lengths = new int[cpusize];
      cache.send_bufs = new double *[cpusize];
      cache.recv_bufs = new double *[cpusize];
      cache.send_buf_caps = new int[cpusize];
      cache.recv_buf_caps = new int[cpusize];
      for (int i = 0; i < cpusize; i++)
      {
        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
      }
      cache.max_reqs = 2 * cpusize;
      cache.reqs = new MPI_Request[cache.max_reqs];
      cache.stats = new MPI_Status[cache.max_reqs];
    }

    for (int node = 0; node < cpusize; node++)
    {
      cache.combined_src[node] = cache.combined_dst[node] = 0;
      cache.send_lengths[node] = cache.recv_lengths[node] = 0;
    }

    // Build intra-patch segments (same as Sync_merged Phase A)
    MyList<Patch> *Pp = PatL;
    while (Pp)
    {
      Patch *Pat = Pp->data;
      MyList<Parallel::gridseg> *dst_ghost = build_ghost_gsl(Pat);
      for (int node = 0; node < cpusize; node++)
      {
        MyList<Parallel::gridseg> *src_owned = build_owned_gsl0(Pat, node);
        MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
        build_gstl(src_owned, dst_ghost, &tsrc, &tdst);
        if (tsrc)
        {
          if (cache.combined_src[node])
            cache.combined_src[node]->catList(tsrc);
          else
            cache.combined_src[node] = tsrc;
        }
        if (tdst)
        {
          if (cache.combined_dst[node])
            cache.combined_dst[node]->catList(tdst);
          else
            cache.combined_dst[node] = tdst;
        }
        if (src_owned) src_owned->destroyList();
      }
      if (dst_ghost) dst_ghost->destroyList();
      Pp = Pp->next;
    }

    // Build inter-patch segments (same as Sync_merged Phase B)
    MyList<Parallel::gridseg> *dst_buffer = build_buffer_gsl(PatL);
    for (int node = 0; node < cpusize; node++)
    {
      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatL, node, 5, Symmetry);
      MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
      build_gstl(src_owned, dst_buffer, &tsrc, &tdst);
      if (tsrc)
      {
        if (cache.combined_src[node])
          cache.combined_src[node]->catList(tsrc);
        else
          cache.combined_src[node] = tsrc;
      }
      if (tdst)
      {
        if (cache.combined_dst[node])
          cache.combined_dst[node]->catList(tdst);
        else
          cache.combined_dst[node] = tdst;
      }
      if (src_owned) src_owned->destroyList();
    }
    if (dst_buffer) dst_buffer->destroyList();

    cache.valid = true;
  }

  // Use cached lists with buffer-reusing transfer
  transfer_cached(cache.combined_src, cache.combined_dst, VarList, VarList, Symmetry, cache);
}
// Sync_start: pack and post MPI_Isend/Irecv, return immediately
void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
                          SyncCache &cache, AsyncSyncState &state)
{
  // Ensure cache is built
  if (!cache.valid)
  {
    // Build cache (same logic as Sync_cached)
    int cpusize;
    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
    cache.cpusize = cpusize;

    if (!cache.combined_src)
    {
      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
      cache.send_lengths = new int[cpusize];
      cache.recv_lengths = new int[cpusize];
      cache.send_bufs = new double *[cpusize];
      cache.recv_bufs = new double *[cpusize];
      cache.send_buf_caps = new int[cpusize];
      cache.recv_buf_caps = new int[cpusize];
      for (int i = 0; i < cpusize; i++)
      {
        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
      }
      cache.max_reqs = 2 * cpusize;
      cache.reqs = new MPI_Request[cache.max_reqs];
      cache.stats = new MPI_Status[cache.max_reqs];
    }

    for (int node = 0; node < cpusize; node++)
    {
      cache.combined_src[node] = cache.combined_dst[node] = 0;
      cache.send_lengths[node] = cache.recv_lengths[node] = 0;
    }

    MyList<Patch> *Pp = PatL;
    while (Pp)
    {
      Patch *Pat = Pp->data;
      MyList<Parallel::gridseg> *dst_ghost = build_ghost_gsl(Pat);
      for (int node = 0; node < cpusize; node++)
      {
        MyList<Parallel::gridseg> *src_owned = build_owned_gsl0(Pat, node);
        MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
        build_gstl(src_owned, dst_ghost, &tsrc, &tdst);
        if (tsrc)
        {
          if (cache.combined_src[node])
            cache.combined_src[node]->catList(tsrc);
          else
            cache.combined_src[node] = tsrc;
        }
        if (tdst)
        {
          if (cache.combined_dst[node])
            cache.combined_dst[node]->catList(tdst);
          else
            cache.combined_dst[node] = tdst;
        }
        if (src_owned) src_owned->destroyList();
      }
      if (dst_ghost) dst_ghost->destroyList();
      Pp = Pp->next;
    }

    MyList<Parallel::gridseg> *dst_buffer = build_buffer_gsl(PatL);
    for (int node = 0; node < cpusize; node++)
    {
      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatL, node, 5, Symmetry);
      MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
      build_gstl(src_owned, dst_buffer, &tsrc, &tdst);
      if (tsrc)
      {
        if (cache.combined_src[node])
          cache.combined_src[node]->catList(tsrc);
        else
          cache.combined_src[node] = tsrc;
      }
      if (tdst)
      {
        if (cache.combined_dst[node])
          cache.combined_dst[node]->catList(tdst);
        else
          cache.combined_dst[node] = tdst;
      }
      if (src_owned) src_owned->destroyList();
    }
    if (dst_buffer) dst_buffer->destroyList();
    cache.valid = true;
  }

  // Now pack and post async MPI operations
  int myrank;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  int cpusize = cache.cpusize;
  state.req_no = 0;
  state.active = true;

  MyList<Parallel::gridseg> **src = cache.combined_src;
  MyList<Parallel::gridseg> **dst = cache.combined_dst;

  for (int node = 0; node < cpusize; node++)
  {
    if (node == myrank)
    {
      int length;
      if (!cache.lengths_valid) {
        length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
        cache.recv_lengths[node] = length;
      } else {
        length = cache.recv_lengths[node];
      }
      if (length > 0)
      {
        if (length > cache.recv_buf_caps[node])
        {
          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
          cache.recv_bufs[node] = new double[length];
          cache.recv_buf_caps[node] = length;
        }
        data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
      }
    }
    else
    {
      int slength;
      if (!cache.lengths_valid) {
        slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
        cache.send_lengths[node] = slength;
      } else {
        slength = cache.send_lengths[node];
      }
      if (slength > 0)
      {
        if (slength > cache.send_buf_caps[node])
        {
          if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
          cache.send_bufs[node] = new double[slength];
          cache.send_buf_caps[node] = slength;
        }
        data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
        MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
      }
      int rlength;
      if (!cache.lengths_valid) {
        rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry);
        cache.recv_lengths[node] = rlength;
      } else {
        rlength = cache.recv_lengths[node];
      }
      if (rlength > 0)
      {
        if (rlength > cache.recv_buf_caps[node])
        {
          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
          cache.recv_bufs[node] = new double[rlength];
          cache.recv_buf_caps[node] = rlength;
        }
        MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
      }
    }
  }
  cache.lengths_valid = true;
}
// Sync_finish: wait for async MPI operations and unpack
void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state,
                           MyList<var> *VarList, int Symmetry)
{
  if (!state.active)
    return;

  MPI_Waitall(state.req_no, cache.reqs, cache.stats);

  int cpusize = cache.cpusize;
  MyList<Parallel::gridseg> **src = cache.combined_src;
  MyList<Parallel::gridseg> **dst = cache.combined_dst;

  for (int node = 0; node < cpusize; node++)
    if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
      data_packer(cache.recv_bufs[node], src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry);

  state.active = false;
}
// collect buffer grid segments or blocks for the periodic boundary condition of given patch
// ---------------------------------------------------
// |con |                                       |con |
// |ner |                PhysBD                 |ner |
// |-------------------------------------------------|
// |    |                                       |    |
// |Phy |                                       |Phy |
// |sBD |                                       |BD  |
// |    |                                       |    |
// |    |                                       |    |
// |    |                                       |    |
// |-------------------------------------------------|
// |con |               PhysBD                  |con |
// |ner |                                       |ner |
// ---------------------------------------------------
// first order derivetive does not need conner information,
// but second order derivative needs!
/* the following code does not include conner part
MyList<Parallel::gridseg> *Parallel::build_PhysBD_gsl(Patch *Pat)
{
       MyList<Parallel::gridseg> *cgsl,*gsc,*gsb=0,*p;
       gsc = build_ghost_gsl(Pat);
       for(int i=0;i<dim;i++)
       {
         double DH = gsc->data->Bg->getdX(i);
// lower boundary
         if(gsb)
   {
          p = new MyList<Parallel::gridseg>;
          p->data = new Parallel::gridseg;
          p->next=gsb;
    gsb=p;
   }
   else
   {
          gsb = new MyList<Parallel::gridseg>;
          gsb->data = new Parallel::gridseg;
          gsb->next=0;
   }
         for(int j=0;j<dim;j++)
   {
           if(i == j)
     {
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
             gsb->data->llb[i] = Pat->bbox[i]-ghost_width*DH;
             gsb->data->uub[i] = Pat->bbox[i]-DH;
#else
#ifdef Cell
             gsb->data->llb[i] = Pat->bbox[i]-ghost_width*DH;
             gsb->data->uub[i] = Pat->bbox[i];
#else
#error Not define Vertex nor Cell
#endif
#endif
             gsb->data->shape[i] = ghost_width;
     }
     else
     {
             gsb->data->llb[j] = Pat->bbox[j];
             gsb->data->uub[j] = Pat->bbox[j+dim];
             gsb->data->shape[j] = Pat->shape[j];
     }
   }
   gsb->data->Bg = 0;  //vertual grid segment
// upper boundary
         p = new MyList<Parallel::gridseg>;
         p->data = new Parallel::gridseg;
         p->next=gsb;
   gsb=p;
         for(int j=0;j<dim;j++)
   {
           if(i == j)
     {
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
             gsb->data->llb[i] = Pat->bbox[i+dim]+DH;
             gsb->data->uub[i] = Pat->bbox[i+dim]+ghost_width*DH;
#else
#ifdef Cell
             gsb->data->llb[i] = Pat->bbox[i+dim];
             gsb->data->uub[i] = Pat->bbox[i+dim]+ghost_width*DH;
#else
#error Not define Vertex nor Cell
#endif
#endif
             gsb->data->shape[i] = ghost_width;
     }
     else
     {
             gsb->data->llb[j] = Pat->bbox[j];
             gsb->data->uub[j] = Pat->bbox[j+dim];
             gsb->data->shape[j] = Pat->shape[j];
     }
   }
   gsb->data->Bg = 0;  //vertual grid segment
       }

       cgsl = gsl_and(gsc,gsb);

       gsc->destroyList();
       gsb->destroyList();

       return cgsl;
}
*/
// the following code includes conner part
MyList<Parallel::gridseg> *Parallel::build_PhysBD_gsl(Patch *Pat)
{
  MyList<Parallel::gridseg> *cgsl, *gsc, *gsb = 0, *p;

  gsc = build_complete_gsl(Pat);

  gsb = new MyList<Parallel::gridseg>;
  gsb->data = new Parallel::gridseg;
  gsb->next = 0;
  gsb->data->Bg = 0;

  for (int j = 0; j < dim; j++)
  {
    gsb->data->llb[j] = Pat->bbox[j];
    gsb->data->uub[j] = Pat->bbox[j + dim];
    gsb->data->shape[j] = Pat->shape[j];
  }

  p = gsl_subtract(gsc, gsb);

  gsc->destroyList();
  gsb->destroyList();

  cgsl = divide_gsl(p, Pat);

  p->destroyList();

  return cgsl;
}
MyList<Parallel::gridseg> *Parallel::divide_gsl(MyList<Parallel::gridseg> *p, Patch *Pat)
{
  MyList<Parallel::gridseg> *cgsl = 0;
  while (p)
  {
    if (cgsl)
      cgsl->catList(divide_gs(p, Pat));
    else
      cgsl = divide_gs(p, Pat);
    p = p->next;
  }

  return cgsl;
}
// divide the gs into pices which locate either totally outside of the given Patch coordinate range
// or totally inside it. It's usefull for periodic boundary condition
MyList<Parallel::gridseg> *Parallel::divide_gs(MyList<Parallel::gridseg> *p, Patch *Pat)
{
  double DH[dim];
  for (int i = 0; i < dim; i++)
  {
    DH[i] = p->data->Bg->getdX(i);
  }

  int num[dim];
  double llb[3][dim], uub[3][dim];
  for (int i = 0; i < dim; i++)
  {
    if (p->data->llb[i] < Pat->bbox[i] - DH[i] / 2)
    {
      if (p->data->uub[i] > Pat->bbox[i + dim] + DH[i] / 2)
      {
        num[i] = 3;
        llb[0][i] = p->data->llb[i];
        llb[1][i] = Pat->bbox[i];
        uub[1][i] = Pat->bbox[i + dim];
        uub[2][i] = p->data->uub[i];
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
        uub[0][i] = Pat->bbox[i] - DH[i];
        llb[2][i] = Pat->bbox[i + dim] + DH[i];
#else
#ifdef Cell
        uub[0][i] = Pat->bbox[i];
        llb[2][i] = Pat->bbox[i + dim];
#else
#error Not define Vertex nor Cell
#endif
#endif
      }
      else if (p->data->uub[i] > Pat->bbox[i] + DH[i] / 2)
      {
        num[i] = 2;
        llb[0][i] = p->data->llb[i];
        llb[1][i] = Pat->bbox[i];
        uub[1][i] = p->data->uub[i];
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
        uub[0][i] = Pat->bbox[i] - DH[i];
#else
#ifdef Cell
        uub[0][i] = Pat->bbox[i];
#else
#error Not define Vertex nor Cell
#endif
#endif
      }
      else
      {
        num[i] = 1;
        llb[0][i] = p->data->llb[i];
        uub[0][i] = p->data->uub[i];
      }
    }
    else if (p->data->llb[i] < Pat->bbox[i + dim] - DH[i] / 2)
    {
      if (p->data->uub[i] > Pat->bbox[i + dim] + DH[i] / 2)
      {
        num[i] = 2;
        llb[0][i] = p->data->llb[i];
        uub[0][i] = Pat->bbox[i + dim];
        uub[1][i] = p->data->uub[i];
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
        llb[1][i] = Pat->bbox[i + dim] + DH[i];
#else
#ifdef Cell
        llb[1][i] = Pat->bbox[i + dim];
#else
#error Not define Vertex nor Cell
#endif
#endif
      }
      else
      {
        num[i] = 1;
        llb[0][i] = p->data->llb[i];
        uub[0][i] = p->data->uub[i];
      }
    }
    else
    {
      num[i] = 1;
      llb[0][i] = p->data->llb[i];
      uub[0][i] = p->data->uub[i];
    }
  }
  MyList<Parallel::gridseg> *cgsl = 0, *gg;
  int NN = 1;
  for (int i = 0; i < dim; i++)
    NN = NN * num[i];

  for (int i = 0; i < NN; i++)
  {
    int ind[dim];
    getarrayindex(dim, num, ind, i);
    gg = clone_gsl(p, true);
    for (int k = 0; k < dim; k++)
    {
      gg->data->llb[k] = llb[ind[k]][k];
      gg->data->uub[k] = uub[ind[k]][k];
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
      gg->data->shape[k] = int((uub[ind[k]][k] - llb[ind[k]][k]) / DH[k] + 0.4) + 1;
#else
#ifdef Cell
      gg->data->shape[k] = int((uub[ind[k]][k] - llb[ind[k]][k]) / DH[k] + 0.4);
#else
#error Not define Vertex nor Cell
#endif
#endif
    }

    if (cgsl)
      cgsl->catList(gg);
    else
      cgsl = gg;
  }

  return cgsl;
}
// after mod operation, according to overlape to determine real grid segments
void Parallel::build_PhysBD_gstl(Patch *Pat, MyList<Parallel::gridseg> *srci, MyList<Parallel::gridseg> *dsti,
                                 MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst)
{
  *out_src = *out_dst = 0;

  if (!srci || !dsti)
    return;

  MyList<Parallel::gridseg> *s, *d;
  MyList<Parallel::gridseg> *s2, *d2;

  double llb[dim], uub[dim];

  s = srci;
  while (s)
  {
    Parallel::gridseg *sd = s->data;
    d = dsti;
    while (d)
    {
      Parallel::gridseg *dd = d->data;
      bool flag = true;
      for (int i = 0; i < dim; i++)
      {
        double SH = sd->Bg->getdX(i), DH = dd->Bg->getdX(i);
        if (!feq(SH, DH, SH / 2))
        {
          cout << "Parallel::build_PhysBD_gstl meets different grid space SH = " << SH << ", DH = " << DH << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        // we assume dst and src locate on the same Patch
        if (dd->llb[i] < Pat->bbox[i])
          llb[i] = Mymax(sd->llb[i], dd->llb[i] + Pat->bbox[dim + i] - Pat->bbox[i]);
        else if (dd->llb[i] > Pat->bbox[i + dim])
          llb[i] = Mymax(sd->llb[i], dd->llb[i] - Pat->bbox[dim + i] + Pat->bbox[i]);
        else
          llb[i] = Mymax(sd->llb[i], dd->llb[i]);

        if (dd->uub[i] < Pat->bbox[i])
          uub[i] = Mymin(sd->uub[i], dd->uub[i] + Pat->bbox[dim + i] - Pat->bbox[i]);
        else if (dd->uub[i] > Pat->bbox[dim + i])
          uub[i] = Mymin(sd->uub[i], dd->uub[i] - Pat->bbox[dim + i] + Pat->bbox[i]);
        else
          uub[i] = Mymin(sd->uub[i], dd->uub[i]);
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
        if (llb[i] > uub[i] + SH / 2)
        {
          flag = false;
          break;
        } // special for isolated point
#else
#ifdef Cell
        if (llb[i] > uub[i])
        {
          flag = false;
          break;
        }
#else
#error Not define Vertex nor Cell
#endif
#endif
      }

      if (flag)
      {
        if (!(*out_src))
        {
          *out_src = s2 = new MyList<Parallel::gridseg>;
          *out_dst = d2 = new MyList<Parallel::gridseg>;
          s2->data = new Parallel::gridseg;
          d2->data = new Parallel::gridseg;
        }
        else
        {
          s2->next = new MyList<Parallel::gridseg>;
          s2 = s2->next;
          d2->next = new MyList<Parallel::gridseg>;
          d2 = d2->next;
          s2->data = new Parallel::gridseg;
          d2->data = new Parallel::gridseg;
        }

        for (int i = 0; i < dim; i++)
        {
          double SH = sd->Bg->getdX(i), DH = dd->Bg->getdX(i);
          s2->data->llb[i] = llb[i];
          s2->data->uub[i] = uub[i];

          if (dd->llb[i] < Pat->bbox[i])
            d2->data->llb[i] = llb[i] - Pat->bbox[dim + i] + Pat->bbox[i];
          else if (dd->llb[i] > Pat->bbox[i + dim])
            d2->data->llb[i] = llb[i] + Pat->bbox[dim + i] - Pat->bbox[i];
          else
            d2->data->llb[i] = llb[i];

          if (dd->uub[i] < Pat->bbox[i])
            d2->data->uub[i] = uub[i] - Pat->bbox[dim + i] + Pat->bbox[i];
          else if (dd->uub[i] > Pat->bbox[dim + i])
            d2->data->uub[i] = uub[i] + Pat->bbox[dim + i] - Pat->bbox[i];
          else
            d2->data->uub[i] = uub[i];
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
          s2->data->shape[i] = int((s2->data->uub[i] - s2->data->llb[i]) / SH + 0.4) + 1;
          d2->data->shape[i] = int((d2->data->uub[i] - d2->data->llb[i]) / DH + 0.4) + 1;
#else
#ifdef Cell
          s2->data->shape[i] = int((s2->data->uub[i] - s2->data->llb[i]) / SH + 0.4);
          d2->data->shape[i] = int((d2->data->uub[i] - d2->data->llb[i]) / DH + 0.4);
#else
#error Not define Vertex nor Cell
#endif
#endif
        }
        s2->data->Bg = sd->Bg;
        s2->next = 0;
        d2->data->Bg = dd->Bg;
        d2->next = 0;
      }
      d = d->next;
    }
    s = s->next;
  }
}
void Parallel::PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry)
{
  int cpusize;
  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);

  MyList<Parallel::gridseg> *dst;
  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
  src = new MyList<Parallel::gridseg> *[cpusize];
  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];

  dst = build_PhysBD_gsl(Pat);
  for (int node = 0; node < cpusize; node++)
  {
    src[node] = build_owned_gsl0(Pat, node);                                          // for the part without ghost points and do not extend
    build_PhysBD_gstl(Pat, src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
  }

  transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);

  if (dst)
    dst->destroyList();
  for (int node = 0; node < cpusize; node++)
  {
    if (src[node])
      src[node]->destroyList();
    if (transfer_src[node])
      transfer_src[node]->destroyList();
    if (transfer_dst[node])
      transfer_dst[node]->destroyList();
  }

  delete[] src;
  delete[] transfer_src;
  delete[] transfer_dst;
}
double Parallel::L2Norm(Patch *Pat, var *vf)
{
  int myrank;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);

  double tvf, dtvf = 0;
  int BDW = ghost_width;

  MyList<Block> *BP = Pat->blb;
  while (BP)
  {
    Block *cg = BP->data;
    if (myrank == cg->rank)
    {
      f_l2normhelper(cg->shape, cg->X[0], cg->X[1], cg->X[2],
                     Pat->bbox[0], Pat->bbox[1], Pat->bbox[2],
                     Pat->bbox[3], Pat->bbox[4], Pat->bbox[5],
                     cg->fgfs[vf->sgfn], tvf, BDW);
      dtvf += tvf;
    }
    if (BP == Pat->ble)
      break;
    BP = BP->next;
  }

  MPI_Allreduce(&dtvf, &tvf, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);

  tvf = sqrt(tvf);

  return tvf;
}
double Parallel::L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here)
{
  int myrank;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);

  double tvf, dtvf = 0;
  int BDW = ghost_width;

  MyList<Block> *BP = Pat->blb;
  while (BP)
  {
    Block *cg = BP->data;
    if (myrank == cg->rank)
    {
      f_l2normhelper(cg->shape, cg->X[0], cg->X[1], cg->X[2],
                     Pat->bbox[0], Pat->bbox[1], Pat->bbox[2],
                     Pat->bbox[3], Pat->bbox[4], Pat->bbox[5],
                     cg->fgfs[vf->sgfn], tvf, BDW);
      dtvf += tvf;
    }
    if (BP == Pat->ble)
      break;
    BP = BP->next;
  }

  MPI_Allreduce(&dtvf, &tvf, 1, MPI_DOUBLE, MPI_SUM, Comm_here);

  tvf = sqrt(tvf);

  return tvf;
}
void Parallel::checkgsl(MyList<Parallel::gridseg> *pp, bool first_only)
{
  int myrank = 0;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  if (myrank == 0)
  {
    if (!pp)
      cout << " Parallel::checkgsl meets empty gsl" << endl;
    while (pp)
    {
      if (pp->data->Bg)
        cout << " on node#" << pp->data->Bg->rank << endl;
      else
        cout << " virtual grid segment" << endl;
      cout << " shape: (";
      for (int i = 0; i < dim; i++)
      {
        if (i < dim - 1)
          cout << pp->data->shape[i] << ",";
        else
          cout << pp->data->shape[i] << ")" << endl;
      }
      cout << " range: (";
      for (int i = 0; i < dim; i++)
      {
        if (i < dim - 1)
          cout << pp->data->llb[i] << ":" << pp->data->uub[i] << ",";
        else
          cout << pp->data->llb[i] << ":" << pp->data->uub[i] << ")" << endl;
      }
      if (first_only)
        return;
      pp = pp->next;
    }
  }
}
void Parallel::checkvarl(MyList<var> *pp, bool first_only)
{
  int myrank = 0;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  if (myrank == 0)
  {
    while (pp)
    {
      cout << "name: " << pp->data->name << endl;
      cout << "SoA = (" << pp->data->SoA[0] << "," << pp->data->SoA[1] << "," << pp->data->SoA[2] << ")" << endl;
      cout << "sgfn = " << pp->data->sgfn << endl;
      if (first_only)
        return;
      pp = pp->next;
    }
  }
}
void Parallel::prepare_inter_time_level(MyList<Patch> *PatL,
                                        MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
                                        MyList<var> *VarList3 /* target (t+a*dt) */, int tindex)
{
  while (PatL)
  {
    prepare_inter_time_level(PatL->data, VarList1, VarList2, VarList3, tindex);
    PatL = PatL->next;
  }
}
void Parallel::prepare_inter_time_level(Patch *Pat,
                                        MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
                                        MyList<var> *VarList3 /* target (t+a*dt) */, int tindex)
{
  int myrank = 0;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);

  MyList<var> *varl1;
  MyList<var> *varl2;
  MyList<var> *varl3;

  MyList<Block> *BP = Pat->blb;
  while (BP)
  {
    Block *cg = BP->data;
    if (myrank == cg->rank)
    {
      varl1 = VarList1;
      varl2 = VarList2;
      varl3 = VarList3;
      while (varl1)
      {
        if (tindex == 0)
          f_average(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn], cg->fgfs[varl3->data->sgfn]);
        else if (tindex == 1)
          f_average3(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn], cg->fgfs[varl3->data->sgfn]);
        else if (tindex == -1)
          // just change data order to use average3
          f_average3(cg->shape, cg->fgfs[varl2->data->sgfn], cg->fgfs[varl1->data->sgfn], cg->fgfs[varl3->data->sgfn]);
        else
        {
          cout << "error tindex in Parallel::prepare_inter_time_level" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        varl1 = varl1->next;
        varl2 = varl2->next;
        varl3 = varl3->next;
      }
    }
    if (BP == Pat->ble)
      break;
    BP = BP->next;
  }
}
void Parallel::prepare_inter_time_level(MyList<Patch> *PatL,
                                        MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
                                        MyList<var> *VarList3 /* source (t-dt) */, MyList<var> *VarList4 /* target (t+a*dt) */, int tindex)
{
  while (PatL)
  {
    prepare_inter_time_level(PatL->data, VarList1, VarList2, VarList3, VarList4, tindex);
    PatL = PatL->next;
  }
}
void Parallel::prepare_inter_time_level(Patch *Pat,
                                        MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
                                        MyList<var> *VarList3 /* source (t-dt) */, MyList<var> *VarList4 /* target (t+a*dt) */, int tindex)
{
  int myrank = 0;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);

  MyList<var> *varl1;
  MyList<var> *varl2;
  MyList<var> *varl3;
  MyList<var> *varl4;

  MyList<Block> *BP = Pat->blb;
  while (BP)
  {
    Block *cg = BP->data;
    if (myrank == cg->rank)
    {
      varl1 = VarList1;
      varl2 = VarList2;
      varl3 = VarList3;
      varl4 = VarList4;
      while (varl1)
      {
        if (tindex == 0)
          f_average2(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn],
                     cg->fgfs[varl3->data->sgfn], cg->fgfs[varl4->data->sgfn]);
        else if (tindex == 1)
          f_average2p(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn],
                      cg->fgfs[varl3->data->sgfn], cg->fgfs[varl4->data->sgfn]);
        else if (tindex == -1)
          f_average2m(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn],
                      cg->fgfs[varl3->data->sgfn], cg->fgfs[varl4->data->sgfn]);
        else
        {
          cout << "error tindex in long cgh::prepare_inter_time_level" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        varl1 = varl1->next;
        varl2 = varl2->next;
        varl3 = varl3->next;
        varl4 = varl4->next;
      }
    }
    if (BP == Pat->ble)
      break;
    BP = BP->next;
  }
}
void Parallel::Prolong(Patch *Patc, Patch *Patf,
                       MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
                       int Symmetry)
{
  if (Patc->lev >= Patf->lev)
  {
    cout << "Parallel::Prolong: meet requst of Prolong from lev#" << Patc->lev << " to lev#" << Patf->lev << endl;
    MPI_Abort(MPI_COMM_WORLD, 1);
  }
  int cpusize;
  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);

  MyList<Parallel::gridseg> *dst;
  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
  src = new MyList<Parallel::gridseg> *[cpusize];
  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];

  dst = build_complete_gsl(Patf); // including ghost
  for (int node = 0; node < cpusize; node++)
  {
    src[node] = build_owned_gsl4(Patc, node, Symmetry);                   // - buffer - ghost - BD ghost
    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
  }

  transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);

  if (dst)
    dst->destroyList();
  for (int node = 0; node < cpusize; node++)
  {
    if (src[node])
      src[node]->destroyList();
    if (transfer_src[node])
      transfer_src[node]->destroyList();
    if (transfer_dst[node])
      transfer_dst[node]->destroyList();
  }

  delete[] src;
  delete[] transfer_src;
  delete[] transfer_dst;
}
void Parallel::Restrict(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                        MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
                        int Symmetry)
{
  if (PatcL->data->lev >= PatfL->data->lev)
  {
    cout << "Parallel::Restrict: meet requst of Restrict from lev#" << PatfL->data->lev << " to lev#" << PatcL->data->lev << endl;
    MPI_Abort(MPI_COMM_WORLD, 1);
  }
  int cpusize;
  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);

  MyList<Parallel::gridseg> *dst;
  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
  src = new MyList<Parallel::gridseg> *[cpusize];
  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];

  dst = build_complete_gsl(PatcL); // including ghost
  for (int node = 0; node < cpusize; node++)
  {
#if 0
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
      src[node]=build_owned_gsl(PatfL,node,2,Symmetry);   // - buffer - ghost
#else
#ifdef Cell
      src[node]=build_owned_gsl(PatfL,node,4,Symmetry); // - buffer - ghost - BD ghost
#else
#error Not define Vertex nor Cell
#endif
#endif
#else
    // it seems bam always use this
    src[node] = build_owned_gsl(PatfL, node, 2, Symmetry); // - buffer - ghost
#endif
    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
  }

  transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);

  if (dst)
    dst->destroyList();
  for (int node = 0; node < cpusize; node++)
  {
    if (src[node])
      src[node]->destroyList();
    if (transfer_src[node])
      transfer_src[node]->destroyList();
    if (transfer_dst[node])
      transfer_dst[node]->destroyList();
  }

  delete[] src;
  delete[] transfer_src;
  delete[] transfer_dst;
}
void Parallel::Restrict_after(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                              MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
                              int Symmetry)
{
  if (PatcL->data->lev >= PatfL->data->lev)
  {
    cout << "Parallel::Restrict: meet requst of Restrict from lev#" << PatfL->data->lev << " to lev#" << PatcL->data->lev << endl;
    MPI_Abort(MPI_COMM_WORLD, 1);
  }
  int cpusize;
  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);

  MyList<Parallel::gridseg> *dst;
  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
  src = new MyList<Parallel::gridseg> *[cpusize];
  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];

  dst = build_complete_gsl(PatcL); // including ghost
  for (int node = 0; node < cpusize; node++)
  {
    src[node] = build_owned_gsl(PatfL, node, 3, Symmetry); // - ghost - BD ghost

    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
  }

  transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);

  if (dst)
    dst->destroyList();
  for (int node = 0; node < cpusize; node++)
  {
    if (src[node])
      src[node]->destroyList();
    if (transfer_src[node])
      transfer_src[node]->destroyList();
    if (transfer_dst[node])
      transfer_dst[node]->destroyList();
  }

  delete[] src;
  delete[] transfer_src;
  delete[] transfer_dst;
}
// for the same time level
void Parallel::OutBdLow2Hi(Patch *Patc, Patch *Patf,
                           MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
                           int Symmetry)
{
  if (Patc->lev >= Patf->lev)
  {
    cout << "Parallel::OutBdLow2Hi: meet requst of Prolong from lev#" << Patc->lev << " to lev#" << Patf->lev << endl;
    MPI_Abort(MPI_COMM_WORLD, 1);
  }
  int cpusize;
  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);

  MyList<Parallel::gridseg> *dst;
  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
  src = new MyList<Parallel::gridseg> *[cpusize];
  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];

  dst = build_buffer_gsl(Patf); // buffer region only

  for (int node = 0; node < cpusize; node++)
  {
    src[node] = build_owned_gsl4(Patc, node, Symmetry);                   // - buffer - ghost - BD ghost
    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
  }

  transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);

  if (dst)
    dst->destroyList();
  for (int node = 0; node < cpusize; node++)
  {
    if (src[node])
      src[node]->destroyList();
    if (transfer_src[node])
      transfer_src[node]->destroyList();
    if (transfer_dst[node])
      transfer_dst[node]->destroyList();
  }

  delete[] src;
  delete[] transfer_src;
  delete[] transfer_dst;
}
void Parallel::OutBdLow2Hi(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                           MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
                           int Symmetry)
{
  MyList<Patch> *Pp, *Ppc;
  Ppc = PatcL;
  while (Ppc)
  {
    Pp = PatfL;
    while (Pp)
    {
      if (Ppc->data->lev >= Pp->data->lev)
      {
        cout << "Parallel::OutBdLow2Hi(list): meet requst of Prolong from lev#" << Ppc->data->lev << " to lev#" << Pp->data->lev << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
      Pp = Pp->next;
    }
    Ppc = Ppc->next;
  }

  int cpusize;
  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);

  MyList<Parallel::gridseg> *dst;
  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
  src = new MyList<Parallel::gridseg> *[cpusize];
  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];

  dst = build_buffer_gsl(PatfL); // buffer region only

  for (int node = 0; node < cpusize; node++)
  {
    src[node] = build_owned_gsl(PatcL, node, 4, Symmetry);                // - buffer - ghost - BD ghost
    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
  }

  transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);

  if (dst)
    dst->destroyList();
  for (int node = 0; node < cpusize; node++)
  {
    if (src[node])
      src[node]->destroyList();
    if (transfer_src[node])
      transfer_src[node]->destroyList();
    if (transfer_dst[node])
      transfer_dst[node]->destroyList();
  }

  delete[] src;
  delete[] transfer_src;
  delete[] transfer_dst;
}
// for the same time level
void Parallel::OutBdLow2Himix(Patch *Patc, Patch *Patf,
                              MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
                              int Symmetry)
{
  if (Patc->lev >= Patf->lev)
  {
    cout << "Parallel::OutBdLow2Himix: meet requst of Prolong from lev#" << Patc->lev << " to lev#" << Patf->lev << endl;
    MPI_Abort(MPI_COMM_WORLD, 1);
  }
  int cpusize;
  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);

  MyList<Parallel::gridseg> *dst;
  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
  src = new MyList<Parallel::gridseg> *[cpusize];
  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];

  dst = build_buffer_gsl(Patf); // buffer region only

  for (int node = 0; node < cpusize; node++)
  {
    src[node] = build_owned_gsl4(Patc, node, Symmetry);                   // - buffer - ghost - BD ghost
    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
  }

  transfermix(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);

  if (dst)
    dst->destroyList();
  for (int node = 0; node < cpusize; node++)
  {
    if (src[node])
      src[node]->destroyList();
    if (transfer_src[node])
      transfer_src[node]->destroyList();
    if (transfer_dst[node])
      transfer_dst[node]->destroyList();
  }

  delete[] src;
  delete[] transfer_src;
  delete[] transfer_dst;

  // do not need this, we have done after calling of this routine in ProlongRestrict or RestrictProlong
  //    Sync(Patf,VarList2,Symmetry);  // fine level points may be not enough for interpolation
}
void Parallel::OutBdLow2Himix(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                              MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
                              int Symmetry)
{
  MyList<Patch> *Pp, *Ppc;
  Ppc = PatcL;
  while (Ppc)
  {
    Pp = PatfL;
    while (Pp)
    {
      if (Ppc->data->lev >= Pp->data->lev)
      {
        cout << "Parallel::OutBdLow2Himix(list): meet requst of Prolong from lev#" << Ppc->data->lev << " to lev#" << Pp->data->lev << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
      Pp = Pp->next;
    }
    Ppc = Ppc->next;
  }

  int cpusize;
  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);

  MyList<Parallel::gridseg> *dst;
  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
  src = new MyList<Parallel::gridseg> *[cpusize];
  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];

  dst = build_buffer_gsl(PatfL); // buffer region only

  for (int node = 0; node < cpusize; node++)
  {
    src[node] = build_owned_gsl(PatcL, node, 4, Symmetry);                // - buffer - ghost - BD ghost
    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
  }

  transfermix(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);

  if (dst)
    dst->destroyList();
  for (int node = 0; node < cpusize; node++)
  {
    if (src[node])
      src[node]->destroyList();
    if (transfer_src[node])
      transfer_src[node]->destroyList();
    if (transfer_dst[node])
      transfer_dst[node]->destroyList();
  }

  delete[] src;
  delete[] transfer_src;
  delete[] transfer_dst;
}

// Restrict_cached: cache grid segment lists, reuse buffers via transfer_cached
void Parallel::Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                               MyList<var> *VarList1, MyList<var> *VarList2,
                               int Symmetry, SyncCache &cache)
{
  if (!cache.valid)
  {
    int cpusize;
    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
    cache.cpusize = cpusize;

    if (!cache.combined_src)
    {
      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
      cache.send_lengths = new int[cpusize];
      cache.recv_lengths = new int[cpusize];
      cache.send_bufs = new double *[cpusize];
      cache.recv_bufs = new double *[cpusize];
      cache.send_buf_caps = new int[cpusize];
      cache.recv_buf_caps = new int[cpusize];
      for (int i = 0; i < cpusize; i++)
      {
        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
      }
      cache.max_reqs = 2 * cpusize;
      cache.reqs = new MPI_Request[cache.max_reqs];
      cache.stats = new MPI_Status[cache.max_reqs];
    }

    MyList<Parallel::gridseg> *dst = build_complete_gsl(PatcL);
    for (int node = 0; node < cpusize; node++)
    {
      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatfL, node, 2, Symmetry);
      build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]);
      if (src_owned) src_owned->destroyList();
    }
    if (dst) dst->destroyList();

    cache.valid = true;
  }

  transfer_cached(cache.combined_src, cache.combined_dst, VarList1, VarList2, Symmetry, cache);
}

// OutBdLow2Hi_cached: cache grid segment lists, reuse buffers via transfer_cached
void Parallel::OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                                  MyList<var> *VarList1, MyList<var> *VarList2,
                                  int Symmetry, SyncCache &cache)
{
  if (!cache.valid)
  {
    int cpusize;
    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
    cache.cpusize = cpusize;

    if (!cache.combined_src)
    {
      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
      cache.send_lengths = new int[cpusize];
      cache.recv_lengths = new int[cpusize];
      cache.send_bufs = new double *[cpusize];
      cache.recv_bufs = new double *[cpusize];
      cache.send_buf_caps = new int[cpusize];
      cache.recv_buf_caps = new int[cpusize];
      for (int i = 0; i < cpusize; i++)
      {
        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
      }
      cache.max_reqs = 2 * cpusize;
      cache.reqs = new MPI_Request[cache.max_reqs];
      cache.stats = new MPI_Status[cache.max_reqs];
    }

    MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
    for (int node = 0; node < cpusize; node++)
    {
      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatcL, node, 4, Symmetry);
      build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]);
      if (src_owned) src_owned->destroyList();
    }
    if (dst) dst->destroyList();

    cache.valid = true;
  }

  transfer_cached(cache.combined_src, cache.combined_dst, VarList1, VarList2, Symmetry, cache);
}

// OutBdLow2Himix_cached: same as OutBdLow2Hi_cached but uses transfermix for unpacking
void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                                     MyList<var> *VarList1, MyList<var> *VarList2,
                                     int Symmetry, SyncCache &cache)
{
  if (!cache.valid)
  {
    int cpusize;
    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
    cache.cpusize = cpusize;

    if (!cache.combined_src)
    {
      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
      cache.send_lengths = new int[cpusize];
      cache.recv_lengths = new int[cpusize];
      cache.send_bufs = new double *[cpusize];
      cache.recv_bufs = new double *[cpusize];
      cache.send_buf_caps = new int[cpusize];
      cache.recv_buf_caps = new int[cpusize];
      for (int i = 0; i < cpusize; i++)
      {
        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
      }
      cache.max_reqs = 2 * cpusize;
      cache.reqs = new MPI_Request[cache.max_reqs];
      cache.stats = new MPI_Status[cache.max_reqs];
    }

    MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
    for (int node = 0; node < cpusize; node++)
    {
      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatcL, node, 4, Symmetry);
      build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]);
      if (src_owned) src_owned->destroyList();
    }
    if (dst) dst->destroyList();

    cache.valid = true;
  }

  // Use transfermix instead of transfer for mix-mode interpolation
  int myrank;
  MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize);
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  int cpusize = cache.cpusize;

  int req_no = 0;
  for (int node = 0; node < cpusize; node++)
  {
    if (node == myrank)
    {
      int length = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
      cache.recv_lengths[node] = length;
      if (length > 0)
      {
        if (length > cache.recv_buf_caps[node])
        {
          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
          cache.recv_bufs[node] = new double[length];
          cache.recv_buf_caps[node] = length;
        }
        data_packermix(cache.recv_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
      }
    }
    else
    {
      int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
      cache.send_lengths[node] = slength;
      if (slength > 0)
      {
        if (slength > cache.send_buf_caps[node])
        {
          if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
          cache.send_bufs[node] = new double[slength];
          cache.send_buf_caps[node] = slength;
        }
        data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
        MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
      }
      int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
      cache.recv_lengths[node] = rlength;
      if (rlength > 0)
      {
        if (rlength > cache.recv_buf_caps[node])
        {
          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
          cache.recv_bufs[node] = new double[rlength];
          cache.recv_buf_caps[node] = rlength;
        }
        MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
      }
    }
  }

  MPI_Waitall(req_no, cache.reqs, cache.stats);

  for (int node = 0; node < cpusize; node++)
    if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
      data_packermix(cache.recv_bufs[node], cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
}

// collect all buffer grid segments or blocks for given patch
MyList<Parallel::gridseg> *Parallel::build_buffer_gsl(Patch *Pat)
{
  MyList<Parallel::gridseg> *cgsl, *gsc, *gsb;

  gsc = build_complete_gsl(Pat); // including ghost

  gsb = new MyList<Parallel::gridseg>;
  gsb->data = new Parallel::gridseg;

  for (int i = 0; i < dim; i++)
  {
    double DH = Pat->blb->data->getdX(i);
    gsb->data->uub[i] = Pat->bbox[dim + i] - Pat->uui[i] * DH;
    gsb->data->llb[i] = Pat->bbox[i] + Pat->lli[i] * DH;
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
    gsb->data->shape[i] = int((gsb->data->uub[i] - gsb->data->llb[i]) / DH + 0.4) + 1;
#else
#ifdef Cell
    gsb->data->shape[i] = int((gsb->data->uub[i] - gsb->data->llb[i]) / DH + 0.4);
#else
#error Not define Vertex nor Cell
#endif
#endif
  }
  gsb->data->Bg = 0;
  gsb->next = 0;

  cgsl = gsl_subtract(gsc, gsb);

  gsc->destroyList();
  gsb->destroyList();

  //  set illb and iuub
  gsb = cgsl;
  while (gsb)
  {
    for (int i = 0; i < dim; i++)
    {
      double DH = Pat->blb->data->getdX(i);
      gsb->data->iuub[i] = Pat->bbox[dim + i] - Pat->uui[i] * DH;
      gsb->data->illb[i] = Pat->bbox[i] + Pat->lli[i] * DH;
    }
    gsb = gsb->next;
  }

  return cgsl;
}
MyList<Parallel::gridseg> *Parallel::build_buffer_gsl(MyList<Patch> *PatL)
{
  MyList<Parallel::gridseg> *cgsl = 0, *gs;
  while (PatL)
  {
    if (cgsl)
    {
      gs->next = build_buffer_gsl(PatL->data);
      gs = gs->next;
      if (gs)
        while (gs->next)
          gs = gs->next;
    }
    else
    {
      cgsl = build_buffer_gsl(PatL->data);
      gs = cgsl;
      if (gs)
        while (gs->next)
          gs = gs->next;
    }
    PatL = PatL->next;
  }

  return cgsl;
}
void Parallel::Prolongint(Patch *Patc, Patch *Patf,
                          MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
                          int Symmetry)
{
  if (Patc->lev >= Patf->lev)
  {
    cout << "Parallel::Prolong: meet requst of Prolong from lev#" << Patc->lev << " to lev#" << Patf->lev << endl;
    MPI_Abort(MPI_COMM_WORLD, 1);
  }

  int myrank = 0;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);

  int num_var = 0;
  MyList<var> *varl;
  varl = VarList1;
  while (varl)
  {
    num_var++;
    varl = varl->next;
  }

  MyList<Block> *BP = Patf->blb;
  while (BP)
  {
    int Npts;
    if (myrank == BP->data->rank)
      Npts = BP->data->shape[0] * BP->data->shape[1] * BP->data->shape[2];
    MPI_Bcast(&Npts, 1, MPI_INT, BP->data->rank, MPI_COMM_WORLD);
    double *pox[3];
    for (int i = 0; i < 3; i++)
      pox[i] = new double[Npts];
    if (myrank == BP->data->rank)
    {
      for (int i = 0; i < Npts; i++)
      {
        int ind[3];
        Parallel::getarrayindex(3, BP->data->shape, ind, i);
        pox[0][i] = BP->data->X[0][ind[0]];
        pox[1][i] = BP->data->X[1][ind[1]];
        pox[2][i] = BP->data->X[2][ind[2]];
      }
    }
    for (int i = 0; i < 3; i++)
      MPI_Bcast(pox[i], Npts, MPI_DOUBLE, BP->data->rank, MPI_COMM_WORLD);
    double *res;
    res = new double[num_var * Npts];
    Patc->Interp_Points(VarList1, Npts, pox, res, Symmetry); // because this operation is a global operation (for all processors)
                                                             // we have to isolate it out of myrank==BP->data->rank
    if (myrank == BP->data->rank)
    {
      for (int i = 0; i < Npts; i++)
      {
        varl = VarList2;
        int j = 0;
        while (varl)
        {
          (BP->data->fgfs[varl->data->sgfn])[i] = res[j + i * num_var];
          j++;
          varl = varl->next;
        }
      }
    }
    delete[] pox[0];
    delete[] pox[1];
    delete[] pox[2];
    delete[] res;
    BP = BP->next;
  }
}
//
void Parallel::merge_gsl(MyList<gridseg> *&A, const double ratio)
{
  if (!A)
    return;

  MyList<gridseg> *B, *C, *D = A;
  bool flag = false;
  while (D->next)
  {
    B = D->next;
    while (B)
    {
      flag = merge_gs(D, B, C, ratio);
      if (flag)
        break;
      B = B->next;
    }
    if (flag)
      break;
    D = D->next;
  }

  if (flag)
  {
    // delete D and B from A
    MyList<gridseg> *E = A;
    while (E->next)
    {
      MyList<gridseg> *tp = E->next;
      if (D == tp || B == tp)
      {
        E->next = (tp->next) ? tp->next : 0;
        delete tp->data;
        delete tp;
      }
      if (E->next)
        E = E->next;
    }

    if (D == A)
    {
      MyList<gridseg> *tp = A;
      A = (A->next) ? A->next : 0;
      delete tp->data;
      delete tp;
    }
    // cat C to A
    if (A)
      A->catList(C);
    else
      A = C;

    merge_gsl(A, ratio);
  }
}
//
bool Parallel::merge_gs(MyList<gridseg> *D, MyList<gridseg> *B, MyList<gridseg> *&C, const double ratio)
{
  if (!B || !D)
    return false;

  C = 0;
  double llb[dim], uub[dim], DH[dim];
  for (int i = 0; i < dim; i++)
  {
    double tdh;
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
    DH[i] = (D->data->uub[i] - D->data->llb[i]) / (D->data->shape[i] - 1);
    tdh = (B->data->uub[i] - B->data->llb[i]) / (B->data->shape[i] - 1);
#else
#ifdef Cell
    DH[i] = (D->data->uub[i] - D->data->llb[i]) / D->data->shape[i];
    tdh = (B->data->uub[i] - B->data->llb[i]) / B->data->shape[i];
#else
#error Not define Vertex nor Cell
#endif
#endif
    if (!feq(DH[i], tdh, DH[i] / 2))
    {
      cout << "Parallel::merge_gs meets different grid segment " << DH[i] << " vs " << tdh << endl;
      checkgsl(B, true);
      checkgsl(D, true);
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
    llb[i] = Mymax(D->data->llb[i], B->data->llb[i]);
    uub[i] = Mymin(D->data->uub[i], B->data->uub[i]);
    //    if(uub[i]-llb[i] < DH[i]/2) return false;  //here this is valid for both vertex and cell

    // use 0 instead of DH[i]/2, we consider contact case, 2012 Aug 8
    if (uub[i] - llb[i] < 0)
      return false; // here this is valid for both vertex and cell
  }

  // vb: volume of B
  // vd: volume of D
  // vo: volume of overlap
  // vt: volume of smallest common box (virtual merged box)
  double vd = 1, vb = 1, vt = 1, vo = 1;
  for (int i = 0; i < dim; i++)
  {
    vt = vt * (Mymax(D->data->uub[i], B->data->uub[i]) - Mymin(D->data->llb[i], B->data->llb[i]));
    vo = vo * (uub[i] - llb[i]);
    vd = vd * (D->data->uub[i] - D->data->llb[i]);
    vb = vb * (B->data->uub[i] - B->data->llb[i]);
  }

  // smller ratio, more possible to merge
  if ((vd + vb - vo) / vt > ratio)
  {
    C = new MyList<gridseg>;
    C->data = new gridseg;
    for (int i = 0; i < dim; i++)
    {
      C->data->uub[i] = Mymax(D->data->uub[i], B->data->uub[i]);
      C->data->llb[i] = Mymin(D->data->llb[i], B->data->llb[i]);
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
      C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / DH[i] + 0.4) + 1;
#else
#ifdef Cell
      C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / DH[i] + 0.4);
#else
#error Not define Vertex nor Cell
#endif
#endif
    }
    if (D->data->Bg == B->data->Bg)
      C->data->Bg = D->data->Bg;
    else
      C->data->Bg = 0;

    C->next = 0;

    return true;
  }
  else
  {
    return false;
  }
}
// Add ghost region to tangent plane
// we assume the grids have the same resolution
void Parallel::add_ghost_touch(MyList<gridseg> *&A)
{
  if (!A || !(A->next))
    return;

  double DH[dim];
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
  for (int i = 0; i < dim; i++)
    DH[i] = (A->data->uub[i] - A->data->llb[i]) / (A->data->shape[i] - 1) / 2;
#else
#ifdef Cell
  for (int i = 0; i < dim; i++)
    DH[i] = (A->data->uub[i] - A->data->llb[i]) / A->data->shape[i] / 2;
#else
#error Not define Vertex nor Cell
#endif
#endif

  MyList<gridseg> *C1, *C2, *A1 = A, *A2, *dc;
  dc = C1 = clone_gsl(A, false);
  while (C1)
  {
    C2 = C1->next;
    A2 = A1->next;
    while (C2)
    {
      for (int i = 0; i < dim; i++)
      {
        if (feq(C1->data->llb[i], C2->data->uub[i], DH[i]))
        {
          // direction i touch, other directions overlap
          bool flag = true;
          for (int j = 0; j < i; j++)
            if ((C1->data->llb[j] - C2->data->llb[j]) * (C1->data->uub[j] - C2->data->llb[j]) > 0 &&
                (C2->data->llb[j] - C1->data->llb[j]) * (C2->data->uub[j] - C1->data->llb[j]) > 0)
              flag = false;
          for (int j = i + 1; j < dim; j++)
            if ((C1->data->llb[j] - C2->data->llb[j]) * (C1->data->uub[j] - C2->data->llb[j]) > 0 &&
                (C2->data->llb[j] - C1->data->llb[j]) * (C2->data->uub[j] - C1->data->llb[j]) > 0)
              flag = false;

          if (flag)
          {
            // only add one ghost region
            if (feq(A1->data->llb[i], C1->data->llb[i], DH[i]))
            {
              A1->data->llb[i] -= ghost_width * 2 * DH[i];
              A1->data->shape[i] += ghost_width;
            }
            if (feq(A2->data->uub[i], C2->data->uub[i], DH[i]))
            {
              A2->data->uub[i] += ghost_width * 2 * DH[i];
              A2->data->shape[i] += ghost_width;
            }
          }
        }
        if (feq(C1->data->uub[i], C2->data->llb[i], DH[i]))
        {
          // direction i touch, other directions overlap
          bool flag = true;
          for (int j = 0; j < i; j++)
            if ((C1->data->llb[j] - C2->data->llb[j]) * (C1->data->uub[j] - C2->data->llb[j]) > 0 &&
                (C2->data->llb[j] - C1->data->llb[j]) * (C2->data->uub[j] - C1->data->llb[j]) > 0)
              flag = false;
          for (int j = i + 1; j < dim; j++)
            if ((C1->data->llb[j] - C2->data->llb[j]) * (C1->data->uub[j] - C2->data->llb[j]) > 0 &&
                (C2->data->llb[j] - C1->data->llb[j]) * (C2->data->uub[j] - C1->data->llb[j]) > 0)
              flag = false;

          if (flag)
          {
            // only add one ghost region
            if (feq(A1->data->uub[i], C1->data->uub[i], DH[i]))
            {
              A1->data->uub[i] += ghost_width * 2 * DH[i];
              A1->data->shape[i] += ghost_width;
            }
            if (feq(A2->data->llb[i], C2->data->llb[i], DH[i]))
            {
              A2->data->llb[i] -= ghost_width * 2 * DH[i];
              A2->data->shape[i] += ghost_width;
            }
          }
        }
      }
      C2 = C2->next;
      A2 = A2->next;
    }
    C1 = C1->next;
    A1 = A1->next;
  }

  if (dc)
    dc->destroyList();
}
// According to overlap to cut the gsl into recular pices
void Parallel::cut_gsl(MyList<gridseg> *&A)
{
  if (!A)
    return;

  MyList<gridseg> *B, *C, *D = A;
  bool flag = false;
  while (D->next)
  {
    B = D->next;
    while (B)
    {
      flag = cut_gs(D, B, C);
      if (flag)
        break;
      B = B->next;
    }
    if (flag)
      break;
    D = D->next;
  }

  if (flag)
  {
    // delete D and B from A
    MyList<gridseg> *E = A;
    while (E->next)
    {
      MyList<gridseg> *tp = E->next;
      if (D == tp || B == tp)
      {
        E->next = (tp->next) ? tp->next : 0;
        delete tp->data;
        delete tp;
      }
      if (E->next)
        E = E->next;
    }

    if (D == A)
    {
      MyList<gridseg> *tp = A;
      A = (A->next) ? A->next : 0;
      delete tp->data;
      delete tp;
    }
    // cat C to A
    if (A)
      A->catList(C);
    else
      A = C;

    cut_gsl(A);
  }
}
// when D and B have overlap, cut them into C and return true
// otherwise return false and C=0
bool Parallel::cut_gs(MyList<gridseg> *D, MyList<gridseg> *B, MyList<gridseg> *&C)
{
  C = 0;
  double llb[dim], uub[dim], DH[dim];
  for (int i = 0; i < dim; i++)
  {
    double tdh;
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
    DH[i] = (D->data->uub[i] - D->data->llb[i]) / (D->data->shape[i] - 1);
    tdh = (B->data->uub[i] - B->data->llb[i]) / (B->data->shape[i] - 1);
#else
#ifdef Cell
    DH[i] = (D->data->uub[i] - D->data->llb[i]) / D->data->shape[i];
    tdh = (B->data->uub[i] - B->data->llb[i]) / B->data->shape[i];
#else
#error Not define Vertex nor Cell
#endif
#endif
    if (!feq(DH[i], tdh, DH[i] / 2))
    {
      cout << "Parallel::cut_gs meets different grid segment " << DH[i] << " vs " << tdh << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
    llb[i] = Mymax(D->data->llb[i], B->data->llb[i]);
    uub[i] = Mymin(D->data->uub[i], B->data->uub[i]);
    // for efficiency we ask the width of the patch at least 2(buffer+ghost+BD ghost)
    if (uub[i] - llb[i] < DH[i] * 2 * (buffer_width + 2 * ghost_width))
      return false; // here this is valid for both vertex and cell
  }

  // this part code results in 5 patches generally

  C = new MyList<gridseg>;
  C->data = new gridseg;
  for (int i = 0; i < dim; i++)
  {
    C->data->llb[i] = llb[i];
    C->data->uub[i] = uub[i];
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
    C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / DH[i] + 0.4) + 1;
#else
#ifdef Cell
    C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / DH[i] + 0.4);
#else
#error Not define Vertex nor Cell
#endif
#endif
  }
  if (D->data->Bg == B->data->Bg)
    C->data->Bg = D->data->Bg;
  else
    C->data->Bg = 0;

  C->next = gs_subtract_virtual(D, C);

  MyList<gridseg> *E = C;

  while (E->next)
    E = E->next;

  E->next = gs_subtract_virtual(B, C);

  // this part code results in 3 patches generally
  /*
       C = clone_gsl(D,true);
       C->next = gs_subtract_virtual(B,C);
  */

  return true;
}
// note here it is different to real cut, we need leave the cutting edge for both vertex center and cell center
MyList<Parallel::gridseg> *Parallel::gs_subtract_virtual(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B)
{
  if (!A)
    return 0;
  if (!B)
    return clone_gsl(A, true);

  double cut_plane[2 * dim], DH[dim];

  for (int i = 0; i < dim; i++)
  {
    double tdh;
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
    DH[i] = (A->data->uub[i] - A->data->llb[i]) / (A->data->shape[i] - 1);
    tdh = (B->data->uub[i] - B->data->llb[i]) / (B->data->shape[i] - 1);
#else
#ifdef Cell
    DH[i] = (A->data->uub[i] - A->data->llb[i]) / A->data->shape[i];
    tdh = (B->data->uub[i] - B->data->llb[i]) / B->data->shape[i];
#else
#error Not define Vertex nor Cell
#endif
#endif
    if (!feq(DH[i], tdh, DH[i] / 2))
    {
      cout << "Parallel::gs_subtract_virtual meets different grid segment " << DH[i] << " vs " << tdh << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }

  MyList<Parallel::gridseg> *C = 0, *q;
  for (int i = 0; i < dim; i++)
  {
    if (B->data->llb[i] > A->data->uub[i] || B->data->uub[i] < A->data->llb[i])
      return clone_gsl(A, true);
    cut_plane[i] = A->data->llb[i];
    cut_plane[i + dim] = A->data->uub[i];
  }

  for (int i = 0; i < dim; i++)
  {
    cut_plane[i] = Mymax(A->data->llb[i], B->data->llb[i]);
    if (cut_plane[i] > A->data->llb[i])
    {
      q = clone_gsl(A, true);
      // prolong the list from head
      if (C)
        q->next = C;
      C = q;
      for (int j = 0; j < dim; j++)
      {
        if (i == j)
        {
          C->data->llb[i] = A->data->llb[i];
          // **note here it is different to real cut, we need leave the cutting edge for both vertex center and cell center**
          C->data->uub[i] = Mymax(C->data->llb[i], cut_plane[i]);
        }
        else
        {
          C->data->llb[j] = cut_plane[j];
          C->data->uub[j] = cut_plane[j + dim];
        }
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4) + 1;
#else
#ifdef Cell
        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4);
#else
#error Not define Vertex nor Cell
#endif
#endif
      }
    }

    cut_plane[i + dim] = Mymin(A->data->uub[i], B->data->uub[i]);
    if (cut_plane[i + dim] < A->data->uub[i])
    {
      q = clone_gsl(A, true);
      if (C)
        q->next = C;
      C = q;
      for (int j = 0; j < dim; j++)
      {
        if (i == j)
        {
          C->data->uub[i] = A->data->uub[i];
          // note here it is different to real cut, we need leave the cutting edge for both vertex center and cell center
          C->data->llb[i] = Mymin(C->data->uub[i], cut_plane[i + dim]);
        }
        else
        {
          C->data->llb[j] = cut_plane[j];
          C->data->uub[j] = cut_plane[j + dim];
        }
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4) + 1;
#else
#ifdef Cell
        C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4);
#else
#error Not define Vertex nor Cell
#endif
#endif
      }
    }
  }
  return C;
}
// note the data structure
// if CC is true
// 1   -----------  1   ------  ^
//                  0   ------  |  t
// 0   -----------  old ------  |
//
// old -----------
// if CC is false
// 1   -----------  1   ------  ^
// 0   -----------  0   ------  |  t
// old -----------  old ------  |
void Parallel::fill_level_data(MyList<Patch> *PatLd, MyList<Patch> *PatLs, MyList<Patch> *PatcL,
                               MyList<var> *OldList, MyList<var> *StateList, MyList<var> *FutureList,
                               MyList<var> *tmList, int Symmetry, bool BB, bool CC)
{
  if (PatLd->data->lev != PatLs->data->lev)
  {
    cout << "Parallel::fill_level_data: meet requst from lev#" << PatLs->data->lev << " to lev#" << PatLd->data->lev << endl;
    MPI_Abort(MPI_COMM_WORLD, 1);
  }
  if (PatLd->data->lev <= PatcL->data->lev)
  {
    cout << "Parallel::fill_level_data: meet prolong requst from lev#" << PatcL->data->lev << " to lev#" << PatLd->data->lev << endl;
    MPI_Abort(MPI_COMM_WORLD, 1);
  }

  int cpusize;
  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);

  MyList<var> *VarList = 0;
  MyList<var> *p;
  p = StateList;
  while (p)
  {
    if (VarList)
      VarList->insert(p->data);
    else
      VarList = new MyList<var>(p->data);
    p = p->next;
  }
  p = FutureList;
  while (p)
  {
    if (VarList)
      VarList->insert(p->data);
    else
      VarList = new MyList<var>(p->data);
    p = p->next;
  }

  MyList<Parallel::gridseg> *dst;
  MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
  src = new MyList<Parallel::gridseg> *[cpusize];
  transfer_src = new MyList<Parallel::gridseg> *[cpusize];
  transfer_dst = new MyList<Parallel::gridseg> *[cpusize];

  dst = build_complete_gsl(PatLd); // including ghost
  // copy part
  for (int node = 0; node < cpusize; node++)
  {
    src[node] = build_owned_gsl(PatLs, node, 0, Symmetry);                // similar to Sync
    build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
  }

  transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);

  for (int node = 0; node < cpusize; node++)
  {
    if (src[node])
      src[node]->destroyList();
    if (transfer_src[node])
      transfer_src[node]->destroyList();
    if (transfer_dst[node])
      transfer_dst[node]->destroyList();
  }

  MyList<Parallel::gridseg> *dsts, *dstd;
  dsts = build_complete_gsl_virtual(PatLs);
  dstd = dst;
  dst = gsl_subtract(dstd, dsts);
  if (dstd)
    dstd->destroyList();
  if (dsts)
    dsts->destroyList();

  if (dst)
  {
    // prolongation part
    for (int node = 0; node < cpusize; node++)
    {
      src[node] = build_owned_gsl(PatcL, node, 4, Symmetry);                // - buffer - ghost - BD ghost
      build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
    }

    if (CC)
    {
      // for FutureList
      // restrict first~~~>
      {
        Restrict(PatcL, PatLs, FutureList, FutureList, Symmetry);
        Sync(PatcL, FutureList, Symmetry);
      }
      //<~~~prolong then
      transfer(transfer_src, transfer_dst, FutureList, FutureList, Symmetry);

      // for StateList
      // time interpolation part
      if (BB)
        prepare_inter_time_level(PatcL, FutureList, StateList, OldList,
                                 tmList, 0); // use SynchList_pre as temporal storage space
      else
        prepare_inter_time_level(PatcL, FutureList, StateList,
                                 tmList, 0); // use SynchList_pre as temporal storage space
                                             // restrict first~~~>
      {
        Restrict(PatcL, PatLs, StateList, tmList, Symmetry);
        Sync(PatcL, tmList, Symmetry);
      }
      //<~~~prolong then
      transfer(transfer_src, transfer_dst, tmList, StateList, Symmetry);
    }
    else
    {
      // for both FutureList and StateList
      // restrict first~~~>
      {
        Restrict(PatcL, PatLs, VarList, VarList, Symmetry);
        Sync(PatcL, VarList, Symmetry);
      }
      //<~~~prolong then
      transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
    }

    for (int node = 0; node < cpusize; node++)
    {
      if (src[node])
        src[node]->destroyList();
      if (transfer_src[node])
        transfer_src[node]->destroyList();
      if (transfer_dst[node])
        transfer_dst[node]->destroyList();
    }

    dst->destroyList();
  }

  delete[] src;
  delete[] transfer_src;
  delete[] transfer_dst;

  VarList->clearList();
}
void Parallel::KillBlocks(MyList<Patch> *PatchLIST)
{
  while (PatchLIST)
  {
    Patch *Pp = PatchLIST->data;
    MyList<Block> *bg;
    while (Pp->blb)
    {
      if (Pp->blb == Pp->ble)
        break;
      bg = (Pp->blb->next) ? Pp->blb->next : 0;
      delete Pp->blb->data;
      delete Pp->blb;
      Pp->blb = bg;
    }
    if (Pp->ble)
    {
      delete Pp->ble->data;
      delete Pp->ble;
    }
    Pp->blb = Pp->ble = 0;
    PatchLIST = PatchLIST->next;
  }
}
bool Parallel::PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
                                     int NN, double **XX,
                                     double *Shellf, int Symmetry)
{
  MyList<var> *varl;
  int num_var = 0;
  varl = VarList;
  while (varl)
  {
    num_var++;
    varl = varl->next;
  }

  double lld[dim], uud[dim];
  double **pox;
  pox = new double *[dim];
  for (int j = 0; j < dim; j++)
    pox[j] = new double[1];
  for (int i = 0; i < NN; i++)
  {
    MyList<Patch> *PL = PatL;
    while (PL)
    {
      bool flag = true;
      for (int j = 0; j < dim; j++)
      {
        double h = PL->data->getdX(j);
        lld[j] = PL->data->lli[j] * h;
        uud[j] = PL->data->uui[j] * h;
        if (XX[j][i] < PL->data->bbox[j] + lld[j] || XX[j][i] > PL->data->bbox[j + dim] - uud[j])
        {
          flag = false;
          break;
        }
        pox[j][0] = XX[j][i];
      }
      if (flag)
      {
        PL->data->Interp_Points(VarList, 1, pox, Shellf + i * num_var, Symmetry);
        break;
      }
      PL = PL->next;
    }
    if (!PL)
    {
      checkpatchlist(PatL, false);
      return false;
    }
  }
  for (int j = 0; j < dim; j++)
    delete[] pox[j];
  delete[] pox;

  return true;
}
bool Parallel::PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
                                     int NN, double **XX,
                                     double *Shellf, int Symmetry, MPI_Comm Comm_here)
{
  MyList<var> *varl;
  int num_var = 0;
  varl = VarList;
  while (varl)
  {
    num_var++;
    varl = varl->next;
  }

  double lld[dim], uud[dim];
  double **pox;
  pox = new double *[dim];
  for (int j = 0; j < dim; j++)
    pox[j] = new double[1];
  for (int i = 0; i < NN; i++)
  {
    MyList<Patch> *PL = PatL;
    while (PL)
    {
      bool flag = true;
      for (int j = 0; j < dim; j++)
      {
        double h = PL->data->getdX(j);
        lld[j] = PL->data->lli[j] * h;
        uud[j] = PL->data->uui[j] * h;
        if (XX[j][i] < PL->data->bbox[j] + lld[j] || XX[j][i] > PL->data->bbox[j + dim] - uud[j])
        {
          flag = false;
          break;
        }
        pox[j][0] = XX[j][i];
      }
      if (flag)
      {
        PL->data->Interp_Points(VarList, 1, pox, Shellf + i * num_var, Symmetry, Comm_here);
        break;
      }
      PL = PL->next;
    }
    if (!PL)
    {
      checkpatchlist(PatL, false);
      return false;
    }
  }
  for (int j = 0; j < dim; j++)
    delete[] pox[j];
  delete[] pox;

  return true;
}
void Parallel::aligncheck(double *bbox0, double *bboxl, int lev, double *DH0, int *shape)
{
  const double aligntiny = 0.1;
  double DHl, rr;
  int NN;
  for (int i = 0; i < dim; i++)
  {
    DHl = DH0[i] * pow(0.5, lev);
    rr = bboxl[i] - bbox0[i];
    bboxl[i] = bbox0[i] + int(rr / DHl + 0.4) * DHl;
    rr = bbox0[i + dim] - bboxl[i + dim];
    bboxl[i + dim] = bbox0[i + dim] - int(rr / DHl + 0.4) * DHl;
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
    NN = int((bboxl[i + dim] - bboxl[i]) / DHl + 0.4) + 1;
#else
#ifdef Cell
    NN = int((bboxl[i + dim] - bboxl[i]) / DHl + 0.4);
#else
#error Not define Vertex nor Cell
#endif
#endif
    if (NN != shape[i])
    {
      int myrank;
      MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
      if (myrank == 0)
      {
        cout << "Parallel::aligncheck want shape " << NN << " for lev#" << lev << ", but " << shape[i] << endl;
        cout << "i = " << i << ", low = " << bboxl[i] << ", up = " << bboxl[i + dim] << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
  }
}
bool Parallel::point_locat_gsl(double *pox, MyList<Parallel::gridseg> *gsl)
{
  bool flag = false;
  while (gsl)
  {
    for (int i = 0; i < dim; i++)
    {
      if (pox[i] > gsl->data->llb[i] && pox[i] < gsl->data->uub[i])
        flag = true;
      else
      {
        flag = false;
        break;
      }
    }
    if (flag)
      break;
    gsl = gsl->next;
  }

  return flag;
}
void Parallel::checkpatchlist(MyList<Patch> *PatL, bool buflog)
{
  MyList<Patch> *PL = PatL;
  while (PL)
  {
    PL->data->checkPatch(buflog);
    PL = PL->next;
  }
}
// Check if load balancing is needed based on interpolation times
bool Parallel::check_load_balance_need(double *rank_times, int nprocs, int &num_heavy, int *heavy_ranks)
{
  // Calculate average time
  double avg_time = 0;
  for (int r = 0; r < nprocs; r++)
  {
    avg_time += rank_times[r];
  }
  avg_time /= nprocs;

  // Identify heavy ranks (time > 1.5x average)
  std::vector<std::pair<int, double>> rank_times_vec;
  for (int r = 0; r < nprocs; r++)
  {
    if (rank_times[r] > avg_time * 1.5)
    {
      rank_times_vec.push_back(std::make_pair(r, rank_times[r]));
    }
  }

  // Sort by time (descending)
  std::sort(rank_times_vec.begin(), rank_times_vec.end(),
            [](const std::pair<int, double>& a, const std::pair<int, double>& b) {
              return a.second > b.second;
            });

  // Take top 4 heavy ranks
  num_heavy = std::min(4, (int)rank_times_vec.size());
  if (num_heavy > 0)
  {
    for (int i = 0; i < num_heavy; i++)
    {
      heavy_ranks[i] = rank_times_vec[i].first;
    }
    return true;  // Load balancing is needed
  }

  return false;  // No load balancing needed
}

// Split blocks belonging to heavy ranks to improve load balancing
// Strategy: Split heavy rank blocks in half, merge 8 light ranks to free 4 ranks
void Parallel::split_heavy_blocks(MyList<Patch> *PatL, int *heavy_ranks, int num_heavy,
                                  int split_factor, int cpusize, int ingfsi, int fngfsi)
{
  int myrank, nprocs;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  if (myrank != 0) return; // Only rank 0 performs the analysis

  cout << "\n=== Load Balancing Strategy ===" << endl;
  cout << "Heavy ranks to split (in half): " << num_heavy << endl;
  for (int i = 0; i < num_heavy; i++)
    cout << "  Heavy rank " << heavy_ranks[i] << endl;

  // Step 1: Identify all blocks and their ranks
  std::vector<int> all_ranks;
  std::map<int, std::vector<Block*>> rank_to_blocks;

  MyList<Patch> *PL = PatL;
  while (PL)
  {
    Patch *PP = PL->data;
    MyList<Block> *BP = PP->blb;
    while (BP)
    {
      Block *block = BP->data;
      all_ranks.push_back(block->rank);
      rank_to_blocks[block->rank].push_back(block);
      BP = BP->next;
    }
    PL = PL->next;
  }

  // Step 2: Identify light ranks (not in heavy_ranks list)
  std::set<int> heavy_set(heavy_ranks, heavy_ranks + num_heavy);
  std::vector<int> light_ranks;
  for (int r : all_ranks)
  {
    if (heavy_set.find(r) == heavy_set.end())
    {
      light_ranks.push_back(r);
    }
  }

  // Remove duplicates from light_ranks
  std::sort(light_ranks.begin(), light_ranks.end());
  light_ranks.erase(std::unique(light_ranks.begin(), light_ranks.end()), light_ranks.end());

  cout << "Found " << light_ranks.size() << " light ranks (candidates for merging)" << endl;

  // Step 3: Select 8 light ranks to merge (those with smallest workload)
  // For now, we select the first 8 light ranks
  int num_to_merge = 8;
  if (light_ranks.size() < num_to_merge)
  {
    cout << "WARNING: Not enough light ranks to merge. Found " << light_ranks.size()
         << ", need " << num_to_merge << endl;
    num_to_merge = light_ranks.size();
  }

  std::vector<int> ranks_to_merge(light_ranks.begin(), light_ranks.begin() + num_to_merge);

  cout << "Light ranks to merge (8 -> 4 merged ranks):" << endl;
  for (int i = 0; i < num_to_merge; i++)
    cout << "  Rank " << ranks_to_merge[i] << endl;

  // Step 4: Analyze blocks that need to be split
  cout << "\n=== Analyzing blocks for splitting ===" << endl;

  struct BlockSplitInfo {
    Block *original_block;
    int split_dim;
    int split_point;
  };

  std::vector<BlockSplitInfo> blocks_to_split;

  PL = PatL;
  while (PL)
  {
    Patch *PP = PL->data;
    MyList<Block> *BP = PP->blb;
    while (BP)
    {
      Block *block = BP->data;

      // Check if this block belongs to a heavy rank
      for (int i = 0; i < num_heavy; i++)
      {
        if (block->rank == heavy_ranks[i])
        {
          // Find the largest dimension for splitting
          int max_dim = 0;
          int max_size = block->shape[0];
          for (int d = 1; d < dim; d++)
          {
            if (block->shape[d] > max_size)
            {
              max_size = block->shape[d];
              max_dim = d;
            }
          }

          int split_point = max_size / 2;

          BlockSplitInfo info;
          info.original_block = block;
          info.split_dim = max_dim;
          info.split_point = split_point;
          blocks_to_split.push_back(info);

          cout << "Block at rank " << block->rank << " will be split" << endl;
          cout << "  Shape: [" << block->shape[0] << ", " << block->shape[1] << ", " << block->shape[2] << "]" << endl;
          cout << "  Split along dimension " << max_dim << " at index " << split_point << endl;
          break;
        }
      }

      BP = BP->next;
    }
    PL = PL->next;
  }

  cout << "\nTotal blocks to split: " << blocks_to_split.size() << endl;

  // Step 5: Calculate new rank assignments
  // Strategy:
  // - For each heavy rank, its blocks are split in half
  // - First half keeps the original rank
  // - Second half gets a new rank (from the freed light ranks)
  // - 8 light ranks are merged into 4 ranks, freeing up 4 ranks

  std::vector<int> freed_ranks;
  for (size_t i = 0; i < ranks_to_merge.size(); i += 2)
  {
    // Merge pairs of light ranks: (ranks_to_merge[i], ranks_to_merge[i+1]) -> ranks_to_merge[i]
    // This frees up ranks_to_merge[i+1]
    if (i + 1 < ranks_to_merge.size())
    {
      freed_ranks.push_back(ranks_to_merge[i + 1]);
      cout << "Merging ranks " << ranks_to_merge[i] << " and " << ranks_to_merge[i + 1]
           << " -> keeping rank " << ranks_to_merge[i] << ", freeing rank " << ranks_to_merge[i + 1] << endl;
    }
  }

  cout << "\nFreed ranks available for split blocks: ";
  for (int r : freed_ranks)
    cout << r << " ";
  cout << endl;

  // Step 6: Assign new ranks to split blocks
  int freed_idx = 0;
  for (size_t i = 0; i < blocks_to_split.size(); i++)
  {
    BlockSplitInfo &info = blocks_to_split[i];
    Block *original = info.original_block;

    if (freed_idx < freed_ranks.size())
    {
      cout << "\nSplitting block at rank " << original->rank << endl;
      cout << "  First half: keeps rank " << original->rank << endl;
      cout << "  Second half: gets new rank " << freed_ranks[freed_idx] << endl;
      freed_idx++;
    }
    else
    {
      cout << "WARNING: Not enough freed ranks for all split blocks!" << endl;
      break;
    }
  }

  cout << "\n=== Load Balancing Analysis Complete ===" << endl;
  cout << "Next steps:" << endl;
  cout << "  1. Recompose the grid with new rank assignments" << endl;
  cout << "  2. Data migration will be handled by recompose_cgh" << endl;
  cout << "  3. Ghost zone communication will be updated automatically" << endl;
}