From 22c1e7168b21fdcca6bf105266acc508671ef92e Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Wed, 29 Apr 2026 17:05:10 +0800 Subject: [PATCH] Optimize BSSN CUDA resident state and CUDA-aware MPI --- AMSS_NCKU_Input.py | 2 +- AMSS_NCKU_source/Block.C | 125 +- AMSS_NCKU_source/Block.h | 13 +- AMSS_NCKU_source/Parallel.C | 13737 ++++++++++++++-------------- AMSS_NCKU_source/Parallel.h | 59 +- AMSS_NCKU_source/bssn_rhs_cuda.cu | 188 + AMSS_NCKU_source/bssn_rhs_cuda.h | 64 +- 7 files changed, 7461 insertions(+), 6727 deletions(-) diff --git a/AMSS_NCKU_Input.py b/AMSS_NCKU_Input.py index 67e7c1c..73af547 100755 --- a/AMSS_NCKU_Input.py +++ b/AMSS_NCKU_Input.py @@ -16,7 +16,7 @@ import numpy File_directory = "GW150914" ## output file directory Output_directory = "binary_output" ## binary data file directory ## The file directory name should not be too long -MPI_processes = 8 ## number of mpi processes used in the simulation +MPI_processes = 2 ## number of mpi processes used in the simulation GPU_Calculation = "yes" ## Use GPU or not ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface) diff --git a/AMSS_NCKU_source/Block.C b/AMSS_NCKU_source/Block.C index fcae198..e0fa4ab 100644 --- a/AMSS_NCKU_source/Block.C +++ b/AMSS_NCKU_source/Block.C @@ -6,14 +6,68 @@ #include #include #include -#include -using namespace std; - -#include "Block.h" -#include "misc.h" - -Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfsi, int levi, const int cgpui) : rank(ranki), ingfs(ingfsi), fngfs(fngfsi), lev(levi), cgpu(cgpui) -{ +#include +using namespace std; + +#include "Block.h" +#include "misc.h" + +#if USE_CUDA_BSSN || USE_CUDA_Z4C +#include +#endif + +namespace { + +bool cuda_pin_gridfuncs_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_CUDA_PIN_GRIDFUNCS"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +double *alloc_gridfunc(size_t count, unsigned char &pinned) +{ + pinned = 0; +#if USE_CUDA_BSSN || USE_CUDA_Z4C + if (cuda_pin_gridfuncs_enabled()) + { + double *ptr = 0; + cudaError_t err = cudaMallocHost((void **)&ptr, count * sizeof(double)); + if (err == cudaSuccess) + { + pinned = 1; + return ptr; + } + cudaGetLastError(); + } +#endif + return (double *)malloc(sizeof(double) * count); +} + +void free_gridfunc(double *ptr, unsigned char pinned) +{ + if (!ptr) + return; +#if USE_CUDA_BSSN || USE_CUDA_Z4C + if (pinned) + { + cudaFreeHost(ptr); + return; + } +#else + (void)pinned; +#endif + free(ptr); +} + +} + +Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfsi, int levi, const int cgpui) : rank(ranki), lev(levi), cgpu(cgpui), ingfs(ingfsi), fngfs(fngfsi), igfs(0), fgfs(0), fgfs_pinned(0) +{ for (int i = 0; i < dim; i++) X[i] = 0; @@ -68,14 +122,15 @@ Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fng #endif } - int nn = shape[0] * shape[1] * shape[2]; - fgfs = new double *[fngfs]; - for (int i = 0; i < fngfs; i++) - { - fgfs[i] = (double *)malloc(sizeof(double) * nn); - if (!(fgfs[i])) - { - cout << "on node#" << rank << ", out of memory when constructing Block." << endl; + int nn = shape[0] * shape[1] * shape[2]; + fgfs = new double *[fngfs]; + fgfs_pinned = new unsigned char[fngfs]; + for (int i = 0; i < fngfs; i++) + { + fgfs[i] = alloc_gridfunc((size_t)nn, fgfs_pinned[i]); + if (!(fgfs[i])) + { + cout << "on node#" << rank << ", out of memory when constructing Block." << endl; MPI_Abort(MPI_COMM_WORLD, 1); } memset(fgfs[i], 0, sizeof(double) * nn); @@ -103,17 +158,19 @@ Block::~Block() { for (int i = 0; i < dim; i++) delete[] X[i]; - for (int i = 0; i < ingfs; i++) - free(igfs[i]); - delete[] igfs; - for (int i = 0; i < fngfs; i++) - free(fgfs[i]); - delete[] fgfs; - X[0] = X[1] = X[2] = 0; - igfs = 0; - fgfs = 0; - } -} + for (int i = 0; i < ingfs; i++) + free(igfs[i]); + delete[] igfs; + for (int i = 0; i < fngfs; i++) + free_gridfunc(fgfs[i], fgfs_pinned ? fgfs_pinned[i] : 0); + delete[] fgfs; + delete[] fgfs_pinned; + X[0] = X[1] = X[2] = 0; + igfs = 0; + fgfs = 0; + fgfs_pinned = 0; + } +} void Block::checkBlock() { int myrank; @@ -184,12 +241,14 @@ void Block::swapList(MyList *VarList1, MyList *VarList2, int myrank) if (rank == myrank) { MyList *varl1 = VarList1, *varl2 = VarList2; - while (varl1 && varl2) - { - misc::swap(fgfs[varl1->data->sgfn], fgfs[varl2->data->sgfn]); - varl1 = varl1->next; - varl2 = varl2->next; - } + while (varl1 && varl2) + { + misc::swap(fgfs[varl1->data->sgfn], fgfs[varl2->data->sgfn]); + if (fgfs_pinned) + misc::swap(fgfs_pinned[varl1->data->sgfn], fgfs_pinned[varl2->data->sgfn]); + varl1 = varl1->next; + varl2 = varl2->next; + } if (varl1 || varl2) { cout << "error in Block::swaplist, var lists does not match." << endl; diff --git a/AMSS_NCKU_source/Block.h b/AMSS_NCKU_source/Block.h index 28193fd..6c920ba 100644 --- a/AMSS_NCKU_source/Block.h +++ b/AMSS_NCKU_source/Block.h @@ -13,14 +13,15 @@ public: int shape[dim]; double bbox[2 * dim]; double *X[dim]; - int rank; // where the real data locate in - int lev, cgpu; - int ingfs, fngfs; - int *(*igfs); - double *(*fgfs); + int rank; // where the real data locate in + int lev, cgpu; + int ingfs, fngfs; + int *(*igfs); + double *(*fgfs); + unsigned char *fgfs_pinned; public: - Block() {}; + Block() : rank(0), lev(0), cgpu(0), ingfs(0), fngfs(0), igfs(0), fgfs(0), fgfs_pinned(0) {}; Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfs, int levi, const int cgpui = 0); ~Block(); diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index 4df55a8..6760047 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -1,5 +1,5 @@ - -#include "Parallel.h" + +#include "Parallel.h" #include "fmisc.h" #include "prolongrestrict.h" #include "misc.h" @@ -23,6 +23,10 @@ namespace { +#if USE_CUDA_BSSN || USE_CUDA_Z4C +static thread_local bool s_cuda_aware_pack_active = false; +#endif + struct SyncProfileStats { long long start_calls; @@ -268,3735 +272,3933 @@ bool cuda_direct_unpack_segment(double *buffer, sync_profile_stats().direct_unpack_sec += MPI_Wtime() - t0; return ok; } + +bool cuda_aware_mpi_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_CUDA_AWARE_MPI"); + enabled = (!env || atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +bool cuda_mpi_diag_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_CUDA_MPI_DIAG"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0 || sync_profile_enabled(); +} + +double *alloc_device_comm_buffer(int length) +{ + if (length <= 0) + return 0; + double *ptr = 0; + cudaError_t err = cudaMalloc((void **)&ptr, (size_t)length * sizeof(double)); + if (err != cudaSuccess) + { + fprintf(stderr, "Parallel: cudaMalloc failed for device comm buffer (%d doubles, err=%d)\n", + length, (int)err); + MPI_Abort(MPI_COMM_WORLD, 1); + } + return ptr; +} + +void free_device_comm_buffer(double *&ptr) +{ + if (!ptr) + return; + cudaFree(ptr); + ptr = 0; +} + +void ensure_device_comm_buffer(double **buffers, int *caps, int idx, int length) +{ + if (length <= caps[idx]) + return; + free_device_comm_buffer(buffers[idx]); + buffers[idx] = alloc_device_comm_buffer(length); + if (!buffers[idx]) + { + fprintf(stderr, "Parallel: failed to allocate device communication buffer (%d doubles)\n", length); + MPI_Abort(MPI_COMM_WORLD, 1); + } + caps[idx] = length; +} + +bool cuda_direct_pack_segment_to_device(double *buffer, + const Parallel::gridseg *src, + const Parallel::gridseg *dst, + int state_count) +{ +#if USE_CUDA_BSSN + if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT) + return false; + const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; + const int i0 = cuda_seg_begin(dst, src->Bg, 0); + const int j0 = cuda_seg_begin(dst, src->Bg, 1); + const int k0 = cuda_seg_begin(dst, src->Bg, 2); + const bool ok = bssn_cuda_pack_state_batch_to_device_buffer( + src->Bg, state_count, buffer, src->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + if (sync_profile_enabled()) + sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0; + return ok; +#else + (void)buffer; (void)src; (void)dst; (void)state_count; + return false; +#endif +} + +bool cuda_direct_unpack_segment_from_device(double *buffer, + const Parallel::gridseg *dst, + int state_count) +{ +#if USE_CUDA_BSSN + if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT) + return false; + const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; + const int i0 = cuda_seg_begin(dst, dst->Bg, 0); + const int j0 = cuda_seg_begin(dst, dst->Bg, 1); + const int k0 = cuda_seg_begin(dst, dst->Bg, 2); + const bool ok = bssn_cuda_unpack_state_batch_from_device_buffer( + dst->Bg, state_count, buffer, dst->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + if (sync_profile_enabled()) + sync_profile_stats().direct_unpack_sec += MPI_Wtime() - t0; + return ok; +#else + (void)buffer; (void)dst; (void)state_count; + return false; +#endif +} + +bool cuda_device_state_count_supported(int state_count) +{ +#if USE_CUDA_BSSN + return state_count > 0 && state_count <= BSSN_CUDA_STATE_COUNT; +#else + (void)state_count; + return false; +#endif +} + +bool cuda_segments_same_level(MyList *src, + MyList *dst, + int rank_in, + int dir, + int myrank) +{ + bool has_work = false; + while (src && dst) + { + if ((dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) || + (dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank)) + { + has_work = true; + if (!src->data || !dst->data || !src->data->Bg || !dst->data->Bg || + src->data->Bg->lev != dst->data->Bg->lev) + return false; + } + src = src->next; + dst = dst->next; + } + return has_work; +} + +bool cuda_pack_to_device_eligible(MyList *src, + MyList *dst, + int rank_in, + int state_count, + int myrank) +{ + if (!cuda_aware_mpi_enabled() || !cuda_device_state_count_supported(state_count)) + return false; + if (!cuda_segments_same_level(src, dst, rank_in, PACK, myrank)) + return false; + while (src && dst) + { + if (dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank && + !cuda_can_direct_pack(src->data, dst->data, 1)) + return false; + src = src->next; + dst = dst->next; + } + return true; +} + +bool cuda_recv_to_device_eligible(MyList *src, + MyList *dst, + int rank_in, + int state_count, + int myrank) +{ + if (!cuda_aware_mpi_enabled() || !cuda_device_state_count_supported(state_count)) + return false; + if (!cuda_segments_same_level(src, dst, rank_in, UNPACK, myrank)) + return false; + while (src && dst) + { + if (src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank && + !cuda_can_direct_unpack(dst->data, 1)) + return false; + src = src->next; + dst = dst->next; + } + return true; +} + +int data_packer_with_device_buffer(double *data, + MyList *src, + MyList *dst, + int rank_in, + int dir, + MyList *VarLists, + MyList *VarListd, + int Symmetry) +{ + s_cuda_aware_pack_active = true; + int n = Parallel::data_packer(data, src, dst, rank_in, dir, VarLists, VarListd, Symmetry); + s_cuda_aware_pack_active = false; + return n; +} #endif } // namespace int Parallel::partition1(int &nx, int split_size, int min_width, int cpusize, int shape) // special for 1 diemnsion { - nx = Mymax(1, shape / min_width); - nx = Mymin(cpusize, nx); - - return nx; -} -int Parallel::partition2(int *nxy, int split_size, int *min_width, int cpusize, int *shape) // special for 2 diemnsions -{ -#define SEARCH_SIZE 5 - int i, j, nx, ny; - int maxnx, maxny; - int mnx, mny; - int dn, hmin_width, cmin_width; - int cnx, cny; - double fx, fy; - int block_size; - int n; - - block_size = shape[0] * shape[1]; - n = Mymax(1, (block_size + split_size / 2) / split_size); - - maxnx = Mymax(1, shape[0] / min_width[0]); - maxnx = Mymin(cpusize, maxnx); - maxny = Mymax(1, shape[1] / min_width[1]); - maxny = Mymin(cpusize, maxny); - fx = (double)shape[0] / (shape[0] + shape[1]); - fy = (double)shape[1] / (shape[0] + shape[1]); - nx = mnx = Mymax(1, Mymin(maxnx, (int)(sqrt(double(n)) * fx / fy))); - ny = mny = Mymax(1, Mymin(maxny, (int)(sqrt(double(n)) * fy / fx))); - dn = abs(n - nx * ny); - hmin_width = Mymin(shape[0] / nx, shape[1] / ny); - for (cny = Mymax(1, mny - SEARCH_SIZE); cny <= (Mymin(mny + SEARCH_SIZE, maxny)); cny++) - for (cnx = Mymax(1, mnx - SEARCH_SIZE); cnx <= (Mymin(mnx + SEARCH_SIZE, maxnx)); cnx++) - { - cmin_width = Mymin(shape[0] / cnx, shape[1] / cny); - if (dn > abs(n - cnx * cny) || (dn == abs(n - cnx * cny) && cmin_width > hmin_width)) - { - dn = abs(n - cnx * cny); - nx = cnx; - ny = cny; - hmin_width = cmin_width; - } - } - - nxy[0] = nx; - nxy[1] = ny; - - return nx * ny; -#undef SEARCH_SIZE -} -int Parallel::partition3(int *nxyz, int split_size, int *min_width, int cpusize, int *shape) // special for 3 diemnsions -#if 1 // algrithsm from Pretorius -{ -// cout< abs(n - cnx * cny * cnz) || (dn == abs(n - cnx * cny * cnz) && cmin_width > hmin_width)) - { - dn = abs(n - cnx * cny * cnz); - nx = cnx; - ny = cny; - nz = cnz; - hmin_width = cmin_width; - } - } - - nxyz[0] = nx; - nxyz[1] = ny; - nxyz[2] = nz; - - return nx * ny * nz; -#undef SEARCH_SIZE -} -#elif 1 // Zhihui's idea one on 2013-09-25 -{ - int nx, ny, nz; - int hmin_width; - hmin_width = Mymin(min_width[0], min_width[1]); - hmin_width = Mymin(hmin_width, min_width[2]); - nx = shape[0] / hmin_width; - if (nx * hmin_width < shape[0]) - nx++; - ny = shape[1] / hmin_width; - if (ny * hmin_width < shape[1]) - ny++; - nz = shape[2] / hmin_width; - if (nz * hmin_width < shape[2]) - nz++; - while (nx * ny * nz > cpusize) - { - hmin_width++; - nx = shape[0] / hmin_width; - if (nx * hmin_width < shape[0]) - nx++; - ny = shape[1] / hmin_width; - if (ny * hmin_width < shape[1]) - ny++; - nz = shape[2] / hmin_width; - if (nz * hmin_width < shape[2]) - nz++; - } - - nxyz[0] = nx; - nxyz[1] = ny; - nxyz[2] = nz; - - return nx * ny * nz; -} -#elif 1 // Zhihui's idea two on 2013-09-25 -{ - int nx, ny, nz; - const int hmin_width = 8; // for example we use 8 - nx = shape[0] / hmin_width; - if (nx * hmin_width < shape[0]) - nx++; - ny = shape[1] / hmin_width; - if (ny * hmin_width < shape[1]) - ny++; - nz = shape[2] / hmin_width; - if (nz * hmin_width < shape[2]) - nz++; - - nxyz[0] = nx; - nxyz[1] = ny; - nxyz[2] = nz; - - return nx * ny * nz; -} -#endif -// distribute the data to cprocessors -#if (PSTR == 0) -MyList *Parallel::distribute(MyList *PatchLIST, int cpusize, int ingfsi, int fngfsi, - bool periodic, int nodes) -{ -#ifdef USE_GPU_DIVIDE - double cpu_part, gpu_part; - map::iterator iter; - iter = parameters::dou_par.find("cpu part"); - if (iter != parameters::dou_par.end()) - { - cpu_part = iter->second; - } - else - { - int myrank; - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - // read parameter from file - const int LEN = 256; - char pline[LEN]; - string str, sgrp, skey, sval; - int sind; - char pname[50]; - { - map::iterator iter = parameters::str_par.find("inputpar"); - if (iter != parameters::str_par.end()) - { - strcpy(pname, (iter->second).c_str()); - } - else - { - cout << "Error inputpar" << endl; - exit(0); - } - } - ifstream inf(pname, ifstream::in); - if (!inf.good() && myrank == 0) - { - cout << "Can not open parameter file " << pname << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - - for (int i = 1; inf.good(); i++) - { - inf.getline(pline, LEN); - str = pline; - - int status = misc::parse_parts(str, sgrp, skey, sval, sind); - if (status == -1) - { - cout << "error reading parameter file " << pname << " in line " << i << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - else if (status == 0) - continue; - - if (sgrp == "ABE") - { - if (skey == "cpu part") - cpu_part = atof(sval.c_str()); - } - } - inf.close(); - - parameters::dou_par.insert(map::value_type("cpu part", cpu_part)); - } - iter = parameters::dou_par.find("gpu part"); - if (iter != parameters::dou_par.end()) - { - gpu_part = iter->second; - } - else - { - int myrank; - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - // read parameter from file - const int LEN = 256; - char pline[LEN]; - string str, sgrp, skey, sval; - int sind; - char pname[50]; - { - map::iterator iter = parameters::str_par.find("inputpar"); - if (iter != parameters::str_par.end()) - { - strcpy(pname, (iter->second).c_str()); - } - else - { - cout << "Error inputpar" << endl; - exit(0); - } - } - ifstream inf(pname, ifstream::in); - if (!inf.good() && myrank == 0) - { - cout << "Can not open parameter file " << pname << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - - for (int i = 1; inf.good(); i++) - { - inf.getline(pline, LEN); - str = pline; - - int status = misc::parse_parts(str, sgrp, skey, sval, sind); - if (status == -1) - { - cout << "error reading parameter file " << pname << " in line " << i << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - else if (status == 0) - continue; - - if (sgrp == "ABE") - { - if (skey == "gpu part") - gpu_part = atof(sval.c_str()); - } - } - inf.close(); - - parameters::dou_par.insert(map::value_type("gpu part", gpu_part)); - } - - if (nodes == 0) - nodes = cpusize / 2; -#else - if (nodes == 0) - nodes = cpusize; -#endif - - if (dim != 3) - { - cout << "distrivute: now we only support 3-dimension" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - - MyList *BlL = 0; - - int split_size, min_size, block_size = 0; - - int min_width = 2 * Mymax(ghost_width, buffer_width); - int nxyz[dim], mmin_width[dim], min_shape[dim]; - - MyList *PLi = PatchLIST; - for (int i = 0; i < dim; i++) - min_shape[i] = PLi->data->shape[i]; - int lev = PLi->data->lev; - PLi = PLi->next; - while (PLi) - { - Patch *PP = PLi->data; - for (int i = 0; i < dim; i++) - min_shape[i] = Mymin(min_shape[i], PP->shape[i]); - if (lev != PLi->data->lev) - cout << "Parallel::distribute CAUSTION: meet Patches for different level: " << lev << " and " << PLi->data->lev << endl; - PLi = PLi->next; - } - - for (int i = 0; i < dim; i++) - mmin_width[i] = Mymin(min_width, min_shape[i]); - - min_size = mmin_width[0]; - for (int i = 1; i < dim; i++) - min_size = min_size * mmin_width[i]; - - PLi = PatchLIST; - while (PLi) - { - Patch *PP = PLi->data; - // PP->checkPatch(true); - int bs = PP->shape[0]; - for (int i = 1; i < dim; i++) - bs = bs * PP->shape[i]; - block_size = block_size + bs; - PLi = PLi->next; - } - split_size = Mymax(min_size, block_size / nodes); - split_size = Mymax(1, split_size); - - int n_rank = 0; - PLi = PatchLIST; - int reacpu = 0; - while (PLi) - { - Patch *PP = PLi->data; - - reacpu += partition3(nxyz, split_size, mmin_width, nodes, PP->shape); - - Block *ng0, *ng; - int shape_here[dim], ibbox_here[2 * dim]; - double bbox_here[2 * dim], dd; - - // ibbox : 0,...N-1 - for (int i = 0; i < nxyz[0]; i++) - for (int j = 0; j < nxyz[1]; j++) - for (int k = 0; k < nxyz[2]; k++) - { - ibbox_here[0] = (PP->shape[0] * i) / nxyz[0]; - ibbox_here[3] = (PP->shape[0] * (i + 1)) / nxyz[0] - 1; - ibbox_here[1] = (PP->shape[1] * j) / nxyz[1]; - ibbox_here[4] = (PP->shape[1] * (j + 1)) / nxyz[1] - 1; - ibbox_here[2] = (PP->shape[2] * k) / nxyz[2]; - ibbox_here[5] = (PP->shape[2] * (k + 1)) / nxyz[2] - 1; - - if (periodic) - { - ibbox_here[0] = ibbox_here[0] - ghost_width; - ibbox_here[3] = ibbox_here[3] + ghost_width; - ibbox_here[1] = ibbox_here[1] - ghost_width; - ibbox_here[4] = ibbox_here[4] + ghost_width; - ibbox_here[2] = ibbox_here[2] - ghost_width; - ibbox_here[5] = ibbox_here[5] + ghost_width; - } - else - { - ibbox_here[0] = Mymax(0, ibbox_here[0] - ghost_width); - ibbox_here[3] = Mymin(PP->shape[0] - 1, ibbox_here[3] + ghost_width); - ibbox_here[1] = Mymax(0, ibbox_here[1] - ghost_width); - ibbox_here[4] = Mymin(PP->shape[1] - 1, ibbox_here[4] + ghost_width); - ibbox_here[2] = Mymax(0, ibbox_here[2] - ghost_width); - ibbox_here[5] = Mymin(PP->shape[2] - 1, ibbox_here[5] + ghost_width); - } - - shape_here[0] = ibbox_here[3] - ibbox_here[0] + 1; - shape_here[1] = ibbox_here[4] - ibbox_here[1] + 1; - shape_here[2] = ibbox_here[5] - ibbox_here[2] + 1; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - // 0--4, 5--10 - dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1); - bbox_here[0] = PP->bbox[0] + ibbox_here[0] * dd; - bbox_here[3] = PP->bbox[0] + ibbox_here[3] * dd; - - dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1); - bbox_here[1] = PP->bbox[1] + ibbox_here[1] * dd; - bbox_here[4] = PP->bbox[1] + ibbox_here[4] * dd; - - dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1); - bbox_here[2] = PP->bbox[2] + ibbox_here[2] * dd; - bbox_here[5] = PP->bbox[2] + ibbox_here[5] * dd; -#else -#ifdef Cell - // 0--5, 5--10 - dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0]; - bbox_here[0] = PP->bbox[0] + (ibbox_here[0]) * dd; - bbox_here[3] = PP->bbox[0] + (ibbox_here[3] + 1) * dd; - - dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1]; - bbox_here[1] = PP->bbox[1] + (ibbox_here[1]) * dd; - bbox_here[4] = PP->bbox[1] + (ibbox_here[4] + 1) * dd; - - dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2]; - bbox_here[2] = PP->bbox[2] + (ibbox_here[2]) * dd; - bbox_here[5] = PP->bbox[2] + (ibbox_here[5] + 1) * dd; -#else -#error Not define Vertex nor Cell -#endif -#endif - -#ifdef USE_GPU_DIVIDE - { - const int pices = 2; - double picef[pices]; - picef[0] = cpu_part; - picef[1] = gpu_part; - int shape_res[dim * pices]; - double bbox_res[2 * dim * pices]; - misc::dividBlock(dim, shape_here, bbox_here, pices, picef, shape_res, bbox_res, min_width); - ng = ng0 = new Block(dim, shape_res, bbox_res, n_rank++, ingfsi, fngfsi, PP->lev, 0); // delete through KillBlocks - - // if(n_rank==cpusize) {n_rank=0; cerr<<"place one!!"<checkBlock(); - if (BlL) - BlL->insert(ng); - else - BlL = new MyList(ng); // delete through KillBlocks - - for (int i = 1; i < pices; i++) - { - ng = new Block(dim, shape_res + i * dim, bbox_res + i * 2 * dim, n_rank++, ingfsi, fngfsi, PP->lev, i); // delete through KillBlocks - // if(n_rank==cpusize) {n_rank=0; cerr<<"place two!! "<checkBlock(); - BlL->insert(ng); - } - } -#else - ng = ng0 = new Block(dim, shape_here, bbox_here, n_rank++, ingfsi, fngfsi, PP->lev); - // ng->checkBlock(); - if (BlL) - BlL->insert(ng); - else - BlL = new MyList(ng); // delete through KillBlocks -#endif - if (n_rank == cpusize) - n_rank = 0; - - // set PP->blb - if (i == 0 && j == 0 && k == 0) - { - MyList *Bp = BlL; - while (Bp->data != ng0) - Bp = Bp->next; // ng0 is the first of the pices list - PP->blb = Bp; - } - } - // set PP->ble - { - MyList *Bp = BlL; - while (Bp->data != ng) - Bp = Bp->next; // ng is the last of the pices list - PP->ble = Bp; - } - PLi = PLi->next; - } - if (reacpu < nodes * 2 / 3) - { - int myrank; - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - if (myrank == 0) - cout << "Parallel::distribute CAUSTION: level#" << lev << " uses essencially " << reacpu << " processors vs " << nodes << " nodes run, your scientific computation scale is not as large as you estimate." << endl; - } - - return BlL; -} - -#ifdef INTERP_LB_OPTIMIZE -#include "interp_lb_profile_data.h" - -MyList *Parallel::distribute_optimize(MyList *PatchLIST, int cpusize, int ingfsi, int fngfsi, - bool periodic, int nodes) -{ -#ifdef USE_GPU_DIVIDE - double cpu_part, gpu_part; - map::iterator iter; - iter = parameters::dou_par.find("cpu part"); - if (iter != parameters::dou_par.end()) - { - cpu_part = iter->second; - } - else - { - int myrank; - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - const int LEN = 256; - char pline[LEN]; - string str, sgrp, skey, sval; - int sind; - char pname[50]; - { - map::iterator iter = parameters::str_par.find("inputpar"); - if (iter != parameters::str_par.end()) - strcpy(pname, (iter->second).c_str()); - else { cout << "Error inputpar" << endl; exit(0); } - } - ifstream inf(pname, ifstream::in); - if (!inf.good() && myrank == 0) - { cout << "Can not open parameter file " << pname << endl; MPI_Abort(MPI_COMM_WORLD, 1); } - for (int i = 1; inf.good(); i++) - { - inf.getline(pline, LEN); str = pline; - int status = misc::parse_parts(str, sgrp, skey, sval, sind); - if (status == -1) { cout << "error reading parameter file " << pname << " in line " << i << endl; MPI_Abort(MPI_COMM_WORLD, 1); } - else if (status == 0) continue; - if (sgrp == "ABE") { if (skey == "cpu part") cpu_part = atof(sval.c_str()); } - } - inf.close(); - parameters::dou_par.insert(map::value_type("cpu part", cpu_part)); - } - iter = parameters::dou_par.find("gpu part"); - if (iter != parameters::dou_par.end()) - { - gpu_part = iter->second; - } - else - { - int myrank; - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - const int LEN = 256; - char pline[LEN]; - string str, sgrp, skey, sval; - int sind; - char pname[50]; - { - map::iterator iter = parameters::str_par.find("inputpar"); - if (iter != parameters::str_par.end()) - strcpy(pname, (iter->second).c_str()); - else { cout << "Error inputpar" << endl; exit(0); } - } - ifstream inf(pname, ifstream::in); - if (!inf.good() && myrank == 0) - { cout << "Can not open parameter file " << pname << endl; MPI_Abort(MPI_COMM_WORLD, 1); } - for (int i = 1; inf.good(); i++) - { - inf.getline(pline, LEN); str = pline; - int status = misc::parse_parts(str, sgrp, skey, sval, sind); - if (status == -1) { cout << "error reading parameter file " << pname << " in line " << i << endl; MPI_Abort(MPI_COMM_WORLD, 1); } - else if (status == 0) continue; - if (sgrp == "ABE") { if (skey == "gpu part") gpu_part = atof(sval.c_str()); } - } - inf.close(); - parameters::dou_par.insert(map::value_type("gpu part", gpu_part)); - } - if (nodes == 0) nodes = cpusize / 2; -#else - if (nodes == 0) nodes = cpusize; -#endif - - if (dim != 3) - { - cout << "distrivute: now we only support 3-dimension" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - - MyList *BlL = 0; - int split_size, min_size, block_size = 0; - int min_width = 2 * Mymax(ghost_width, buffer_width); - int nxyz[dim], mmin_width[dim], min_shape[dim]; - - MyList *PLi = PatchLIST; - for (int i = 0; i < dim; i++) - min_shape[i] = PLi->data->shape[i]; - int lev = PLi->data->lev; - PLi = PLi->next; - while (PLi) - { - Patch *PP = PLi->data; - for (int i = 0; i < dim; i++) - min_shape[i] = Mymin(min_shape[i], PP->shape[i]); - if (lev != PLi->data->lev) - cout << "Parallel::distribute CAUSTION: meet Patches for different level: " << lev << " and " << PLi->data->lev << endl; - PLi = PLi->next; - } - - for (int i = 0; i < dim; i++) - mmin_width[i] = Mymin(min_width, min_shape[i]); - min_size = mmin_width[0]; - for (int i = 1; i < dim; i++) - min_size = min_size * mmin_width[i]; - - PLi = PatchLIST; - while (PLi) - { - Patch *PP = PLi->data; - int bs = PP->shape[0]; - for (int i = 1; i < dim; i++) - bs = bs * PP->shape[i]; - block_size = block_size + bs; - PLi = PLi->next; - } - split_size = Mymax(min_size, block_size / nodes); - split_size = Mymax(1, split_size); - - int n_rank = 0; - PLi = PatchLIST; - int reacpu = 0; - int current_block_id = 0; - while (PLi) { - Block *ng0, *ng; - bool first_block_in_patch = true; - Patch *PP = PLi->data; - reacpu += partition3(nxyz, split_size, mmin_width, nodes, PP->shape); - - for (int i = 0; i < nxyz[0]; i++) - for (int j = 0; j < nxyz[1]; j++) - for (int k = 0; k < nxyz[2]; k++) - { - int ibbox_here[6], shape_here[3]; - double bbox_here[6], dd; - Block *current_ng_start = nullptr; - - bool is_heavy = false; - int r_l = -1, r_r = -1; - if (cpusize == INTERP_LB_NPROCS) { - for (int si = 0; si < INTERP_LB_NUM_HEAVY; si++) { - if (current_block_id == interp_lb_splits[si][0]) { - is_heavy = true; - r_l = interp_lb_splits[si][1]; - r_r = interp_lb_splits[si][2]; - break; - } - } - } - - if (is_heavy) - { - int ib0 = (PP->shape[0] * i) / nxyz[0]; - int ib3 = (PP->shape[0] * (i + 1)) / nxyz[0] - 1; - int jb1 = (PP->shape[1] * j) / nxyz[1]; - int jb4 = (PP->shape[1] * (j + 1)) / nxyz[1] - 1; - int kb2 = (PP->shape[2] * k) / nxyz[2]; - int kb5 = (PP->shape[2] * (k + 1)) / nxyz[2] - 1; - - Block *split_first_block = nullptr; - Block *split_last_block = nullptr; - splitHotspotBlock(BlL, dim, ib0, ib3, jb1, jb4, kb2, kb5, - PP, r_l, r_r, ingfsi, fngfsi, periodic, - split_first_block, split_last_block); - - current_ng_start = split_first_block; - ng = split_last_block; - } - else - { - ibbox_here[0] = (PP->shape[0] * i) / nxyz[0]; - ibbox_here[3] = (PP->shape[0] * (i + 1)) / nxyz[0] - 1; - ibbox_here[1] = (PP->shape[1] * j) / nxyz[1]; - ibbox_here[4] = (PP->shape[1] * (j + 1)) / nxyz[1] - 1; - ibbox_here[2] = (PP->shape[2] * k) / nxyz[2]; - ibbox_here[5] = (PP->shape[2] * (k + 1)) / nxyz[2] - 1; - - if (periodic) { - for(int d=0; d<3; d++) { - ibbox_here[d] -= ghost_width; - ibbox_here[d+3] += ghost_width; - } - } else { - ibbox_here[0] = Mymax(0, ibbox_here[0] - ghost_width); - ibbox_here[3] = Mymin(PP->shape[0] - 1, ibbox_here[3] + ghost_width); - ibbox_here[1] = Mymax(0, ibbox_here[1] - ghost_width); - ibbox_here[4] = Mymin(PP->shape[1] - 1, ibbox_here[4] + ghost_width); - ibbox_here[2] = Mymax(0, ibbox_here[2] - ghost_width); - ibbox_here[5] = Mymin(PP->shape[2] - 1, ibbox_here[5] + ghost_width); - } - - for(int d=0; d<3; d++) shape_here[d] = ibbox_here[d+3] - ibbox_here[d] + 1; - -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1); - bbox_here[0] = PP->bbox[0] + ibbox_here[0] * dd; - bbox_here[3] = PP->bbox[0] + ibbox_here[3] * dd; - dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1); - bbox_here[1] = PP->bbox[1] + ibbox_here[1] * dd; - bbox_here[4] = PP->bbox[1] + ibbox_here[4] * dd; - dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1); - bbox_here[2] = PP->bbox[2] + ibbox_here[2] * dd; - bbox_here[5] = PP->bbox[2] + ibbox_here[5] * dd; -#else -#ifdef Cell - dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0]; - bbox_here[0] = PP->bbox[0] + (ibbox_here[0]) * dd; - bbox_here[3] = PP->bbox[0] + (ibbox_here[3] + 1) * dd; - dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1]; - bbox_here[1] = PP->bbox[1] + (ibbox_here[1]) * dd; - bbox_here[4] = PP->bbox[1] + (ibbox_here[4] + 1) * dd; - dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2]; - bbox_here[2] = PP->bbox[2] + (ibbox_here[2]) * dd; - bbox_here[5] = PP->bbox[2] + (ibbox_here[5] + 1) * dd; -#else -#error Not define Vertex nor Cell -#endif -#endif - ng = createMappedBlock(BlL, dim, shape_here, bbox_here, - current_block_id, ingfsi, fngfsi, PP->lev); - current_ng_start = ng; - } - - if (first_block_in_patch) { - ng0 = current_ng_start; - MyList *Bp_start = BlL; - while (Bp_start && Bp_start->data != ng0) Bp_start = Bp_start->next; - PP->blb = Bp_start; - first_block_in_patch = false; - } - - current_block_id++; - } - - { - MyList *Bp_end = BlL; - while (Bp_end && Bp_end->data != ng) Bp_end = Bp_end->next; - PP->ble = Bp_end; - } - - PLi = PLi->next; - } - if (reacpu < nodes * 2 / 3) - { - int myrank; - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - if (myrank == 0) - cout << "Parallel::distribute CAUSTION: level#" << lev << " uses essencially " << reacpu << " processors vs " << nodes << " nodes run, your scientific computation scale is not as large as you estimate." << endl; - } - - return BlL; -} - -Block* Parallel::splitHotspotBlock(MyList* &BlL, int _dim, - int ib0_orig, int ib3_orig, - int jb1_orig, int jb4_orig, - int kb2_orig, int kb5_orig, - Patch* PP, int r_left, int r_right, - int ingfsi, int fngfsi, bool periodic, - Block* &split_first_block, Block* &split_last_block) -{ - int mid = (ib0_orig + ib3_orig) / 2; - - int indices_L[6] = {ib0_orig, jb1_orig, kb2_orig, mid, jb4_orig, kb5_orig}; - int indices_R[6] = {mid + 1, jb1_orig, kb2_orig, ib3_orig, jb4_orig, kb5_orig}; - - auto createSubBlock = [&](int* ib_raw, int target_rank) { - int ib_final[6]; - int sh_here[3]; - double bb_here[6], dd; - - if (periodic) { - ib_final[0] = ib_raw[0] - ghost_width; - ib_final[3] = ib_raw[3] + ghost_width; - ib_final[1] = ib_raw[1] - ghost_width; - ib_final[4] = ib_raw[4] + ghost_width; - ib_final[2] = ib_raw[2] - ghost_width; - ib_final[5] = ib_raw[5] + ghost_width; - } else { - ib_final[0] = Mymax(0, ib_raw[0] - ghost_width); - ib_final[3] = Mymin(PP->shape[0] - 1, ib_raw[3] + ghost_width); - ib_final[1] = Mymax(0, ib_raw[1] - ghost_width); - ib_final[4] = Mymin(PP->shape[1] - 1, ib_raw[4] + ghost_width); - ib_final[2] = Mymax(0, ib_raw[2] - ghost_width); - ib_final[5] = Mymin(PP->shape[2] - 1, ib_raw[5] + ghost_width); - } - - sh_here[0] = ib_final[3] - ib_final[0] + 1; - sh_here[1] = ib_final[4] - ib_final[1] + 1; - sh_here[2] = ib_final[5] - ib_final[2] + 1; - -#ifdef Vertex - dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1); - bb_here[0] = PP->bbox[0] + ib_final[0] * dd; - bb_here[3] = PP->bbox[0] + ib_final[3] * dd; - dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1); - bb_here[1] = PP->bbox[1] + ib_final[1] * dd; - bb_here[4] = PP->bbox[1] + ib_final[4] * dd; - dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1); - bb_here[2] = PP->bbox[2] + ib_final[2] * dd; - bb_here[5] = PP->bbox[2] + ib_final[5] * dd; -#else -#ifdef Cell - dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0]; - bb_here[0] = PP->bbox[0] + ib_final[0] * dd; - bb_here[3] = PP->bbox[0] + (ib_final[3] + 1) * dd; - dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1]; - bb_here[1] = PP->bbox[1] + ib_final[1] * dd; - bb_here[4] = PP->bbox[1] + (ib_final[4] + 1) * dd; - dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2]; - bb_here[2] = PP->bbox[2] + ib_final[2] * dd; - bb_here[5] = PP->bbox[2] + (ib_final[5] + 1) * dd; -#endif -#endif - - Block* Bg = new Block(dim, sh_here, bb_here, target_rank, ingfsi, fngfsi, PP->lev); - if (BlL) BlL->insert(Bg); - else BlL = new MyList(Bg); - - return Bg; - }; - - split_first_block = createSubBlock(indices_L, r_left); - split_last_block = createSubBlock(indices_R, r_right); - return split_last_block; -} - -Block* Parallel::createMappedBlock(MyList* &BlL, int _dim, int* shape, double* bbox, - int block_id, int ingfsi, int fngfsi, int lev) -{ - int target_rank = block_id; - if (INTERP_LB_NPROCS > 0) { - for (int ri = 0; ri < interp_lb_num_remaps; ri++) { - if (block_id == interp_lb_remaps[ri][0]) { - target_rank = interp_lb_remaps[ri][1]; - break; - } - } - } - - Block* ng = new Block(dim, shape, bbox, target_rank, ingfsi, fngfsi, lev); - if (BlL) BlL->insert(ng); - else BlL = new MyList(ng); - - return ng; -} -#else -// When INTERP_LB_OPTIMIZE is not defined, distribute_optimize falls back to distribute -MyList *Parallel::distribute_optimize(MyList *PatchLIST, int cpusize, int ingfsi, int fngfsi, - bool periodic, int nodes) -{ - return distribute(PatchLIST, cpusize, ingfsi, fngfsi, periodic, nodes); -} -Block* Parallel::splitHotspotBlock(MyList* &BlL, int _dim, - int ib0_orig, int ib3_orig, - int jb1_orig, int jb4_orig, - int kb2_orig, int kb5_orig, - Patch* PP, int r_left, int r_right, - int ingfsi, int fngfsi, bool periodic, - Block* &split_first_block, Block* &split_last_block) -{ return nullptr; } -Block* Parallel::createMappedBlock(MyList* &BlL, int _dim, int* shape, double* bbox, - int block_id, int ingfsi, int fngfsi, int lev) -{ return nullptr; } -#endif - -#elif (PSTR == 1 || PSTR == 2 || PSTR == 3) -MyList *Parallel::distribute(MyList *PatchLIST, int cpusize, int ingfsi, int fngfsi, - bool periodic, int start_rank, int end_rank, int nodes) -{ -#ifdef USE_GPU_DIVIDE - double cpu_part, gpu_part; - map::iterator iter; - iter = parameters::dou_par.find("cpu part"); - if (iter != parameters::dou_par.end()) - { - cpu_part = iter->second; - } - else - { - int myrank; - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - // read parameter from file - const int LEN = 256; - char pline[LEN]; - string str, sgrp, skey, sval; - int sind; - char pname[50]; - { - map::iterator iter = parameters::str_par.find("inputpar"); - if (iter != parameters::str_par.end()) - { - strcpy(pname, (iter->second).c_str()); - } - else - { - cout << "Error inputpar" << endl; - exit(0); - } - } - ifstream inf(pname, ifstream::in); - if (!inf.good() && myrank == 0) - { - cout << "Can not open parameter file " << pname << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - - for (int i = 1; inf.good(); i++) - { - inf.getline(pline, LEN); - str = pline; - - int status = misc::parse_parts(str, sgrp, skey, sval, sind); - if (status == -1) - { - cout << "error reading parameter file " << pname << " in line " << i << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - else if (status == 0) - continue; - - if (sgrp == "ABE") - { - if (skey == "cpu part") - cpu_part = atof(sval.c_str()); - } - } - inf.close(); - - parameters::dou_par.insert(map::value_type("cpu part", cpu_part)); - } - iter = parameters::dou_par.find("gpu part"); - if (iter != parameters::dou_par.end()) - { - gpu_part = iter->second; - } - else - { - int myrank; - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - // read parameter from file - const int LEN = 256; - char pline[LEN]; - string str, sgrp, skey, sval; - int sind; - char pname[50]; - { - map::iterator iter = parameters::str_par.find("inputpar"); - if (iter != parameters::str_par.end()) - { - strcpy(pname, (iter->second).c_str()); - } - else - { - cout << "Error inputpar" << endl; - exit(0); - } - } - ifstream inf(pname, ifstream::in); - if (!inf.good() && myrank == 0) - { - cout << "Can not open parameter file " << pname << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - - for (int i = 1; inf.good(); i++) - { - inf.getline(pline, LEN); - str = pline; - - int status = misc::parse_parts(str, sgrp, skey, sval, sind); - if (status == -1) - { - cout << "error reading parameter file " << pname << " in line " << i << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - else if (status == 0) - continue; - - if (sgrp == "ABE") - { - if (skey == "gpu part") - gpu_part = atof(sval.c_str()); - } - } - inf.close(); - - parameters::dou_par.insert(map::value_type("gpu part", gpu_part)); - } - - if (nodes == 0) - nodes = cpusize / 2; -#else - if (nodes == 0) - nodes = cpusize; -#endif - - if (dim != 3) - { - cout << "distrivute: now we only support 3-dimension" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - - MyList *BlL = 0; - - int split_size, min_size, block_size = 0; - - int min_width = 2 * Mymax(ghost_width, buffer_width); - int nxyz[dim], mmin_width[dim], min_shape[dim]; - - MyList *PLi = PatchLIST; - for (int i = 0; i < dim; i++) - min_shape[i] = PLi->data->shape[i]; - int lev = PLi->data->lev; - PLi = PLi->next; - while (PLi) - { - Patch *PP = PLi->data; - for (int i = 0; i < dim; i++) - min_shape[i] = Mymin(min_shape[i], PP->shape[i]); - if (lev != PLi->data->lev) - cout << "Parallel::distribute CAUSTION: meet Patches for different level: " << lev << " and " << PLi->data->lev << endl; - PLi = PLi->next; - } - - for (int i = 0; i < dim; i++) - mmin_width[i] = Mymin(min_width, min_shape[i]); - - min_size = mmin_width[0]; - for (int i = 1; i < dim; i++) - min_size = min_size * mmin_width[i]; - - PLi = PatchLIST; - while (PLi) - { - Patch *PP = PLi->data; - // PP->checkPatch(true); - int bs = PP->shape[0]; - for (int i = 1; i < dim; i++) - bs = bs * PP->shape[i]; - block_size = block_size + bs; - PLi = PLi->next; - } - split_size = Mymax(min_size, block_size / cpusize); - split_size = Mymax(1, split_size); - - int n_rank = start_rank; - PLi = PatchLIST; - int reacpu = 0; - while (PLi) - { - Patch *PP = PLi->data; - - reacpu += partition3(nxyz, split_size, mmin_width, cpusize, PP->shape); - - Block *ng, *ng0; - int shape_here[dim], ibbox_here[2 * dim]; - double bbox_here[2 * dim], dd; - - // ibbox : 0,...N-1 - for (int i = 0; i < nxyz[0]; i++) - for (int j = 0; j < nxyz[1]; j++) - for (int k = 0; k < nxyz[2]; k++) - { - ibbox_here[0] = (PP->shape[0] * i) / nxyz[0]; - ibbox_here[3] = (PP->shape[0] * (i + 1)) / nxyz[0] - 1; - ibbox_here[1] = (PP->shape[1] * j) / nxyz[1]; - ibbox_here[4] = (PP->shape[1] * (j + 1)) / nxyz[1] - 1; - ibbox_here[2] = (PP->shape[2] * k) / nxyz[2]; - ibbox_here[5] = (PP->shape[2] * (k + 1)) / nxyz[2] - 1; - - if (periodic) - { - ibbox_here[0] = ibbox_here[0] - ghost_width; - ibbox_here[3] = ibbox_here[3] + ghost_width; - ibbox_here[1] = ibbox_here[1] - ghost_width; - ibbox_here[4] = ibbox_here[4] + ghost_width; - ibbox_here[2] = ibbox_here[2] - ghost_width; - ibbox_here[5] = ibbox_here[5] + ghost_width; - } - else - { - ibbox_here[0] = Mymax(0, ibbox_here[0] - ghost_width); - ibbox_here[3] = Mymin(PP->shape[0] - 1, ibbox_here[3] + ghost_width); - ibbox_here[1] = Mymax(0, ibbox_here[1] - ghost_width); - ibbox_here[4] = Mymin(PP->shape[1] - 1, ibbox_here[4] + ghost_width); - ibbox_here[2] = Mymax(0, ibbox_here[2] - ghost_width); - ibbox_here[5] = Mymin(PP->shape[2] - 1, ibbox_here[5] + ghost_width); - } - - shape_here[0] = ibbox_here[3] - ibbox_here[0] + 1; - shape_here[1] = ibbox_here[4] - ibbox_here[1] + 1; - shape_here[2] = ibbox_here[5] - ibbox_here[2] + 1; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - // 0--4, 5--10 - dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1); - bbox_here[0] = PP->bbox[0] + ibbox_here[0] * dd; - bbox_here[3] = PP->bbox[0] + ibbox_here[3] * dd; - - dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1); - bbox_here[1] = PP->bbox[1] + ibbox_here[1] * dd; - bbox_here[4] = PP->bbox[1] + ibbox_here[4] * dd; - - dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1); - bbox_here[2] = PP->bbox[2] + ibbox_here[2] * dd; - bbox_here[5] = PP->bbox[2] + ibbox_here[5] * dd; -#else -#ifdef Cell - // 0--5, 5--10 - dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0]; - bbox_here[0] = PP->bbox[0] + (ibbox_here[0]) * dd; - bbox_here[3] = PP->bbox[0] + (ibbox_here[3] + 1) * dd; - - dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1]; - bbox_here[1] = PP->bbox[1] + (ibbox_here[1]) * dd; - bbox_here[4] = PP->bbox[1] + (ibbox_here[4] + 1) * dd; - - dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2]; - bbox_here[2] = PP->bbox[2] + (ibbox_here[2]) * dd; - bbox_here[5] = PP->bbox[2] + (ibbox_here[5] + 1) * dd; -#else -#error Not define Vertex nor Cell -#endif -#endif - -#ifdef USE_GPU_DIVIDE - { - const int pices = 2; - double picef[pices]; - picef[0] = cpu_part; - picef[1] = gpu_part; - int shape_res[dim * pices]; - double bbox_res[2 * dim * pices]; - misc::dividBlock(dim, shape_here, bbox_here, pices, picef, shape_res, bbox_res, min_width); - ng = ng0 = new Block(dim, shape_res, bbox_res, n_rank++, ingfsi, fngfsi, PP->lev, 0); // delete through KillBlocks - // ng->checkBlock(); - if (BlL) - BlL->insert(ng); - else - BlL = new MyList(ng); // delete through KillBlocks - - for (int i = 1; i < pices; i++) - { - ng = new Block(dim, shape_res + i * dim, bbox_res + i * 2 * dim, n_rank++, ingfsi, fngfsi, PP->lev, i); // delete through KillBlocks - // ng->checkBlock(); - BlL->insert(ng); - } - } -#else - ng = ng0 = new Block(dim, shape_here, bbox_here, n_rank++, ingfsi, fngfsi, PP->lev); // delete through KillBlocks - // ng->checkBlock(); - if (BlL) - BlL->insert(ng); - else - BlL = new MyList(ng); // delete through KillBlocks -#endif - - if (n_rank == end_rank + 1) - n_rank = start_rank; - - // set PP->blb - if (i == 0 && j == 0 && k == 0) - { - MyList *Bp = BlL; - while (Bp->data != ng0) - Bp = Bp->next; // ng0 is the first of the pices list - PP->blb = Bp; - } - } - // set PP->ble - { - MyList *Bp = BlL; - while (Bp->data != ng) - Bp = Bp->next; // ng is the last of the pices list - PP->ble = Bp; - } - PLi = PLi->next; - } - if (reacpu < nodes * 2 / 3) - { - int myrank; - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - if (myrank == start_rank) - cout << "Parallel::distribute CAUSTION: level#" << lev << " uses essencially " << reacpu << " processors vs " << nodes << " nodes run, your scientific computation scale is not as large as you estimate." << endl; - } - - return BlL; -} -#endif -void Parallel::setfunction(MyList *BlL, var *vn, double func(double x, double y, double z)) -{ - while (BlL) - { - if (BlL->data->X[0]) - { - int nn = BlL->data->shape[0] * BlL->data->shape[1] * BlL->data->shape[2]; - double *p = BlL->data->fgfs[vn->sgfn]; - for (int i = 0; i < nn; i++) - { - int ind[3]; - getarrayindex(3, BlL->data->shape, ind, i); - p[i] = func(BlL->data->X[0][ind[0]], BlL->data->X[1][ind[1]], BlL->data->X[2][ind[2]]); - } - } - BlL = BlL->next; - } -} -// set function only for cpu rank -void Parallel::setfunction(int rank, MyList *BlL, var *vn, double func(double x, double y, double z)) -{ - while (BlL) - { - if (BlL->data->X[0] && BlL->data->rank == rank) - { - int nn = BlL->data->shape[0] * BlL->data->shape[1] * BlL->data->shape[2]; - double *p = BlL->data->fgfs[vn->sgfn]; - for (int i = 0; i < nn; i++) - { - int ind[3]; - getarrayindex(3, BlL->data->shape, ind, i); - p[i] = func(BlL->data->X[0][ind[0]], BlL->data->X[1][ind[1]], BlL->data->X[2][ind[2]]); - } - } - BlL = BlL->next; - } -} -void Parallel::getarrayindex(int DIM, int *shape, int *index, int n) -{ - // we assume index has already memory space - int *mu; - mu = new int[DIM]; - mu[0] = 1; - for (int i = 1; i < DIM; i++) - mu[i] = mu[i - 1] * shape[i - 1]; - for (int i = DIM - 1; i >= 0; i--) - { - index[i] = n / mu[i]; - n = n - index[i] * mu[i]; - } - - delete[] mu; -} -int Parallel::getarraylocation(int DIM, int *shape, int *index) -{ - int n, mu; - mu = shape[0]; - n = index[0]; - for (int i = 1; i < DIM; i++) - { - n = n + index[i] * mu; - mu = mu * shape[i]; - } - - return n; -} -void Parallel::copy(int DIM, double *llbout, double *uubout, int *Dshape, double *DD, double *llbin, double *uubin, - int *shape, double *datain, double *llb, double *uub) -{ - // for 3 dimensional case, based on simple test, I found this is half slower than f90 code - int *illi, *iuui; - int *illo, *iuuo; - int *indi, *indo; - illi = new int[DIM]; - iuui = new int[DIM]; - illo = new int[DIM]; - iuuo = new int[DIM]; - indi = new int[DIM]; - indo = new int[DIM]; - - int ial = 1; - for (int i = 0; i < DIM; i++) - { - double ho, hi; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - ho = (uubout[i] - llbout[i]) / (Dshape[i] - 1); - hi = (uubin[i] - llbin[i]) / (shape[i] - 1); -#else -#ifdef Cell - ho = (uubout[i] - llbout[i]) / Dshape[i]; - hi = (uubin[i] - llbin[i]) / shape[i]; -#else -#error Not define Vertex nor Cell -#endif -#endif - illo[i] = int((llb[i] - llbout[i]) / ho); - iuuo[i] = Dshape[i] - 1 - int((uubout[i] - uub[i]) / ho); - illi[i] = int((llb[i] - llbin[i]) / hi); - iuui[i] = shape[i] - 1 - int((uubin[i] - uub[i]) / hi); - - if (illo[i] > iuuo[i] || illi[i] > iuui[i] || illo[i] < 0 || illi[i] < 0 || - iuui[i] >= shape[i] || iuuo[i] >= Dshape[i]) - { - cout << "Parallel copy: in direction " << i << ":" << endl; - cout << "llb = " << llb[i] << ", uub = " << uub[i] << endl; - cout << " in data : il = " << illi[i] << ", iu = " << iuui[i] << endl; - cout << "bbox = (" << llbin[i] << "," << uubin[i] << ")" << endl; - cout << "shape = " << shape[i] << endl; - cout << "out data : il = " << illo[i] << ", iu = " << iuuo[i] << endl; - cout << "bbox = (" << llbout[i] << "," << uubout[i] << ")" << endl; - cout << "shape = " << Dshape[i] << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - int ihi = iuui[i] - illi[i] + 1, iho = iuuo[i] - illo[i] + 1; - if (!(feq(ho, hi, ho / 2)) || ihi != iho) - { - cout << "Parallel copy: in direction " << i << ":" << endl; - cout << "Parallel copy: not the same grid structure." << endl; - cout << "hi = " << hi << ", bbox = (" << llbin[i] << "," << uubin[i] << "), shape = " << shape[i] << endl; - cout << "ho = " << ho << ", bbox = (" << llbout[i] << "," << uubout[i] << "), shape = " << Dshape[i] << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - ial = ial * ihi; - } - - for (int i = 0; i < DIM; i++) - { - indi[i] = illi[i]; - indo[i] = illo[i]; - } - /* - //check start index - for(int i=0;i NNi) - { - cout << "Parallel copy: ni = " << ni << " is out of array range (0," << NNi << ")." << endl; - cout << "shape = ("; - for (int j = 0; j < DIM; j++) - { - cout << shape[j]; - if (j < DIM - 1) - cout << ","; - else - cout << ")" << endl; - } - cout << "ind = ("; - for (int j = 0; j < DIM; j++) - { - cout << indi[j]; - if (j < DIM - 1) - cout << ","; - else - cout << ")" << endl; - } - MPI_Abort(MPI_COMM_WORLD, 1); - } - DD[no] = datain[ni]; - - indi[0]++; - for (int j = 1; j < DIM; j++) - { - if (indi[j - 1] == iuui[j - 1] + 1) - { - indi[j - 1] = illi[j - 1]; - indi[j]++; - } // carry 1 to next digital - else - break; - } - indo[0]++; - for (int j = 1; j < DIM; j++) - { - if (indo[j - 1] == iuuo[j - 1] + 1) - { - indo[j - 1] = illo[j - 1]; - indo[j]++; - } - else - break; - } - } - /* - //check final index - for(int i=0;i *BlL, MyList *DumpList, char *tag, double time, double dT) -{ - int myrank; - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - // round at 4 and 5 - int ncount = int(time / dT + 0.5); - - MyList *Bp; - while (DumpList) - { - Bp = BlL; - int Bi = 0; - while (Bp) - { - Block *BP = Bp->data; - var *VP = DumpList->data; - if (BP->rank == myrank) - { - - string out_dir; - map::iterator iter; - iter = parameters::str_par.find("output dir"); - if (iter != parameters::str_par.end()) - { - out_dir = iter->second; - } - else - { - // read parameter from file - const int LEN = 256; - char pline[LEN]; - string str, sgrp, skey, sval; - int sind; - char pname[50]; - { - map::iterator iter = parameters::str_par.find("inputpar"); - if (iter != parameters::str_par.end()) - { - strcpy(pname, (iter->second).c_str()); - } - else - { - cout << "Error inputpar" << endl; - exit(0); - } - } - ifstream inf(pname, ifstream::in); - if (!inf.good()) - { - cout << "Can not open parameter file " << pname << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - - for (int i = 1; inf.good(); i++) - { - inf.getline(pline, LEN); - str = pline; - - int status = misc::parse_parts(str, sgrp, skey, sval, sind); - if (status == -1) - { - cout << "error reading parameter file " << pname << " in line " << i << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - else if (status == 0) - continue; - - if (sgrp == "ABE") - { - if (skey == "output dir") - out_dir = sval; - } - } - inf.close(); - - parameters::str_par.insert(map::value_type("output dir", out_dir)); - } - - char filename[100]; - if (tag) - sprintf(filename, "%s/%s_Lev%02d-%02d_%02d_%s_%05d.bin", out_dir.c_str(), tag, BP->lev, Bi, myrank, VP->name, ncount); - else - sprintf(filename, "%s/Lev%02d-%02d_%02d_%s_%05d.bin", out_dir.c_str(), BP->lev, Bi, myrank, VP->name, ncount); - writefile(time, BP->shape[0], BP->shape[1], BP->shape[2], BP->bbox[0], BP->bbox[3], BP->bbox[1], BP->bbox[4], - BP->bbox[2], BP->bbox[5], filename, BP->fgfs[VP->sgfn]); - cout << "end of dump " << VP->name << " at time " << time << ", on node " << myrank << endl; - } - Bp = Bp->next; - Bi++; - } - DumpList = DumpList->next; - } -} -// Now we dump the data including buffer points -void Parallel::Dump_Data(Patch *PP, MyList *DumpList, char *tag, double time, double dT, int grd) -{ - int myrank; - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - // round at 4 and 5 - int ncount = int(time / dT + 0.5); - - MPI_Status sta; - int DIM = 3; - double llb[3], uub[3]; - double DX, DY, DZ; - - double *databuffer = 0; - if (myrank == 0) - { - databuffer = (double *)malloc(sizeof(double) * PP->shape[0] * PP->shape[1] * PP->shape[2]); - if (!databuffer) - { - cout << "Parallel::Dump_Data: out of memory when dumping data." << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - } - - while (DumpList) - { - var *VP = DumpList->data; - - MyList *Bp = PP->blb; - while (Bp) - { - Block *BP = Bp->data; - if (BP->rank == 0 && myrank == 0) - { - DX = BP->getdX(0); - DY = BP->getdX(1); - DZ = BP->getdX(2); - llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX; - llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY; - llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ; - uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX; - uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY; - uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ; - f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, BP->fgfs[VP->sgfn], llb, uub); - } - else - { - int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]); - if (myrank == 0) - { - double *bufferhere = (double *)malloc(sizeof(double) * nnn); - if (!bufferhere) - { - cout << "on node#" << myrank << ", out of memory when dumping data." << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - MPI_Recv(bufferhere, nnn, MPI_DOUBLE, BP->rank, 0, MPI_COMM_WORLD, &sta); - DX = BP->getdX(0); - DY = BP->getdX(1); - DZ = BP->getdX(2); - llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX; - llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY; - llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ; - uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX; - uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY; - uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ; - f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, bufferhere, llb, uub); - free(bufferhere); - } - else if (myrank == BP->rank) - { - MPI_Send(BP->fgfs[VP->sgfn], nnn, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD); - } - } - if (Bp == PP->ble) - break; - Bp = Bp->next; - } - if (myrank == 0) - { - - string out_dir; - map::iterator iter; - iter = parameters::str_par.find("output dir"); - if (iter != parameters::str_par.end()) - { - out_dir = iter->second; - } - else - { - // read parameter from file - const int LEN = 256; - char pline[LEN]; - string str, sgrp, skey, sval; - int sind; - char pname[50]; - { - map::iterator iter = parameters::str_par.find("inputpar"); - if (iter != parameters::str_par.end()) - { - strcpy(pname, (iter->second).c_str()); - } - else - { - cout << "Error inputpar" << endl; - exit(0); - } - } - ifstream inf(pname, ifstream::in); - if (!inf.good()) - { - cout << "Can not open parameter file " << pname << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - - for (int i = 1; inf.good(); i++) - { - inf.getline(pline, LEN); - str = pline; - - int status = misc::parse_parts(str, sgrp, skey, sval, sind); - if (status == -1) - { - cout << "error reading parameter file " << pname << " in line " << i << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - else if (status == 0) - continue; - - if (sgrp == "ABE") - { - if (skey == "output dir") - out_dir = sval; - } - } - inf.close(); - - parameters::str_par.insert(map::value_type("output dir", out_dir)); - } - - char filename[100]; - if (tag) - sprintf(filename, "%s/%s_Lev%02d-%02d_%s_%05d.bin", out_dir.c_str(), tag, PP->lev, grd, VP->name, ncount); - else - sprintf(filename, "%s/Lev%02d-%02d_%s_%05d.bin", out_dir.c_str(), PP->lev, grd, VP->name, ncount); - - writefile(time, PP->shape[0], PP->shape[1], PP->shape[2], PP->bbox[0], PP->bbox[3], PP->bbox[1], PP->bbox[4], - PP->bbox[2], PP->bbox[5], filename, databuffer); - } - DumpList = DumpList->next; - } - - if (myrank == 0) - free(databuffer); -} -void Parallel::Dump_Data(MyList *PL, MyList *DumpList, char *tag, double time, double dT) -{ - MyList *Pp; - Pp = PL; - int grd = 0; - while (Pp) - { - Patch *PP = Pp->data; - Dump_Data(PP, DumpList, tag, time, dT, grd); - grd++; - Pp = Pp->next; - } -} -// collect the data including buffer points -double *Parallel::Collect_Data(Patch *PP, var *VP) -{ - int myrank; - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - - MPI_Status sta; - int DIM = 3; - double llb[3], uub[3]; - double DX, DY, DZ; - - double *databuffer = 0; - if (myrank == 0) - { - databuffer = (double *)malloc(sizeof(double) * PP->shape[0] * PP->shape[1] * PP->shape[2]); - if (!databuffer) - { - cout << "Parallel::Collect_Data: out of memory when dumping data." << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - } - - MyList *Bp = PP->blb; - while (Bp) - { - Block *BP = Bp->data; - if (BP->rank == 0 && myrank == 0) - { - DX = BP->getdX(0); - DY = BP->getdX(1); - DZ = BP->getdX(2); - llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX; - llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY; - llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ; - uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX; - uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY; - uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ; - f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, BP->fgfs[VP->sgfn], llb, uub); - } - else - { - int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]); - if (myrank == 0) - { - double *bufferhere = (double *)malloc(sizeof(double) * nnn); - if (!bufferhere) - { - cout << "on node#" << myrank << ", out of memory when dumping data." << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - MPI_Recv(bufferhere, nnn, MPI_DOUBLE, BP->rank, 0, MPI_COMM_WORLD, &sta); - DX = BP->getdX(0); - DY = BP->getdX(1); - DZ = BP->getdX(2); - llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX; - llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY; - llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ; - uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX; - uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY; - uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ; - f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, bufferhere, llb, uub); - free(bufferhere); - } - else if (myrank == BP->rank) - { - MPI_Send(BP->fgfs[VP->sgfn], nnn, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD); - } - } - if (Bp == PP->ble) - break; - Bp = Bp->next; - } - - return databuffer; -} -// Now we dump the data including buffer points -// dump z = 0 plane -void Parallel::d2Dump_Data(Patch *PP, MyList *DumpList, char *tag, double time, double dT, int grd) -{ - int myrank; - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - // round at 4 and 5 - int ncount = int(time / dT + 0.5); - - MPI_Status sta; - int DIM = 3; - double llb[3], uub[3]; - double DX, DY, DZ; - - double *databuffer = 0, *databuffer2 = 0; - if (myrank == 0) - { - databuffer = (double *)malloc(sizeof(double) * PP->shape[0] * PP->shape[1] * PP->shape[2]); - databuffer2 = (double *)malloc(sizeof(double) * PP->shape[0] * PP->shape[1]); - if (!databuffer || !databuffer2) - { - cout << "Parallel::d2Dump_Data: out of memory when dumping data." << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - } - - while (DumpList) - { - var *VP = DumpList->data; - - MyList *Bp = PP->blb; - while (Bp) - { - Block *BP = Bp->data; - if (BP->rank == 0 && myrank == 0) - { - DX = BP->getdX(0); - DY = BP->getdX(1); - DZ = BP->getdX(2); - llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX; - llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY; - llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ; - uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX; - uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY; - uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ; - f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, BP->fgfs[VP->sgfn], llb, uub); - } - else - { - int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]); - if (myrank == 0) - { - double *bufferhere = (double *)malloc(sizeof(double) * nnn); - if (!bufferhere) - { - cout << "on node#" << myrank << ", out of memory when dumping data." << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - MPI_Recv(bufferhere, nnn, MPI_DOUBLE, BP->rank, 0, MPI_COMM_WORLD, &sta); - DX = BP->getdX(0); - DY = BP->getdX(1); - DZ = BP->getdX(2); - llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX; - llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY; - llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ; - uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX; - uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY; - uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ; - f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, bufferhere, llb, uub); - free(bufferhere); - } - else if (myrank == BP->rank) - { - MPI_Send(BP->fgfs[VP->sgfn], nnn, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD); - } - } - if (Bp == PP->ble) - break; - Bp = Bp->next; - } - if (myrank == 0) - { - - string out_dir; - map::iterator iter; - iter = parameters::str_par.find("output dir"); - if (iter != parameters::str_par.end()) - { - out_dir = iter->second; - } - else - { - // read parameter from file - const int LEN = 256; - char pline[LEN]; - string str, sgrp, skey, sval; - int sind; - char pname[50]; - { - map::iterator iter = parameters::str_par.find("inputpar"); - if (iter != parameters::str_par.end()) - { - strcpy(pname, (iter->second).c_str()); - } - else - { - cout << "Error inputpar" << endl; - exit(0); - } - } - ifstream inf(pname, ifstream::in); - if (!inf.good()) - { - cout << "Can not open parameter file " << pname << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - - for (int i = 1; inf.good(); i++) - { - inf.getline(pline, LEN); - str = pline; - - int status = misc::parse_parts(str, sgrp, skey, sval, sind); - if (status == -1) - { - cout << "error reading parameter file " << pname << " in line " << i << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - else if (status == 0) - continue; - - if (sgrp == "ABE") - { - if (skey == "output dir") - out_dir = sval; - } - } - inf.close(); - - parameters::str_par.insert(map::value_type("output dir", out_dir)); - } - - char filename[100]; - if (tag) - sprintf(filename, "%s/%s_2d_Lev%02d-%02d_%s_%05d.dat", out_dir.c_str(), tag, PP->lev, grd, VP->name, ncount); - else - sprintf(filename, "%s/2d_Lev%02d-%02d_%s_%05d.dat", out_dir.c_str(), PP->lev, grd, VP->name, ncount); - - int gord = ghost_width; - f_d2dump(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, databuffer2, gord, VP->SoA); - writefile(time, PP->shape[0], PP->shape[1], PP->bbox[0], PP->bbox[3], PP->bbox[1], PP->bbox[4], - filename, databuffer2); - } - DumpList = DumpList->next; - } - - if (myrank == 0) - { - free(databuffer); - free(databuffer2); - } -} -void Parallel::d2Dump_Data(MyList *PL, MyList *DumpList, char *tag, double time, double dT) -{ - MyList *Pp; - Pp = PL; - int grd = 0; - while (Pp) - { - Patch *PP = Pp->data; - d2Dump_Data(PP, DumpList, tag, time, dT, grd); - grd++; - Pp = Pp->next; - } -} -// Now we dump the data including buffer points and ghost points of the given patch -void Parallel::Dump_Data0(Patch *PP, MyList *DumpList, char *tag, double time, double dT) -{ - int myrank; - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - // round at 4 and 5 - int ncount = int(time / dT + 0.5); - - MPI_Status sta; - int DIM = 3; - double llb[3], uub[3], tllb[3], tuub[3]; - int tshape[3]; - double DX, DY, DZ; - - for (int i = 0; i < 3; i++) - { - double DX = PP->blb->data->getdX(i); - tshape[i] = PP->shape[i] + 2 * ghost_width; - tllb[i] = PP->bbox[i] - ghost_width * DX; - tuub[i] = PP->bbox[i + dim] + ghost_width * DX; - } - - int NN = tshape[0] * tshape[1] * tshape[2]; - double *databuffer = 0; - if (myrank == 0) - { - databuffer = (double *)malloc(sizeof(double) * NN); - if (!databuffer) - { - cout << "on node# " << myrank << ", out of memory when dumping data." << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - } - - while (DumpList) - { - var *VP = DumpList->data; - MyList *Bp = PP->blb; - while (Bp) - { - Block *BP = Bp->data; - if (BP->rank == 0 && myrank == 0) - { - DX = BP->getdX(0); - DY = BP->getdX(1); - DZ = BP->getdX(2); - llb[0] = (feq(BP->bbox[0], tllb[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX; - llb[1] = (feq(BP->bbox[1], tllb[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY; - llb[2] = (feq(BP->bbox[2], tllb[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ; - uub[0] = (feq(BP->bbox[3], tuub[0], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX; - uub[1] = (feq(BP->bbox[4], tuub[1], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY; - uub[2] = (feq(BP->bbox[5], tuub[2], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ; - f_copy(DIM, tllb, tuub, tshape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, BP->fgfs[VP->sgfn], llb, uub); - } - else - { - if (myrank == 0) - { - int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]); - double *bufferhere = (double *)malloc(sizeof(double) * nnn); - if (!bufferhere) - { - cout << "on node#" << myrank << ", out of memory when dumping data." << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - MPI_Recv(bufferhere, nnn, MPI_DOUBLE, BP->rank, 0, MPI_COMM_WORLD, &sta); - DX = BP->getdX(0); - DY = BP->getdX(1); - DZ = BP->getdX(2); - llb[0] = (feq(BP->bbox[0], tllb[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX; - llb[1] = (feq(BP->bbox[1], tllb[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY; - llb[2] = (feq(BP->bbox[2], tllb[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ; - uub[0] = (feq(BP->bbox[3], tuub[0], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX; - uub[1] = (feq(BP->bbox[4], tuub[1], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY; - uub[2] = (feq(BP->bbox[5], tuub[2], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ; - f_copy(DIM, tllb, tuub, tshape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, bufferhere, llb, uub); - free(bufferhere); - } - else if (myrank == BP->rank) - { - int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]); - MPI_Send(BP->fgfs[VP->sgfn], nnn, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD); - } - } - if (Bp == PP->ble) - break; - Bp = Bp->next; - } - if (myrank == 0) - { - - string out_dir; - map::iterator iter; - iter = parameters::str_par.find("output dir"); - if (iter != parameters::str_par.end()) - { - out_dir = iter->second; - } - else - { - // read parameter from file - const int LEN = 256; - char pline[LEN]; - string str, sgrp, skey, sval; - int sind; - char pname[50]; - { - map::iterator iter = parameters::str_par.find("inputpar"); - if (iter != parameters::str_par.end()) - { - strcpy(pname, (iter->second).c_str()); - } - else - { - cout << "Error inputpar" << endl; - exit(0); - } - } - ifstream inf(pname, ifstream::in); - if (!inf.good()) - { - cout << "Can not open parameter file " << pname << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - - for (int i = 1; inf.good(); i++) - { - inf.getline(pline, LEN); - str = pline; - - int status = misc::parse_parts(str, sgrp, skey, sval, sind); - if (status == -1) - { - cout << "error reading parameter file " << pname << " in line " << i << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - else if (status == 0) - continue; - - if (sgrp == "ABE") - { - if (skey == "output dir") - out_dir = sval; - } - } - inf.close(); - - parameters::str_par.insert(map::value_type("output dir", out_dir)); - } - - char filename[100]; - if (tag) - sprintf(filename, "%s/%s_Lev%02d_%s_%05d.bin", out_dir.c_str(), tag, PP->lev, VP->name, ncount); - else - sprintf(filename, "%s/Lev%02d_%s_%05d.bin", out_dir.c_str(), PP->lev, VP->name, ncount); - - writefile(time, tshape[0], tshape[1], tshape[2], tllb[0], tuub[0], tllb[1], tuub[2], - tllb[2], tuub[2], filename, databuffer); - } - DumpList = DumpList->next; - } - - if (myrank == 0) - free(databuffer); -} -// Map point is much easier than maping data itself -// But the main problem is about the points near the boundary -// worst case is -ghost -ghost+1 .... 0 * ...... -double Parallel::global_interp(int DIM, int *ext, double **CoX, double *datain, - double *poXb, int ordn, double *SoA, int Symmetry) -{ - if (DIM != 3) - { - cout << "Parallel::global_interp does not suport DIM = " << DIM << " for Symmetry." << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - double resu; - double poX[3]; - double asgn = 1; - - for (int i = 0; i < 3; i++) - poX[i] = poXb[i]; - - switch (Symmetry) - { - case 2: - for (int i = 0; i < 3; i++) - if (poX[i] < 0) - { - poX[i] = -poX[i]; - asgn = asgn * SoA[i]; - } - break; - case 1: - if (poX[2] < 0) - { - poX[2] = -poX[2]; - asgn = asgn * SoA[2]; - } - } - - int extb[3]; - - for (int i = 0; i < 3; i++) - extb[i] = ext[i]; - - switch (Symmetry) - { -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - case 2: - if (poX[0] < (ghost_width - 1) * (CoX[0][1] - CoX[0][0])) - extb[0] = extb[0] + ghost_width - 1; - if (poX[1] < (ghost_width - 1) * (CoX[1][1] - CoX[1][0])) - extb[1] = extb[1] + ghost_width - 1; - case 1: - if (poX[2] < (ghost_width - 1) * (CoX[2][1] - CoX[2][0])) - extb[2] = extb[2] + ghost_width - 1; -#else -#ifdef Cell - case 2: - if (poX[0] < (ghost_width - 0.5) * (CoX[0][1] - CoX[0][0])) - extb[0] = extb[0] + ghost_width; - if (poX[1] < (ghost_width - 0.5) * (CoX[1][1] - CoX[1][0])) - extb[1] = extb[1] + ghost_width; - case 1: - if (poX[2] < (ghost_width - 0.5) * (CoX[2][1] - CoX[2][0])) - extb[2] = extb[2] + ghost_width; -#else -#error Not define Vertex nor Cell -#endif -#endif - } - - if (extb[0] > ext[0] || extb[1] > ext[1] || extb[2] > ext[2]) - { - double *CoXb[3]; - int Nb = extb[0] * extb[1] * extb[2]; - double *datab; - datab = new double[Nb]; - for (int i = 0; i < 3; i++) - { - CoXb[i] = new double[extb[i]]; - double DH = CoX[i][1] - CoX[i][0]; - if (extb[i] > ext[i]) - { - if (CoX[i][0] > DH) - { - cout << "lower boundary[" << i << "] = " << CoX[i][0] << ", but SYmmetry = " << Symmetry << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - for (int j = 0; j < ghost_width - 1; j++) - CoXb[i][j] = -CoX[i][ghost_width - 1 - j]; - for (int j = ghost_width - 1; j < extb[i]; j++) - CoXb[i][j] = CoX[i][j - ghost_width + 1]; -#else -#ifdef Cell - for (int j = 0; j < ghost_width; j++) - CoXb[i][j] = -CoX[i][ghost_width - 1 - j]; - for (int j = ghost_width; j < extb[i]; j++) - CoXb[i][j] = CoX[i][j - ghost_width]; -#else -#error Not define Vertex nor Cell -#endif -#endif - } - else - { - for (int j = 0; j < extb[i]; j++) - CoXb[i][j] = CoX[i][j]; - } - } - - for (int i = 0; i < Nb; i++) - { - int ind[3], indb[3]; - getarrayindex(3, extb, indb, i); - double sgn = 1; - for (int j = 0; j < 3; j++) - { - if (extb[j] > ext[j]) - { -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - if (indb[j] < ghost_width - 1) - { - ind[j] = ghost_width - 1 - indb[j]; - sgn = sgn * SoA[j]; - } - else - { - ind[j] = 1 + indb[j] - ghost_width; - } -#else -#ifdef Cell - if (indb[j] < ghost_width) - { - ind[j] = ghost_width - 1 - indb[j]; - sgn = sgn * SoA[j]; - } - else - { - ind[j] = indb[j] - ghost_width; - } -#else -#error Not define Vertex nor Cell -#endif -#endif - } - else - ind[j] = indb[j]; - } - int lon = getarraylocation(3, ext, ind); - datab[i] = datain[lon] * sgn; - } - - resu = global_interp(DIM, extb, CoXb, datab, poX, ordn); - - for (int i = 0; i < 3; i++) - delete[] CoXb[i]; - delete[] datab; - } - else - { - resu = global_interp(DIM, ext, CoX, datain, poX, ordn); - } - - return resu * asgn; -} -double Parallel::global_interp(int DIM, int *ext, double **CoX, double *datain, - double *poX, int ordn) -{ - if (ordn > 2 * ghost_width) - { - cout << "Parallel::global_interp can not handle ordn = " << ordn << " > 2*ghost_width = " << 2 * ghost_width << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - - double *bbox, *datainbbox; - bbox = new double[2 * DIM]; - datainbbox = new double[2 * DIM]; - - int *NN, *ind, *shape; - NN = new int[DIM]; - ind = new int[DIM]; - shape = new int[DIM]; - - for (int i = 0; i < DIM; i++) - { - ind[i] = int((poX[i] - CoX[i][0]) / (CoX[i][1] - CoX[i][0])) - ordn / 2 + 1; - // poX may exactly locate on the boundary (exclude ghost) - if (ind[i] == -1 && feq(poX[i], CoX[i][0], (CoX[i][1] - CoX[i][0]) / 2)) - ind[i] = 0; - /* - if(ind[i] < 0) - { - cout<<"Parallel::global_interp error ind["< ext = "<= 0; i--) - NN[i] = NN[i + 1] * ordn; - - double *xpts, *funcvals; - xpts = new double[ordn]; - funcvals = new double[ordn]; - double *DDd, *DDd1, rr; - - DDd = new double[NN[0]]; - - copy(DIM, bbox, bbox + DIM, shape, DDd, datainbbox, datainbbox + DIM, ext, datain, bbox, bbox + DIM); - - for (int i = 0; i < DIM; i++) - { - for (int j = ind[i]; j < ind[i] + ordn; j++) - { - xpts[j - ind[i]] = CoX[i][j]; - } - - if (i < DIM - 1) - { - DDd1 = new double[NN[i + 1]]; - for (int j = 0; j < NN[i + 1]; j++) - { - for (int k = 0; k < ordn; k++) - funcvals[k] = DDd[k + j * ordn]; - DDd1[j] = Lagrangian_Int(poX[i], ordn, xpts, funcvals); - } - delete[] DDd; - DDd = DDd1; - } - else - { - for (int j = 0; j < ordn; j++) - funcvals[j] = DDd[j]; - rr = Lagrangian_Int(poX[i], ordn, xpts, funcvals); - delete[] DDd1; // since DDd and DDd1 now point to the same stuff, we need delete after above int - } - } - - delete[] NN; - delete[] ind; - delete[] xpts; - delete[] funcvals; - delete[] bbox; - delete[] datainbbox; - delete[] shape; - - return rr; -} -double Parallel::Lagrangian_Int(double x, int npts, double *xpts, double *funcvals) -{ - double sum = 0; - for (int i = 0; i < npts; i++) - { - sum = sum + funcvals[i] * LagrangePoly(x, i, npts, xpts); - } - return sum; -} -double Parallel::LagrangePoly(double x, int pt, int npts, double *xpts) -{ - double h = 1; - int i; - - for (i = 0; i < pt; i++) - h = h * (x - xpts[i]) / (xpts[pt] - xpts[i]); - - for (i = pt + 1; i < npts; i++) - h = h * (x - xpts[i]) / (xpts[pt] - xpts[i]); - - return h; -} -// collect all grid segments or blocks including ghost and buffer for given patch -MyList *Parallel::build_complete_gsl(Patch *Pat) -{ - MyList *cgsl = 0, *gs; - MyList *BP = Pat->blb; - while (BP) - { - if (!cgsl) - { - cgsl = gs = new MyList; // delete through destroyList(); - gs->data = new Parallel::gridseg; - } - else - { - gs->next = new MyList; - gs = gs->next; - gs->data = new Parallel::gridseg; - } - - for (int i = 0; i < dim; i++) - { - gs->data->llb[i] = BP->data->bbox[i]; - gs->data->uub[i] = BP->data->bbox[dim + i]; - gs->data->shape[i] = BP->data->shape[i]; - } - gs->data->Bg = BP->data; - gs->next = 0; - - if (BP == Pat->ble) - break; - BP = BP->next; - } - - return cgsl; -} -// collect all grid segments or blocks including ghost and buffer for given patch list -MyList *Parallel::build_complete_gsl(MyList *PatL) -{ - MyList *cgsl = 0, *gs; - while (PatL) - { - if (!cgsl) - { - cgsl = build_complete_gsl(PatL->data); - gs = cgsl; - while (gs->next) - gs = gs->next; - } - else - { - gs->next = build_complete_gsl(PatL->data); - gs = gs->next; - while (gs->next) - gs = gs->next; - } - PatL = PatL->next; - } - - return cgsl; -} -// cellect the information of Patch list -MyList *Parallel::build_complete_gsl_virtual(MyList *PatL) -{ - MyList *cgsl = 0, *gs; - while (PatL) - { - if (cgsl) - { - gs->next = new MyList; - gs = gs->next; - gs->data = new Parallel::gridseg; - } - else - { - cgsl = gs = new MyList; - gs->data = new Parallel::gridseg; - } - - for (int i = 0; i < dim; i++) - { - gs->data->llb[i] = PatL->data->bbox[i]; - gs->data->uub[i] = PatL->data->bbox[dim + i]; - gs->data->shape[i] = PatL->data->shape[i]; - } - gs->data->Bg = 0; - gs->next = 0; - - PatL = PatL->next; - } - - return cgsl; -} -// cellect the information of Patch list without buffer points -MyList *Parallel::build_complete_gsl_virtual2(MyList *PatL) // - buffer -{ - MyList *cgsl = 0, *gs; - while (PatL) - { - if (cgsl) - { - gs->next = new MyList; - gs = gs->next; - gs->data = new Parallel::gridseg; - } - else - { - cgsl = gs = new MyList; - gs->data = new Parallel::gridseg; - } - - for (int i = 0; i < dim; i++) - { - double DH = PatL->data->getdX(i); - gs->data->llb[i] = PatL->data->bbox[i] + PatL->data->lli[i] * DH; - gs->data->uub[i] = PatL->data->bbox[dim + i] - PatL->data->uui[i] * DH; - gs->data->shape[i] = PatL->data->shape[i] - PatL->data->lli[i] - PatL->data->uui[i]; - } - gs->data->Bg = 0; - gs->next = 0; - - PatL = PatL->next; - } - - return cgsl; -} -// collect all grid segments or blocks without ghost for given patch, without extension -MyList *Parallel::build_bulk_gsl(Patch *Pat) -{ - MyList *cgsl = 0, *gs; - MyList *BP = Pat->blb; - while (BP) - { - Block *bp = BP->data; - if (!cgsl) - { - cgsl = gs = new MyList; - gs->data = new Parallel::gridseg; - } - else - { - gs->next = new MyList; - gs = gs->next; - gs->data = new Parallel::gridseg; - } - - for (int i = 0; i < dim; i++) - { - double DH = bp->getdX(i); - gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] : bp->bbox[dim + i] - ghost_width * DH; - gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + ghost_width * DH; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1; -#else -#ifdef Cell - gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4); -#else -#error Not define Vertex nor Cell -#endif -#endif - } - gs->data->Bg = BP->data; - gs->next = 0; - - if (BP == Pat->ble) - break; - BP = BP->next; - } - - return cgsl; -} -// bulk part for given Block within given patch, without extension -MyList *Parallel::build_bulk_gsl(Block *bp, Patch *Pat) -{ - MyList *gs = 0; - - gs = new MyList; - gs->data = new Parallel::gridseg; - - for (int i = 0; i < dim; i++) - { - double DH = bp->getdX(i); - gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] : bp->bbox[dim + i] - ghost_width * DH; - gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + ghost_width * DH; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1; -#else -#ifdef Cell - gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4); -#else -#error Not define Vertex nor Cell -#endif -#endif - } - gs->data->Bg = bp; - gs->next = 0; - - return gs; -} -MyList *Parallel::clone_gsl(MyList *p, bool first_only) -{ - MyList *np = 0, *q = 0, *pq = 0; - - while (p) - { - q = new MyList; - q->data = new Parallel::gridseg; - q->data->Bg = p->data->Bg; - for (int i = 0; i < dim; i++) - { - q->data->llb[i] = p->data->llb[i]; - q->data->uub[i] = p->data->uub[i]; - q->data->shape[i] = p->data->shape[i]; - } - if (pq) - pq->next = q; - else - np = q; - if (first_only) - { - np->next = 0; - return np; - } - pq = q; - p = p->next; - } - return np; -} -MyList *Parallel::gs_subtract(MyList *A, MyList *B) -{ - if (!A) - return 0; - if (!B) - return clone_gsl(A, true); - - double cut_plane[2 * dim], DH[dim]; - - for (int i = 0; i < dim; i++) - { - DH[i] = A->data->Bg->getdX(i); - if (B->data->Bg && !feq(DH[i], B->data->Bg->getdX(i), DH[i] / 2)) - { - cout << "Parallel::gs_subtract meets different grid segment " << DH[i] << " vs " << B->data->Bg->getdX(i) << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - } - - MyList *C = 0, *q; - for (int i = 0; i < dim; i++) - { - if (B->data->llb[i] > A->data->uub[i] || B->data->uub[i] < A->data->llb[i]) - return clone_gsl(A, true); - cut_plane[i] = A->data->llb[i]; - cut_plane[i + dim] = A->data->uub[i]; - } - - for (int i = 0; i < dim; i++) - { - cut_plane[i] = Mymax(A->data->llb[i], B->data->llb[i]); - if (cut_plane[i] - A->data->llb[i] > DH[i] / 2) - { - q = clone_gsl(A, true); - // prolong the list from head - if (C) - q->next = C; - C = q; - for (int j = 0; j < dim; j++) - { - if (i == j) - { - C->data->llb[i] = A->data->llb[i]; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - C->data->uub[i] = Mymax(C->data->llb[i], cut_plane[i] - DH[i]); -#else -#ifdef Cell - C->data->uub[i] = Mymax(C->data->llb[i], cut_plane[i]); -#else -#error Not define Vertex nor Cell -#endif -#endif - } - else - { - C->data->llb[j] = cut_plane[j]; - C->data->uub[j] = cut_plane[j + dim]; - } -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4) + 1; -#else -#ifdef Cell - C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4); -#else -#error Not define Vertex nor Cell -#endif -#endif - } - } - - cut_plane[i + dim] = Mymin(A->data->uub[i], B->data->uub[i]); - if (A->data->uub[i] - cut_plane[i + dim] > DH[i] / 2) - { - q = clone_gsl(A, true); - if (C) - q->next = C; - C = q; - for (int j = 0; j < dim; j++) - { - if (i == j) - { - C->data->uub[i] = A->data->uub[i]; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - C->data->llb[i] = Mymin(C->data->uub[i], cut_plane[i + dim] + DH[i]); -#else -#ifdef Cell - C->data->llb[i] = Mymin(C->data->uub[i], cut_plane[i + dim]); -#else -#error Not define Vertex nor Cell -#endif -#endif - } - else - { - C->data->llb[j] = cut_plane[j]; - C->data->uub[j] = cut_plane[j + dim]; - } -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4) + 1; -#else -#ifdef Cell - C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4); -#else -#error Not define Vertex nor Cell -#endif -#endif - } - } - } - return C; -} -// stupid method -/* -MyList *Parallel::gsl_subtract(MyList *A,MyList *B) //A subtract B but with A's information -{ -// always make return and A, B distinct - if(!A) return 0; - - if(!B) return clone_gsl(A,0); - - MyList *C=0,*C0,*C1,*Cc,*CC0,*gs; - - while(A) - { - C0=gs_subtract(A,B); // note C0 becomes a list after subtraction - C1=B->next; - while(C1) - { - CC0=C0; - Cc=0; - while(CC0) - { - gs=gs_subtract(CC0,C1); - if(Cc) Cc->catList(gs); - else Cc=gs; - CC0=CC0->next; - } - if(C0) C0->destroyList(); - C0=Cc; - C1=C1->next; - } - if(C) C->catList(C0); - else C=C0; - A=A->next; - } - - return C; -} -*/ -// more clever method -MyList *Parallel::gsl_subtract(MyList *A, MyList *B) // A subtract B but with A's information -{ - // always make return and A, B distinct - if (!A) - return 0; - - MyList *C = 0, *C0, *C1; - - C = clone_gsl(A, 0); - - while (B) - { - C0 = 0; - C1 = C; - while (C1) - { - if (C0) - C0->catList(gs_subtract(C1, B)); - else - C0 = gs_subtract(C1, B); - C1 = C1->next; - } - if (C) - C->destroyList(); - else - { - if (C0) - C0->destroyList(); - return 0; - } - - C = C0; - B = B->next; - } - - return C; -} -MyList *Parallel::gs_and(MyList *A, MyList *B) -{ - if (!A || !B) - return 0; - - double llb[dim], uub[dim]; - bool flag = false; - for (int i = 0; i < dim; i++) - { - llb[i] = Mymax(A->data->llb[i], B->data->llb[i]); - uub[i] = Mymin(A->data->uub[i], B->data->uub[i]); - if (llb[i] > uub[i]) - { - flag = true; - break; - } - } - if (flag) - return 0; - - MyList *C; - C = clone_gsl(A, true); - for (int i = 0; i < dim; i++) - { - C->data->llb[i] = llb[i]; - C->data->uub[i] = uub[i]; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / C->data->Bg->getdX(i) + 0.4) + 1; -#else -#ifdef Cell - C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / C->data->Bg->getdX(i) + 0.4); -#else -#error Not define Vertex nor Cell -#endif -#endif - } - - return C; -} -// overlap of A_i and (union of all j of B_j) -MyList *Parallel::gsl_and(MyList *A, MyList *B) // A and B but with A's information -{ - MyList *C = 0, *C1; - - while (A) - { - C1 = B; - while (C1) - { - if (C) - C->catList(gs_and(A, C1)); - else - C = gs_and(A, C1); - C1 = C1->next; - } - A = A->next; - } - return C; -} -// collect all ghost grid segments or blocks for given patch -MyList *Parallel::build_ghost_gsl(Patch *Pat) -{ - MyList *cgsl = 0, *gs, *gsb; - MyList *BP = Pat->blb; - while (BP) - { - gs = new MyList; - gs->data = new Parallel::gridseg; - - for (int i = 0; i < dim; i++) - { - gs->data->llb[i] = BP->data->bbox[i]; - gs->data->uub[i] = BP->data->bbox[dim + i]; - gs->data->shape[i] = BP->data->shape[i]; - } - gs->data->Bg = BP->data; - gs->next = 0; - - gsb = build_bulk_gsl(BP->data, Pat); - - if (!cgsl) - cgsl = gs_subtract(gs, gsb); - else - cgsl->catList(gs_subtract(gs, gsb)); - - gsb->destroyList(); - gs->destroyList(); - - if (BP == Pat->ble) - break; - BP = BP->next; - } - - return cgsl; -} -// collect all ghost grid segments or blocks for given patch list -MyList *Parallel::build_ghost_gsl(MyList *PatL) -{ - MyList *cgsl = 0, *gs; - while (PatL) - { - if (!cgsl) - { - cgsl = build_ghost_gsl(PatL->data); - gs = cgsl; - while (gs->next) - gs = gs->next; - } - else - { - gs->next = build_ghost_gsl(PatL->data); - gs = gs->next; - while (gs->next) - gs = gs->next; - } - PatL = PatL->next; - } - - return cgsl; -} -// collect all grid segments or blocks without ghost for given patch -// special for Sync usage, so we do not need consider missing points -MyList *Parallel::build_owned_gsl0(Patch *Pat, int rank_in) -{ - MyList *cgsl = 0, *gs; - MyList *BP = Pat->blb; - while (BP) - { - Block *bp = BP->data; - if (bp->rank == rank_in) - { - if (!cgsl) - { - cgsl = gs = new MyList; - gs->data = new Parallel::gridseg; - } - else - { - gs->next = new MyList; - gs = gs->next; - gs->data = new Parallel::gridseg; - } - - for (int i = 0; i < dim; i++) - { - double DH = bp->getdX(i); - gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] : bp->bbox[dim + i] - ghost_width * DH; - gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + ghost_width * DH; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1; -#else -#ifdef Cell - gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4); -#else -#error Not define Vertex nor Cell -#endif -#endif - } - gs->data->Bg = BP->data; - gs->next = 0; - } - - if (BP == Pat->ble) - break; - BP = BP->next; - } - - return cgsl; -} -// collect all grid segments or blocks without ghost for given patch -MyList *Parallel::build_owned_gsl1(Patch *Pat, int rank_in) -{ - MyList *cgsl = 0, *gs; - MyList *BP = Pat->blb; - while (BP) - { - Block *bp = BP->data; - if (bp->rank == rank_in) - { - if (!cgsl) - { - cgsl = gs = new MyList; - gs->data = new Parallel::gridseg; - } - else - { - gs->next = new MyList; - gs = gs->next; - gs->data = new Parallel::gridseg; - } - - for (int i = 0; i < dim; i++) - { - double DH = bp->getdX(i); - gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] : bp->bbox[dim + i] - ghost_width * DH; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - // NOTE: our dividing structure is (exclude ghost) - // -1 0 - // 1 2 - // so (0,1) does not belong to any part for vertex structure, we always put it to right part, this is consistent to - // the fortran routine where we always take floor to get index - gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + (ghost_width - 1) * DH; - gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1; -#else -#ifdef Cell - gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + ghost_width * DH; - gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4); -#else -#error Not define Vertex nor Cell -#endif -#endif - } - gs->data->Bg = BP->data; - gs->next = 0; - } - - if (BP == Pat->ble) - break; - BP = BP->next; - } - - return cgsl; -} -// collect all grid segments or blocks without ghost nor buffer for given patch -MyList *Parallel::build_owned_gsl2(Patch *Pat, int rank_in) -{ - MyList *cgsl = 0, *gs; - MyList *BP = Pat->blb; - while (BP) - { - Block *bp = BP->data; - if (bp->rank == rank_in) - { - if (!cgsl) - { - cgsl = gs = new MyList; - gs->data = new Parallel::gridseg; - } - else - { - gs->next = new MyList; - gs = gs->next; - gs->data = new Parallel::gridseg; - } - - for (int i = 0; i < dim; i++) - { - double DH = bp->getdX(i); - gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] - Pat->uui[i] * DH : bp->bbox[dim + i] - ghost_width * DH; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - // NOTE: our dividing structure is (exclude ghost) - // -1 0 - // 1 2 - // so (0,1) does not belong to any part for vertex structure, we always put it to right part, this is consistent to - // the fortran routine where we always take floor to get index - gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i] + (ghost_width - 1) * DH; - gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1; -#else -#ifdef Cell - gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i] + ghost_width * DH; - gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4); -#else -#error Not define Vertex nor Cell -#endif -#endif - } - gs->data->Bg = BP->data; - gs->next = 0; - } - - if (BP == Pat->ble) - break; - BP = BP->next; - } - - return cgsl; -} -// collect all grid segments or blocks without ghost for given patch, and delete the ghost_width for interpolation consideration on the patch boundary -MyList *Parallel::build_owned_gsl3(Patch *Pat, int rank_in, int Symmetry) -{ - MyList *cgsl = 0, *gs; - MyList *BP = Pat->blb; - while (BP) - { - Block *bp = BP->data; - if (bp->rank == rank_in) - { - if (!cgsl) - { - cgsl = gs = new MyList; - gs->data = new Parallel::gridseg; - } - else - { - gs->next = new MyList; - gs = gs->next; - gs->data = new Parallel::gridseg; - } - - for (int i = 0; i < dim; i++) - { - double DH = bp->getdX(i); - gs->data->uub[i] = bp->bbox[dim + i] - ghost_width * DH; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - // NOTE: our dividing structure is (exclude ghost) - // -1 0 - // 1 2 - // so (0,1) does not belong to any part for vertex structure, we always put it to right part, this is consistent to - // the fortran routine where we always take floor to get index - gs->data->llb[i] = bp->bbox[i] + (ghost_width - 1) * DH; - gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1; -#else -#ifdef Cell - gs->data->llb[i] = bp->bbox[i] + ghost_width * DH; - gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4); -#else -#error Not define Vertex nor Cell -#endif -#endif - } - // Symmetry consideration - if (Symmetry > 0) - { - double DH = bp->getdX(2); - if (feq(bp->bbox[2], 0, DH / 2)) - { - gs->data->llb[2] = bp->bbox[2]; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - gs->data->shape[2] = int((gs->data->uub[2] - gs->data->llb[2]) / DH + 0.4) + 1; -#else -#ifdef Cell - gs->data->shape[2] = int((gs->data->uub[2] - gs->data->llb[2]) / DH + 0.4); -#else -#error Not define Vertex nor Cell -#endif -#endif - } - if (Symmetry > 1) - { - for (int i = 0; i < 2; i++) - { - DH = bp->getdX(i); - if (feq(bp->bbox[i], 0, DH / 2)) - { - gs->data->llb[i] = bp->bbox[i]; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1; -#else -#ifdef Cell - gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4); -#else -#error Not define Vertex nor Cell -#endif -#endif - } - } - } - } - - gs->data->Bg = BP->data; - gs->next = 0; - } - - if (BP == Pat->ble) - break; - BP = BP->next; - } - - return cgsl; -} -// collect all grid segments or blocks without ghost nor buffer for given patch, -// and delete the ghost_width for interpolation consideration on the patch boundary -MyList *Parallel::build_owned_gsl4(Patch *Pat, int rank_in, int Symmetry) -{ - MyList *cgsl = 0, *gs; - MyList *BP = Pat->blb; - while (BP) - { - Block *bp = BP->data; - if (bp->rank == rank_in) - { - if (!cgsl) - { - cgsl = gs = new MyList; - gs->data = new Parallel::gridseg; - } - else - { - gs->next = new MyList; - gs = gs->next; - gs->data = new Parallel::gridseg; - } - - for (int i = 0; i < dim; i++) - { - double DH = bp->getdX(i); - gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] - Pat->uui[i] * DH : bp->bbox[dim + i]; - gs->data->uub[i] -= ghost_width * DH; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - // NOTE: our dividing structure is (exclude ghost) - // -1 0 - // 1 2 - // so (0,1) does not belong to any part for vertex structure, we always put it to right part, this is consistent to - // the fortran routine where we always take floor to get index - gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i]; - gs->data->llb[i] += (ghost_width - 1) * DH; - gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1; -#else -#ifdef Cell - gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i]; - gs->data->llb[i] += ghost_width * DH; - gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4); -#else -#error Not define Vertex nor Cell -#endif -#endif - } - // Symmetry consideration - if (Symmetry > 0) - { - double DH = bp->getdX(2); - if (feq(bp->bbox[2], 0, DH / 2)) - { - gs->data->llb[2] = bp->bbox[2]; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - gs->data->shape[2] = int((gs->data->uub[2] - gs->data->llb[2]) / DH + 0.4) + 1; -#else -#ifdef Cell - gs->data->shape[2] = int((gs->data->uub[2] - gs->data->llb[2]) / DH + 0.4); -#else -#error Not define Vertex nor Cell -#endif -#endif - } - if (Symmetry > 1) - { - for (int i = 0; i < 2; i++) - { - DH = bp->getdX(i); - if (feq(bp->bbox[i], 0, DH / 2)) - { - gs->data->llb[i] = bp->bbox[i]; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1; -#else -#ifdef Cell - gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4); -#else -#error Not define Vertex nor Cell -#endif -#endif - } - } - } - } - - gs->data->Bg = BP->data; - gs->next = 0; - } - - if (BP == Pat->ble) - break; - BP = BP->next; - } - - return cgsl; -} -// collect all grid segments or blocks without ghost nor buffer for given patch, no extention -MyList *Parallel::build_owned_gsl5(Patch *Pat, int rank_in) -{ - MyList *cgsl = 0, *gs; - MyList *BP = Pat->blb; - while (BP) - { - Block *bp = BP->data; - if (bp->rank == rank_in) - { - if (!cgsl) - { - cgsl = gs = new MyList; - gs->data = new Parallel::gridseg; - } - else - { - gs->next = new MyList; - gs = gs->next; - gs->data = new Parallel::gridseg; - } - - for (int i = 0; i < dim; i++) - { - double DH = bp->getdX(i); - gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] - Pat->uui[i] * DH : bp->bbox[dim + i] - ghost_width * DH; - gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i] + ghost_width * DH; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1; -#else -#ifdef Cell - gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4); -#else -#error Not define Vertex nor Cell -#endif -#endif - } - gs->data->Bg = BP->data; - gs->next = 0; - } - - if (BP == Pat->ble) - break; - BP = BP->next; - } - - return cgsl; -} -// collect all grid segments or blocks without ghost for given patch list -// stupid method -/* -MyList *Parallel::build_owned_gsl(MyList *PatL,int rank_in,int type,int Symmetry) -{ - MyList *cgsl=0,*gs; - while(PatL) - { - if(!cgsl) - { - switch(type) - { - case 0: - cgsl = build_owned_gsl0(PatL->data,rank_in); - break; - case 1: - cgsl = build_owned_gsl1(PatL->data,rank_in); - break; - case 2: - cgsl = build_owned_gsl2(PatL->data,rank_in); - break; - case 3: - cgsl = build_owned_gsl3(PatL->data,rank_in,Symmetry); - break; - case 4: - cgsl = build_owned_gsl4(PatL->data,rank_in,Symmetry); - break; - case 5: - cgsl = build_owned_gsl5(PatL->data,rank_in); - break; - default: - cout<<"Parallel::build_owned_gsl : unknown type = "<next) gs = gs->next; - } - else - { - switch(type) - { - case 0: - gs->next = build_owned_gsl0(PatL->data,rank_in); - break; - case 1: - gs->next = build_owned_gsl1(PatL->data,rank_in); - break; - case 2: - gs->next = build_owned_gsl2(PatL->data,rank_in); - break; - case 3: - gs->next = build_owned_gsl3(PatL->data,rank_in,Symmetry); - break; - case 4: - gs->next = build_owned_gsl4(PatL->data,rank_in,Symmetry); - break; - case 5: - gs->next = build_owned_gsl5(PatL->data,rank_in); - break; - default: - cout<<"Parallel::build_owned_gsl : unknown type = "<next) gs = gs->next; - } - PatL = PatL->next; - } - - return cgsl; -} -*/ -// more clever method -MyList *Parallel::build_owned_gsl(MyList *PatL, int rank_in, int type, int Symmetry) -{ - MyList *cgsl = 0, *gs; - while (PatL) - { - switch (type) - { - case 0: - gs = build_owned_gsl0(PatL->data, rank_in); - break; - case 1: - gs = build_owned_gsl1(PatL->data, rank_in); - break; - case 2: - gs = build_owned_gsl2(PatL->data, rank_in); - break; - case 3: - gs = build_owned_gsl3(PatL->data, rank_in, Symmetry); - break; - case 4: - gs = build_owned_gsl4(PatL->data, rank_in, Symmetry); - break; - case 5: - gs = build_owned_gsl5(PatL->data, rank_in); - break; - default: - cout << "Parallel::build_owned_gsl : unknown type = " << type << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - if (cgsl) - cgsl->catList(gs); - else - cgsl = gs; - PatL = PatL->next; - } - - return cgsl; -} -// according to overlape to determine real grid segments -void Parallel::build_gstl(MyList *srci, MyList *dsti, - MyList **out_src, MyList **out_dst) -{ - *out_src = *out_dst = 0; - - if (!srci || !dsti) - return; - - MyList *s, *d; - MyList *s2, *d2; - - double llb[dim], uub[dim]; - - s = srci; - while (s) - { - Parallel::gridseg *sd = s->data; - d = dsti; - while (d) - { - Parallel::gridseg *dd = d->data; - bool flag = true; - for (int i = 0; i < dim; i++) - { - double SH = sd->Bg->getdX(i), DH = dd->Bg->getdX(i); - llb[i] = Mymax(sd->llb[i], dd->llb[i]); - uub[i] = Mymin(sd->uub[i], dd->uub[i]); - // make sure the region boundary is consistent to the grids - // here we only judge if the domain is empty, so do not need to adjust the align - double lb = llb[i], ub = uub[i]; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - // ---*--- - // x-------x - // if (int(2*(sd->uub[i]-uub[i])/SH+0.4)%2 == 1) ub = uub[i]-SH/2; - // else if(int(2*(dd->uub[i]-uub[i])/DH+0.4)%2 == 1) ub = uub[i]-DH/2; - // if (int(2*(llb[i]-sd->llb[i])/SH+0.4)%2 == 1) lb = llb[i]+SH/2; - // else if(int(2*(llb[i]-dd->llb[i])/DH+0.4)%2 == 1) lb = llb[i]+DH/2; - if (lb > ub + Mymin(SH, DH) / 2) - { - flag = false; - break; - } // special for isolated point -#else -#ifdef Cell - // |------| - // |-------------| - // if (int(2*(sd->uub[i]-uub[i])/SH+0.4)%2 == 1) ub = uub[i]+SH/2; - // else if(int(2*(dd->uub[i]-uub[i])/DH+0.4)%2 == 1) ub = uub[i]+DH/2; - // |------| - // |-------------| - // if (int(2*(llb[i]-sd->llb[i])/SH+0.4)%2 == 1) lb = llb[i]-SH/2; - // else if(int(2*(llb[i]-dd->llb[i])/DH+0.4)%2 == 1) lb = llb[i]-DH/2; - if (ub - lb < Mymin(SH, DH) / 2) - { - flag = false; - break; - } // even for isolated point, it has a cell belong to it -#else -#error Not define Vertex nor Cell -#endif -#endif - } - - if (flag) - { - if (!(*out_src)) - { - *out_src = s2 = new MyList; - *out_dst = d2 = new MyList; - s2->data = new Parallel::gridseg; - d2->data = new Parallel::gridseg; - } - else - { - s2->next = new MyList; - s2 = s2->next; - d2->next = new MyList; - d2 = d2->next; - s2->data = new Parallel::gridseg; - d2->data = new Parallel::gridseg; - } - - for (int i = 0; i < dim; i++) - { - double SH = sd->Bg->getdX(i), DH = dd->Bg->getdX(i); - s2->data->llb[i] = d2->data->llb[i] = llb[i]; - s2->data->uub[i] = d2->data->uub[i] = uub[i]; -// using float method to count point, we do not need following consideration (2012 nov 17) -#if 1 - -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - // old code distuinguish vertex and cell - // if (int(2*(sd->uub[i]-uub[i])/SH+0.4)%2 == 1) s2->data->uub[i] = uub[i]-SH/2; - // else if(int(2*(dd->uub[i]-uub[i])/DH+0.4)%2 == 1) d2->data->uub[i] = uub[i]-DH/2; - // if (int(2*(llb[i]-sd->llb[i])/SH+0.4)%2 == 1) s2->data->llb[i] = llb[i]+SH/2; - // else if(int(2*(llb[i]-dd->llb[i])/DH+0.4)%2 == 1) d2->data->llb[i] = llb[i]+DH/2; - // new code: here we concern much more about missing point, because overlaping domain has been gaureented above - if (int(2 * (sd->uub[i] - uub[i]) / SH + 0.4) % 2 == 1) - s2->data->uub[i] = uub[i] + SH / 2; - else if (int(2 * (dd->uub[i] - uub[i]) / DH + 0.4) % 2 == 1) - d2->data->uub[i] = uub[i] + DH / 2; - if (int(2 * (llb[i] - sd->llb[i]) / SH + 0.4) % 2 == 1) - s2->data->llb[i] = llb[i] - SH / 2; - else if (int(2 * (llb[i] - dd->llb[i]) / DH + 0.4) % 2 == 1) - d2->data->llb[i] = llb[i] - DH / 2; - s2->data->shape[i] = int((s2->data->uub[i] - s2->data->llb[i]) / SH + 0.4) + 1; - d2->data->shape[i] = int((d2->data->uub[i] - d2->data->llb[i]) / DH + 0.4) + 1; -#else -#ifdef Cell - if (int(2 * (sd->uub[i] - uub[i]) / SH + 0.4) % 2 == 1) - s2->data->uub[i] = uub[i] + SH / 2; - else if (int(2 * (dd->uub[i] - uub[i]) / DH + 0.4) % 2 == 1) - d2->data->uub[i] = uub[i] + DH / 2; - if (int(2 * (llb[i] - sd->llb[i]) / SH + 0.4) % 2 == 1) - s2->data->llb[i] = llb[i] - SH / 2; - else if (int(2 * (llb[i] - dd->llb[i]) / DH + 0.4) % 2 == 1) - d2->data->llb[i] = llb[i] - DH / 2; - s2->data->shape[i] = int((s2->data->uub[i] - s2->data->llb[i]) / SH + 0.4); - d2->data->shape[i] = int((d2->data->uub[i] - d2->data->llb[i]) / DH + 0.4); -#else -#error Not define Vertex nor Cell -#endif -#endif - -#endif - s2->data->illb[i] = sd->illb[i]; - d2->data->illb[i] = dd->illb[i]; - s2->data->iuub[i] = sd->iuub[i]; - d2->data->iuub[i] = dd->iuub[i]; - } - s2->data->Bg = sd->Bg; - s2->next = 0; - d2->data->Bg = dd->Bg; - d2->next = 0; - } - d = d->next; - } - s = s->next; - } -} -// PACK: prepare target data in 'data' -// UNPACK: copy target data from 'data' to corresponding numerical grids -int Parallel::data_packer(double *data, MyList *src, MyList *dst, int rank_in, int dir, - MyList *VarLists /* source */, MyList *VarListd /* target */, int Symmetry) -{ - int myrank; - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - - int DIM = dim; - - if (dir != PACK && dir != UNPACK) - { - cout << "error dir " << dir << " for data_packer " << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - - int size_out = 0; - - if (!src || !dst) - return size_out; - + nx = Mymax(1, shape / min_width); + nx = Mymin(cpusize, nx); + + return nx; +} +int Parallel::partition2(int *nxy, int split_size, int *min_width, int cpusize, int *shape) // special for 2 diemnsions +{ +#define SEARCH_SIZE 5 + int i, j, nx, ny; + int maxnx, maxny; + int mnx, mny; + int dn, hmin_width, cmin_width; + int cnx, cny; + double fx, fy; + int block_size; + int n; + + block_size = shape[0] * shape[1]; + n = Mymax(1, (block_size + split_size / 2) / split_size); + + maxnx = Mymax(1, shape[0] / min_width[0]); + maxnx = Mymin(cpusize, maxnx); + maxny = Mymax(1, shape[1] / min_width[1]); + maxny = Mymin(cpusize, maxny); + fx = (double)shape[0] / (shape[0] + shape[1]); + fy = (double)shape[1] / (shape[0] + shape[1]); + nx = mnx = Mymax(1, Mymin(maxnx, (int)(sqrt(double(n)) * fx / fy))); + ny = mny = Mymax(1, Mymin(maxny, (int)(sqrt(double(n)) * fy / fx))); + dn = abs(n - nx * ny); + hmin_width = Mymin(shape[0] / nx, shape[1] / ny); + for (cny = Mymax(1, mny - SEARCH_SIZE); cny <= (Mymin(mny + SEARCH_SIZE, maxny)); cny++) + for (cnx = Mymax(1, mnx - SEARCH_SIZE); cnx <= (Mymin(mnx + SEARCH_SIZE, maxnx)); cnx++) + { + cmin_width = Mymin(shape[0] / cnx, shape[1] / cny); + if (dn > abs(n - cnx * cny) || (dn == abs(n - cnx * cny) && cmin_width > hmin_width)) + { + dn = abs(n - cnx * cny); + nx = cnx; + ny = cny; + hmin_width = cmin_width; + } + } + + nxy[0] = nx; + nxy[1] = ny; + + return nx * ny; +#undef SEARCH_SIZE +} +int Parallel::partition3(int *nxyz, int split_size, int *min_width, int cpusize, int *shape) // special for 3 diemnsions +#if 1 // algrithsm from Pretorius +{ +// cout< abs(n - cnx * cny * cnz) || (dn == abs(n - cnx * cny * cnz) && cmin_width > hmin_width)) + { + dn = abs(n - cnx * cny * cnz); + nx = cnx; + ny = cny; + nz = cnz; + hmin_width = cmin_width; + } + } + + nxyz[0] = nx; + nxyz[1] = ny; + nxyz[2] = nz; + + return nx * ny * nz; +#undef SEARCH_SIZE +} +#elif 1 // Zhihui's idea one on 2013-09-25 +{ + int nx, ny, nz; + int hmin_width; + hmin_width = Mymin(min_width[0], min_width[1]); + hmin_width = Mymin(hmin_width, min_width[2]); + nx = shape[0] / hmin_width; + if (nx * hmin_width < shape[0]) + nx++; + ny = shape[1] / hmin_width; + if (ny * hmin_width < shape[1]) + ny++; + nz = shape[2] / hmin_width; + if (nz * hmin_width < shape[2]) + nz++; + while (nx * ny * nz > cpusize) + { + hmin_width++; + nx = shape[0] / hmin_width; + if (nx * hmin_width < shape[0]) + nx++; + ny = shape[1] / hmin_width; + if (ny * hmin_width < shape[1]) + ny++; + nz = shape[2] / hmin_width; + if (nz * hmin_width < shape[2]) + nz++; + } + + nxyz[0] = nx; + nxyz[1] = ny; + nxyz[2] = nz; + + return nx * ny * nz; +} +#elif 1 // Zhihui's idea two on 2013-09-25 +{ + int nx, ny, nz; + const int hmin_width = 8; // for example we use 8 + nx = shape[0] / hmin_width; + if (nx * hmin_width < shape[0]) + nx++; + ny = shape[1] / hmin_width; + if (ny * hmin_width < shape[1]) + ny++; + nz = shape[2] / hmin_width; + if (nz * hmin_width < shape[2]) + nz++; + + nxyz[0] = nx; + nxyz[1] = ny; + nxyz[2] = nz; + + return nx * ny * nz; +} +#endif +// distribute the data to cprocessors +#if (PSTR == 0) +MyList *Parallel::distribute(MyList *PatchLIST, int cpusize, int ingfsi, int fngfsi, + bool periodic, int nodes) +{ +#ifdef USE_GPU_DIVIDE + double cpu_part, gpu_part; + map::iterator iter; + iter = parameters::dou_par.find("cpu part"); + if (iter != parameters::dou_par.end()) + { + cpu_part = iter->second; + } + else + { + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + // read parameter from file + const int LEN = 256; + char pline[LEN]; + string str, sgrp, skey, sval; + int sind; + char pname[50]; + { + map::iterator iter = parameters::str_par.find("inputpar"); + if (iter != parameters::str_par.end()) + { + strcpy(pname, (iter->second).c_str()); + } + else + { + cout << "Error inputpar" << endl; + exit(0); + } + } + ifstream inf(pname, ifstream::in); + if (!inf.good() && myrank == 0) + { + cout << "Can not open parameter file " << pname << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + for (int i = 1; inf.good(); i++) + { + inf.getline(pline, LEN); + str = pline; + + int status = misc::parse_parts(str, sgrp, skey, sval, sind); + if (status == -1) + { + cout << "error reading parameter file " << pname << " in line " << i << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + else if (status == 0) + continue; + + if (sgrp == "ABE") + { + if (skey == "cpu part") + cpu_part = atof(sval.c_str()); + } + } + inf.close(); + + parameters::dou_par.insert(map::value_type("cpu part", cpu_part)); + } + iter = parameters::dou_par.find("gpu part"); + if (iter != parameters::dou_par.end()) + { + gpu_part = iter->second; + } + else + { + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + // read parameter from file + const int LEN = 256; + char pline[LEN]; + string str, sgrp, skey, sval; + int sind; + char pname[50]; + { + map::iterator iter = parameters::str_par.find("inputpar"); + if (iter != parameters::str_par.end()) + { + strcpy(pname, (iter->second).c_str()); + } + else + { + cout << "Error inputpar" << endl; + exit(0); + } + } + ifstream inf(pname, ifstream::in); + if (!inf.good() && myrank == 0) + { + cout << "Can not open parameter file " << pname << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + for (int i = 1; inf.good(); i++) + { + inf.getline(pline, LEN); + str = pline; + + int status = misc::parse_parts(str, sgrp, skey, sval, sind); + if (status == -1) + { + cout << "error reading parameter file " << pname << " in line " << i << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + else if (status == 0) + continue; + + if (sgrp == "ABE") + { + if (skey == "gpu part") + gpu_part = atof(sval.c_str()); + } + } + inf.close(); + + parameters::dou_par.insert(map::value_type("gpu part", gpu_part)); + } + + if (nodes == 0) + nodes = cpusize / 2; +#else + if (nodes == 0) + nodes = cpusize; +#endif + + if (dim != 3) + { + cout << "distrivute: now we only support 3-dimension" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + MyList *BlL = 0; + + int split_size, min_size, block_size = 0; + + int min_width = 2 * Mymax(ghost_width, buffer_width); + int nxyz[dim], mmin_width[dim], min_shape[dim]; + + MyList *PLi = PatchLIST; + for (int i = 0; i < dim; i++) + min_shape[i] = PLi->data->shape[i]; + int lev = PLi->data->lev; + PLi = PLi->next; + while (PLi) + { + Patch *PP = PLi->data; + for (int i = 0; i < dim; i++) + min_shape[i] = Mymin(min_shape[i], PP->shape[i]); + if (lev != PLi->data->lev) + cout << "Parallel::distribute CAUSTION: meet Patches for different level: " << lev << " and " << PLi->data->lev << endl; + PLi = PLi->next; + } + + for (int i = 0; i < dim; i++) + mmin_width[i] = Mymin(min_width, min_shape[i]); + + min_size = mmin_width[0]; + for (int i = 1; i < dim; i++) + min_size = min_size * mmin_width[i]; + + PLi = PatchLIST; + while (PLi) + { + Patch *PP = PLi->data; + // PP->checkPatch(true); + int bs = PP->shape[0]; + for (int i = 1; i < dim; i++) + bs = bs * PP->shape[i]; + block_size = block_size + bs; + PLi = PLi->next; + } + split_size = Mymax(min_size, block_size / nodes); + split_size = Mymax(1, split_size); + + int n_rank = 0; + PLi = PatchLIST; + int reacpu = 0; + while (PLi) + { + Patch *PP = PLi->data; + + reacpu += partition3(nxyz, split_size, mmin_width, nodes, PP->shape); + + Block *ng0, *ng; + int shape_here[dim], ibbox_here[2 * dim]; + double bbox_here[2 * dim], dd; + + // ibbox : 0,...N-1 + for (int i = 0; i < nxyz[0]; i++) + for (int j = 0; j < nxyz[1]; j++) + for (int k = 0; k < nxyz[2]; k++) + { + ibbox_here[0] = (PP->shape[0] * i) / nxyz[0]; + ibbox_here[3] = (PP->shape[0] * (i + 1)) / nxyz[0] - 1; + ibbox_here[1] = (PP->shape[1] * j) / nxyz[1]; + ibbox_here[4] = (PP->shape[1] * (j + 1)) / nxyz[1] - 1; + ibbox_here[2] = (PP->shape[2] * k) / nxyz[2]; + ibbox_here[5] = (PP->shape[2] * (k + 1)) / nxyz[2] - 1; + + if (periodic) + { + ibbox_here[0] = ibbox_here[0] - ghost_width; + ibbox_here[3] = ibbox_here[3] + ghost_width; + ibbox_here[1] = ibbox_here[1] - ghost_width; + ibbox_here[4] = ibbox_here[4] + ghost_width; + ibbox_here[2] = ibbox_here[2] - ghost_width; + ibbox_here[5] = ibbox_here[5] + ghost_width; + } + else + { + ibbox_here[0] = Mymax(0, ibbox_here[0] - ghost_width); + ibbox_here[3] = Mymin(PP->shape[0] - 1, ibbox_here[3] + ghost_width); + ibbox_here[1] = Mymax(0, ibbox_here[1] - ghost_width); + ibbox_here[4] = Mymin(PP->shape[1] - 1, ibbox_here[4] + ghost_width); + ibbox_here[2] = Mymax(0, ibbox_here[2] - ghost_width); + ibbox_here[5] = Mymin(PP->shape[2] - 1, ibbox_here[5] + ghost_width); + } + + shape_here[0] = ibbox_here[3] - ibbox_here[0] + 1; + shape_here[1] = ibbox_here[4] - ibbox_here[1] + 1; + shape_here[2] = ibbox_here[5] - ibbox_here[2] + 1; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + // 0--4, 5--10 + dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1); + bbox_here[0] = PP->bbox[0] + ibbox_here[0] * dd; + bbox_here[3] = PP->bbox[0] + ibbox_here[3] * dd; + + dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1); + bbox_here[1] = PP->bbox[1] + ibbox_here[1] * dd; + bbox_here[4] = PP->bbox[1] + ibbox_here[4] * dd; + + dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1); + bbox_here[2] = PP->bbox[2] + ibbox_here[2] * dd; + bbox_here[5] = PP->bbox[2] + ibbox_here[5] * dd; +#else +#ifdef Cell + // 0--5, 5--10 + dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0]; + bbox_here[0] = PP->bbox[0] + (ibbox_here[0]) * dd; + bbox_here[3] = PP->bbox[0] + (ibbox_here[3] + 1) * dd; + + dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1]; + bbox_here[1] = PP->bbox[1] + (ibbox_here[1]) * dd; + bbox_here[4] = PP->bbox[1] + (ibbox_here[4] + 1) * dd; + + dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2]; + bbox_here[2] = PP->bbox[2] + (ibbox_here[2]) * dd; + bbox_here[5] = PP->bbox[2] + (ibbox_here[5] + 1) * dd; +#else +#error Not define Vertex nor Cell +#endif +#endif + +#ifdef USE_GPU_DIVIDE + { + const int pices = 2; + double picef[pices]; + picef[0] = cpu_part; + picef[1] = gpu_part; + int shape_res[dim * pices]; + double bbox_res[2 * dim * pices]; + misc::dividBlock(dim, shape_here, bbox_here, pices, picef, shape_res, bbox_res, min_width); + ng = ng0 = new Block(dim, shape_res, bbox_res, n_rank++, ingfsi, fngfsi, PP->lev, 0); // delete through KillBlocks + + // if(n_rank==cpusize) {n_rank=0; cerr<<"place one!!"<checkBlock(); + if (BlL) + BlL->insert(ng); + else + BlL = new MyList(ng); // delete through KillBlocks + + for (int i = 1; i < pices; i++) + { + ng = new Block(dim, shape_res + i * dim, bbox_res + i * 2 * dim, n_rank++, ingfsi, fngfsi, PP->lev, i); // delete through KillBlocks + // if(n_rank==cpusize) {n_rank=0; cerr<<"place two!! "<checkBlock(); + BlL->insert(ng); + } + } +#else + ng = ng0 = new Block(dim, shape_here, bbox_here, n_rank++, ingfsi, fngfsi, PP->lev); + // ng->checkBlock(); + if (BlL) + BlL->insert(ng); + else + BlL = new MyList(ng); // delete through KillBlocks +#endif + if (n_rank == cpusize) + n_rank = 0; + + // set PP->blb + if (i == 0 && j == 0 && k == 0) + { + MyList *Bp = BlL; + while (Bp->data != ng0) + Bp = Bp->next; // ng0 is the first of the pices list + PP->blb = Bp; + } + } + // set PP->ble + { + MyList *Bp = BlL; + while (Bp->data != ng) + Bp = Bp->next; // ng is the last of the pices list + PP->ble = Bp; + } + PLi = PLi->next; + } + if (reacpu < nodes * 2 / 3) + { + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + if (myrank == 0) + cout << "Parallel::distribute CAUSTION: level#" << lev << " uses essencially " << reacpu << " processors vs " << nodes << " nodes run, your scientific computation scale is not as large as you estimate." << endl; + } + + return BlL; +} + +#ifdef INTERP_LB_OPTIMIZE +#include "interp_lb_profile_data.h" + +MyList *Parallel::distribute_optimize(MyList *PatchLIST, int cpusize, int ingfsi, int fngfsi, + bool periodic, int nodes) +{ +#ifdef USE_GPU_DIVIDE + double cpu_part, gpu_part; + map::iterator iter; + iter = parameters::dou_par.find("cpu part"); + if (iter != parameters::dou_par.end()) + { + cpu_part = iter->second; + } + else + { + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + const int LEN = 256; + char pline[LEN]; + string str, sgrp, skey, sval; + int sind; + char pname[50]; + { + map::iterator iter = parameters::str_par.find("inputpar"); + if (iter != parameters::str_par.end()) + strcpy(pname, (iter->second).c_str()); + else { cout << "Error inputpar" << endl; exit(0); } + } + ifstream inf(pname, ifstream::in); + if (!inf.good() && myrank == 0) + { cout << "Can not open parameter file " << pname << endl; MPI_Abort(MPI_COMM_WORLD, 1); } + for (int i = 1; inf.good(); i++) + { + inf.getline(pline, LEN); str = pline; + int status = misc::parse_parts(str, sgrp, skey, sval, sind); + if (status == -1) { cout << "error reading parameter file " << pname << " in line " << i << endl; MPI_Abort(MPI_COMM_WORLD, 1); } + else if (status == 0) continue; + if (sgrp == "ABE") { if (skey == "cpu part") cpu_part = atof(sval.c_str()); } + } + inf.close(); + parameters::dou_par.insert(map::value_type("cpu part", cpu_part)); + } + iter = parameters::dou_par.find("gpu part"); + if (iter != parameters::dou_par.end()) + { + gpu_part = iter->second; + } + else + { + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + const int LEN = 256; + char pline[LEN]; + string str, sgrp, skey, sval; + int sind; + char pname[50]; + { + map::iterator iter = parameters::str_par.find("inputpar"); + if (iter != parameters::str_par.end()) + strcpy(pname, (iter->second).c_str()); + else { cout << "Error inputpar" << endl; exit(0); } + } + ifstream inf(pname, ifstream::in); + if (!inf.good() && myrank == 0) + { cout << "Can not open parameter file " << pname << endl; MPI_Abort(MPI_COMM_WORLD, 1); } + for (int i = 1; inf.good(); i++) + { + inf.getline(pline, LEN); str = pline; + int status = misc::parse_parts(str, sgrp, skey, sval, sind); + if (status == -1) { cout << "error reading parameter file " << pname << " in line " << i << endl; MPI_Abort(MPI_COMM_WORLD, 1); } + else if (status == 0) continue; + if (sgrp == "ABE") { if (skey == "gpu part") gpu_part = atof(sval.c_str()); } + } + inf.close(); + parameters::dou_par.insert(map::value_type("gpu part", gpu_part)); + } + if (nodes == 0) nodes = cpusize / 2; +#else + if (nodes == 0) nodes = cpusize; +#endif + + if (dim != 3) + { + cout << "distrivute: now we only support 3-dimension" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + MyList *BlL = 0; + int split_size, min_size, block_size = 0; + int min_width = 2 * Mymax(ghost_width, buffer_width); + int nxyz[dim], mmin_width[dim], min_shape[dim]; + + MyList *PLi = PatchLIST; + for (int i = 0; i < dim; i++) + min_shape[i] = PLi->data->shape[i]; + int lev = PLi->data->lev; + PLi = PLi->next; + while (PLi) + { + Patch *PP = PLi->data; + for (int i = 0; i < dim; i++) + min_shape[i] = Mymin(min_shape[i], PP->shape[i]); + if (lev != PLi->data->lev) + cout << "Parallel::distribute CAUSTION: meet Patches for different level: " << lev << " and " << PLi->data->lev << endl; + PLi = PLi->next; + } + + for (int i = 0; i < dim; i++) + mmin_width[i] = Mymin(min_width, min_shape[i]); + min_size = mmin_width[0]; + for (int i = 1; i < dim; i++) + min_size = min_size * mmin_width[i]; + + PLi = PatchLIST; + while (PLi) + { + Patch *PP = PLi->data; + int bs = PP->shape[0]; + for (int i = 1; i < dim; i++) + bs = bs * PP->shape[i]; + block_size = block_size + bs; + PLi = PLi->next; + } + split_size = Mymax(min_size, block_size / nodes); + split_size = Mymax(1, split_size); + + int n_rank = 0; + PLi = PatchLIST; + int reacpu = 0; + int current_block_id = 0; + while (PLi) { + Block *ng0, *ng; + bool first_block_in_patch = true; + Patch *PP = PLi->data; + reacpu += partition3(nxyz, split_size, mmin_width, nodes, PP->shape); + + for (int i = 0; i < nxyz[0]; i++) + for (int j = 0; j < nxyz[1]; j++) + for (int k = 0; k < nxyz[2]; k++) + { + int ibbox_here[6], shape_here[3]; + double bbox_here[6], dd; + Block *current_ng_start = nullptr; + + bool is_heavy = false; + int r_l = -1, r_r = -1; + if (cpusize == INTERP_LB_NPROCS) { + for (int si = 0; si < INTERP_LB_NUM_HEAVY; si++) { + if (current_block_id == interp_lb_splits[si][0]) { + is_heavy = true; + r_l = interp_lb_splits[si][1]; + r_r = interp_lb_splits[si][2]; + break; + } + } + } + + if (is_heavy) + { + int ib0 = (PP->shape[0] * i) / nxyz[0]; + int ib3 = (PP->shape[0] * (i + 1)) / nxyz[0] - 1; + int jb1 = (PP->shape[1] * j) / nxyz[1]; + int jb4 = (PP->shape[1] * (j + 1)) / nxyz[1] - 1; + int kb2 = (PP->shape[2] * k) / nxyz[2]; + int kb5 = (PP->shape[2] * (k + 1)) / nxyz[2] - 1; + + Block *split_first_block = nullptr; + Block *split_last_block = nullptr; + splitHotspotBlock(BlL, dim, ib0, ib3, jb1, jb4, kb2, kb5, + PP, r_l, r_r, ingfsi, fngfsi, periodic, + split_first_block, split_last_block); + + current_ng_start = split_first_block; + ng = split_last_block; + } + else + { + ibbox_here[0] = (PP->shape[0] * i) / nxyz[0]; + ibbox_here[3] = (PP->shape[0] * (i + 1)) / nxyz[0] - 1; + ibbox_here[1] = (PP->shape[1] * j) / nxyz[1]; + ibbox_here[4] = (PP->shape[1] * (j + 1)) / nxyz[1] - 1; + ibbox_here[2] = (PP->shape[2] * k) / nxyz[2]; + ibbox_here[5] = (PP->shape[2] * (k + 1)) / nxyz[2] - 1; + + if (periodic) { + for(int d=0; d<3; d++) { + ibbox_here[d] -= ghost_width; + ibbox_here[d+3] += ghost_width; + } + } else { + ibbox_here[0] = Mymax(0, ibbox_here[0] - ghost_width); + ibbox_here[3] = Mymin(PP->shape[0] - 1, ibbox_here[3] + ghost_width); + ibbox_here[1] = Mymax(0, ibbox_here[1] - ghost_width); + ibbox_here[4] = Mymin(PP->shape[1] - 1, ibbox_here[4] + ghost_width); + ibbox_here[2] = Mymax(0, ibbox_here[2] - ghost_width); + ibbox_here[5] = Mymin(PP->shape[2] - 1, ibbox_here[5] + ghost_width); + } + + for(int d=0; d<3; d++) shape_here[d] = ibbox_here[d+3] - ibbox_here[d] + 1; + +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1); + bbox_here[0] = PP->bbox[0] + ibbox_here[0] * dd; + bbox_here[3] = PP->bbox[0] + ibbox_here[3] * dd; + dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1); + bbox_here[1] = PP->bbox[1] + ibbox_here[1] * dd; + bbox_here[4] = PP->bbox[1] + ibbox_here[4] * dd; + dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1); + bbox_here[2] = PP->bbox[2] + ibbox_here[2] * dd; + bbox_here[5] = PP->bbox[2] + ibbox_here[5] * dd; +#else +#ifdef Cell + dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0]; + bbox_here[0] = PP->bbox[0] + (ibbox_here[0]) * dd; + bbox_here[3] = PP->bbox[0] + (ibbox_here[3] + 1) * dd; + dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1]; + bbox_here[1] = PP->bbox[1] + (ibbox_here[1]) * dd; + bbox_here[4] = PP->bbox[1] + (ibbox_here[4] + 1) * dd; + dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2]; + bbox_here[2] = PP->bbox[2] + (ibbox_here[2]) * dd; + bbox_here[5] = PP->bbox[2] + (ibbox_here[5] + 1) * dd; +#else +#error Not define Vertex nor Cell +#endif +#endif + ng = createMappedBlock(BlL, dim, shape_here, bbox_here, + current_block_id, ingfsi, fngfsi, PP->lev); + current_ng_start = ng; + } + + if (first_block_in_patch) { + ng0 = current_ng_start; + MyList *Bp_start = BlL; + while (Bp_start && Bp_start->data != ng0) Bp_start = Bp_start->next; + PP->blb = Bp_start; + first_block_in_patch = false; + } + + current_block_id++; + } + + { + MyList *Bp_end = BlL; + while (Bp_end && Bp_end->data != ng) Bp_end = Bp_end->next; + PP->ble = Bp_end; + } + + PLi = PLi->next; + } + if (reacpu < nodes * 2 / 3) + { + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + if (myrank == 0) + cout << "Parallel::distribute CAUSTION: level#" << lev << " uses essencially " << reacpu << " processors vs " << nodes << " nodes run, your scientific computation scale is not as large as you estimate." << endl; + } + + return BlL; +} + +Block* Parallel::splitHotspotBlock(MyList* &BlL, int _dim, + int ib0_orig, int ib3_orig, + int jb1_orig, int jb4_orig, + int kb2_orig, int kb5_orig, + Patch* PP, int r_left, int r_right, + int ingfsi, int fngfsi, bool periodic, + Block* &split_first_block, Block* &split_last_block) +{ + int mid = (ib0_orig + ib3_orig) / 2; + + int indices_L[6] = {ib0_orig, jb1_orig, kb2_orig, mid, jb4_orig, kb5_orig}; + int indices_R[6] = {mid + 1, jb1_orig, kb2_orig, ib3_orig, jb4_orig, kb5_orig}; + + auto createSubBlock = [&](int* ib_raw, int target_rank) { + int ib_final[6]; + int sh_here[3]; + double bb_here[6], dd; + + if (periodic) { + ib_final[0] = ib_raw[0] - ghost_width; + ib_final[3] = ib_raw[3] + ghost_width; + ib_final[1] = ib_raw[1] - ghost_width; + ib_final[4] = ib_raw[4] + ghost_width; + ib_final[2] = ib_raw[2] - ghost_width; + ib_final[5] = ib_raw[5] + ghost_width; + } else { + ib_final[0] = Mymax(0, ib_raw[0] - ghost_width); + ib_final[3] = Mymin(PP->shape[0] - 1, ib_raw[3] + ghost_width); + ib_final[1] = Mymax(0, ib_raw[1] - ghost_width); + ib_final[4] = Mymin(PP->shape[1] - 1, ib_raw[4] + ghost_width); + ib_final[2] = Mymax(0, ib_raw[2] - ghost_width); + ib_final[5] = Mymin(PP->shape[2] - 1, ib_raw[5] + ghost_width); + } + + sh_here[0] = ib_final[3] - ib_final[0] + 1; + sh_here[1] = ib_final[4] - ib_final[1] + 1; + sh_here[2] = ib_final[5] - ib_final[2] + 1; + +#ifdef Vertex + dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1); + bb_here[0] = PP->bbox[0] + ib_final[0] * dd; + bb_here[3] = PP->bbox[0] + ib_final[3] * dd; + dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1); + bb_here[1] = PP->bbox[1] + ib_final[1] * dd; + bb_here[4] = PP->bbox[1] + ib_final[4] * dd; + dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1); + bb_here[2] = PP->bbox[2] + ib_final[2] * dd; + bb_here[5] = PP->bbox[2] + ib_final[5] * dd; +#else +#ifdef Cell + dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0]; + bb_here[0] = PP->bbox[0] + ib_final[0] * dd; + bb_here[3] = PP->bbox[0] + (ib_final[3] + 1) * dd; + dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1]; + bb_here[1] = PP->bbox[1] + ib_final[1] * dd; + bb_here[4] = PP->bbox[1] + (ib_final[4] + 1) * dd; + dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2]; + bb_here[2] = PP->bbox[2] + ib_final[2] * dd; + bb_here[5] = PP->bbox[2] + (ib_final[5] + 1) * dd; +#endif +#endif + + Block* Bg = new Block(dim, sh_here, bb_here, target_rank, ingfsi, fngfsi, PP->lev); + if (BlL) BlL->insert(Bg); + else BlL = new MyList(Bg); + + return Bg; + }; + + split_first_block = createSubBlock(indices_L, r_left); + split_last_block = createSubBlock(indices_R, r_right); + return split_last_block; +} + +Block* Parallel::createMappedBlock(MyList* &BlL, int _dim, int* shape, double* bbox, + int block_id, int ingfsi, int fngfsi, int lev) +{ + int target_rank = block_id; + if (INTERP_LB_NPROCS > 0) { + for (int ri = 0; ri < interp_lb_num_remaps; ri++) { + if (block_id == interp_lb_remaps[ri][0]) { + target_rank = interp_lb_remaps[ri][1]; + break; + } + } + } + + Block* ng = new Block(dim, shape, bbox, target_rank, ingfsi, fngfsi, lev); + if (BlL) BlL->insert(ng); + else BlL = new MyList(ng); + + return ng; +} +#else +// When INTERP_LB_OPTIMIZE is not defined, distribute_optimize falls back to distribute +MyList *Parallel::distribute_optimize(MyList *PatchLIST, int cpusize, int ingfsi, int fngfsi, + bool periodic, int nodes) +{ + return distribute(PatchLIST, cpusize, ingfsi, fngfsi, periodic, nodes); +} +Block* Parallel::splitHotspotBlock(MyList* &BlL, int _dim, + int ib0_orig, int ib3_orig, + int jb1_orig, int jb4_orig, + int kb2_orig, int kb5_orig, + Patch* PP, int r_left, int r_right, + int ingfsi, int fngfsi, bool periodic, + Block* &split_first_block, Block* &split_last_block) +{ return nullptr; } +Block* Parallel::createMappedBlock(MyList* &BlL, int _dim, int* shape, double* bbox, + int block_id, int ingfsi, int fngfsi, int lev) +{ return nullptr; } +#endif + +#elif (PSTR == 1 || PSTR == 2 || PSTR == 3) +MyList *Parallel::distribute(MyList *PatchLIST, int cpusize, int ingfsi, int fngfsi, + bool periodic, int start_rank, int end_rank, int nodes) +{ +#ifdef USE_GPU_DIVIDE + double cpu_part, gpu_part; + map::iterator iter; + iter = parameters::dou_par.find("cpu part"); + if (iter != parameters::dou_par.end()) + { + cpu_part = iter->second; + } + else + { + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + // read parameter from file + const int LEN = 256; + char pline[LEN]; + string str, sgrp, skey, sval; + int sind; + char pname[50]; + { + map::iterator iter = parameters::str_par.find("inputpar"); + if (iter != parameters::str_par.end()) + { + strcpy(pname, (iter->second).c_str()); + } + else + { + cout << "Error inputpar" << endl; + exit(0); + } + } + ifstream inf(pname, ifstream::in); + if (!inf.good() && myrank == 0) + { + cout << "Can not open parameter file " << pname << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + for (int i = 1; inf.good(); i++) + { + inf.getline(pline, LEN); + str = pline; + + int status = misc::parse_parts(str, sgrp, skey, sval, sind); + if (status == -1) + { + cout << "error reading parameter file " << pname << " in line " << i << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + else if (status == 0) + continue; + + if (sgrp == "ABE") + { + if (skey == "cpu part") + cpu_part = atof(sval.c_str()); + } + } + inf.close(); + + parameters::dou_par.insert(map::value_type("cpu part", cpu_part)); + } + iter = parameters::dou_par.find("gpu part"); + if (iter != parameters::dou_par.end()) + { + gpu_part = iter->second; + } + else + { + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + // read parameter from file + const int LEN = 256; + char pline[LEN]; + string str, sgrp, skey, sval; + int sind; + char pname[50]; + { + map::iterator iter = parameters::str_par.find("inputpar"); + if (iter != parameters::str_par.end()) + { + strcpy(pname, (iter->second).c_str()); + } + else + { + cout << "Error inputpar" << endl; + exit(0); + } + } + ifstream inf(pname, ifstream::in); + if (!inf.good() && myrank == 0) + { + cout << "Can not open parameter file " << pname << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + for (int i = 1; inf.good(); i++) + { + inf.getline(pline, LEN); + str = pline; + + int status = misc::parse_parts(str, sgrp, skey, sval, sind); + if (status == -1) + { + cout << "error reading parameter file " << pname << " in line " << i << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + else if (status == 0) + continue; + + if (sgrp == "ABE") + { + if (skey == "gpu part") + gpu_part = atof(sval.c_str()); + } + } + inf.close(); + + parameters::dou_par.insert(map::value_type("gpu part", gpu_part)); + } + + if (nodes == 0) + nodes = cpusize / 2; +#else + if (nodes == 0) + nodes = cpusize; +#endif + + if (dim != 3) + { + cout << "distrivute: now we only support 3-dimension" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + MyList *BlL = 0; + + int split_size, min_size, block_size = 0; + + int min_width = 2 * Mymax(ghost_width, buffer_width); + int nxyz[dim], mmin_width[dim], min_shape[dim]; + + MyList *PLi = PatchLIST; + for (int i = 0; i < dim; i++) + min_shape[i] = PLi->data->shape[i]; + int lev = PLi->data->lev; + PLi = PLi->next; + while (PLi) + { + Patch *PP = PLi->data; + for (int i = 0; i < dim; i++) + min_shape[i] = Mymin(min_shape[i], PP->shape[i]); + if (lev != PLi->data->lev) + cout << "Parallel::distribute CAUSTION: meet Patches for different level: " << lev << " and " << PLi->data->lev << endl; + PLi = PLi->next; + } + + for (int i = 0; i < dim; i++) + mmin_width[i] = Mymin(min_width, min_shape[i]); + + min_size = mmin_width[0]; + for (int i = 1; i < dim; i++) + min_size = min_size * mmin_width[i]; + + PLi = PatchLIST; + while (PLi) + { + Patch *PP = PLi->data; + // PP->checkPatch(true); + int bs = PP->shape[0]; + for (int i = 1; i < dim; i++) + bs = bs * PP->shape[i]; + block_size = block_size + bs; + PLi = PLi->next; + } + split_size = Mymax(min_size, block_size / cpusize); + split_size = Mymax(1, split_size); + + int n_rank = start_rank; + PLi = PatchLIST; + int reacpu = 0; + while (PLi) + { + Patch *PP = PLi->data; + + reacpu += partition3(nxyz, split_size, mmin_width, cpusize, PP->shape); + + Block *ng, *ng0; + int shape_here[dim], ibbox_here[2 * dim]; + double bbox_here[2 * dim], dd; + + // ibbox : 0,...N-1 + for (int i = 0; i < nxyz[0]; i++) + for (int j = 0; j < nxyz[1]; j++) + for (int k = 0; k < nxyz[2]; k++) + { + ibbox_here[0] = (PP->shape[0] * i) / nxyz[0]; + ibbox_here[3] = (PP->shape[0] * (i + 1)) / nxyz[0] - 1; + ibbox_here[1] = (PP->shape[1] * j) / nxyz[1]; + ibbox_here[4] = (PP->shape[1] * (j + 1)) / nxyz[1] - 1; + ibbox_here[2] = (PP->shape[2] * k) / nxyz[2]; + ibbox_here[5] = (PP->shape[2] * (k + 1)) / nxyz[2] - 1; + + if (periodic) + { + ibbox_here[0] = ibbox_here[0] - ghost_width; + ibbox_here[3] = ibbox_here[3] + ghost_width; + ibbox_here[1] = ibbox_here[1] - ghost_width; + ibbox_here[4] = ibbox_here[4] + ghost_width; + ibbox_here[2] = ibbox_here[2] - ghost_width; + ibbox_here[5] = ibbox_here[5] + ghost_width; + } + else + { + ibbox_here[0] = Mymax(0, ibbox_here[0] - ghost_width); + ibbox_here[3] = Mymin(PP->shape[0] - 1, ibbox_here[3] + ghost_width); + ibbox_here[1] = Mymax(0, ibbox_here[1] - ghost_width); + ibbox_here[4] = Mymin(PP->shape[1] - 1, ibbox_here[4] + ghost_width); + ibbox_here[2] = Mymax(0, ibbox_here[2] - ghost_width); + ibbox_here[5] = Mymin(PP->shape[2] - 1, ibbox_here[5] + ghost_width); + } + + shape_here[0] = ibbox_here[3] - ibbox_here[0] + 1; + shape_here[1] = ibbox_here[4] - ibbox_here[1] + 1; + shape_here[2] = ibbox_here[5] - ibbox_here[2] + 1; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + // 0--4, 5--10 + dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1); + bbox_here[0] = PP->bbox[0] + ibbox_here[0] * dd; + bbox_here[3] = PP->bbox[0] + ibbox_here[3] * dd; + + dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1); + bbox_here[1] = PP->bbox[1] + ibbox_here[1] * dd; + bbox_here[4] = PP->bbox[1] + ibbox_here[4] * dd; + + dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1); + bbox_here[2] = PP->bbox[2] + ibbox_here[2] * dd; + bbox_here[5] = PP->bbox[2] + ibbox_here[5] * dd; +#else +#ifdef Cell + // 0--5, 5--10 + dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0]; + bbox_here[0] = PP->bbox[0] + (ibbox_here[0]) * dd; + bbox_here[3] = PP->bbox[0] + (ibbox_here[3] + 1) * dd; + + dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1]; + bbox_here[1] = PP->bbox[1] + (ibbox_here[1]) * dd; + bbox_here[4] = PP->bbox[1] + (ibbox_here[4] + 1) * dd; + + dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2]; + bbox_here[2] = PP->bbox[2] + (ibbox_here[2]) * dd; + bbox_here[5] = PP->bbox[2] + (ibbox_here[5] + 1) * dd; +#else +#error Not define Vertex nor Cell +#endif +#endif + +#ifdef USE_GPU_DIVIDE + { + const int pices = 2; + double picef[pices]; + picef[0] = cpu_part; + picef[1] = gpu_part; + int shape_res[dim * pices]; + double bbox_res[2 * dim * pices]; + misc::dividBlock(dim, shape_here, bbox_here, pices, picef, shape_res, bbox_res, min_width); + ng = ng0 = new Block(dim, shape_res, bbox_res, n_rank++, ingfsi, fngfsi, PP->lev, 0); // delete through KillBlocks + // ng->checkBlock(); + if (BlL) + BlL->insert(ng); + else + BlL = new MyList(ng); // delete through KillBlocks + + for (int i = 1; i < pices; i++) + { + ng = new Block(dim, shape_res + i * dim, bbox_res + i * 2 * dim, n_rank++, ingfsi, fngfsi, PP->lev, i); // delete through KillBlocks + // ng->checkBlock(); + BlL->insert(ng); + } + } +#else + ng = ng0 = new Block(dim, shape_here, bbox_here, n_rank++, ingfsi, fngfsi, PP->lev); // delete through KillBlocks + // ng->checkBlock(); + if (BlL) + BlL->insert(ng); + else + BlL = new MyList(ng); // delete through KillBlocks +#endif + + if (n_rank == end_rank + 1) + n_rank = start_rank; + + // set PP->blb + if (i == 0 && j == 0 && k == 0) + { + MyList *Bp = BlL; + while (Bp->data != ng0) + Bp = Bp->next; // ng0 is the first of the pices list + PP->blb = Bp; + } + } + // set PP->ble + { + MyList *Bp = BlL; + while (Bp->data != ng) + Bp = Bp->next; // ng is the last of the pices list + PP->ble = Bp; + } + PLi = PLi->next; + } + if (reacpu < nodes * 2 / 3) + { + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + if (myrank == start_rank) + cout << "Parallel::distribute CAUSTION: level#" << lev << " uses essencially " << reacpu << " processors vs " << nodes << " nodes run, your scientific computation scale is not as large as you estimate." << endl; + } + + return BlL; +} +#endif +void Parallel::setfunction(MyList *BlL, var *vn, double func(double x, double y, double z)) +{ + while (BlL) + { + if (BlL->data->X[0]) + { + int nn = BlL->data->shape[0] * BlL->data->shape[1] * BlL->data->shape[2]; + double *p = BlL->data->fgfs[vn->sgfn]; + for (int i = 0; i < nn; i++) + { + int ind[3]; + getarrayindex(3, BlL->data->shape, ind, i); + p[i] = func(BlL->data->X[0][ind[0]], BlL->data->X[1][ind[1]], BlL->data->X[2][ind[2]]); + } + } + BlL = BlL->next; + } +} +// set function only for cpu rank +void Parallel::setfunction(int rank, MyList *BlL, var *vn, double func(double x, double y, double z)) +{ + while (BlL) + { + if (BlL->data->X[0] && BlL->data->rank == rank) + { + int nn = BlL->data->shape[0] * BlL->data->shape[1] * BlL->data->shape[2]; + double *p = BlL->data->fgfs[vn->sgfn]; + for (int i = 0; i < nn; i++) + { + int ind[3]; + getarrayindex(3, BlL->data->shape, ind, i); + p[i] = func(BlL->data->X[0][ind[0]], BlL->data->X[1][ind[1]], BlL->data->X[2][ind[2]]); + } + } + BlL = BlL->next; + } +} +void Parallel::getarrayindex(int DIM, int *shape, int *index, int n) +{ + // we assume index has already memory space + int *mu; + mu = new int[DIM]; + mu[0] = 1; + for (int i = 1; i < DIM; i++) + mu[i] = mu[i - 1] * shape[i - 1]; + for (int i = DIM - 1; i >= 0; i--) + { + index[i] = n / mu[i]; + n = n - index[i] * mu[i]; + } + + delete[] mu; +} +int Parallel::getarraylocation(int DIM, int *shape, int *index) +{ + int n, mu; + mu = shape[0]; + n = index[0]; + for (int i = 1; i < DIM; i++) + { + n = n + index[i] * mu; + mu = mu * shape[i]; + } + + return n; +} +void Parallel::copy(int DIM, double *llbout, double *uubout, int *Dshape, double *DD, double *llbin, double *uubin, + int *shape, double *datain, double *llb, double *uub) +{ + // for 3 dimensional case, based on simple test, I found this is half slower than f90 code + int *illi, *iuui; + int *illo, *iuuo; + int *indi, *indo; + illi = new int[DIM]; + iuui = new int[DIM]; + illo = new int[DIM]; + iuuo = new int[DIM]; + indi = new int[DIM]; + indo = new int[DIM]; + + int ial = 1; + for (int i = 0; i < DIM; i++) + { + double ho, hi; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + ho = (uubout[i] - llbout[i]) / (Dshape[i] - 1); + hi = (uubin[i] - llbin[i]) / (shape[i] - 1); +#else +#ifdef Cell + ho = (uubout[i] - llbout[i]) / Dshape[i]; + hi = (uubin[i] - llbin[i]) / shape[i]; +#else +#error Not define Vertex nor Cell +#endif +#endif + illo[i] = int((llb[i] - llbout[i]) / ho); + iuuo[i] = Dshape[i] - 1 - int((uubout[i] - uub[i]) / ho); + illi[i] = int((llb[i] - llbin[i]) / hi); + iuui[i] = shape[i] - 1 - int((uubin[i] - uub[i]) / hi); + + if (illo[i] > iuuo[i] || illi[i] > iuui[i] || illo[i] < 0 || illi[i] < 0 || + iuui[i] >= shape[i] || iuuo[i] >= Dshape[i]) + { + cout << "Parallel copy: in direction " << i << ":" << endl; + cout << "llb = " << llb[i] << ", uub = " << uub[i] << endl; + cout << " in data : il = " << illi[i] << ", iu = " << iuui[i] << endl; + cout << "bbox = (" << llbin[i] << "," << uubin[i] << ")" << endl; + cout << "shape = " << shape[i] << endl; + cout << "out data : il = " << illo[i] << ", iu = " << iuuo[i] << endl; + cout << "bbox = (" << llbout[i] << "," << uubout[i] << ")" << endl; + cout << "shape = " << Dshape[i] << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + int ihi = iuui[i] - illi[i] + 1, iho = iuuo[i] - illo[i] + 1; + if (!(feq(ho, hi, ho / 2)) || ihi != iho) + { + cout << "Parallel copy: in direction " << i << ":" << endl; + cout << "Parallel copy: not the same grid structure." << endl; + cout << "hi = " << hi << ", bbox = (" << llbin[i] << "," << uubin[i] << "), shape = " << shape[i] << endl; + cout << "ho = " << ho << ", bbox = (" << llbout[i] << "," << uubout[i] << "), shape = " << Dshape[i] << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + ial = ial * ihi; + } + + for (int i = 0; i < DIM; i++) + { + indi[i] = illi[i]; + indo[i] = illo[i]; + } + /* + //check start index + for(int i=0;i NNi) + { + cout << "Parallel copy: ni = " << ni << " is out of array range (0," << NNi << ")." << endl; + cout << "shape = ("; + for (int j = 0; j < DIM; j++) + { + cout << shape[j]; + if (j < DIM - 1) + cout << ","; + else + cout << ")" << endl; + } + cout << "ind = ("; + for (int j = 0; j < DIM; j++) + { + cout << indi[j]; + if (j < DIM - 1) + cout << ","; + else + cout << ")" << endl; + } + MPI_Abort(MPI_COMM_WORLD, 1); + } + DD[no] = datain[ni]; + + indi[0]++; + for (int j = 1; j < DIM; j++) + { + if (indi[j - 1] == iuui[j - 1] + 1) + { + indi[j - 1] = illi[j - 1]; + indi[j]++; + } // carry 1 to next digital + else + break; + } + indo[0]++; + for (int j = 1; j < DIM; j++) + { + if (indo[j - 1] == iuuo[j - 1] + 1) + { + indo[j - 1] = illo[j - 1]; + indo[j]++; + } + else + break; + } + } + /* + //check final index + for(int i=0;i *BlL, MyList *DumpList, char *tag, double time, double dT) +{ + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + // round at 4 and 5 + int ncount = int(time / dT + 0.5); + + MyList *Bp; + while (DumpList) + { + Bp = BlL; + int Bi = 0; + while (Bp) + { + Block *BP = Bp->data; + var *VP = DumpList->data; + if (BP->rank == myrank) + { + + string out_dir; + map::iterator iter; + iter = parameters::str_par.find("output dir"); + if (iter != parameters::str_par.end()) + { + out_dir = iter->second; + } + else + { + // read parameter from file + const int LEN = 256; + char pline[LEN]; + string str, sgrp, skey, sval; + int sind; + char pname[50]; + { + map::iterator iter = parameters::str_par.find("inputpar"); + if (iter != parameters::str_par.end()) + { + strcpy(pname, (iter->second).c_str()); + } + else + { + cout << "Error inputpar" << endl; + exit(0); + } + } + ifstream inf(pname, ifstream::in); + if (!inf.good()) + { + cout << "Can not open parameter file " << pname << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + for (int i = 1; inf.good(); i++) + { + inf.getline(pline, LEN); + str = pline; + + int status = misc::parse_parts(str, sgrp, skey, sval, sind); + if (status == -1) + { + cout << "error reading parameter file " << pname << " in line " << i << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + else if (status == 0) + continue; + + if (sgrp == "ABE") + { + if (skey == "output dir") + out_dir = sval; + } + } + inf.close(); + + parameters::str_par.insert(map::value_type("output dir", out_dir)); + } + + char filename[100]; + if (tag) + sprintf(filename, "%s/%s_Lev%02d-%02d_%02d_%s_%05d.bin", out_dir.c_str(), tag, BP->lev, Bi, myrank, VP->name, ncount); + else + sprintf(filename, "%s/Lev%02d-%02d_%02d_%s_%05d.bin", out_dir.c_str(), BP->lev, Bi, myrank, VP->name, ncount); + writefile(time, BP->shape[0], BP->shape[1], BP->shape[2], BP->bbox[0], BP->bbox[3], BP->bbox[1], BP->bbox[4], + BP->bbox[2], BP->bbox[5], filename, BP->fgfs[VP->sgfn]); + cout << "end of dump " << VP->name << " at time " << time << ", on node " << myrank << endl; + } + Bp = Bp->next; + Bi++; + } + DumpList = DumpList->next; + } +} +// Now we dump the data including buffer points +void Parallel::Dump_Data(Patch *PP, MyList *DumpList, char *tag, double time, double dT, int grd) +{ + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + // round at 4 and 5 + int ncount = int(time / dT + 0.5); + + MPI_Status sta; + int DIM = 3; + double llb[3], uub[3]; + double DX, DY, DZ; + + double *databuffer = 0; + if (myrank == 0) + { + databuffer = (double *)malloc(sizeof(double) * PP->shape[0] * PP->shape[1] * PP->shape[2]); + if (!databuffer) + { + cout << "Parallel::Dump_Data: out of memory when dumping data." << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + + while (DumpList) + { + var *VP = DumpList->data; + + MyList *Bp = PP->blb; + while (Bp) + { + Block *BP = Bp->data; + if (BP->rank == 0 && myrank == 0) + { + DX = BP->getdX(0); + DY = BP->getdX(1); + DZ = BP->getdX(2); + llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX; + llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY; + llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ; + uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX; + uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY; + uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ; + f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, BP->fgfs[VP->sgfn], llb, uub); + } + else + { + int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]); + if (myrank == 0) + { + double *bufferhere = (double *)malloc(sizeof(double) * nnn); + if (!bufferhere) + { + cout << "on node#" << myrank << ", out of memory when dumping data." << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + MPI_Recv(bufferhere, nnn, MPI_DOUBLE, BP->rank, 0, MPI_COMM_WORLD, &sta); + DX = BP->getdX(0); + DY = BP->getdX(1); + DZ = BP->getdX(2); + llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX; + llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY; + llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ; + uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX; + uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY; + uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ; + f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, bufferhere, llb, uub); + free(bufferhere); + } + else if (myrank == BP->rank) + { + MPI_Send(BP->fgfs[VP->sgfn], nnn, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD); + } + } + if (Bp == PP->ble) + break; + Bp = Bp->next; + } + if (myrank == 0) + { + + string out_dir; + map::iterator iter; + iter = parameters::str_par.find("output dir"); + if (iter != parameters::str_par.end()) + { + out_dir = iter->second; + } + else + { + // read parameter from file + const int LEN = 256; + char pline[LEN]; + string str, sgrp, skey, sval; + int sind; + char pname[50]; + { + map::iterator iter = parameters::str_par.find("inputpar"); + if (iter != parameters::str_par.end()) + { + strcpy(pname, (iter->second).c_str()); + } + else + { + cout << "Error inputpar" << endl; + exit(0); + } + } + ifstream inf(pname, ifstream::in); + if (!inf.good()) + { + cout << "Can not open parameter file " << pname << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + for (int i = 1; inf.good(); i++) + { + inf.getline(pline, LEN); + str = pline; + + int status = misc::parse_parts(str, sgrp, skey, sval, sind); + if (status == -1) + { + cout << "error reading parameter file " << pname << " in line " << i << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + else if (status == 0) + continue; + + if (sgrp == "ABE") + { + if (skey == "output dir") + out_dir = sval; + } + } + inf.close(); + + parameters::str_par.insert(map::value_type("output dir", out_dir)); + } + + char filename[100]; + if (tag) + sprintf(filename, "%s/%s_Lev%02d-%02d_%s_%05d.bin", out_dir.c_str(), tag, PP->lev, grd, VP->name, ncount); + else + sprintf(filename, "%s/Lev%02d-%02d_%s_%05d.bin", out_dir.c_str(), PP->lev, grd, VP->name, ncount); + + writefile(time, PP->shape[0], PP->shape[1], PP->shape[2], PP->bbox[0], PP->bbox[3], PP->bbox[1], PP->bbox[4], + PP->bbox[2], PP->bbox[5], filename, databuffer); + } + DumpList = DumpList->next; + } + + if (myrank == 0) + free(databuffer); +} +void Parallel::Dump_Data(MyList *PL, MyList *DumpList, char *tag, double time, double dT) +{ + MyList *Pp; + Pp = PL; + int grd = 0; + while (Pp) + { + Patch *PP = Pp->data; + Dump_Data(PP, DumpList, tag, time, dT, grd); + grd++; + Pp = Pp->next; + } +} +// collect the data including buffer points +double *Parallel::Collect_Data(Patch *PP, var *VP) +{ + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + + MPI_Status sta; + int DIM = 3; + double llb[3], uub[3]; + double DX, DY, DZ; + + double *databuffer = 0; + if (myrank == 0) + { + databuffer = (double *)malloc(sizeof(double) * PP->shape[0] * PP->shape[1] * PP->shape[2]); + if (!databuffer) + { + cout << "Parallel::Collect_Data: out of memory when dumping data." << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + + MyList *Bp = PP->blb; + while (Bp) + { + Block *BP = Bp->data; + if (BP->rank == 0 && myrank == 0) + { + DX = BP->getdX(0); + DY = BP->getdX(1); + DZ = BP->getdX(2); + llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX; + llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY; + llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ; + uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX; + uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY; + uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ; + f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, BP->fgfs[VP->sgfn], llb, uub); + } + else + { + int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]); + if (myrank == 0) + { + double *bufferhere = (double *)malloc(sizeof(double) * nnn); + if (!bufferhere) + { + cout << "on node#" << myrank << ", out of memory when dumping data." << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + MPI_Recv(bufferhere, nnn, MPI_DOUBLE, BP->rank, 0, MPI_COMM_WORLD, &sta); + DX = BP->getdX(0); + DY = BP->getdX(1); + DZ = BP->getdX(2); + llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX; + llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY; + llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ; + uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX; + uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY; + uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ; + f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, bufferhere, llb, uub); + free(bufferhere); + } + else if (myrank == BP->rank) + { + MPI_Send(BP->fgfs[VP->sgfn], nnn, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD); + } + } + if (Bp == PP->ble) + break; + Bp = Bp->next; + } + + return databuffer; +} +// Now we dump the data including buffer points +// dump z = 0 plane +void Parallel::d2Dump_Data(Patch *PP, MyList *DumpList, char *tag, double time, double dT, int grd) +{ + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + // round at 4 and 5 + int ncount = int(time / dT + 0.5); + + MPI_Status sta; + int DIM = 3; + double llb[3], uub[3]; + double DX, DY, DZ; + + double *databuffer = 0, *databuffer2 = 0; + if (myrank == 0) + { + databuffer = (double *)malloc(sizeof(double) * PP->shape[0] * PP->shape[1] * PP->shape[2]); + databuffer2 = (double *)malloc(sizeof(double) * PP->shape[0] * PP->shape[1]); + if (!databuffer || !databuffer2) + { + cout << "Parallel::d2Dump_Data: out of memory when dumping data." << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + + while (DumpList) + { + var *VP = DumpList->data; + + MyList *Bp = PP->blb; + while (Bp) + { + Block *BP = Bp->data; + if (BP->rank == 0 && myrank == 0) + { + DX = BP->getdX(0); + DY = BP->getdX(1); + DZ = BP->getdX(2); + llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX; + llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY; + llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ; + uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX; + uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY; + uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ; + f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, BP->fgfs[VP->sgfn], llb, uub); + } + else + { + int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]); + if (myrank == 0) + { + double *bufferhere = (double *)malloc(sizeof(double) * nnn); + if (!bufferhere) + { + cout << "on node#" << myrank << ", out of memory when dumping data." << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + MPI_Recv(bufferhere, nnn, MPI_DOUBLE, BP->rank, 0, MPI_COMM_WORLD, &sta); + DX = BP->getdX(0); + DY = BP->getdX(1); + DZ = BP->getdX(2); + llb[0] = (feq(BP->bbox[0], PP->bbox[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX; + llb[1] = (feq(BP->bbox[1], PP->bbox[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY; + llb[2] = (feq(BP->bbox[2], PP->bbox[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ; + uub[0] = (feq(BP->bbox[3], PP->bbox[3], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX; + uub[1] = (feq(BP->bbox[4], PP->bbox[4], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY; + uub[2] = (feq(BP->bbox[5], PP->bbox[5], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ; + f_copy(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, bufferhere, llb, uub); + free(bufferhere); + } + else if (myrank == BP->rank) + { + MPI_Send(BP->fgfs[VP->sgfn], nnn, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD); + } + } + if (Bp == PP->ble) + break; + Bp = Bp->next; + } + if (myrank == 0) + { + + string out_dir; + map::iterator iter; + iter = parameters::str_par.find("output dir"); + if (iter != parameters::str_par.end()) + { + out_dir = iter->second; + } + else + { + // read parameter from file + const int LEN = 256; + char pline[LEN]; + string str, sgrp, skey, sval; + int sind; + char pname[50]; + { + map::iterator iter = parameters::str_par.find("inputpar"); + if (iter != parameters::str_par.end()) + { + strcpy(pname, (iter->second).c_str()); + } + else + { + cout << "Error inputpar" << endl; + exit(0); + } + } + ifstream inf(pname, ifstream::in); + if (!inf.good()) + { + cout << "Can not open parameter file " << pname << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + for (int i = 1; inf.good(); i++) + { + inf.getline(pline, LEN); + str = pline; + + int status = misc::parse_parts(str, sgrp, skey, sval, sind); + if (status == -1) + { + cout << "error reading parameter file " << pname << " in line " << i << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + else if (status == 0) + continue; + + if (sgrp == "ABE") + { + if (skey == "output dir") + out_dir = sval; + } + } + inf.close(); + + parameters::str_par.insert(map::value_type("output dir", out_dir)); + } + + char filename[100]; + if (tag) + sprintf(filename, "%s/%s_2d_Lev%02d-%02d_%s_%05d.dat", out_dir.c_str(), tag, PP->lev, grd, VP->name, ncount); + else + sprintf(filename, "%s/2d_Lev%02d-%02d_%s_%05d.dat", out_dir.c_str(), PP->lev, grd, VP->name, ncount); + + int gord = ghost_width; + f_d2dump(DIM, PP->bbox, PP->bbox + DIM, PP->shape, databuffer, databuffer2, gord, VP->SoA); + writefile(time, PP->shape[0], PP->shape[1], PP->bbox[0], PP->bbox[3], PP->bbox[1], PP->bbox[4], + filename, databuffer2); + } + DumpList = DumpList->next; + } + + if (myrank == 0) + { + free(databuffer); + free(databuffer2); + } +} +void Parallel::d2Dump_Data(MyList *PL, MyList *DumpList, char *tag, double time, double dT) +{ + MyList *Pp; + Pp = PL; + int grd = 0; + while (Pp) + { + Patch *PP = Pp->data; + d2Dump_Data(PP, DumpList, tag, time, dT, grd); + grd++; + Pp = Pp->next; + } +} +// Now we dump the data including buffer points and ghost points of the given patch +void Parallel::Dump_Data0(Patch *PP, MyList *DumpList, char *tag, double time, double dT) +{ + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + // round at 4 and 5 + int ncount = int(time / dT + 0.5); + + MPI_Status sta; + int DIM = 3; + double llb[3], uub[3], tllb[3], tuub[3]; + int tshape[3]; + double DX, DY, DZ; + + for (int i = 0; i < 3; i++) + { + double DX = PP->blb->data->getdX(i); + tshape[i] = PP->shape[i] + 2 * ghost_width; + tllb[i] = PP->bbox[i] - ghost_width * DX; + tuub[i] = PP->bbox[i + dim] + ghost_width * DX; + } + + int NN = tshape[0] * tshape[1] * tshape[2]; + double *databuffer = 0; + if (myrank == 0) + { + databuffer = (double *)malloc(sizeof(double) * NN); + if (!databuffer) + { + cout << "on node# " << myrank << ", out of memory when dumping data." << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + + while (DumpList) + { + var *VP = DumpList->data; + MyList *Bp = PP->blb; + while (Bp) + { + Block *BP = Bp->data; + if (BP->rank == 0 && myrank == 0) + { + DX = BP->getdX(0); + DY = BP->getdX(1); + DZ = BP->getdX(2); + llb[0] = (feq(BP->bbox[0], tllb[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX; + llb[1] = (feq(BP->bbox[1], tllb[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY; + llb[2] = (feq(BP->bbox[2], tllb[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ; + uub[0] = (feq(BP->bbox[3], tuub[0], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX; + uub[1] = (feq(BP->bbox[4], tuub[1], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY; + uub[2] = (feq(BP->bbox[5], tuub[2], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ; + f_copy(DIM, tllb, tuub, tshape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, BP->fgfs[VP->sgfn], llb, uub); + } + else + { + if (myrank == 0) + { + int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]); + double *bufferhere = (double *)malloc(sizeof(double) * nnn); + if (!bufferhere) + { + cout << "on node#" << myrank << ", out of memory when dumping data." << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + MPI_Recv(bufferhere, nnn, MPI_DOUBLE, BP->rank, 0, MPI_COMM_WORLD, &sta); + DX = BP->getdX(0); + DY = BP->getdX(1); + DZ = BP->getdX(2); + llb[0] = (feq(BP->bbox[0], tllb[0], DX / 2)) ? BP->bbox[0] : BP->bbox[0] + ghost_width * DX; + llb[1] = (feq(BP->bbox[1], tllb[1], DY / 2)) ? BP->bbox[1] : BP->bbox[1] + ghost_width * DY; + llb[2] = (feq(BP->bbox[2], tllb[2], DZ / 2)) ? BP->bbox[2] : BP->bbox[2] + ghost_width * DZ; + uub[0] = (feq(BP->bbox[3], tuub[0], DX / 2)) ? BP->bbox[3] : BP->bbox[3] - ghost_width * DX; + uub[1] = (feq(BP->bbox[4], tuub[1], DY / 2)) ? BP->bbox[4] : BP->bbox[4] - ghost_width * DY; + uub[2] = (feq(BP->bbox[5], tuub[2], DZ / 2)) ? BP->bbox[5] : BP->bbox[5] - ghost_width * DZ; + f_copy(DIM, tllb, tuub, tshape, databuffer, BP->bbox, BP->bbox + DIM, BP->shape, bufferhere, llb, uub); + free(bufferhere); + } + else if (myrank == BP->rank) + { + int nnn = (BP->shape[0]) * (BP->shape[1]) * (BP->shape[2]); + MPI_Send(BP->fgfs[VP->sgfn], nnn, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD); + } + } + if (Bp == PP->ble) + break; + Bp = Bp->next; + } + if (myrank == 0) + { + + string out_dir; + map::iterator iter; + iter = parameters::str_par.find("output dir"); + if (iter != parameters::str_par.end()) + { + out_dir = iter->second; + } + else + { + // read parameter from file + const int LEN = 256; + char pline[LEN]; + string str, sgrp, skey, sval; + int sind; + char pname[50]; + { + map::iterator iter = parameters::str_par.find("inputpar"); + if (iter != parameters::str_par.end()) + { + strcpy(pname, (iter->second).c_str()); + } + else + { + cout << "Error inputpar" << endl; + exit(0); + } + } + ifstream inf(pname, ifstream::in); + if (!inf.good()) + { + cout << "Can not open parameter file " << pname << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + for (int i = 1; inf.good(); i++) + { + inf.getline(pline, LEN); + str = pline; + + int status = misc::parse_parts(str, sgrp, skey, sval, sind); + if (status == -1) + { + cout << "error reading parameter file " << pname << " in line " << i << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + else if (status == 0) + continue; + + if (sgrp == "ABE") + { + if (skey == "output dir") + out_dir = sval; + } + } + inf.close(); + + parameters::str_par.insert(map::value_type("output dir", out_dir)); + } + + char filename[100]; + if (tag) + sprintf(filename, "%s/%s_Lev%02d_%s_%05d.bin", out_dir.c_str(), tag, PP->lev, VP->name, ncount); + else + sprintf(filename, "%s/Lev%02d_%s_%05d.bin", out_dir.c_str(), PP->lev, VP->name, ncount); + + writefile(time, tshape[0], tshape[1], tshape[2], tllb[0], tuub[0], tllb[1], tuub[2], + tllb[2], tuub[2], filename, databuffer); + } + DumpList = DumpList->next; + } + + if (myrank == 0) + free(databuffer); +} +// Map point is much easier than maping data itself +// But the main problem is about the points near the boundary +// worst case is -ghost -ghost+1 .... 0 * ...... +double Parallel::global_interp(int DIM, int *ext, double **CoX, double *datain, + double *poXb, int ordn, double *SoA, int Symmetry) +{ + if (DIM != 3) + { + cout << "Parallel::global_interp does not suport DIM = " << DIM << " for Symmetry." << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + double resu; + double poX[3]; + double asgn = 1; + + for (int i = 0; i < 3; i++) + poX[i] = poXb[i]; + + switch (Symmetry) + { + case 2: + for (int i = 0; i < 3; i++) + if (poX[i] < 0) + { + poX[i] = -poX[i]; + asgn = asgn * SoA[i]; + } + break; + case 1: + if (poX[2] < 0) + { + poX[2] = -poX[2]; + asgn = asgn * SoA[2]; + } + } + + int extb[3]; + + for (int i = 0; i < 3; i++) + extb[i] = ext[i]; + + switch (Symmetry) + { +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + case 2: + if (poX[0] < (ghost_width - 1) * (CoX[0][1] - CoX[0][0])) + extb[0] = extb[0] + ghost_width - 1; + if (poX[1] < (ghost_width - 1) * (CoX[1][1] - CoX[1][0])) + extb[1] = extb[1] + ghost_width - 1; + case 1: + if (poX[2] < (ghost_width - 1) * (CoX[2][1] - CoX[2][0])) + extb[2] = extb[2] + ghost_width - 1; +#else +#ifdef Cell + case 2: + if (poX[0] < (ghost_width - 0.5) * (CoX[0][1] - CoX[0][0])) + extb[0] = extb[0] + ghost_width; + if (poX[1] < (ghost_width - 0.5) * (CoX[1][1] - CoX[1][0])) + extb[1] = extb[1] + ghost_width; + case 1: + if (poX[2] < (ghost_width - 0.5) * (CoX[2][1] - CoX[2][0])) + extb[2] = extb[2] + ghost_width; +#else +#error Not define Vertex nor Cell +#endif +#endif + } + + if (extb[0] > ext[0] || extb[1] > ext[1] || extb[2] > ext[2]) + { + double *CoXb[3]; + int Nb = extb[0] * extb[1] * extb[2]; + double *datab; + datab = new double[Nb]; + for (int i = 0; i < 3; i++) + { + CoXb[i] = new double[extb[i]]; + double DH = CoX[i][1] - CoX[i][0]; + if (extb[i] > ext[i]) + { + if (CoX[i][0] > DH) + { + cout << "lower boundary[" << i << "] = " << CoX[i][0] << ", but SYmmetry = " << Symmetry << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + for (int j = 0; j < ghost_width - 1; j++) + CoXb[i][j] = -CoX[i][ghost_width - 1 - j]; + for (int j = ghost_width - 1; j < extb[i]; j++) + CoXb[i][j] = CoX[i][j - ghost_width + 1]; +#else +#ifdef Cell + for (int j = 0; j < ghost_width; j++) + CoXb[i][j] = -CoX[i][ghost_width - 1 - j]; + for (int j = ghost_width; j < extb[i]; j++) + CoXb[i][j] = CoX[i][j - ghost_width]; +#else +#error Not define Vertex nor Cell +#endif +#endif + } + else + { + for (int j = 0; j < extb[i]; j++) + CoXb[i][j] = CoX[i][j]; + } + } + + for (int i = 0; i < Nb; i++) + { + int ind[3], indb[3]; + getarrayindex(3, extb, indb, i); + double sgn = 1; + for (int j = 0; j < 3; j++) + { + if (extb[j] > ext[j]) + { +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + if (indb[j] < ghost_width - 1) + { + ind[j] = ghost_width - 1 - indb[j]; + sgn = sgn * SoA[j]; + } + else + { + ind[j] = 1 + indb[j] - ghost_width; + } +#else +#ifdef Cell + if (indb[j] < ghost_width) + { + ind[j] = ghost_width - 1 - indb[j]; + sgn = sgn * SoA[j]; + } + else + { + ind[j] = indb[j] - ghost_width; + } +#else +#error Not define Vertex nor Cell +#endif +#endif + } + else + ind[j] = indb[j]; + } + int lon = getarraylocation(3, ext, ind); + datab[i] = datain[lon] * sgn; + } + + resu = global_interp(DIM, extb, CoXb, datab, poX, ordn); + + for (int i = 0; i < 3; i++) + delete[] CoXb[i]; + delete[] datab; + } + else + { + resu = global_interp(DIM, ext, CoX, datain, poX, ordn); + } + + return resu * asgn; +} +double Parallel::global_interp(int DIM, int *ext, double **CoX, double *datain, + double *poX, int ordn) +{ + if (ordn > 2 * ghost_width) + { + cout << "Parallel::global_interp can not handle ordn = " << ordn << " > 2*ghost_width = " << 2 * ghost_width << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + double *bbox, *datainbbox; + bbox = new double[2 * DIM]; + datainbbox = new double[2 * DIM]; + + int *NN, *ind, *shape; + NN = new int[DIM]; + ind = new int[DIM]; + shape = new int[DIM]; + + for (int i = 0; i < DIM; i++) + { + ind[i] = int((poX[i] - CoX[i][0]) / (CoX[i][1] - CoX[i][0])) - ordn / 2 + 1; + // poX may exactly locate on the boundary (exclude ghost) + if (ind[i] == -1 && feq(poX[i], CoX[i][0], (CoX[i][1] - CoX[i][0]) / 2)) + ind[i] = 0; + /* + if(ind[i] < 0) + { + cout<<"Parallel::global_interp error ind["< ext = "<= 0; i--) + NN[i] = NN[i + 1] * ordn; + + double *xpts, *funcvals; + xpts = new double[ordn]; + funcvals = new double[ordn]; + double *DDd, *DDd1, rr; + + DDd = new double[NN[0]]; + + copy(DIM, bbox, bbox + DIM, shape, DDd, datainbbox, datainbbox + DIM, ext, datain, bbox, bbox + DIM); + + for (int i = 0; i < DIM; i++) + { + for (int j = ind[i]; j < ind[i] + ordn; j++) + { + xpts[j - ind[i]] = CoX[i][j]; + } + + if (i < DIM - 1) + { + DDd1 = new double[NN[i + 1]]; + for (int j = 0; j < NN[i + 1]; j++) + { + for (int k = 0; k < ordn; k++) + funcvals[k] = DDd[k + j * ordn]; + DDd1[j] = Lagrangian_Int(poX[i], ordn, xpts, funcvals); + } + delete[] DDd; + DDd = DDd1; + } + else + { + for (int j = 0; j < ordn; j++) + funcvals[j] = DDd[j]; + rr = Lagrangian_Int(poX[i], ordn, xpts, funcvals); + delete[] DDd1; // since DDd and DDd1 now point to the same stuff, we need delete after above int + } + } + + delete[] NN; + delete[] ind; + delete[] xpts; + delete[] funcvals; + delete[] bbox; + delete[] datainbbox; + delete[] shape; + + return rr; +} +double Parallel::Lagrangian_Int(double x, int npts, double *xpts, double *funcvals) +{ + double sum = 0; + for (int i = 0; i < npts; i++) + { + sum = sum + funcvals[i] * LagrangePoly(x, i, npts, xpts); + } + return sum; +} +double Parallel::LagrangePoly(double x, int pt, int npts, double *xpts) +{ + double h = 1; + int i; + + for (i = 0; i < pt; i++) + h = h * (x - xpts[i]) / (xpts[pt] - xpts[i]); + + for (i = pt + 1; i < npts; i++) + h = h * (x - xpts[i]) / (xpts[pt] - xpts[i]); + + return h; +} +// collect all grid segments or blocks including ghost and buffer for given patch +MyList *Parallel::build_complete_gsl(Patch *Pat) +{ + MyList *cgsl = 0, *gs; + MyList *BP = Pat->blb; + while (BP) + { + if (!cgsl) + { + cgsl = gs = new MyList; // delete through destroyList(); + gs->data = new Parallel::gridseg; + } + else + { + gs->next = new MyList; + gs = gs->next; + gs->data = new Parallel::gridseg; + } + + for (int i = 0; i < dim; i++) + { + gs->data->llb[i] = BP->data->bbox[i]; + gs->data->uub[i] = BP->data->bbox[dim + i]; + gs->data->shape[i] = BP->data->shape[i]; + } + gs->data->Bg = BP->data; + gs->next = 0; + + if (BP == Pat->ble) + break; + BP = BP->next; + } + + return cgsl; +} +// collect all grid segments or blocks including ghost and buffer for given patch list +MyList *Parallel::build_complete_gsl(MyList *PatL) +{ + MyList *cgsl = 0, *gs; + while (PatL) + { + if (!cgsl) + { + cgsl = build_complete_gsl(PatL->data); + gs = cgsl; + while (gs->next) + gs = gs->next; + } + else + { + gs->next = build_complete_gsl(PatL->data); + gs = gs->next; + while (gs->next) + gs = gs->next; + } + PatL = PatL->next; + } + + return cgsl; +} +// cellect the information of Patch list +MyList *Parallel::build_complete_gsl_virtual(MyList *PatL) +{ + MyList *cgsl = 0, *gs; + while (PatL) + { + if (cgsl) + { + gs->next = new MyList; + gs = gs->next; + gs->data = new Parallel::gridseg; + } + else + { + cgsl = gs = new MyList; + gs->data = new Parallel::gridseg; + } + + for (int i = 0; i < dim; i++) + { + gs->data->llb[i] = PatL->data->bbox[i]; + gs->data->uub[i] = PatL->data->bbox[dim + i]; + gs->data->shape[i] = PatL->data->shape[i]; + } + gs->data->Bg = 0; + gs->next = 0; + + PatL = PatL->next; + } + + return cgsl; +} +// cellect the information of Patch list without buffer points +MyList *Parallel::build_complete_gsl_virtual2(MyList *PatL) // - buffer +{ + MyList *cgsl = 0, *gs; + while (PatL) + { + if (cgsl) + { + gs->next = new MyList; + gs = gs->next; + gs->data = new Parallel::gridseg; + } + else + { + cgsl = gs = new MyList; + gs->data = new Parallel::gridseg; + } + + for (int i = 0; i < dim; i++) + { + double DH = PatL->data->getdX(i); + gs->data->llb[i] = PatL->data->bbox[i] + PatL->data->lli[i] * DH; + gs->data->uub[i] = PatL->data->bbox[dim + i] - PatL->data->uui[i] * DH; + gs->data->shape[i] = PatL->data->shape[i] - PatL->data->lli[i] - PatL->data->uui[i]; + } + gs->data->Bg = 0; + gs->next = 0; + + PatL = PatL->next; + } + + return cgsl; +} +// collect all grid segments or blocks without ghost for given patch, without extension +MyList *Parallel::build_bulk_gsl(Patch *Pat) +{ + MyList *cgsl = 0, *gs; + MyList *BP = Pat->blb; + while (BP) + { + Block *bp = BP->data; + if (!cgsl) + { + cgsl = gs = new MyList; + gs->data = new Parallel::gridseg; + } + else + { + gs->next = new MyList; + gs = gs->next; + gs->data = new Parallel::gridseg; + } + + for (int i = 0; i < dim; i++) + { + double DH = bp->getdX(i); + gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] : bp->bbox[dim + i] - ghost_width * DH; + gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + ghost_width * DH; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1; +#else +#ifdef Cell + gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4); +#else +#error Not define Vertex nor Cell +#endif +#endif + } + gs->data->Bg = BP->data; + gs->next = 0; + + if (BP == Pat->ble) + break; + BP = BP->next; + } + + return cgsl; +} +// bulk part for given Block within given patch, without extension +MyList *Parallel::build_bulk_gsl(Block *bp, Patch *Pat) +{ + MyList *gs = 0; + + gs = new MyList; + gs->data = new Parallel::gridseg; + + for (int i = 0; i < dim; i++) + { + double DH = bp->getdX(i); + gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] : bp->bbox[dim + i] - ghost_width * DH; + gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + ghost_width * DH; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1; +#else +#ifdef Cell + gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4); +#else +#error Not define Vertex nor Cell +#endif +#endif + } + gs->data->Bg = bp; + gs->next = 0; + + return gs; +} +MyList *Parallel::clone_gsl(MyList *p, bool first_only) +{ + MyList *np = 0, *q = 0, *pq = 0; + + while (p) + { + q = new MyList; + q->data = new Parallel::gridseg; + q->data->Bg = p->data->Bg; + for (int i = 0; i < dim; i++) + { + q->data->llb[i] = p->data->llb[i]; + q->data->uub[i] = p->data->uub[i]; + q->data->shape[i] = p->data->shape[i]; + } + if (pq) + pq->next = q; + else + np = q; + if (first_only) + { + np->next = 0; + return np; + } + pq = q; + p = p->next; + } + return np; +} +MyList *Parallel::gs_subtract(MyList *A, MyList *B) +{ + if (!A) + return 0; + if (!B) + return clone_gsl(A, true); + + double cut_plane[2 * dim], DH[dim]; + + for (int i = 0; i < dim; i++) + { + DH[i] = A->data->Bg->getdX(i); + if (B->data->Bg && !feq(DH[i], B->data->Bg->getdX(i), DH[i] / 2)) + { + cout << "Parallel::gs_subtract meets different grid segment " << DH[i] << " vs " << B->data->Bg->getdX(i) << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + + MyList *C = 0, *q; + for (int i = 0; i < dim; i++) + { + if (B->data->llb[i] > A->data->uub[i] || B->data->uub[i] < A->data->llb[i]) + return clone_gsl(A, true); + cut_plane[i] = A->data->llb[i]; + cut_plane[i + dim] = A->data->uub[i]; + } + + for (int i = 0; i < dim; i++) + { + cut_plane[i] = Mymax(A->data->llb[i], B->data->llb[i]); + if (cut_plane[i] - A->data->llb[i] > DH[i] / 2) + { + q = clone_gsl(A, true); + // prolong the list from head + if (C) + q->next = C; + C = q; + for (int j = 0; j < dim; j++) + { + if (i == j) + { + C->data->llb[i] = A->data->llb[i]; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + C->data->uub[i] = Mymax(C->data->llb[i], cut_plane[i] - DH[i]); +#else +#ifdef Cell + C->data->uub[i] = Mymax(C->data->llb[i], cut_plane[i]); +#else +#error Not define Vertex nor Cell +#endif +#endif + } + else + { + C->data->llb[j] = cut_plane[j]; + C->data->uub[j] = cut_plane[j + dim]; + } +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4) + 1; +#else +#ifdef Cell + C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4); +#else +#error Not define Vertex nor Cell +#endif +#endif + } + } + + cut_plane[i + dim] = Mymin(A->data->uub[i], B->data->uub[i]); + if (A->data->uub[i] - cut_plane[i + dim] > DH[i] / 2) + { + q = clone_gsl(A, true); + if (C) + q->next = C; + C = q; + for (int j = 0; j < dim; j++) + { + if (i == j) + { + C->data->uub[i] = A->data->uub[i]; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + C->data->llb[i] = Mymin(C->data->uub[i], cut_plane[i + dim] + DH[i]); +#else +#ifdef Cell + C->data->llb[i] = Mymin(C->data->uub[i], cut_plane[i + dim]); +#else +#error Not define Vertex nor Cell +#endif +#endif + } + else + { + C->data->llb[j] = cut_plane[j]; + C->data->uub[j] = cut_plane[j + dim]; + } +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4) + 1; +#else +#ifdef Cell + C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4); +#else +#error Not define Vertex nor Cell +#endif +#endif + } + } + } + return C; +} +// stupid method +/* +MyList *Parallel::gsl_subtract(MyList *A,MyList *B) //A subtract B but with A's information +{ +// always make return and A, B distinct + if(!A) return 0; + + if(!B) return clone_gsl(A,0); + + MyList *C=0,*C0,*C1,*Cc,*CC0,*gs; + + while(A) + { + C0=gs_subtract(A,B); // note C0 becomes a list after subtraction + C1=B->next; + while(C1) + { + CC0=C0; + Cc=0; + while(CC0) + { + gs=gs_subtract(CC0,C1); + if(Cc) Cc->catList(gs); + else Cc=gs; + CC0=CC0->next; + } + if(C0) C0->destroyList(); + C0=Cc; + C1=C1->next; + } + if(C) C->catList(C0); + else C=C0; + A=A->next; + } + + return C; +} +*/ +// more clever method +MyList *Parallel::gsl_subtract(MyList *A, MyList *B) // A subtract B but with A's information +{ + // always make return and A, B distinct + if (!A) + return 0; + + MyList *C = 0, *C0, *C1; + + C = clone_gsl(A, 0); + + while (B) + { + C0 = 0; + C1 = C; + while (C1) + { + if (C0) + C0->catList(gs_subtract(C1, B)); + else + C0 = gs_subtract(C1, B); + C1 = C1->next; + } + if (C) + C->destroyList(); + else + { + if (C0) + C0->destroyList(); + return 0; + } + + C = C0; + B = B->next; + } + + return C; +} +MyList *Parallel::gs_and(MyList *A, MyList *B) +{ + if (!A || !B) + return 0; + + double llb[dim], uub[dim]; + bool flag = false; + for (int i = 0; i < dim; i++) + { + llb[i] = Mymax(A->data->llb[i], B->data->llb[i]); + uub[i] = Mymin(A->data->uub[i], B->data->uub[i]); + if (llb[i] > uub[i]) + { + flag = true; + break; + } + } + if (flag) + return 0; + + MyList *C; + C = clone_gsl(A, true); + for (int i = 0; i < dim; i++) + { + C->data->llb[i] = llb[i]; + C->data->uub[i] = uub[i]; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / C->data->Bg->getdX(i) + 0.4) + 1; +#else +#ifdef Cell + C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / C->data->Bg->getdX(i) + 0.4); +#else +#error Not define Vertex nor Cell +#endif +#endif + } + + return C; +} +// overlap of A_i and (union of all j of B_j) +MyList *Parallel::gsl_and(MyList *A, MyList *B) // A and B but with A's information +{ + MyList *C = 0, *C1; + + while (A) + { + C1 = B; + while (C1) + { + if (C) + C->catList(gs_and(A, C1)); + else + C = gs_and(A, C1); + C1 = C1->next; + } + A = A->next; + } + return C; +} +// collect all ghost grid segments or blocks for given patch +MyList *Parallel::build_ghost_gsl(Patch *Pat) +{ + MyList *cgsl = 0, *gs, *gsb; + MyList *BP = Pat->blb; + while (BP) + { + gs = new MyList; + gs->data = new Parallel::gridseg; + + for (int i = 0; i < dim; i++) + { + gs->data->llb[i] = BP->data->bbox[i]; + gs->data->uub[i] = BP->data->bbox[dim + i]; + gs->data->shape[i] = BP->data->shape[i]; + } + gs->data->Bg = BP->data; + gs->next = 0; + + gsb = build_bulk_gsl(BP->data, Pat); + + if (!cgsl) + cgsl = gs_subtract(gs, gsb); + else + cgsl->catList(gs_subtract(gs, gsb)); + + gsb->destroyList(); + gs->destroyList(); + + if (BP == Pat->ble) + break; + BP = BP->next; + } + + return cgsl; +} +// collect all ghost grid segments or blocks for given patch list +MyList *Parallel::build_ghost_gsl(MyList *PatL) +{ + MyList *cgsl = 0, *gs; + while (PatL) + { + if (!cgsl) + { + cgsl = build_ghost_gsl(PatL->data); + gs = cgsl; + while (gs->next) + gs = gs->next; + } + else + { + gs->next = build_ghost_gsl(PatL->data); + gs = gs->next; + while (gs->next) + gs = gs->next; + } + PatL = PatL->next; + } + + return cgsl; +} +// collect all grid segments or blocks without ghost for given patch +// special for Sync usage, so we do not need consider missing points +MyList *Parallel::build_owned_gsl0(Patch *Pat, int rank_in) +{ + MyList *cgsl = 0, *gs; + MyList *BP = Pat->blb; + while (BP) + { + Block *bp = BP->data; + if (bp->rank == rank_in) + { + if (!cgsl) + { + cgsl = gs = new MyList; + gs->data = new Parallel::gridseg; + } + else + { + gs->next = new MyList; + gs = gs->next; + gs->data = new Parallel::gridseg; + } + + for (int i = 0; i < dim; i++) + { + double DH = bp->getdX(i); + gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] : bp->bbox[dim + i] - ghost_width * DH; + gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + ghost_width * DH; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1; +#else +#ifdef Cell + gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4); +#else +#error Not define Vertex nor Cell +#endif +#endif + } + gs->data->Bg = BP->data; + gs->next = 0; + } + + if (BP == Pat->ble) + break; + BP = BP->next; + } + + return cgsl; +} +// collect all grid segments or blocks without ghost for given patch +MyList *Parallel::build_owned_gsl1(Patch *Pat, int rank_in) +{ + MyList *cgsl = 0, *gs; + MyList *BP = Pat->blb; + while (BP) + { + Block *bp = BP->data; + if (bp->rank == rank_in) + { + if (!cgsl) + { + cgsl = gs = new MyList; + gs->data = new Parallel::gridseg; + } + else + { + gs->next = new MyList; + gs = gs->next; + gs->data = new Parallel::gridseg; + } + + for (int i = 0; i < dim; i++) + { + double DH = bp->getdX(i); + gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] : bp->bbox[dim + i] - ghost_width * DH; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + // NOTE: our dividing structure is (exclude ghost) + // -1 0 + // 1 2 + // so (0,1) does not belong to any part for vertex structure, we always put it to right part, this is consistent to + // the fortran routine where we always take floor to get index + gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + (ghost_width - 1) * DH; + gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1; +#else +#ifdef Cell + gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] : bp->bbox[i] + ghost_width * DH; + gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4); +#else +#error Not define Vertex nor Cell +#endif +#endif + } + gs->data->Bg = BP->data; + gs->next = 0; + } + + if (BP == Pat->ble) + break; + BP = BP->next; + } + + return cgsl; +} +// collect all grid segments or blocks without ghost nor buffer for given patch +MyList *Parallel::build_owned_gsl2(Patch *Pat, int rank_in) +{ + MyList *cgsl = 0, *gs; + MyList *BP = Pat->blb; + while (BP) + { + Block *bp = BP->data; + if (bp->rank == rank_in) + { + if (!cgsl) + { + cgsl = gs = new MyList; + gs->data = new Parallel::gridseg; + } + else + { + gs->next = new MyList; + gs = gs->next; + gs->data = new Parallel::gridseg; + } + + for (int i = 0; i < dim; i++) + { + double DH = bp->getdX(i); + gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] - Pat->uui[i] * DH : bp->bbox[dim + i] - ghost_width * DH; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + // NOTE: our dividing structure is (exclude ghost) + // -1 0 + // 1 2 + // so (0,1) does not belong to any part for vertex structure, we always put it to right part, this is consistent to + // the fortran routine where we always take floor to get index + gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i] + (ghost_width - 1) * DH; + gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1; +#else +#ifdef Cell + gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i] + ghost_width * DH; + gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4); +#else +#error Not define Vertex nor Cell +#endif +#endif + } + gs->data->Bg = BP->data; + gs->next = 0; + } + + if (BP == Pat->ble) + break; + BP = BP->next; + } + + return cgsl; +} +// collect all grid segments or blocks without ghost for given patch, and delete the ghost_width for interpolation consideration on the patch boundary +MyList *Parallel::build_owned_gsl3(Patch *Pat, int rank_in, int Symmetry) +{ + MyList *cgsl = 0, *gs; + MyList *BP = Pat->blb; + while (BP) + { + Block *bp = BP->data; + if (bp->rank == rank_in) + { + if (!cgsl) + { + cgsl = gs = new MyList; + gs->data = new Parallel::gridseg; + } + else + { + gs->next = new MyList; + gs = gs->next; + gs->data = new Parallel::gridseg; + } + + for (int i = 0; i < dim; i++) + { + double DH = bp->getdX(i); + gs->data->uub[i] = bp->bbox[dim + i] - ghost_width * DH; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + // NOTE: our dividing structure is (exclude ghost) + // -1 0 + // 1 2 + // so (0,1) does not belong to any part for vertex structure, we always put it to right part, this is consistent to + // the fortran routine where we always take floor to get index + gs->data->llb[i] = bp->bbox[i] + (ghost_width - 1) * DH; + gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1; +#else +#ifdef Cell + gs->data->llb[i] = bp->bbox[i] + ghost_width * DH; + gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4); +#else +#error Not define Vertex nor Cell +#endif +#endif + } + // Symmetry consideration + if (Symmetry > 0) + { + double DH = bp->getdX(2); + if (feq(bp->bbox[2], 0, DH / 2)) + { + gs->data->llb[2] = bp->bbox[2]; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + gs->data->shape[2] = int((gs->data->uub[2] - gs->data->llb[2]) / DH + 0.4) + 1; +#else +#ifdef Cell + gs->data->shape[2] = int((gs->data->uub[2] - gs->data->llb[2]) / DH + 0.4); +#else +#error Not define Vertex nor Cell +#endif +#endif + } + if (Symmetry > 1) + { + for (int i = 0; i < 2; i++) + { + DH = bp->getdX(i); + if (feq(bp->bbox[i], 0, DH / 2)) + { + gs->data->llb[i] = bp->bbox[i]; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1; +#else +#ifdef Cell + gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4); +#else +#error Not define Vertex nor Cell +#endif +#endif + } + } + } + } + + gs->data->Bg = BP->data; + gs->next = 0; + } + + if (BP == Pat->ble) + break; + BP = BP->next; + } + + return cgsl; +} +// collect all grid segments or blocks without ghost nor buffer for given patch, +// and delete the ghost_width for interpolation consideration on the patch boundary +MyList *Parallel::build_owned_gsl4(Patch *Pat, int rank_in, int Symmetry) +{ + MyList *cgsl = 0, *gs; + MyList *BP = Pat->blb; + while (BP) + { + Block *bp = BP->data; + if (bp->rank == rank_in) + { + if (!cgsl) + { + cgsl = gs = new MyList; + gs->data = new Parallel::gridseg; + } + else + { + gs->next = new MyList; + gs = gs->next; + gs->data = new Parallel::gridseg; + } + + for (int i = 0; i < dim; i++) + { + double DH = bp->getdX(i); + gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] - Pat->uui[i] * DH : bp->bbox[dim + i]; + gs->data->uub[i] -= ghost_width * DH; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + // NOTE: our dividing structure is (exclude ghost) + // -1 0 + // 1 2 + // so (0,1) does not belong to any part for vertex structure, we always put it to right part, this is consistent to + // the fortran routine where we always take floor to get index + gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i]; + gs->data->llb[i] += (ghost_width - 1) * DH; + gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1; +#else +#ifdef Cell + gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i]; + gs->data->llb[i] += ghost_width * DH; + gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4); +#else +#error Not define Vertex nor Cell +#endif +#endif + } + // Symmetry consideration + if (Symmetry > 0) + { + double DH = bp->getdX(2); + if (feq(bp->bbox[2], 0, DH / 2)) + { + gs->data->llb[2] = bp->bbox[2]; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + gs->data->shape[2] = int((gs->data->uub[2] - gs->data->llb[2]) / DH + 0.4) + 1; +#else +#ifdef Cell + gs->data->shape[2] = int((gs->data->uub[2] - gs->data->llb[2]) / DH + 0.4); +#else +#error Not define Vertex nor Cell +#endif +#endif + } + if (Symmetry > 1) + { + for (int i = 0; i < 2; i++) + { + DH = bp->getdX(i); + if (feq(bp->bbox[i], 0, DH / 2)) + { + gs->data->llb[i] = bp->bbox[i]; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1; +#else +#ifdef Cell + gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4); +#else +#error Not define Vertex nor Cell +#endif +#endif + } + } + } + } + + gs->data->Bg = BP->data; + gs->next = 0; + } + + if (BP == Pat->ble) + break; + BP = BP->next; + } + + return cgsl; +} +// collect all grid segments or blocks without ghost nor buffer for given patch, no extention +MyList *Parallel::build_owned_gsl5(Patch *Pat, int rank_in) +{ + MyList *cgsl = 0, *gs; + MyList *BP = Pat->blb; + while (BP) + { + Block *bp = BP->data; + if (bp->rank == rank_in) + { + if (!cgsl) + { + cgsl = gs = new MyList; + gs->data = new Parallel::gridseg; + } + else + { + gs->next = new MyList; + gs = gs->next; + gs->data = new Parallel::gridseg; + } + + for (int i = 0; i < dim; i++) + { + double DH = bp->getdX(i); + gs->data->uub[i] = (feq(bp->bbox[dim + i], Pat->bbox[dim + i], DH / 2)) ? bp->bbox[dim + i] - Pat->uui[i] * DH : bp->bbox[dim + i] - ghost_width * DH; + gs->data->llb[i] = (feq(bp->bbox[i], Pat->bbox[i], DH / 2)) ? bp->bbox[i] + Pat->lli[i] * DH : bp->bbox[i] + ghost_width * DH; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4) + 1; +#else +#ifdef Cell + gs->data->shape[i] = int((gs->data->uub[i] - gs->data->llb[i]) / DH + 0.4); +#else +#error Not define Vertex nor Cell +#endif +#endif + } + gs->data->Bg = BP->data; + gs->next = 0; + } + + if (BP == Pat->ble) + break; + BP = BP->next; + } + + return cgsl; +} +// collect all grid segments or blocks without ghost for given patch list +// stupid method +/* +MyList *Parallel::build_owned_gsl(MyList *PatL,int rank_in,int type,int Symmetry) +{ + MyList *cgsl=0,*gs; + while(PatL) + { + if(!cgsl) + { + switch(type) + { + case 0: + cgsl = build_owned_gsl0(PatL->data,rank_in); + break; + case 1: + cgsl = build_owned_gsl1(PatL->data,rank_in); + break; + case 2: + cgsl = build_owned_gsl2(PatL->data,rank_in); + break; + case 3: + cgsl = build_owned_gsl3(PatL->data,rank_in,Symmetry); + break; + case 4: + cgsl = build_owned_gsl4(PatL->data,rank_in,Symmetry); + break; + case 5: + cgsl = build_owned_gsl5(PatL->data,rank_in); + break; + default: + cout<<"Parallel::build_owned_gsl : unknown type = "<next) gs = gs->next; + } + else + { + switch(type) + { + case 0: + gs->next = build_owned_gsl0(PatL->data,rank_in); + break; + case 1: + gs->next = build_owned_gsl1(PatL->data,rank_in); + break; + case 2: + gs->next = build_owned_gsl2(PatL->data,rank_in); + break; + case 3: + gs->next = build_owned_gsl3(PatL->data,rank_in,Symmetry); + break; + case 4: + gs->next = build_owned_gsl4(PatL->data,rank_in,Symmetry); + break; + case 5: + gs->next = build_owned_gsl5(PatL->data,rank_in); + break; + default: + cout<<"Parallel::build_owned_gsl : unknown type = "<next) gs = gs->next; + } + PatL = PatL->next; + } + + return cgsl; +} +*/ +// more clever method +MyList *Parallel::build_owned_gsl(MyList *PatL, int rank_in, int type, int Symmetry) +{ + MyList *cgsl = 0, *gs; + while (PatL) + { + switch (type) + { + case 0: + gs = build_owned_gsl0(PatL->data, rank_in); + break; + case 1: + gs = build_owned_gsl1(PatL->data, rank_in); + break; + case 2: + gs = build_owned_gsl2(PatL->data, rank_in); + break; + case 3: + gs = build_owned_gsl3(PatL->data, rank_in, Symmetry); + break; + case 4: + gs = build_owned_gsl4(PatL->data, rank_in, Symmetry); + break; + case 5: + gs = build_owned_gsl5(PatL->data, rank_in); + break; + default: + cout << "Parallel::build_owned_gsl : unknown type = " << type << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + if (cgsl) + cgsl->catList(gs); + else + cgsl = gs; + PatL = PatL->next; + } + + return cgsl; +} +// according to overlape to determine real grid segments +void Parallel::build_gstl(MyList *srci, MyList *dsti, + MyList **out_src, MyList **out_dst) +{ + *out_src = *out_dst = 0; + + if (!srci || !dsti) + return; + + MyList *s, *d; + MyList *s2, *d2; + + double llb[dim], uub[dim]; + + s = srci; + while (s) + { + Parallel::gridseg *sd = s->data; + d = dsti; + while (d) + { + Parallel::gridseg *dd = d->data; + bool flag = true; + for (int i = 0; i < dim; i++) + { + double SH = sd->Bg->getdX(i), DH = dd->Bg->getdX(i); + llb[i] = Mymax(sd->llb[i], dd->llb[i]); + uub[i] = Mymin(sd->uub[i], dd->uub[i]); + // make sure the region boundary is consistent to the grids + // here we only judge if the domain is empty, so do not need to adjust the align + double lb = llb[i], ub = uub[i]; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + // ---*--- + // x-------x + // if (int(2*(sd->uub[i]-uub[i])/SH+0.4)%2 == 1) ub = uub[i]-SH/2; + // else if(int(2*(dd->uub[i]-uub[i])/DH+0.4)%2 == 1) ub = uub[i]-DH/2; + // if (int(2*(llb[i]-sd->llb[i])/SH+0.4)%2 == 1) lb = llb[i]+SH/2; + // else if(int(2*(llb[i]-dd->llb[i])/DH+0.4)%2 == 1) lb = llb[i]+DH/2; + if (lb > ub + Mymin(SH, DH) / 2) + { + flag = false; + break; + } // special for isolated point +#else +#ifdef Cell + // |------| + // |-------------| + // if (int(2*(sd->uub[i]-uub[i])/SH+0.4)%2 == 1) ub = uub[i]+SH/2; + // else if(int(2*(dd->uub[i]-uub[i])/DH+0.4)%2 == 1) ub = uub[i]+DH/2; + // |------| + // |-------------| + // if (int(2*(llb[i]-sd->llb[i])/SH+0.4)%2 == 1) lb = llb[i]-SH/2; + // else if(int(2*(llb[i]-dd->llb[i])/DH+0.4)%2 == 1) lb = llb[i]-DH/2; + if (ub - lb < Mymin(SH, DH) / 2) + { + flag = false; + break; + } // even for isolated point, it has a cell belong to it +#else +#error Not define Vertex nor Cell +#endif +#endif + } + + if (flag) + { + if (!(*out_src)) + { + *out_src = s2 = new MyList; + *out_dst = d2 = new MyList; + s2->data = new Parallel::gridseg; + d2->data = new Parallel::gridseg; + } + else + { + s2->next = new MyList; + s2 = s2->next; + d2->next = new MyList; + d2 = d2->next; + s2->data = new Parallel::gridseg; + d2->data = new Parallel::gridseg; + } + + for (int i = 0; i < dim; i++) + { + double SH = sd->Bg->getdX(i), DH = dd->Bg->getdX(i); + s2->data->llb[i] = d2->data->llb[i] = llb[i]; + s2->data->uub[i] = d2->data->uub[i] = uub[i]; +// using float method to count point, we do not need following consideration (2012 nov 17) +#if 1 + +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + // old code distuinguish vertex and cell + // if (int(2*(sd->uub[i]-uub[i])/SH+0.4)%2 == 1) s2->data->uub[i] = uub[i]-SH/2; + // else if(int(2*(dd->uub[i]-uub[i])/DH+0.4)%2 == 1) d2->data->uub[i] = uub[i]-DH/2; + // if (int(2*(llb[i]-sd->llb[i])/SH+0.4)%2 == 1) s2->data->llb[i] = llb[i]+SH/2; + // else if(int(2*(llb[i]-dd->llb[i])/DH+0.4)%2 == 1) d2->data->llb[i] = llb[i]+DH/2; + // new code: here we concern much more about missing point, because overlaping domain has been gaureented above + if (int(2 * (sd->uub[i] - uub[i]) / SH + 0.4) % 2 == 1) + s2->data->uub[i] = uub[i] + SH / 2; + else if (int(2 * (dd->uub[i] - uub[i]) / DH + 0.4) % 2 == 1) + d2->data->uub[i] = uub[i] + DH / 2; + if (int(2 * (llb[i] - sd->llb[i]) / SH + 0.4) % 2 == 1) + s2->data->llb[i] = llb[i] - SH / 2; + else if (int(2 * (llb[i] - dd->llb[i]) / DH + 0.4) % 2 == 1) + d2->data->llb[i] = llb[i] - DH / 2; + s2->data->shape[i] = int((s2->data->uub[i] - s2->data->llb[i]) / SH + 0.4) + 1; + d2->data->shape[i] = int((d2->data->uub[i] - d2->data->llb[i]) / DH + 0.4) + 1; +#else +#ifdef Cell + if (int(2 * (sd->uub[i] - uub[i]) / SH + 0.4) % 2 == 1) + s2->data->uub[i] = uub[i] + SH / 2; + else if (int(2 * (dd->uub[i] - uub[i]) / DH + 0.4) % 2 == 1) + d2->data->uub[i] = uub[i] + DH / 2; + if (int(2 * (llb[i] - sd->llb[i]) / SH + 0.4) % 2 == 1) + s2->data->llb[i] = llb[i] - SH / 2; + else if (int(2 * (llb[i] - dd->llb[i]) / DH + 0.4) % 2 == 1) + d2->data->llb[i] = llb[i] - DH / 2; + s2->data->shape[i] = int((s2->data->uub[i] - s2->data->llb[i]) / SH + 0.4); + d2->data->shape[i] = int((d2->data->uub[i] - d2->data->llb[i]) / DH + 0.4); +#else +#error Not define Vertex nor Cell +#endif +#endif + +#endif + s2->data->illb[i] = sd->illb[i]; + d2->data->illb[i] = dd->illb[i]; + s2->data->iuub[i] = sd->iuub[i]; + d2->data->iuub[i] = dd->iuub[i]; + } + s2->data->Bg = sd->Bg; + s2->next = 0; + d2->data->Bg = dd->Bg; + d2->next = 0; + } + d = d->next; + } + s = s->next; + } +} +// PACK: prepare target data in 'data' +// UNPACK: copy target data from 'data' to corresponding numerical grids +int Parallel::data_packer(double *data, MyList *src, MyList *dst, int rank_in, int dir, + MyList *VarLists /* source */, MyList *VarListd /* target */, int Symmetry) +{ + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + + int DIM = dim; + + if (dir != PACK && dir != UNPACK) + { + cout << "error dir " << dir << " for data_packer " << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + int size_out = 0; + + if (!src || !dst) + return size_out; + MyList *varls, *varld; const int state_count = cuda_state_var_count(VarLists, VarListd); @@ -4005,15 +4207,15 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data->Bg->lev == dst->data->Bg->lev) - type = 1; - else if (src->data->Bg->lev > dst->data->Bg->lev) - type = 2; - else - type = 3; - + + int type; /* 1 copy, 2 restrict, 3 prolong */ + if (src->data->Bg->lev == dst->data->Bg->lev) + type = 1; + else if (src->data->Bg->lev > dst->data->Bg->lev) + type = 2; + else + type = 3; + while (src && dst) { if ((dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) || @@ -4031,7 +4233,11 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data, dst->data, type)) { - handled_by_cuda = cuda_direct_pack_segment(data + size_out, src->data, dst->data, state_count); + if (s_cuda_aware_pack_active) { + handled_by_cuda = cuda_direct_pack_segment_to_device(data + size_out, src->data, dst->data, state_count); + } else { + handled_by_cuda = cuda_direct_pack_segment(data + size_out, src->data, dst->data, state_count); + } if (!handled_by_cuda) { cout << "Parallel::data_packer: CUDA direct pack failed." << endl; @@ -4041,7 +4247,11 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data, type)) { - handled_by_cuda = cuda_direct_unpack_segment(data + size_out, dst->data, state_count); + if (s_cuda_aware_pack_active) { + handled_by_cuda = cuda_direct_unpack_segment_from_device(data + size_out, dst->data, state_count); + } else { + handled_by_cuda = cuda_direct_unpack_segment(data + size_out, dst->data, state_count); + } if (!handled_by_cuda) { cout << "Parallel::data_packer: CUDA direct unpack failed." << endl; @@ -4050,26 +4260,34 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data->llb, dst->data->uub, dst->data->shape, data + size_out, - src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn], - dst->data->llb, dst->data->uub); - break; - case 2: - f_restrict3(DIM, dst->data->llb, dst->data->uub, dst->data->shape, data + size_out, - src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn], - dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry); - break; - case 3: - f_prolong3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn], - dst->data->llb, dst->data->uub, dst->data->shape, data + size_out, - dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry); - } + f_copy(DIM, dst->data->llb, dst->data->uub, dst->data->shape, data + size_out, + src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn], + dst->data->llb, dst->data->uub); + break; + case 2: + f_restrict3(DIM, dst->data->llb, dst->data->uub, dst->data->shape, data + size_out, + src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn], + dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry); + break; + case 3: + f_prolong3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn], + dst->data->llb, dst->data->uub, dst->data->shape, data + size_out, + dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry); + } if (dir == UNPACK) // from target data to corresponding grid f_copy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn], dst->data->llb, dst->data->uub, dst->data->shape, data + size_out, @@ -4096,563 +4314,583 @@ int Parallel::data_packer(double *data, MyList *src, MyList

next; src = src->next; - } - - return size_out; -} -int Parallel::data_packermix(double *data, MyList *src, MyList *dst, int rank_in, int dir, - MyList *VarLists /* source */, MyList *VarListd /* target */, int Symmetry) -{ - int myrank; - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - - int DIM = dim; - - if (dir != PACK && dir != UNPACK) - { - cout << "Parallel::data_packermix: error dir " << dir << " for data_packermix." << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - - int size_out = 0; - - if (!src || !dst) - return size_out; - - MyList *varls, *varld; - - varls = VarLists; - varld = VarListd; - while (varls && varld) - { - varls = varls->next; - varld = varld->next; - } - - if (varls || varld) - { - cout << "error in short data packer, var lists does not match." << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - - int type; /* 1 copy, 2 restrict, 3 prolong */ - if (src->data->Bg->lev == dst->data->Bg->lev) - type = 1; - else if (src->data->Bg->lev > dst->data->Bg->lev) - type = 2; - else - type = 3; - - if (type != 3) - { - cout << "Parallel::data_packermix: error type " << type << " for data_packermix." << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - - while (src && dst) - { - if ((dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) || - (dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank)) - { - varls = VarLists; - varld = VarListd; - while (varls && varld) - { - if (data) - { - if (dir == PACK) - f_prolongcopy3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn], - dst->data->llb, dst->data->uub, src->data->shape, data + size_out, - src->data->llb, src->data->uub, varls->data->SoA, Symmetry); - if (dir == UNPACK) // from target data to corresponding grid - f_prolongmix3(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn], - src->data->llb, src->data->uub, src->data->shape, data + size_out, - dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry, dst->data->illb, dst->data->iuub); - } - // the symmetry problem should be dealt in prolongcopy3, - // so we always have ghost_width for both sides - size_out += (src->data->shape[0] + 2 * ghost_width) * (src->data->shape[1] + 2 * ghost_width) * (src->data->shape[2] + 2 * ghost_width); - varls = varls->next; - varld = varld->next; - } - } - dst = dst->next; - src = src->next; - } - - return size_out; -} -// -void Parallel::transfer(MyList **src, MyList **dst, - MyList *VarList1 /* source */, MyList *VarList2 /*target */, - int Symmetry) -{ - int myrank, cpusize; - MPI_Comm_size(MPI_COMM_WORLD, &cpusize); - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - - int node; - - MPI_Request *reqs = new MPI_Request[2 * cpusize]; - MPI_Status *stats = new MPI_Status[2 * cpusize]; - int *req_node = new int[2 * cpusize]; - int *req_is_recv = new int[2 * cpusize]; - int *completed = new int[2 * cpusize]; - int req_no = 0; - int pending_recv = 0; - - double **send_data = new double *[cpusize]; - double **rec_data = new double *[cpusize]; - int *send_lengths = new int[cpusize]; - int *recv_lengths = new int[cpusize]; - - for (node = 0; node < cpusize; node++) - { - send_data[node] = rec_data[node] = 0; - send_lengths[node] = recv_lengths[node] = 0; - } - - // Post receives first so peers can progress rendezvous early. - for (node = 0; node < cpusize; node++) - { - if (node == myrank) continue; - - recv_lengths[node] = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); - if (recv_lengths[node] > 0) - { - rec_data[node] = new double[recv_lengths[node]]; - if (!rec_data[node]) - { - cout << "out of memory when new in short transfer, place 1" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); - req_node[req_no] = node; - req_is_recv[req_no] = 1; - req_no++; - pending_recv++; - } - } - - // Local transfer on this rank. - recv_lengths[myrank] = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); - if (recv_lengths[myrank] > 0) - { - rec_data[myrank] = new double[recv_lengths[myrank]]; - if (!rec_data[myrank]) - { - cout << "out of memory when new in short transfer, place 2" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); - } - - // Pack and post sends. - for (node = 0; node < cpusize; node++) - { - if (node == myrank) continue; - - send_lengths[node] = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - if (send_lengths[node] > 0) - { - send_data[node] = new double[send_lengths[node]]; - if (!send_data[node]) - { - cout << "out of memory when new in short transfer, place 3" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); - req_node[req_no] = node; - req_is_recv[req_no] = 0; - req_no++; - } - } - - // Unpack as soon as receive completes to reduce pure wait time. - while (pending_recv > 0) - { - int outcount = 0; - MPI_Waitsome(req_no, reqs, &outcount, completed, stats); - if (outcount == MPI_UNDEFINED) break; - - for (int i = 0; i < outcount; i++) - { - int idx = completed[i]; - if (idx >= 0 && req_is_recv[idx]) - { - int recv_node = req_node[idx]; - data_packer(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry); - pending_recv--; - } - } - } - - if (req_no > 0) MPI_Waitall(req_no, reqs, stats); - - if (rec_data[myrank]) - data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); - - for (node = 0; node < cpusize; node++) - { - if (send_data[node]) - delete[] send_data[node]; - if (rec_data[node]) - delete[] rec_data[node]; - } - - delete[] reqs; - delete[] stats; - delete[] req_node; - delete[] req_is_recv; - delete[] completed; - delete[] send_data; - delete[] rec_data; - delete[] send_lengths; - delete[] recv_lengths; -} -// -void Parallel::transfermix(MyList **src, MyList **dst, - MyList *VarList1 /* source */, MyList *VarList2 /*target */, - int Symmetry) -{ - int myrank, cpusize; - MPI_Comm_size(MPI_COMM_WORLD, &cpusize); - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - - int node; - - MPI_Request *reqs = new MPI_Request[2 * cpusize]; - MPI_Status *stats = new MPI_Status[2 * cpusize]; - int *req_node = new int[2 * cpusize]; - int *req_is_recv = new int[2 * cpusize]; - int *completed = new int[2 * cpusize]; - int req_no = 0; - int pending_recv = 0; - - double **send_data = new double *[cpusize]; - double **rec_data = new double *[cpusize]; - int *send_lengths = new int[cpusize]; - int *recv_lengths = new int[cpusize]; - - for (node = 0; node < cpusize; node++) - { - send_data[node] = rec_data[node] = 0; - send_lengths[node] = recv_lengths[node] = 0; - } - - // Post receives first so peers can progress rendezvous early. - for (node = 0; node < cpusize; node++) - { - if (node == myrank) continue; - - recv_lengths[node] = data_packermix(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); - if (recv_lengths[node] > 0) - { - rec_data[node] = new double[recv_lengths[node]]; - if (!rec_data[node]) - { - cout << "out of memory when new in short transfer, place 1" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); - req_node[req_no] = node; - req_is_recv[req_no] = 1; - req_no++; - pending_recv++; - } - } - - // Local transfer on this rank. - recv_lengths[myrank] = data_packermix(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); - if (recv_lengths[myrank] > 0) - { - rec_data[myrank] = new double[recv_lengths[myrank]]; - if (!rec_data[myrank]) - { - cout << "out of memory when new in short transfer, place 2" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); - } - - // Pack and post sends. - for (node = 0; node < cpusize; node++) - { - if (node == myrank) continue; - - send_lengths[node] = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - if (send_lengths[node] > 0) - { - send_data[node] = new double[send_lengths[node]]; - if (!send_data[node]) - { - cout << "out of memory when new in short transfer, place 3" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - data_packermix(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); - req_node[req_no] = node; - req_is_recv[req_no] = 0; - req_no++; - } - } - - // Unpack as soon as receive completes to reduce pure wait time. - while (pending_recv > 0) - { - int outcount = 0; - MPI_Waitsome(req_no, reqs, &outcount, completed, stats); - if (outcount == MPI_UNDEFINED) break; - - for (int i = 0; i < outcount; i++) - { - int idx = completed[i]; - if (idx >= 0 && req_is_recv[idx]) - { - int recv_node = req_node[idx]; - data_packermix(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry); - pending_recv--; - } - } - } - - if (req_no > 0) MPI_Waitall(req_no, reqs, stats); - - if (rec_data[myrank]) - data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); - - for (node = 0; node < cpusize; node++) - { - if (send_data[node]) - delete[] send_data[node]; - if (rec_data[node]) - delete[] rec_data[node]; - } - - delete[] reqs; - delete[] stats; - delete[] req_node; - delete[] req_is_recv; - delete[] completed; - delete[] send_data; - delete[] rec_data; - delete[] send_lengths; - delete[] recv_lengths; -} -void Parallel::Sync(Patch *Pat, MyList *VarList, int Symmetry) -{ - int cpusize; - MPI_Comm_size(MPI_COMM_WORLD, &cpusize); - - MyList *dst; - MyList **src, **transfer_src, **transfer_dst; - src = new MyList *[cpusize]; - transfer_src = new MyList *[cpusize]; - transfer_dst = new MyList *[cpusize]; - - dst = build_ghost_gsl(Pat); // ghost region only - for (int node = 0; node < cpusize; node++) - { - src[node] = build_owned_gsl0(Pat, node); // for the part without ghost points and do not extend - build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer_src[node], data locate on cpu#node; - // but for transfer_dst[node] the data may locate on any node - } - - transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry); - - if (dst) - dst->destroyList(); - for (int node = 0; node < cpusize; node++) - { - if (src[node]) - src[node]->destroyList(); - if (transfer_src[node]) - transfer_src[node]->destroyList(); - if (transfer_dst[node]) - transfer_dst[node]->destroyList(); - } - - delete[] src; - delete[] transfer_src; - delete[] transfer_dst; -} -void Parallel::Sync(MyList *PatL, MyList *VarList, int Symmetry) -{ - // Patch inner Synch - MyList *Pp = PatL; - while (Pp) - { - Sync(Pp->data, VarList, Symmetry); - Pp = Pp->next; - } - - // Patch inter Synch - int cpusize; - MPI_Comm_size(MPI_COMM_WORLD, &cpusize); - - MyList *dst; - MyList **src, **transfer_src, **transfer_dst; - src = new MyList *[cpusize]; - transfer_src = new MyList *[cpusize]; - transfer_dst = new MyList *[cpusize]; - - dst = build_buffer_gsl(PatL); // buffer region only - for (int node = 0; node < cpusize; node++) - { - src[node] = build_owned_gsl(PatL, node, 5, Symmetry); // for the part without ghost nor buffer points and do not extend - build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node - } - - transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry); - - if (dst) - dst->destroyList(); - for (int node = 0; node < cpusize; node++) - { - if (src[node]) - src[node]->destroyList(); - if (transfer_src[node]) - transfer_src[node]->destroyList(); - if (transfer_dst[node]) - transfer_dst[node]->destroyList(); - } - - delete[] src; - delete[] transfer_src; - delete[] transfer_dst; -} -// Merged Sync: collect all intra-patch and inter-patch grid segment lists, -// then issue a single transfer() call instead of N+1 separate ones. -void Parallel::Sync_merged(MyList *PatL, MyList *VarList, int Symmetry) -{ - int cpusize; - MPI_Comm_size(MPI_COMM_WORLD, &cpusize); - - MyList **combined_src = new MyList *[cpusize]; - MyList **combined_dst = new MyList *[cpusize]; - for (int node = 0; node < cpusize; node++) - combined_src[node] = combined_dst[node] = 0; - - // Phase A: Intra-patch ghost exchange segments - MyList *Pp = PatL; - while (Pp) - { - Patch *Pat = Pp->data; - MyList *dst_ghost = build_ghost_gsl(Pat); - - for (int node = 0; node < cpusize; node++) - { - MyList *src_owned = build_owned_gsl0(Pat, node); - MyList *tsrc = 0, *tdst = 0; - build_gstl(src_owned, dst_ghost, &tsrc, &tdst); - - if (tsrc) - { - if (combined_src[node]) - combined_src[node]->catList(tsrc); - else - combined_src[node] = tsrc; - } - if (tdst) - { - if (combined_dst[node]) - combined_dst[node]->catList(tdst); - else - combined_dst[node] = tdst; - } - - if (src_owned) - src_owned->destroyList(); - } - - if (dst_ghost) - dst_ghost->destroyList(); - - Pp = Pp->next; - } - - // Phase B: Inter-patch buffer exchange segments - MyList *dst_buffer = build_buffer_gsl(PatL); - for (int node = 0; node < cpusize; node++) - { - MyList *src_owned = build_owned_gsl(PatL, node, 5, Symmetry); - MyList *tsrc = 0, *tdst = 0; - build_gstl(src_owned, dst_buffer, &tsrc, &tdst); - - if (tsrc) - { - if (combined_src[node]) - combined_src[node]->catList(tsrc); - else - combined_src[node] = tsrc; - } - if (tdst) - { - if (combined_dst[node]) - combined_dst[node]->catList(tdst); - else - combined_dst[node] = tdst; - } - - if (src_owned) - src_owned->destroyList(); - } - if (dst_buffer) - dst_buffer->destroyList(); - - // Phase C: Single transfer - transfer(combined_src, combined_dst, VarList, VarList, Symmetry); - - // Phase D: Cleanup - for (int node = 0; node < cpusize; node++) - { - if (combined_src[node]) - combined_src[node]->destroyList(); - if (combined_dst[node]) - combined_dst[node]->destroyList(); - } - delete[] combined_src; - delete[] combined_dst; -} -// SyncCache constructor + } + + return size_out; +} +int Parallel::data_packermix(double *data, MyList *src, MyList *dst, int rank_in, int dir, + MyList *VarLists /* source */, MyList *VarListd /* target */, int Symmetry) +{ + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + + int DIM = dim; + + if (dir != PACK && dir != UNPACK) + { + cout << "Parallel::data_packermix: error dir " << dir << " for data_packermix." << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + int size_out = 0; + + if (!src || !dst) + return size_out; + + MyList *varls, *varld; + + varls = VarLists; + varld = VarListd; + while (varls && varld) + { + varls = varls->next; + varld = varld->next; + } + + if (varls || varld) + { + cout << "error in short data packer, var lists does not match." << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + int type; /* 1 copy, 2 restrict, 3 prolong */ + if (src->data->Bg->lev == dst->data->Bg->lev) + type = 1; + else if (src->data->Bg->lev > dst->data->Bg->lev) + type = 2; + else + type = 3; + + if (type != 3) + { + cout << "Parallel::data_packermix: error type " << type << " for data_packermix." << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + while (src && dst) + { + if ((dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) || + (dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank)) + { + varls = VarLists; + varld = VarListd; + while (varls && varld) + { + if (data) + { + if (dir == PACK) + f_prolongcopy3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn], + dst->data->llb, dst->data->uub, src->data->shape, data + size_out, + src->data->llb, src->data->uub, varls->data->SoA, Symmetry); + if (dir == UNPACK) // from target data to corresponding grid + f_prolongmix3(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn], + src->data->llb, src->data->uub, src->data->shape, data + size_out, + dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry, dst->data->illb, dst->data->iuub); + } + // the symmetry problem should be dealt in prolongcopy3, + // so we always have ghost_width for both sides + size_out += (src->data->shape[0] + 2 * ghost_width) * (src->data->shape[1] + 2 * ghost_width) * (src->data->shape[2] + 2 * ghost_width); + varls = varls->next; + varld = varld->next; + } + } + dst = dst->next; + src = src->next; + } + + return size_out; +} +// +void Parallel::transfer(MyList **src, MyList **dst, + MyList *VarList1 /* source */, MyList *VarList2 /*target */, + int Symmetry) +{ + int myrank, cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + + int node; + + MPI_Request *reqs = new MPI_Request[2 * cpusize]; + MPI_Status *stats = new MPI_Status[2 * cpusize]; + int *req_node = new int[2 * cpusize]; + int *req_is_recv = new int[2 * cpusize]; + int *completed = new int[2 * cpusize]; + int req_no = 0; + int pending_recv = 0; + + double **send_data = new double *[cpusize]; + double **rec_data = new double *[cpusize]; + int *send_lengths = new int[cpusize]; + int *recv_lengths = new int[cpusize]; + + for (node = 0; node < cpusize; node++) + { + send_data[node] = rec_data[node] = 0; + send_lengths[node] = recv_lengths[node] = 0; + } + + // Post receives first so peers can progress rendezvous early. + for (node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + recv_lengths[node] = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); + if (recv_lengths[node] > 0) + { + rec_data[node] = new double[recv_lengths[node]]; + if (!rec_data[node]) + { + cout << "out of memory when new in short transfer, place 1" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); + req_node[req_no] = node; + req_is_recv[req_no] = 1; + req_no++; + pending_recv++; + } + } + + // Local transfer on this rank. + recv_lengths[myrank] = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + if (recv_lengths[myrank] > 0) + { + rec_data[myrank] = new double[recv_lengths[myrank]]; + if (!rec_data[myrank]) + { + cout << "out of memory when new in short transfer, place 2" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + } + + // Pack and post sends. + for (node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + send_lengths[node] = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + if (send_lengths[node] > 0) + { + send_data[node] = new double[send_lengths[node]]; + if (!send_data[node]) + { + cout << "out of memory when new in short transfer, place 3" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); + req_node[req_no] = node; + req_is_recv[req_no] = 0; + req_no++; + } + } + + // Unpack as soon as receive completes to reduce pure wait time. + while (pending_recv > 0) + { + int outcount = 0; + MPI_Waitsome(req_no, reqs, &outcount, completed, stats); + if (outcount == MPI_UNDEFINED) break; + + for (int i = 0; i < outcount; i++) + { + int idx = completed[i]; + if (idx >= 0 && req_is_recv[idx]) + { + int recv_node = req_node[idx]; + data_packer(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry); + pending_recv--; + } + } + } + + if (req_no > 0) MPI_Waitall(req_no, reqs, stats); + + if (rec_data[myrank]) + data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); + + for (node = 0; node < cpusize; node++) + { + if (send_data[node]) + delete[] send_data[node]; + if (rec_data[node]) + delete[] rec_data[node]; + } + + delete[] reqs; + delete[] stats; + delete[] req_node; + delete[] req_is_recv; + delete[] completed; + delete[] send_data; + delete[] rec_data; + delete[] send_lengths; + delete[] recv_lengths; +} +// +void Parallel::transfermix(MyList **src, MyList **dst, + MyList *VarList1 /* source */, MyList *VarList2 /*target */, + int Symmetry) +{ + int myrank, cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + + int node; + + MPI_Request *reqs = new MPI_Request[2 * cpusize]; + MPI_Status *stats = new MPI_Status[2 * cpusize]; + int *req_node = new int[2 * cpusize]; + int *req_is_recv = new int[2 * cpusize]; + int *completed = new int[2 * cpusize]; + int req_no = 0; + int pending_recv = 0; + + double **send_data = new double *[cpusize]; + double **rec_data = new double *[cpusize]; + int *send_lengths = new int[cpusize]; + int *recv_lengths = new int[cpusize]; + + for (node = 0; node < cpusize; node++) + { + send_data[node] = rec_data[node] = 0; + send_lengths[node] = recv_lengths[node] = 0; + } + + // Post receives first so peers can progress rendezvous early. + for (node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + recv_lengths[node] = data_packermix(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); + if (recv_lengths[node] > 0) + { + rec_data[node] = new double[recv_lengths[node]]; + if (!rec_data[node]) + { + cout << "out of memory when new in short transfer, place 1" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); + req_node[req_no] = node; + req_is_recv[req_no] = 1; + req_no++; + pending_recv++; + } + } + + // Local transfer on this rank. + recv_lengths[myrank] = data_packermix(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + if (recv_lengths[myrank] > 0) + { + rec_data[myrank] = new double[recv_lengths[myrank]]; + if (!rec_data[myrank]) + { + cout << "out of memory when new in short transfer, place 2" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + } + + // Pack and post sends. + for (node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + send_lengths[node] = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + if (send_lengths[node] > 0) + { + send_data[node] = new double[send_lengths[node]]; + if (!send_data[node]) + { + cout << "out of memory when new in short transfer, place 3" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + data_packermix(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); + req_node[req_no] = node; + req_is_recv[req_no] = 0; + req_no++; + } + } + + // Unpack as soon as receive completes to reduce pure wait time. + while (pending_recv > 0) + { + int outcount = 0; + MPI_Waitsome(req_no, reqs, &outcount, completed, stats); + if (outcount == MPI_UNDEFINED) break; + + for (int i = 0; i < outcount; i++) + { + int idx = completed[i]; + if (idx >= 0 && req_is_recv[idx]) + { + int recv_node = req_node[idx]; + data_packermix(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry); + pending_recv--; + } + } + } + + if (req_no > 0) MPI_Waitall(req_no, reqs, stats); + + if (rec_data[myrank]) + data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); + + for (node = 0; node < cpusize; node++) + { + if (send_data[node]) + delete[] send_data[node]; + if (rec_data[node]) + delete[] rec_data[node]; + } + + delete[] reqs; + delete[] stats; + delete[] req_node; + delete[] req_is_recv; + delete[] completed; + delete[] send_data; + delete[] rec_data; + delete[] send_lengths; + delete[] recv_lengths; +} +void Parallel::Sync(Patch *Pat, MyList *VarList, int Symmetry) +{ + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + + MyList *dst; + MyList **src, **transfer_src, **transfer_dst; + src = new MyList *[cpusize]; + transfer_src = new MyList *[cpusize]; + transfer_dst = new MyList *[cpusize]; + + dst = build_ghost_gsl(Pat); // ghost region only + for (int node = 0; node < cpusize; node++) + { + src[node] = build_owned_gsl0(Pat, node); // for the part without ghost points and do not extend + build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer_src[node], data locate on cpu#node; + // but for transfer_dst[node] the data may locate on any node + } + + transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry); + + if (dst) + dst->destroyList(); + for (int node = 0; node < cpusize; node++) + { + if (src[node]) + src[node]->destroyList(); + if (transfer_src[node]) + transfer_src[node]->destroyList(); + if (transfer_dst[node]) + transfer_dst[node]->destroyList(); + } + + delete[] src; + delete[] transfer_src; + delete[] transfer_dst; +} +void Parallel::Sync(MyList *PatL, MyList *VarList, int Symmetry) +{ + // Patch inner Synch + MyList *Pp = PatL; + while (Pp) + { + Sync(Pp->data, VarList, Symmetry); + Pp = Pp->next; + } + + // Patch inter Synch + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + + MyList *dst; + MyList **src, **transfer_src, **transfer_dst; + src = new MyList *[cpusize]; + transfer_src = new MyList *[cpusize]; + transfer_dst = new MyList *[cpusize]; + + dst = build_buffer_gsl(PatL); // buffer region only + for (int node = 0; node < cpusize; node++) + { + src[node] = build_owned_gsl(PatL, node, 5, Symmetry); // for the part without ghost nor buffer points and do not extend + build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node + } + + transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry); + + if (dst) + dst->destroyList(); + for (int node = 0; node < cpusize; node++) + { + if (src[node]) + src[node]->destroyList(); + if (transfer_src[node]) + transfer_src[node]->destroyList(); + if (transfer_dst[node]) + transfer_dst[node]->destroyList(); + } + + delete[] src; + delete[] transfer_src; + delete[] transfer_dst; +} +// Merged Sync: collect all intra-patch and inter-patch grid segment lists, +// then issue a single transfer() call instead of N+1 separate ones. +void Parallel::Sync_merged(MyList *PatL, MyList *VarList, int Symmetry) +{ + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + + MyList **combined_src = new MyList *[cpusize]; + MyList **combined_dst = new MyList *[cpusize]; + for (int node = 0; node < cpusize; node++) + combined_src[node] = combined_dst[node] = 0; + + // Phase A: Intra-patch ghost exchange segments + MyList *Pp = PatL; + while (Pp) + { + Patch *Pat = Pp->data; + MyList *dst_ghost = build_ghost_gsl(Pat); + + for (int node = 0; node < cpusize; node++) + { + MyList *src_owned = build_owned_gsl0(Pat, node); + MyList *tsrc = 0, *tdst = 0; + build_gstl(src_owned, dst_ghost, &tsrc, &tdst); + + if (tsrc) + { + if (combined_src[node]) + combined_src[node]->catList(tsrc); + else + combined_src[node] = tsrc; + } + if (tdst) + { + if (combined_dst[node]) + combined_dst[node]->catList(tdst); + else + combined_dst[node] = tdst; + } + + if (src_owned) + src_owned->destroyList(); + } + + if (dst_ghost) + dst_ghost->destroyList(); + + Pp = Pp->next; + } + + // Phase B: Inter-patch buffer exchange segments + MyList *dst_buffer = build_buffer_gsl(PatL); + for (int node = 0; node < cpusize; node++) + { + MyList *src_owned = build_owned_gsl(PatL, node, 5, Symmetry); + MyList *tsrc = 0, *tdst = 0; + build_gstl(src_owned, dst_buffer, &tsrc, &tdst); + + if (tsrc) + { + if (combined_src[node]) + combined_src[node]->catList(tsrc); + else + combined_src[node] = tsrc; + } + if (tdst) + { + if (combined_dst[node]) + combined_dst[node]->catList(tdst); + else + combined_dst[node] = tdst; + } + + if (src_owned) + src_owned->destroyList(); + } + if (dst_buffer) + dst_buffer->destroyList(); + + // Phase C: Single transfer + transfer(combined_src, combined_dst, VarList, VarList, Symmetry); + + // Phase D: Cleanup + for (int node = 0; node < cpusize; node++) + { + if (combined_src[node]) + combined_src[node]->destroyList(); + if (combined_dst[node]) + combined_dst[node]->destroyList(); + } + delete[] combined_src; + delete[] combined_dst; +} +// SyncCache constructor Parallel::SyncCache::SyncCache() : valid(false), cpusize(0), combined_src(0), combined_dst(0), send_lengths(0), recv_lengths(0), send_bufs(0), recv_bufs(0), send_buf_caps(0), recv_buf_caps(0), send_buf_pinned(0), recv_buf_pinned(0), + send_buf_is_dev(0), recv_buf_is_dev(0), + send_buf_caps_dev(0), recv_buf_caps_dev(0), + send_bufs_dev(0), recv_bufs_dev(0), reqs(0), stats(0), max_reqs(0), - lengths_valid(false), tc_req_node(0), tc_req_is_recv(0), tc_completed(0) + lengths_valid(false), tc_req_node(0), tc_req_is_recv(0), tc_completed(0), + cuda_aware_mode(false) { } -// SyncCache invalidate: free grid segment lists but keep buffers -void Parallel::SyncCache::invalidate() -{ - if (!valid) - return; - for (int i = 0; i < cpusize; i++) - { - if (combined_src[i]) - combined_src[i]->destroyList(); - if (combined_dst[i]) - combined_dst[i]->destroyList(); - combined_src[i] = combined_dst[i] = 0; - send_lengths[i] = recv_lengths[i] = 0; - } - valid = false; - lengths_valid = false; -} -// SyncCache destroy: free everything -void Parallel::SyncCache::destroy() -{ - invalidate(); - if (combined_src) delete[] combined_src; - if (combined_dst) delete[] combined_dst; +// SyncCache invalidate: free grid segment lists but keep buffers +void Parallel::SyncCache::invalidate() +{ + if (!valid) + return; + for (int i = 0; i < cpusize; i++) + { + if (combined_src[i]) + combined_src[i]->destroyList(); + if (combined_dst[i]) + combined_dst[i]->destroyList(); + combined_src[i] = combined_dst[i] = 0; + send_lengths[i] = recv_lengths[i] = 0; + } + valid = false; + lengths_valid = false; +} +// SyncCache destroy: free everything +void Parallel::SyncCache::destroy() +{ + invalidate(); + if (combined_src) delete[] combined_src; + if (combined_dst) delete[] combined_dst; if (send_lengths) delete[] send_lengths; if (recv_lengths) delete[] recv_lengths; if (send_buf_caps) delete[] send_buf_caps; if (recv_buf_caps) delete[] recv_buf_caps; for (int i = 0; i < cpusize; i++) { + if (send_bufs_dev && send_bufs_dev[i]) + { +#if USE_CUDA_BSSN || USE_CUDA_Z4C + free_device_comm_buffer(send_bufs_dev[i]); +#else + delete[] send_bufs_dev[i]; +#endif + } + if (recv_bufs_dev && recv_bufs_dev[i]) + { +#if USE_CUDA_BSSN || USE_CUDA_Z4C + free_device_comm_buffer(recv_bufs_dev[i]); +#else + delete[] recv_bufs_dev[i]; +#endif + } if (send_bufs && send_bufs[i]) { #if USE_CUDA_BSSN || USE_CUDA_Z4C @@ -4674,6 +4912,12 @@ void Parallel::SyncCache::destroy() if (recv_bufs) delete[] recv_bufs; if (send_buf_pinned) delete[] send_buf_pinned; if (recv_buf_pinned) delete[] recv_buf_pinned; + if (send_buf_is_dev) delete[] send_buf_is_dev; + if (recv_buf_is_dev) delete[] recv_buf_is_dev; + if (send_buf_caps_dev) delete[] send_buf_caps_dev; + if (recv_buf_caps_dev) delete[] recv_buf_caps_dev; + if (send_bufs_dev) delete[] send_bufs_dev; + if (recv_bufs_dev) delete[] recv_bufs_dev; if (reqs) delete[] reqs; if (stats) delete[] stats; if (tc_req_node) delete[] tc_req_node; @@ -4682,97 +4926,195 @@ void Parallel::SyncCache::destroy() combined_src = combined_dst = 0; send_lengths = recv_lengths = 0; send_buf_caps = recv_buf_caps = 0; + send_buf_caps_dev = recv_buf_caps_dev = 0; send_bufs = recv_bufs = 0; + send_bufs_dev = recv_bufs_dev = 0; send_buf_pinned = recv_buf_pinned = 0; + send_buf_is_dev = recv_buf_is_dev = 0; reqs = 0; stats = 0; - tc_req_node = 0; tc_req_is_recv = 0; tc_completed = 0; - cpusize = 0; max_reqs = 0; -} -// transfer_cached: reuse pre-allocated buffers from SyncCache + tc_req_node = 0; tc_req_is_recv = 0; tc_completed = 0; + cpusize = 0; max_reqs = 0; + cuda_aware_mode = false; +} +// transfer_cached: reuse pre-allocated buffers from SyncCache void Parallel::transfer_cached(MyList **src, MyList **dst, MyList *VarList1, MyList *VarList2, int Symmetry, SyncCache &cache) -{ - int myrank; - MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize); - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - int cpusize = cache.cpusize; - - int req_no = 0; - int pending_recv = 0; - int node; - int *req_node = cache.tc_req_node; - int *req_is_recv = cache.tc_req_is_recv; - int *completed = cache.tc_completed; - - // Post receives first so peers can progress rendezvous early. - for (node = 0; node < cpusize; node++) - { - if (node == myrank) continue; - - int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); +{ + int myrank; + MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize); + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + int cpusize = cache.cpusize; + + int cuda_device_sends = 0; + int cuda_device_recvs = 0; + for (int n = 0; n < cpusize; n++) + cache.send_buf_is_dev[n] = cache.recv_buf_is_dev[n] = 0; +#if USE_CUDA_BSSN || USE_CUDA_Z4C + const int state_count = cuda_state_var_count(VarList1, VarList2); + if (state_count < 0) + { + cout << "Parallel::transfer_cached: variable lists do not match." << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + if (cuda_aware_mpi_enabled()) + { + for (int n = 0; n < cpusize; n++) + { + cache.send_buf_is_dev[n] = cuda_pack_to_device_eligible(src[myrank], dst[myrank], n, state_count, myrank) ? 1 : 0; + cache.recv_buf_is_dev[n] = cuda_recv_to_device_eligible(src[n], dst[n], n, state_count, myrank) ? 1 : 0; + } + cache.recv_buf_is_dev[myrank] = (cache.send_buf_is_dev[myrank] && cache.recv_buf_is_dev[myrank]) ? 1 : 0; + for (int n = 0; n < cpusize; n++) + { + cuda_device_sends += cache.send_buf_is_dev[n] ? 1 : 0; + cuda_device_recvs += cache.recv_buf_is_dev[n] ? 1 : 0; + } + if (cuda_mpi_diag_enabled()) + { + static int diag_reported = 0; + int rep = diag_reported; + if (myrank == 0 && rep < 10) + { + if (__sync_bool_compare_and_swap(&diag_reported, rep, rep + 1)) + fprintf(stderr, "[AMSS-CUDA-MPI][rank %d] transfer_cached: device_sends=%d " + "device_recvs=%d cuda_aware_mpi=%d\n", + myrank, cuda_device_sends, cuda_device_recvs, + cuda_aware_mpi_enabled() ? 1 : 0); + } + } + } + else + { + for (int n = 0; n < cpusize; n++) + cache.send_buf_is_dev[n] = cache.recv_buf_is_dev[n] = 0; + } + cache.cuda_aware_mode = (cuda_device_sends + cuda_device_recvs) > 0; +#endif + + int req_no = 0; + int pending_recv = 0; + int node; + int *req_node = cache.tc_req_node; + int *req_is_recv = cache.tc_req_is_recv; + int *completed = cache.tc_completed; + + // Post receives first so peers can progress rendezvous early. + for (node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); cache.recv_lengths[node] = rlength; if (rlength > 0) { - ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, node, rlength); - MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); - req_node[req_no] = node; - req_is_recv[req_no] = 1; - req_no++; - pending_recv++; - } - } - - // Local transfer on this rank. - int self_len = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + if (cache.recv_buf_is_dev[node]) + { + ensure_device_comm_buffer(cache.recv_bufs_dev, cache.recv_buf_caps_dev, node, rlength); + MPI_Irecv((void *)cache.recv_bufs_dev[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); + } + else + { + ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, node, rlength); + MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); + } + req_node[req_no] = node; + req_is_recv[req_no] = 1; + req_no++; + pending_recv++; + } + } + + // Local transfer on this rank. + int self_len = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); cache.recv_lengths[myrank] = self_len; if (self_len > 0) { - ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, myrank, self_len); - data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + if (cache.recv_buf_is_dev[myrank]) + { + ensure_device_comm_buffer(cache.recv_bufs_dev, cache.recv_buf_caps_dev, myrank, self_len); + data_packer_with_device_buffer(cache.recv_bufs_dev[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + } + else + { + ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, myrank, self_len); + data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + } } - - // Pack and post sends. - for (node = 0; node < cpusize; node++) - { - if (node == myrank) continue; - - int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + + // Pack sends first. Device sends are posted after a single CUDA sync. + for (node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); cache.send_lengths[node] = slength; if (slength > 0) { - ensure_comm_buffer(cache.send_bufs, cache.send_buf_pinned, cache.send_buf_caps, node, slength); - data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); - req_node[req_no] = node; - req_is_recv[req_no] = 0; - req_no++; - } - } - - // Unpack as soon as receive completes to reduce pure wait time. - while (pending_recv > 0) - { - int outcount = 0; - MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats); - if (outcount == MPI_UNDEFINED) break; - - for (int i = 0; i < outcount; i++) - { - int idx = completed[i]; - if (idx >= 0 && req_is_recv[idx]) - { - int recv_node_i = req_node[idx]; - data_packer(cache.recv_bufs[recv_node_i], src[recv_node_i], dst[recv_node_i], recv_node_i, UNPACK, VarList1, VarList2, Symmetry); - pending_recv--; - } - } - } - - if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats); + if (cache.send_buf_is_dev[node]) + { + ensure_device_comm_buffer(cache.send_bufs_dev, cache.send_buf_caps_dev, node, slength); + data_packer_with_device_buffer(cache.send_bufs_dev[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + } + else + { + ensure_comm_buffer(cache.send_bufs, cache.send_buf_pinned, cache.send_buf_caps, node, slength); + data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + } + } + } +#if USE_CUDA_BSSN || USE_CUDA_Z4C + if (cuda_device_sends > 0) + cudaDeviceSynchronize(); +#endif + + for (node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + const int slength = cache.send_lengths[node]; + if (slength > 0) + { + if (cache.send_buf_is_dev[node]) + MPI_Isend((void *)cache.send_bufs_dev[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); + else + MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); + req_node[req_no] = node; + req_is_recv[req_no] = 0; + req_no++; + } + } + + // Unpack as soon as receive completes to reduce pure wait time. + while (pending_recv > 0) + { + int outcount = 0; + MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats); + if (outcount == MPI_UNDEFINED) break; + + for (int i = 0; i < outcount; i++) + { + int idx = completed[i]; + if (idx >= 0 && req_is_recv[idx]) + { + int recv_node_i = req_node[idx]; + if (cache.recv_buf_is_dev[recv_node_i]) + data_packer_with_device_buffer(cache.recv_bufs_dev[recv_node_i], src[recv_node_i], dst[recv_node_i], recv_node_i, UNPACK, VarList1, VarList2, Symmetry); + else + data_packer(cache.recv_bufs[recv_node_i], src[recv_node_i], dst[recv_node_i], recv_node_i, UNPACK, VarList1, VarList2, Symmetry); + pending_recv--; + } + } + } + + if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats); if (self_len > 0) - data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); + { + if (cache.recv_buf_is_dev[myrank]) + data_packer_with_device_buffer(cache.recv_bufs_dev[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); + else + data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); + } } void Parallel::Sync_ensure_cache(MyList *PatL, int Symmetry, SyncCache &cache) { @@ -4795,11 +5137,20 @@ void Parallel::Sync_ensure_cache(MyList *PatL, int Symmetry, SyncCache &c cache.recv_buf_caps = new int[cpusize]; cache.send_buf_pinned = new unsigned char[cpusize]; cache.recv_buf_pinned = new unsigned char[cpusize]; + cache.send_buf_is_dev = new unsigned char[cpusize]; + cache.recv_buf_is_dev = new unsigned char[cpusize]; + cache.send_buf_caps_dev = new int[cpusize]; + cache.recv_buf_caps_dev = new int[cpusize]; + cache.send_bufs_dev = new double *[cpusize]; + cache.recv_bufs_dev = new double *[cpusize]; for (int i = 0; i < cpusize; i++) { cache.send_bufs[i] = cache.recv_bufs[i] = 0; cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0; cache.send_buf_pinned[i] = cache.recv_buf_pinned[i] = 0; + cache.send_buf_is_dev[i] = cache.recv_buf_is_dev[i] = 0; + cache.send_buf_caps_dev[i] = cache.recv_buf_caps_dev[i] = 0; + cache.send_bufs_dev[i] = cache.recv_bufs_dev[i] = 0; } cache.max_reqs = 2 * cpusize; cache.reqs = new MPI_Request[cache.max_reqs]; @@ -4878,7 +5229,7 @@ void Parallel::Sync_cached(MyList *PatL, MyList *VarList, int Symmet // Use cached lists with buffer-reusing transfer transfer_cached(cache.combined_src, cache.combined_dst, VarList, VarList, Symmetry, cache); } -// Sync_start: pack and post MPI_Isend/Irecv, return immediately +// Sync_start: pack and post MPI_Isend/Irecv, return immediately void Parallel::Sync_start(MyList *PatL, MyList *VarList, int Symmetry, SyncCache &cache, AsyncSyncState &state) { @@ -4887,19 +5238,60 @@ void Parallel::Sync_start(MyList *PatL, MyList *VarList, int Symmetr // Now pack and post async MPI operations int myrank; - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - int cpusize = cache.cpusize; - state.req_no = 0; - state.active = true; - state.pending_recv = 0; - // Allocate tracking arrays - delete[] state.req_node; delete[] state.req_is_recv; - state.req_node = new int[cache.max_reqs]; - state.req_is_recv = new int[cache.max_reqs]; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + int cpusize = cache.cpusize; + state.req_no = 0; + state.active = true; + state.pending_recv = 0; + // Allocate tracking arrays + delete[] state.req_node; delete[] state.req_is_recv; + state.req_node = new int[cache.max_reqs]; + state.req_is_recv = new int[cache.max_reqs]; MyList **src = cache.combined_src; MyList **dst = cache.combined_dst; + int cuda_device_sends = 0; + int cuda_device_recvs = 0; + for (int n = 0; n < cpusize; n++) + cache.send_buf_is_dev[n] = cache.recv_buf_is_dev[n] = 0; +#if USE_CUDA_BSSN || USE_CUDA_Z4C + const int state_count = cuda_state_var_count(VarList, VarList); + if (state_count < 0) + { + cout << "Parallel::Sync_start: variable lists do not match." << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + if (cuda_aware_mpi_enabled()) + { + for (int n = 0; n < cpusize; n++) + { + cache.send_buf_is_dev[n] = cuda_pack_to_device_eligible(src[myrank], dst[myrank], n, state_count, myrank) ? 1 : 0; + cache.recv_buf_is_dev[n] = cuda_recv_to_device_eligible(src[n], dst[n], n, state_count, myrank) ? 1 : 0; + } + cache.recv_buf_is_dev[myrank] = (cache.send_buf_is_dev[myrank] && cache.recv_buf_is_dev[myrank]) ? 1 : 0; + for (int n = 0; n < cpusize; n++) + { + cuda_device_sends += cache.send_buf_is_dev[n] ? 1 : 0; + cuda_device_recvs += cache.recv_buf_is_dev[n] ? 1 : 0; + } + if (cuda_mpi_diag_enabled()) + { + static int diag_reported = 0; + int rep = diag_reported; + if (myrank == 0 && rep < 20) + { + if (__sync_bool_compare_and_swap(&diag_reported, rep, rep + 1)) + fprintf(stderr, "[AMSS-CUDA-MPI][rank %d] Sync_start: device_sends=%d " + "device_recvs=%d cuda_aware_mpi=%d\n", + myrank, cuda_device_sends, cuda_device_recvs, + cuda_aware_mpi_enabled() ? 1 : 0); + } + } + } + cache.cuda_aware_mode = (cuda_device_sends + cuda_device_recvs) > 0; +#endif + for (int node = 0; node < cpusize; node++) { if (node == myrank) @@ -4913,11 +5305,22 @@ void Parallel::Sync_start(MyList *PatL, MyList *VarList, int Symmetr } if (rlength > 0) { - ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, node, rlength); - state.req_node[state.req_no] = node; - state.req_is_recv[state.req_no] = 1; - state.pending_recv++; - MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++); + if (cache.recv_buf_is_dev[node]) + { + ensure_device_comm_buffer(cache.recv_bufs_dev, cache.recv_buf_caps_dev, node, rlength); + state.req_node[state.req_no] = node; + state.req_is_recv[state.req_no] = 1; + state.pending_recv++; + MPI_Irecv((void *)cache.recv_bufs_dev[node], rlength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++); + } + else + { + ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, node, rlength); + state.req_node[state.req_no] = node; + state.req_is_recv[state.req_no] = 1; + state.pending_recv++; + MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++); + } } } @@ -4926,37 +5329,70 @@ void Parallel::Sync_start(MyList *PatL, MyList *VarList, int Symmetr if (node == myrank) { int length; - if (!cache.lengths_valid) { - length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); - cache.recv_lengths[node] = length; - } else { - length = cache.recv_lengths[node]; + if (!cache.lengths_valid) { + length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); + cache.recv_lengths[node] = length; + } else { + length = cache.recv_lengths[node]; } if (length > 0) { - ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, node, length); - data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); + if (cache.recv_buf_is_dev[node]) + { + ensure_device_comm_buffer(cache.recv_bufs_dev, cache.recv_buf_caps_dev, node, length); + data_packer_with_device_buffer(cache.recv_bufs_dev[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); + } + else + { + ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, node, length); + data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); + } } } else { - int slength; - if (!cache.lengths_valid) { - slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); - cache.send_lengths[node] = slength; - } else { - slength = cache.send_lengths[node]; + int slength; + if (!cache.lengths_valid) { + slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); + cache.send_lengths[node] = slength; + } else { + slength = cache.send_lengths[node]; } if (slength > 0) { - ensure_comm_buffer(cache.send_bufs, cache.send_buf_pinned, cache.send_buf_caps, node, slength); - data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); - state.req_node[state.req_no] = node; - state.req_is_recv[state.req_no] = 0; - MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++); + if (cache.send_buf_is_dev[node]) + { + ensure_device_comm_buffer(cache.send_bufs_dev, cache.send_buf_caps_dev, node, slength); + data_packer_with_device_buffer(cache.send_bufs_dev[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); + } + else + { + ensure_comm_buffer(cache.send_bufs, cache.send_buf_pinned, cache.send_buf_caps, node, slength); + data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); + } } } } +#if USE_CUDA_BSSN || USE_CUDA_Z4C + if (cuda_device_sends > 0) + cudaDeviceSynchronize(); +#endif + + for (int node = 0; node < cpusize; node++) + { + if (node == myrank) + continue; + const int slength = cache.send_lengths[node]; + if (slength > 0) + { + state.req_node[state.req_no] = node; + state.req_is_recv[state.req_no] = 0; + if (cache.send_buf_is_dev[node]) + MPI_Isend((void *)cache.send_bufs_dev[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++); + else + MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++); + } + } cache.lengths_valid = true; if (sync_profile_enabled()) { @@ -4969,9 +5405,9 @@ void Parallel::Sync_start(MyList *PatL, MyList *VarList, int Symmetr void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state, MyList *VarList, int Symmetry) { - if (!state.active) - return; - + if (!state.active) + return; + int myrank; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MyList **src = cache.combined_src; @@ -4980,10 +5416,18 @@ void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state, double wait_sec = 0.0; // Unpack local data first (no MPI needed) - if (cache.recv_bufs[myrank] && cache.recv_lengths[myrank] > 0) - data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList, VarList, Symmetry); - - // Progressive unpack of remote receives + if (cache.recv_buf_is_dev[myrank]) + { + if (cache.recv_bufs_dev[myrank] && cache.recv_lengths[myrank] > 0) + data_packer_with_device_buffer(cache.recv_bufs_dev[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList, VarList, Symmetry); + } + else + { + if (cache.recv_bufs[myrank] && cache.recv_lengths[myrank] > 0) + data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList, VarList, Symmetry); + } + + // Progressive unpack of remote receives if (state.pending_recv > 0 && state.req_no > 0) { int pending = state.pending_recv; @@ -5001,7 +5445,10 @@ void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state, if (idx >= 0 && state.req_is_recv[idx]) { int recv_node = state.req_node[idx]; - data_packer(cache.recv_bufs[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList, VarList, Symmetry); + if (cache.recv_buf_is_dev[recv_node]) + data_packer_with_device_buffer(cache.recv_bufs_dev[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList, VarList, Symmetry); + else + data_packer(cache.recv_bufs[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList, VarList, Symmetry); pending--; } } @@ -5029,489 +5476,489 @@ void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state, sync_profile_maybe_log(); } } -// collect buffer grid segments or blocks for the periodic boundary condition of given patch -// --------------------------------------------------- -// |con | |con | -// |ner | PhysBD |ner | -// |-------------------------------------------------| -// | | | | -// |Phy | |Phy | -// |sBD | |BD | -// | | | | -// | | | | -// | | | | -// |-------------------------------------------------| -// |con | PhysBD |con | -// |ner | |ner | -// --------------------------------------------------- -// first order derivetive does not need conner information, -// but second order derivative needs! -/* the following code does not include conner part -MyList *Parallel::build_PhysBD_gsl(Patch *Pat) -{ - MyList *cgsl,*gsc,*gsb=0,*p; - gsc = build_ghost_gsl(Pat); - for(int i=0;idata->Bg->getdX(i); -// lower boundary - if(gsb) - { - p = new MyList; - p->data = new Parallel::gridseg; - p->next=gsb; - gsb=p; - } - else - { - gsb = new MyList; - gsb->data = new Parallel::gridseg; - gsb->next=0; - } - for(int j=0;jdata->llb[i] = Pat->bbox[i]-ghost_width*DH; - gsb->data->uub[i] = Pat->bbox[i]-DH; -#else -#ifdef Cell - gsb->data->llb[i] = Pat->bbox[i]-ghost_width*DH; - gsb->data->uub[i] = Pat->bbox[i]; -#else -#error Not define Vertex nor Cell -#endif -#endif - gsb->data->shape[i] = ghost_width; - } - else - { - gsb->data->llb[j] = Pat->bbox[j]; - gsb->data->uub[j] = Pat->bbox[j+dim]; - gsb->data->shape[j] = Pat->shape[j]; - } - } - gsb->data->Bg = 0; //vertual grid segment -// upper boundary - p = new MyList; - p->data = new Parallel::gridseg; - p->next=gsb; - gsb=p; - for(int j=0;jdata->llb[i] = Pat->bbox[i+dim]+DH; - gsb->data->uub[i] = Pat->bbox[i+dim]+ghost_width*DH; -#else -#ifdef Cell - gsb->data->llb[i] = Pat->bbox[i+dim]; - gsb->data->uub[i] = Pat->bbox[i+dim]+ghost_width*DH; -#else -#error Not define Vertex nor Cell -#endif -#endif - gsb->data->shape[i] = ghost_width; - } - else - { - gsb->data->llb[j] = Pat->bbox[j]; - gsb->data->uub[j] = Pat->bbox[j+dim]; - gsb->data->shape[j] = Pat->shape[j]; - } - } - gsb->data->Bg = 0; //vertual grid segment - } - - cgsl = gsl_and(gsc,gsb); - - gsc->destroyList(); - gsb->destroyList(); - - return cgsl; -} -*/ -// the following code includes conner part -MyList *Parallel::build_PhysBD_gsl(Patch *Pat) -{ - MyList *cgsl, *gsc, *gsb = 0, *p; - - gsc = build_complete_gsl(Pat); - - gsb = new MyList; - gsb->data = new Parallel::gridseg; - gsb->next = 0; - gsb->data->Bg = 0; - - for (int j = 0; j < dim; j++) - { - gsb->data->llb[j] = Pat->bbox[j]; - gsb->data->uub[j] = Pat->bbox[j + dim]; - gsb->data->shape[j] = Pat->shape[j]; - } - - p = gsl_subtract(gsc, gsb); - - gsc->destroyList(); - gsb->destroyList(); - - cgsl = divide_gsl(p, Pat); - - p->destroyList(); - - return cgsl; -} -MyList *Parallel::divide_gsl(MyList *p, Patch *Pat) -{ - MyList *cgsl = 0; - while (p) - { - if (cgsl) - cgsl->catList(divide_gs(p, Pat)); - else - cgsl = divide_gs(p, Pat); - p = p->next; - } - - return cgsl; -} -// divide the gs into pices which locate either totally outside of the given Patch coordinate range -// or totally inside it. It's usefull for periodic boundary condition -MyList *Parallel::divide_gs(MyList *p, Patch *Pat) -{ - double DH[dim]; - for (int i = 0; i < dim; i++) - { - DH[i] = p->data->Bg->getdX(i); - } - - int num[dim]; - double llb[3][dim], uub[3][dim]; - for (int i = 0; i < dim; i++) - { - if (p->data->llb[i] < Pat->bbox[i] - DH[i] / 2) - { - if (p->data->uub[i] > Pat->bbox[i + dim] + DH[i] / 2) - { - num[i] = 3; - llb[0][i] = p->data->llb[i]; - llb[1][i] = Pat->bbox[i]; - uub[1][i] = Pat->bbox[i + dim]; - uub[2][i] = p->data->uub[i]; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - uub[0][i] = Pat->bbox[i] - DH[i]; - llb[2][i] = Pat->bbox[i + dim] + DH[i]; -#else -#ifdef Cell - uub[0][i] = Pat->bbox[i]; - llb[2][i] = Pat->bbox[i + dim]; -#else -#error Not define Vertex nor Cell -#endif -#endif - } - else if (p->data->uub[i] > Pat->bbox[i] + DH[i] / 2) - { - num[i] = 2; - llb[0][i] = p->data->llb[i]; - llb[1][i] = Pat->bbox[i]; - uub[1][i] = p->data->uub[i]; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - uub[0][i] = Pat->bbox[i] - DH[i]; -#else -#ifdef Cell - uub[0][i] = Pat->bbox[i]; -#else -#error Not define Vertex nor Cell -#endif -#endif - } - else - { - num[i] = 1; - llb[0][i] = p->data->llb[i]; - uub[0][i] = p->data->uub[i]; - } - } - else if (p->data->llb[i] < Pat->bbox[i + dim] - DH[i] / 2) - { - if (p->data->uub[i] > Pat->bbox[i + dim] + DH[i] / 2) - { - num[i] = 2; - llb[0][i] = p->data->llb[i]; - uub[0][i] = Pat->bbox[i + dim]; - uub[1][i] = p->data->uub[i]; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - llb[1][i] = Pat->bbox[i + dim] + DH[i]; -#else -#ifdef Cell - llb[1][i] = Pat->bbox[i + dim]; -#else -#error Not define Vertex nor Cell -#endif -#endif - } - else - { - num[i] = 1; - llb[0][i] = p->data->llb[i]; - uub[0][i] = p->data->uub[i]; - } - } - else - { - num[i] = 1; - llb[0][i] = p->data->llb[i]; - uub[0][i] = p->data->uub[i]; - } - } - MyList *cgsl = 0, *gg; - int NN = 1; - for (int i = 0; i < dim; i++) - NN = NN * num[i]; - - for (int i = 0; i < NN; i++) - { - int ind[dim]; - getarrayindex(dim, num, ind, i); - gg = clone_gsl(p, true); - for (int k = 0; k < dim; k++) - { - gg->data->llb[k] = llb[ind[k]][k]; - gg->data->uub[k] = uub[ind[k]][k]; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - gg->data->shape[k] = int((uub[ind[k]][k] - llb[ind[k]][k]) / DH[k] + 0.4) + 1; -#else -#ifdef Cell - gg->data->shape[k] = int((uub[ind[k]][k] - llb[ind[k]][k]) / DH[k] + 0.4); -#else -#error Not define Vertex nor Cell -#endif -#endif - } - - if (cgsl) - cgsl->catList(gg); - else - cgsl = gg; - } - - return cgsl; -} -// after mod operation, according to overlape to determine real grid segments -void Parallel::build_PhysBD_gstl(Patch *Pat, MyList *srci, MyList *dsti, - MyList **out_src, MyList **out_dst) -{ - *out_src = *out_dst = 0; - - if (!srci || !dsti) - return; - - MyList *s, *d; - MyList *s2, *d2; - - double llb[dim], uub[dim]; - - s = srci; - while (s) - { - Parallel::gridseg *sd = s->data; - d = dsti; - while (d) - { - Parallel::gridseg *dd = d->data; - bool flag = true; - for (int i = 0; i < dim; i++) - { - double SH = sd->Bg->getdX(i), DH = dd->Bg->getdX(i); - if (!feq(SH, DH, SH / 2)) - { - cout << "Parallel::build_PhysBD_gstl meets different grid space SH = " << SH << ", DH = " << DH << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - // we assume dst and src locate on the same Patch - if (dd->llb[i] < Pat->bbox[i]) - llb[i] = Mymax(sd->llb[i], dd->llb[i] + Pat->bbox[dim + i] - Pat->bbox[i]); - else if (dd->llb[i] > Pat->bbox[i + dim]) - llb[i] = Mymax(sd->llb[i], dd->llb[i] - Pat->bbox[dim + i] + Pat->bbox[i]); - else - llb[i] = Mymax(sd->llb[i], dd->llb[i]); - - if (dd->uub[i] < Pat->bbox[i]) - uub[i] = Mymin(sd->uub[i], dd->uub[i] + Pat->bbox[dim + i] - Pat->bbox[i]); - else if (dd->uub[i] > Pat->bbox[dim + i]) - uub[i] = Mymin(sd->uub[i], dd->uub[i] - Pat->bbox[dim + i] + Pat->bbox[i]); - else - uub[i] = Mymin(sd->uub[i], dd->uub[i]); -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - if (llb[i] > uub[i] + SH / 2) - { - flag = false; - break; - } // special for isolated point -#else -#ifdef Cell - if (llb[i] > uub[i]) - { - flag = false; - break; - } -#else -#error Not define Vertex nor Cell -#endif -#endif - } - - if (flag) - { - if (!(*out_src)) - { - *out_src = s2 = new MyList; - *out_dst = d2 = new MyList; - s2->data = new Parallel::gridseg; - d2->data = new Parallel::gridseg; - } - else - { - s2->next = new MyList; - s2 = s2->next; - d2->next = new MyList; - d2 = d2->next; - s2->data = new Parallel::gridseg; - d2->data = new Parallel::gridseg; - } - - for (int i = 0; i < dim; i++) - { - double SH = sd->Bg->getdX(i), DH = dd->Bg->getdX(i); - s2->data->llb[i] = llb[i]; - s2->data->uub[i] = uub[i]; - - if (dd->llb[i] < Pat->bbox[i]) - d2->data->llb[i] = llb[i] - Pat->bbox[dim + i] + Pat->bbox[i]; - else if (dd->llb[i] > Pat->bbox[i + dim]) - d2->data->llb[i] = llb[i] + Pat->bbox[dim + i] - Pat->bbox[i]; - else - d2->data->llb[i] = llb[i]; - - if (dd->uub[i] < Pat->bbox[i]) - d2->data->uub[i] = uub[i] - Pat->bbox[dim + i] + Pat->bbox[i]; - else if (dd->uub[i] > Pat->bbox[dim + i]) - d2->data->uub[i] = uub[i] + Pat->bbox[dim + i] - Pat->bbox[i]; - else - d2->data->uub[i] = uub[i]; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - s2->data->shape[i] = int((s2->data->uub[i] - s2->data->llb[i]) / SH + 0.4) + 1; - d2->data->shape[i] = int((d2->data->uub[i] - d2->data->llb[i]) / DH + 0.4) + 1; -#else -#ifdef Cell - s2->data->shape[i] = int((s2->data->uub[i] - s2->data->llb[i]) / SH + 0.4); - d2->data->shape[i] = int((d2->data->uub[i] - d2->data->llb[i]) / DH + 0.4); -#else -#error Not define Vertex nor Cell -#endif -#endif - } - s2->data->Bg = sd->Bg; - s2->next = 0; - d2->data->Bg = dd->Bg; - d2->next = 0; - } - d = d->next; - } - s = s->next; - } -} -void Parallel::PeriodicBD(Patch *Pat, MyList *VarList, int Symmetry) -{ - int cpusize; - MPI_Comm_size(MPI_COMM_WORLD, &cpusize); - - MyList *dst; - MyList **src, **transfer_src, **transfer_dst; - src = new MyList *[cpusize]; - transfer_src = new MyList *[cpusize]; - transfer_dst = new MyList *[cpusize]; - - dst = build_PhysBD_gsl(Pat); - for (int node = 0; node < cpusize; node++) - { - src[node] = build_owned_gsl0(Pat, node); // for the part without ghost points and do not extend - build_PhysBD_gstl(Pat, src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node - } - - transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry); - - if (dst) - dst->destroyList(); - for (int node = 0; node < cpusize; node++) - { - if (src[node]) - src[node]->destroyList(); - if (transfer_src[node]) - transfer_src[node]->destroyList(); - if (transfer_dst[node]) - transfer_dst[node]->destroyList(); - } - - delete[] src; - delete[] transfer_src; - delete[] transfer_dst; -} +// collect buffer grid segments or blocks for the periodic boundary condition of given patch +// --------------------------------------------------- +// |con | |con | +// |ner | PhysBD |ner | +// |-------------------------------------------------| +// | | | | +// |Phy | |Phy | +// |sBD | |BD | +// | | | | +// | | | | +// | | | | +// |-------------------------------------------------| +// |con | PhysBD |con | +// |ner | |ner | +// --------------------------------------------------- +// first order derivetive does not need conner information, +// but second order derivative needs! +/* the following code does not include conner part +MyList *Parallel::build_PhysBD_gsl(Patch *Pat) +{ + MyList *cgsl,*gsc,*gsb=0,*p; + gsc = build_ghost_gsl(Pat); + for(int i=0;idata->Bg->getdX(i); +// lower boundary + if(gsb) + { + p = new MyList; + p->data = new Parallel::gridseg; + p->next=gsb; + gsb=p; + } + else + { + gsb = new MyList; + gsb->data = new Parallel::gridseg; + gsb->next=0; + } + for(int j=0;jdata->llb[i] = Pat->bbox[i]-ghost_width*DH; + gsb->data->uub[i] = Pat->bbox[i]-DH; +#else +#ifdef Cell + gsb->data->llb[i] = Pat->bbox[i]-ghost_width*DH; + gsb->data->uub[i] = Pat->bbox[i]; +#else +#error Not define Vertex nor Cell +#endif +#endif + gsb->data->shape[i] = ghost_width; + } + else + { + gsb->data->llb[j] = Pat->bbox[j]; + gsb->data->uub[j] = Pat->bbox[j+dim]; + gsb->data->shape[j] = Pat->shape[j]; + } + } + gsb->data->Bg = 0; //vertual grid segment +// upper boundary + p = new MyList; + p->data = new Parallel::gridseg; + p->next=gsb; + gsb=p; + for(int j=0;jdata->llb[i] = Pat->bbox[i+dim]+DH; + gsb->data->uub[i] = Pat->bbox[i+dim]+ghost_width*DH; +#else +#ifdef Cell + gsb->data->llb[i] = Pat->bbox[i+dim]; + gsb->data->uub[i] = Pat->bbox[i+dim]+ghost_width*DH; +#else +#error Not define Vertex nor Cell +#endif +#endif + gsb->data->shape[i] = ghost_width; + } + else + { + gsb->data->llb[j] = Pat->bbox[j]; + gsb->data->uub[j] = Pat->bbox[j+dim]; + gsb->data->shape[j] = Pat->shape[j]; + } + } + gsb->data->Bg = 0; //vertual grid segment + } + + cgsl = gsl_and(gsc,gsb); + + gsc->destroyList(); + gsb->destroyList(); + + return cgsl; +} +*/ +// the following code includes conner part +MyList *Parallel::build_PhysBD_gsl(Patch *Pat) +{ + MyList *cgsl, *gsc, *gsb = 0, *p; + + gsc = build_complete_gsl(Pat); + + gsb = new MyList; + gsb->data = new Parallel::gridseg; + gsb->next = 0; + gsb->data->Bg = 0; + + for (int j = 0; j < dim; j++) + { + gsb->data->llb[j] = Pat->bbox[j]; + gsb->data->uub[j] = Pat->bbox[j + dim]; + gsb->data->shape[j] = Pat->shape[j]; + } + + p = gsl_subtract(gsc, gsb); + + gsc->destroyList(); + gsb->destroyList(); + + cgsl = divide_gsl(p, Pat); + + p->destroyList(); + + return cgsl; +} +MyList *Parallel::divide_gsl(MyList *p, Patch *Pat) +{ + MyList *cgsl = 0; + while (p) + { + if (cgsl) + cgsl->catList(divide_gs(p, Pat)); + else + cgsl = divide_gs(p, Pat); + p = p->next; + } + + return cgsl; +} +// divide the gs into pices which locate either totally outside of the given Patch coordinate range +// or totally inside it. It's usefull for periodic boundary condition +MyList *Parallel::divide_gs(MyList *p, Patch *Pat) +{ + double DH[dim]; + for (int i = 0; i < dim; i++) + { + DH[i] = p->data->Bg->getdX(i); + } + + int num[dim]; + double llb[3][dim], uub[3][dim]; + for (int i = 0; i < dim; i++) + { + if (p->data->llb[i] < Pat->bbox[i] - DH[i] / 2) + { + if (p->data->uub[i] > Pat->bbox[i + dim] + DH[i] / 2) + { + num[i] = 3; + llb[0][i] = p->data->llb[i]; + llb[1][i] = Pat->bbox[i]; + uub[1][i] = Pat->bbox[i + dim]; + uub[2][i] = p->data->uub[i]; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + uub[0][i] = Pat->bbox[i] - DH[i]; + llb[2][i] = Pat->bbox[i + dim] + DH[i]; +#else +#ifdef Cell + uub[0][i] = Pat->bbox[i]; + llb[2][i] = Pat->bbox[i + dim]; +#else +#error Not define Vertex nor Cell +#endif +#endif + } + else if (p->data->uub[i] > Pat->bbox[i] + DH[i] / 2) + { + num[i] = 2; + llb[0][i] = p->data->llb[i]; + llb[1][i] = Pat->bbox[i]; + uub[1][i] = p->data->uub[i]; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + uub[0][i] = Pat->bbox[i] - DH[i]; +#else +#ifdef Cell + uub[0][i] = Pat->bbox[i]; +#else +#error Not define Vertex nor Cell +#endif +#endif + } + else + { + num[i] = 1; + llb[0][i] = p->data->llb[i]; + uub[0][i] = p->data->uub[i]; + } + } + else if (p->data->llb[i] < Pat->bbox[i + dim] - DH[i] / 2) + { + if (p->data->uub[i] > Pat->bbox[i + dim] + DH[i] / 2) + { + num[i] = 2; + llb[0][i] = p->data->llb[i]; + uub[0][i] = Pat->bbox[i + dim]; + uub[1][i] = p->data->uub[i]; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + llb[1][i] = Pat->bbox[i + dim] + DH[i]; +#else +#ifdef Cell + llb[1][i] = Pat->bbox[i + dim]; +#else +#error Not define Vertex nor Cell +#endif +#endif + } + else + { + num[i] = 1; + llb[0][i] = p->data->llb[i]; + uub[0][i] = p->data->uub[i]; + } + } + else + { + num[i] = 1; + llb[0][i] = p->data->llb[i]; + uub[0][i] = p->data->uub[i]; + } + } + MyList *cgsl = 0, *gg; + int NN = 1; + for (int i = 0; i < dim; i++) + NN = NN * num[i]; + + for (int i = 0; i < NN; i++) + { + int ind[dim]; + getarrayindex(dim, num, ind, i); + gg = clone_gsl(p, true); + for (int k = 0; k < dim; k++) + { + gg->data->llb[k] = llb[ind[k]][k]; + gg->data->uub[k] = uub[ind[k]][k]; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + gg->data->shape[k] = int((uub[ind[k]][k] - llb[ind[k]][k]) / DH[k] + 0.4) + 1; +#else +#ifdef Cell + gg->data->shape[k] = int((uub[ind[k]][k] - llb[ind[k]][k]) / DH[k] + 0.4); +#else +#error Not define Vertex nor Cell +#endif +#endif + } + + if (cgsl) + cgsl->catList(gg); + else + cgsl = gg; + } + + return cgsl; +} +// after mod operation, according to overlape to determine real grid segments +void Parallel::build_PhysBD_gstl(Patch *Pat, MyList *srci, MyList *dsti, + MyList **out_src, MyList **out_dst) +{ + *out_src = *out_dst = 0; + + if (!srci || !dsti) + return; + + MyList *s, *d; + MyList *s2, *d2; + + double llb[dim], uub[dim]; + + s = srci; + while (s) + { + Parallel::gridseg *sd = s->data; + d = dsti; + while (d) + { + Parallel::gridseg *dd = d->data; + bool flag = true; + for (int i = 0; i < dim; i++) + { + double SH = sd->Bg->getdX(i), DH = dd->Bg->getdX(i); + if (!feq(SH, DH, SH / 2)) + { + cout << "Parallel::build_PhysBD_gstl meets different grid space SH = " << SH << ", DH = " << DH << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + // we assume dst and src locate on the same Patch + if (dd->llb[i] < Pat->bbox[i]) + llb[i] = Mymax(sd->llb[i], dd->llb[i] + Pat->bbox[dim + i] - Pat->bbox[i]); + else if (dd->llb[i] > Pat->bbox[i + dim]) + llb[i] = Mymax(sd->llb[i], dd->llb[i] - Pat->bbox[dim + i] + Pat->bbox[i]); + else + llb[i] = Mymax(sd->llb[i], dd->llb[i]); + + if (dd->uub[i] < Pat->bbox[i]) + uub[i] = Mymin(sd->uub[i], dd->uub[i] + Pat->bbox[dim + i] - Pat->bbox[i]); + else if (dd->uub[i] > Pat->bbox[dim + i]) + uub[i] = Mymin(sd->uub[i], dd->uub[i] - Pat->bbox[dim + i] + Pat->bbox[i]); + else + uub[i] = Mymin(sd->uub[i], dd->uub[i]); +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + if (llb[i] > uub[i] + SH / 2) + { + flag = false; + break; + } // special for isolated point +#else +#ifdef Cell + if (llb[i] > uub[i]) + { + flag = false; + break; + } +#else +#error Not define Vertex nor Cell +#endif +#endif + } + + if (flag) + { + if (!(*out_src)) + { + *out_src = s2 = new MyList; + *out_dst = d2 = new MyList; + s2->data = new Parallel::gridseg; + d2->data = new Parallel::gridseg; + } + else + { + s2->next = new MyList; + s2 = s2->next; + d2->next = new MyList; + d2 = d2->next; + s2->data = new Parallel::gridseg; + d2->data = new Parallel::gridseg; + } + + for (int i = 0; i < dim; i++) + { + double SH = sd->Bg->getdX(i), DH = dd->Bg->getdX(i); + s2->data->llb[i] = llb[i]; + s2->data->uub[i] = uub[i]; + + if (dd->llb[i] < Pat->bbox[i]) + d2->data->llb[i] = llb[i] - Pat->bbox[dim + i] + Pat->bbox[i]; + else if (dd->llb[i] > Pat->bbox[i + dim]) + d2->data->llb[i] = llb[i] + Pat->bbox[dim + i] - Pat->bbox[i]; + else + d2->data->llb[i] = llb[i]; + + if (dd->uub[i] < Pat->bbox[i]) + d2->data->uub[i] = uub[i] - Pat->bbox[dim + i] + Pat->bbox[i]; + else if (dd->uub[i] > Pat->bbox[dim + i]) + d2->data->uub[i] = uub[i] + Pat->bbox[dim + i] - Pat->bbox[i]; + else + d2->data->uub[i] = uub[i]; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + s2->data->shape[i] = int((s2->data->uub[i] - s2->data->llb[i]) / SH + 0.4) + 1; + d2->data->shape[i] = int((d2->data->uub[i] - d2->data->llb[i]) / DH + 0.4) + 1; +#else +#ifdef Cell + s2->data->shape[i] = int((s2->data->uub[i] - s2->data->llb[i]) / SH + 0.4); + d2->data->shape[i] = int((d2->data->uub[i] - d2->data->llb[i]) / DH + 0.4); +#else +#error Not define Vertex nor Cell +#endif +#endif + } + s2->data->Bg = sd->Bg; + s2->next = 0; + d2->data->Bg = dd->Bg; + d2->next = 0; + } + d = d->next; + } + s = s->next; + } +} +void Parallel::PeriodicBD(Patch *Pat, MyList *VarList, int Symmetry) +{ + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + + MyList *dst; + MyList **src, **transfer_src, **transfer_dst; + src = new MyList *[cpusize]; + transfer_src = new MyList *[cpusize]; + transfer_dst = new MyList *[cpusize]; + + dst = build_PhysBD_gsl(Pat); + for (int node = 0; node < cpusize; node++) + { + src[node] = build_owned_gsl0(Pat, node); // for the part without ghost points and do not extend + build_PhysBD_gstl(Pat, src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node + } + + transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry); + + if (dst) + dst->destroyList(); + for (int node = 0; node < cpusize; node++) + { + if (src[node]) + src[node]->destroyList(); + if (transfer_src[node]) + transfer_src[node]->destroyList(); + if (transfer_dst[node]) + transfer_dst[node]->destroyList(); + } + + delete[] src; + delete[] transfer_src; + delete[] transfer_dst; +} double Parallel::L2Norm(Patch *Pat, var *vf) { int myrank; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - - double tvf, dtvf = 0; - int BDW = ghost_width; - - MyList *BP = Pat->blb; - while (BP) - { - Block *cg = BP->data; - if (myrank == cg->rank) - { - f_l2normhelper(cg->shape, cg->X[0], cg->X[1], cg->X[2], - Pat->bbox[0], Pat->bbox[1], Pat->bbox[2], - Pat->bbox[3], Pat->bbox[4], Pat->bbox[5], - cg->fgfs[vf->sgfn], tvf, BDW); - dtvf += tvf; - } - if (BP == Pat->ble) - break; - BP = BP->next; - } - - MPI_Allreduce(&dtvf, &tvf, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - - tvf = sqrt(tvf); + + double tvf, dtvf = 0; + int BDW = ghost_width; + + MyList *BP = Pat->blb; + while (BP) + { + Block *cg = BP->data; + if (myrank == cg->rank) + { + f_l2normhelper(cg->shape, cg->X[0], cg->X[1], cg->X[2], + Pat->bbox[0], Pat->bbox[1], Pat->bbox[2], + Pat->bbox[3], Pat->bbox[4], Pat->bbox[5], + cg->fgfs[vf->sgfn], tvf, BDW); + dtvf += tvf; + } + if (BP == Pat->ble) + break; + BP = BP->next; + } + + MPI_Allreduce(&dtvf, &tvf, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + + tvf = sqrt(tvf); return tvf; } @@ -5554,30 +6001,30 @@ double Parallel::L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here) { int myrank; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - - double tvf, dtvf = 0; - int BDW = ghost_width; - - MyList *BP = Pat->blb; - while (BP) - { - Block *cg = BP->data; - if (myrank == cg->rank) - { - f_l2normhelper(cg->shape, cg->X[0], cg->X[1], cg->X[2], - Pat->bbox[0], Pat->bbox[1], Pat->bbox[2], - Pat->bbox[3], Pat->bbox[4], Pat->bbox[5], - cg->fgfs[vf->sgfn], tvf, BDW); - dtvf += tvf; - } - if (BP == Pat->ble) - break; - BP = BP->next; - } - - MPI_Allreduce(&dtvf, &tvf, 1, MPI_DOUBLE, MPI_SUM, Comm_here); - - tvf = sqrt(tvf); + + double tvf, dtvf = 0; + int BDW = ghost_width; + + MyList *BP = Pat->blb; + while (BP) + { + Block *cg = BP->data; + if (myrank == cg->rank) + { + f_l2normhelper(cg->shape, cg->X[0], cg->X[1], cg->X[2], + Pat->bbox[0], Pat->bbox[1], Pat->bbox[2], + Pat->bbox[3], Pat->bbox[4], Pat->bbox[5], + cg->fgfs[vf->sgfn], tvf, BDW); + dtvf += tvf; + } + if (BP == Pat->ble) + break; + BP = BP->next; + } + + MPI_Allreduce(&dtvf, &tvf, 1, MPI_DOUBLE, MPI_SUM, Comm_here); + + tvf = sqrt(tvf); return tvf; } @@ -5619,538 +6066,660 @@ void Parallel::L2Norm7(Patch *Pat, var **vf, double *norms, MPI_Comm Comm_here) void Parallel::checkgsl(MyList *pp, bool first_only) { int myrank = 0; - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - if (myrank == 0) - { - if (!pp) - cout << " Parallel::checkgsl meets empty gsl" << endl; - while (pp) - { - if (pp->data->Bg) - cout << " on node#" << pp->data->Bg->rank << endl; - else - cout << " virtual grid segment" << endl; - cout << " shape: ("; - for (int i = 0; i < dim; i++) - { - if (i < dim - 1) - cout << pp->data->shape[i] << ","; - else - cout << pp->data->shape[i] << ")" << endl; - } - cout << " range: ("; - for (int i = 0; i < dim; i++) - { - if (i < dim - 1) - cout << pp->data->llb[i] << ":" << pp->data->uub[i] << ","; - else - cout << pp->data->llb[i] << ":" << pp->data->uub[i] << ")" << endl; - } - if (first_only) - return; - pp = pp->next; - } - } -} -void Parallel::checkvarl(MyList *pp, bool first_only) -{ - int myrank = 0; - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - if (myrank == 0) - { - while (pp) - { - cout << "name: " << pp->data->name << endl; - cout << "SoA = (" << pp->data->SoA[0] << "," << pp->data->SoA[1] << "," << pp->data->SoA[2] << ")" << endl; - cout << "sgfn = " << pp->data->sgfn << endl; - if (first_only) - return; - pp = pp->next; - } - } -} -void Parallel::prepare_inter_time_level(MyList *PatL, - MyList *VarList1 /* source (t+dt) */, MyList *VarList2 /* source (t) */, - MyList *VarList3 /* target (t+a*dt) */, int tindex) -{ - while (PatL) - { - prepare_inter_time_level(PatL->data, VarList1, VarList2, VarList3, tindex); - PatL = PatL->next; - } -} -void Parallel::prepare_inter_time_level(Patch *Pat, - MyList *VarList1 /* source (t+dt) */, MyList *VarList2 /* source (t) */, - MyList *VarList3 /* target (t+a*dt) */, int tindex) -{ - int myrank = 0; - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - - MyList *varl1; - MyList *varl2; - MyList *varl3; - - MyList *BP = Pat->blb; - while (BP) - { - Block *cg = BP->data; - if (myrank == cg->rank) - { - varl1 = VarList1; - varl2 = VarList2; - varl3 = VarList3; - while (varl1) - { - if (tindex == 0) - f_average(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn], cg->fgfs[varl3->data->sgfn]); - else if (tindex == 1) - f_average3(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn], cg->fgfs[varl3->data->sgfn]); - else if (tindex == -1) - // just change data order to use average3 - f_average3(cg->shape, cg->fgfs[varl2->data->sgfn], cg->fgfs[varl1->data->sgfn], cg->fgfs[varl3->data->sgfn]); - else - { - cout << "error tindex in Parallel::prepare_inter_time_level" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - varl1 = varl1->next; - varl2 = varl2->next; - varl3 = varl3->next; - } - } - if (BP == Pat->ble) - break; - BP = BP->next; - } -} -void Parallel::prepare_inter_time_level(MyList *PatL, - MyList *VarList1 /* source (t+dt) */, MyList *VarList2 /* source (t) */, - MyList *VarList3 /* source (t-dt) */, MyList *VarList4 /* target (t+a*dt) */, int tindex) -{ - while (PatL) - { - prepare_inter_time_level(PatL->data, VarList1, VarList2, VarList3, VarList4, tindex); - PatL = PatL->next; - } -} -void Parallel::prepare_inter_time_level(Patch *Pat, - MyList *VarList1 /* source (t+dt) */, MyList *VarList2 /* source (t) */, - MyList *VarList3 /* source (t-dt) */, MyList *VarList4 /* target (t+a*dt) */, int tindex) -{ - int myrank = 0; - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - - MyList *varl1; - MyList *varl2; - MyList *varl3; - MyList *varl4; - - MyList *BP = Pat->blb; - while (BP) - { - Block *cg = BP->data; - if (myrank == cg->rank) - { - varl1 = VarList1; - varl2 = VarList2; - varl3 = VarList3; - varl4 = VarList4; - while (varl1) - { - if (tindex == 0) - f_average2(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn], - cg->fgfs[varl3->data->sgfn], cg->fgfs[varl4->data->sgfn]); - else if (tindex == 1) - f_average2p(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn], - cg->fgfs[varl3->data->sgfn], cg->fgfs[varl4->data->sgfn]); - else if (tindex == -1) - f_average2m(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn], - cg->fgfs[varl3->data->sgfn], cg->fgfs[varl4->data->sgfn]); - else - { - cout << "error tindex in long cgh::prepare_inter_time_level" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - varl1 = varl1->next; - varl2 = varl2->next; - varl3 = varl3->next; - varl4 = varl4->next; - } - } - if (BP == Pat->ble) - break; - BP = BP->next; - } -} -void Parallel::Prolong(Patch *Patc, Patch *Patf, - MyList *VarList1 /* source */, MyList *VarList2 /* target */, - int Symmetry) -{ - if (Patc->lev >= Patf->lev) - { - cout << "Parallel::Prolong: meet requst of Prolong from lev#" << Patc->lev << " to lev#" << Patf->lev << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - int cpusize; - MPI_Comm_size(MPI_COMM_WORLD, &cpusize); - - MyList *dst; - MyList **src, **transfer_src, **transfer_dst; - src = new MyList *[cpusize]; - transfer_src = new MyList *[cpusize]; - transfer_dst = new MyList *[cpusize]; - - dst = build_complete_gsl(Patf); // including ghost - for (int node = 0; node < cpusize; node++) - { - src[node] = build_owned_gsl4(Patc, node, Symmetry); // - buffer - ghost - BD ghost - build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node - } - - transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry); - - if (dst) - dst->destroyList(); - for (int node = 0; node < cpusize; node++) - { - if (src[node]) - src[node]->destroyList(); - if (transfer_src[node]) - transfer_src[node]->destroyList(); - if (transfer_dst[node]) - transfer_dst[node]->destroyList(); - } - - delete[] src; - delete[] transfer_src; - delete[] transfer_dst; -} -void Parallel::Restrict(MyList *PatcL, MyList *PatfL, - MyList *VarList1 /* source */, MyList *VarList2 /* target */, - int Symmetry) -{ - if (PatcL->data->lev >= PatfL->data->lev) - { - cout << "Parallel::Restrict: meet requst of Restrict from lev#" << PatfL->data->lev << " to lev#" << PatcL->data->lev << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - int cpusize; - MPI_Comm_size(MPI_COMM_WORLD, &cpusize); - - MyList *dst; - MyList **src, **transfer_src, **transfer_dst; - src = new MyList *[cpusize]; - transfer_src = new MyList *[cpusize]; - transfer_dst = new MyList *[cpusize]; - - dst = build_complete_gsl(PatcL); // including ghost - for (int node = 0; node < cpusize; node++) - { -#if 0 -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - src[node]=build_owned_gsl(PatfL,node,2,Symmetry); // - buffer - ghost -#else -#ifdef Cell - src[node]=build_owned_gsl(PatfL,node,4,Symmetry); // - buffer - ghost - BD ghost -#else -#error Not define Vertex nor Cell -#endif -#endif -#else - // it seems bam always use this - src[node] = build_owned_gsl(PatfL, node, 2, Symmetry); // - buffer - ghost -#endif - build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node - } - - transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry); - - if (dst) - dst->destroyList(); - for (int node = 0; node < cpusize; node++) - { - if (src[node]) - src[node]->destroyList(); - if (transfer_src[node]) - transfer_src[node]->destroyList(); - if (transfer_dst[node]) - transfer_dst[node]->destroyList(); - } - - delete[] src; - delete[] transfer_src; - delete[] transfer_dst; -} -void Parallel::Restrict_after(MyList *PatcL, MyList *PatfL, - MyList *VarList1 /* source */, MyList *VarList2 /* target */, - int Symmetry) -{ - if (PatcL->data->lev >= PatfL->data->lev) - { - cout << "Parallel::Restrict: meet requst of Restrict from lev#" << PatfL->data->lev << " to lev#" << PatcL->data->lev << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - int cpusize; - MPI_Comm_size(MPI_COMM_WORLD, &cpusize); - - MyList *dst; - MyList **src, **transfer_src, **transfer_dst; - src = new MyList *[cpusize]; - transfer_src = new MyList *[cpusize]; - transfer_dst = new MyList *[cpusize]; - - dst = build_complete_gsl(PatcL); // including ghost - for (int node = 0; node < cpusize; node++) - { - src[node] = build_owned_gsl(PatfL, node, 3, Symmetry); // - ghost - BD ghost - - build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node - } - - transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry); - - if (dst) - dst->destroyList(); - for (int node = 0; node < cpusize; node++) - { - if (src[node]) - src[node]->destroyList(); - if (transfer_src[node]) - transfer_src[node]->destroyList(); - if (transfer_dst[node]) - transfer_dst[node]->destroyList(); - } - - delete[] src; - delete[] transfer_src; - delete[] transfer_dst; -} -// for the same time level -void Parallel::OutBdLow2Hi(Patch *Patc, Patch *Patf, - MyList *VarList1 /* source */, MyList *VarList2 /* target */, - int Symmetry) -{ - if (Patc->lev >= Patf->lev) - { - cout << "Parallel::OutBdLow2Hi: meet requst of Prolong from lev#" << Patc->lev << " to lev#" << Patf->lev << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - int cpusize; - MPI_Comm_size(MPI_COMM_WORLD, &cpusize); - - MyList *dst; - MyList **src, **transfer_src, **transfer_dst; - src = new MyList *[cpusize]; - transfer_src = new MyList *[cpusize]; - transfer_dst = new MyList *[cpusize]; - - dst = build_buffer_gsl(Patf); // buffer region only - - for (int node = 0; node < cpusize; node++) - { - src[node] = build_owned_gsl4(Patc, node, Symmetry); // - buffer - ghost - BD ghost - build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node - } - - transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry); - - if (dst) - dst->destroyList(); - for (int node = 0; node < cpusize; node++) - { - if (src[node]) - src[node]->destroyList(); - if (transfer_src[node]) - transfer_src[node]->destroyList(); - if (transfer_dst[node]) - transfer_dst[node]->destroyList(); - } - - delete[] src; - delete[] transfer_src; - delete[] transfer_dst; -} -void Parallel::OutBdLow2Hi(MyList *PatcL, MyList *PatfL, - MyList *VarList1 /* source */, MyList *VarList2 /* target */, - int Symmetry) -{ - MyList *Pp, *Ppc; - Ppc = PatcL; - while (Ppc) - { - Pp = PatfL; - while (Pp) - { - if (Ppc->data->lev >= Pp->data->lev) - { - cout << "Parallel::OutBdLow2Hi(list): meet requst of Prolong from lev#" << Ppc->data->lev << " to lev#" << Pp->data->lev << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - Pp = Pp->next; - } - Ppc = Ppc->next; - } - - int cpusize; - MPI_Comm_size(MPI_COMM_WORLD, &cpusize); - - MyList *dst; - MyList **src, **transfer_src, **transfer_dst; - src = new MyList *[cpusize]; - transfer_src = new MyList *[cpusize]; - transfer_dst = new MyList *[cpusize]; - - dst = build_buffer_gsl(PatfL); // buffer region only - - for (int node = 0; node < cpusize; node++) - { - src[node] = build_owned_gsl(PatcL, node, 4, Symmetry); // - buffer - ghost - BD ghost - build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node - } - - transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry); - - if (dst) - dst->destroyList(); - for (int node = 0; node < cpusize; node++) - { - if (src[node]) - src[node]->destroyList(); - if (transfer_src[node]) - transfer_src[node]->destroyList(); - if (transfer_dst[node]) - transfer_dst[node]->destroyList(); - } - - delete[] src; - delete[] transfer_src; - delete[] transfer_dst; -} -// for the same time level -void Parallel::OutBdLow2Himix(Patch *Patc, Patch *Patf, - MyList *VarList1 /* source */, MyList *VarList2 /* target */, - int Symmetry) -{ - if (Patc->lev >= Patf->lev) - { - cout << "Parallel::OutBdLow2Himix: meet requst of Prolong from lev#" << Patc->lev << " to lev#" << Patf->lev << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - int cpusize; - MPI_Comm_size(MPI_COMM_WORLD, &cpusize); - - MyList *dst; - MyList **src, **transfer_src, **transfer_dst; - src = new MyList *[cpusize]; - transfer_src = new MyList *[cpusize]; - transfer_dst = new MyList *[cpusize]; - - dst = build_buffer_gsl(Patf); // buffer region only - - for (int node = 0; node < cpusize; node++) - { - src[node] = build_owned_gsl4(Patc, node, Symmetry); // - buffer - ghost - BD ghost - build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node - } - - transfermix(transfer_src, transfer_dst, VarList1, VarList2, Symmetry); - - if (dst) - dst->destroyList(); - for (int node = 0; node < cpusize; node++) - { - if (src[node]) - src[node]->destroyList(); - if (transfer_src[node]) - transfer_src[node]->destroyList(); - if (transfer_dst[node]) - transfer_dst[node]->destroyList(); - } - - delete[] src; - delete[] transfer_src; - delete[] transfer_dst; - - // do not need this, we have done after calling of this routine in ProlongRestrict or RestrictProlong - // Sync(Patf,VarList2,Symmetry); // fine level points may be not enough for interpolation -} -void Parallel::OutBdLow2Himix(MyList *PatcL, MyList *PatfL, - MyList *VarList1 /* source */, MyList *VarList2 /* target */, - int Symmetry) -{ - MyList *Pp, *Ppc; - Ppc = PatcL; - while (Ppc) - { - Pp = PatfL; - while (Pp) - { - if (Ppc->data->lev >= Pp->data->lev) - { - cout << "Parallel::OutBdLow2Himix(list): meet requst of Prolong from lev#" << Ppc->data->lev << " to lev#" << Pp->data->lev << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - Pp = Pp->next; - } - Ppc = Ppc->next; - } - - int cpusize; - MPI_Comm_size(MPI_COMM_WORLD, &cpusize); - - MyList *dst; - MyList **src, **transfer_src, **transfer_dst; - src = new MyList *[cpusize]; - transfer_src = new MyList *[cpusize]; - transfer_dst = new MyList *[cpusize]; - - dst = build_buffer_gsl(PatfL); // buffer region only - - for (int node = 0; node < cpusize; node++) - { - src[node] = build_owned_gsl(PatcL, node, 4, Symmetry); // - buffer - ghost - BD ghost - build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node - } - - transfermix(transfer_src, transfer_dst, VarList1, VarList2, Symmetry); - - if (dst) - dst->destroyList(); - for (int node = 0; node < cpusize; node++) - { - if (src[node]) - src[node]->destroyList(); - if (transfer_src[node]) - transfer_src[node]->destroyList(); - if (transfer_dst[node]) - transfer_dst[node]->destroyList(); - } - - delete[] src; - delete[] transfer_src; - delete[] transfer_dst; -} - -// Restrict_cached: cache grid segment lists, reuse buffers via transfer_cached -void Parallel::Restrict_cached(MyList *PatcL, MyList *PatfL, - MyList *VarList1, MyList *VarList2, - int Symmetry, SyncCache &cache) -{ - if (!cache.valid) - { - int cpusize; - MPI_Comm_size(MPI_COMM_WORLD, &cpusize); - cache.cpusize = cpusize; - - if (!cache.combined_src) - { - cache.combined_src = new MyList *[cpusize]; - cache.combined_dst = new MyList *[cpusize]; - cache.send_lengths = new int[cpusize]; - cache.recv_lengths = new int[cpusize]; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + if (myrank == 0) + { + if (!pp) + cout << " Parallel::checkgsl meets empty gsl" << endl; + while (pp) + { + if (pp->data->Bg) + cout << " on node#" << pp->data->Bg->rank << endl; + else + cout << " virtual grid segment" << endl; + cout << " shape: ("; + for (int i = 0; i < dim; i++) + { + if (i < dim - 1) + cout << pp->data->shape[i] << ","; + else + cout << pp->data->shape[i] << ")" << endl; + } + cout << " range: ("; + for (int i = 0; i < dim; i++) + { + if (i < dim - 1) + cout << pp->data->llb[i] << ":" << pp->data->uub[i] << ","; + else + cout << pp->data->llb[i] << ":" << pp->data->uub[i] << ")" << endl; + } + if (first_only) + return; + pp = pp->next; + } + } +} +void Parallel::checkvarl(MyList *pp, bool first_only) +{ + int myrank = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + if (myrank == 0) + { + while (pp) + { + cout << "name: " << pp->data->name << endl; + cout << "SoA = (" << pp->data->SoA[0] << "," << pp->data->SoA[1] << "," << pp->data->SoA[2] << ")" << endl; + cout << "sgfn = " << pp->data->sgfn << endl; + if (first_only) + return; + pp = pp->next; + } + } +} +void Parallel::prepare_inter_time_level(MyList *PatL, + MyList *VarList1 /* source (t+dt) */, MyList *VarList2 /* source (t) */, + MyList *VarList3 /* target (t+a*dt) */, int tindex) +{ + while (PatL) + { + prepare_inter_time_level(PatL->data, VarList1, VarList2, VarList3, tindex); + PatL = PatL->next; + } +} +void Parallel::prepare_inter_time_level(Patch *Pat, + MyList *VarList1 /* source (t+dt) */, MyList *VarList2 /* source (t) */, + MyList *VarList3 /* target (t+a*dt) */, int tindex) +{ + int myrank = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + + MyList *varl1; + MyList *varl2; + MyList *varl3; + + MyList *BP = Pat->blb; + while (BP) + { + Block *cg = BP->data; + if (myrank == cg->rank) + { + varl1 = VarList1; + varl2 = VarList2; + varl3 = VarList3; + while (varl1) + { + if (tindex == 0) + f_average(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn], cg->fgfs[varl3->data->sgfn]); + else if (tindex == 1) + f_average3(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn], cg->fgfs[varl3->data->sgfn]); + else if (tindex == -1) + // just change data order to use average3 + f_average3(cg->shape, cg->fgfs[varl2->data->sgfn], cg->fgfs[varl1->data->sgfn], cg->fgfs[varl3->data->sgfn]); + else + { + cout << "error tindex in Parallel::prepare_inter_time_level" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + varl1 = varl1->next; + varl2 = varl2->next; + varl3 = varl3->next; + } + } + if (BP == Pat->ble) + break; + BP = BP->next; + } +} +void Parallel::prepare_inter_time_level(MyList *PatL, + MyList *VarList1 /* source (t+dt) */, MyList *VarList2 /* source (t) */, + MyList *VarList3 /* source (t-dt) */, MyList *VarList4 /* target (t+a*dt) */, int tindex) +{ + while (PatL) + { + prepare_inter_time_level(PatL->data, VarList1, VarList2, VarList3, VarList4, tindex); + PatL = PatL->next; + } +} +void Parallel::prepare_inter_time_level(Patch *Pat, + MyList *VarList1 /* source (t+dt) */, MyList *VarList2 /* source (t) */, + MyList *VarList3 /* source (t-dt) */, MyList *VarList4 /* target (t+a*dt) */, int tindex) +{ + int myrank = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + + MyList *varl1; + MyList *varl2; + MyList *varl3; + MyList *varl4; + + MyList *BP = Pat->blb; + while (BP) + { + Block *cg = BP->data; + if (myrank == cg->rank) + { + varl1 = VarList1; + varl2 = VarList2; + varl3 = VarList3; + varl4 = VarList4; + while (varl1) + { + if (tindex == 0) + f_average2(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn], + cg->fgfs[varl3->data->sgfn], cg->fgfs[varl4->data->sgfn]); + else if (tindex == 1) + f_average2p(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn], + cg->fgfs[varl3->data->sgfn], cg->fgfs[varl4->data->sgfn]); + else if (tindex == -1) + f_average2m(cg->shape, cg->fgfs[varl1->data->sgfn], cg->fgfs[varl2->data->sgfn], + cg->fgfs[varl3->data->sgfn], cg->fgfs[varl4->data->sgfn]); + else + { + cout << "error tindex in long cgh::prepare_inter_time_level" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + varl1 = varl1->next; + varl2 = varl2->next; + varl3 = varl3->next; + varl4 = varl4->next; + } + } + if (BP == Pat->ble) + break; + BP = BP->next; + } +} +void Parallel::Prolong(Patch *Patc, Patch *Patf, + MyList *VarList1 /* source */, MyList *VarList2 /* target */, + int Symmetry) +{ + if (Patc->lev >= Patf->lev) + { + cout << "Parallel::Prolong: meet requst of Prolong from lev#" << Patc->lev << " to lev#" << Patf->lev << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + + MyList *dst; + MyList **src, **transfer_src, **transfer_dst; + src = new MyList *[cpusize]; + transfer_src = new MyList *[cpusize]; + transfer_dst = new MyList *[cpusize]; + + dst = build_complete_gsl(Patf); // including ghost + for (int node = 0; node < cpusize; node++) + { + src[node] = build_owned_gsl4(Patc, node, Symmetry); // - buffer - ghost - BD ghost + build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node + } + + transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry); + + if (dst) + dst->destroyList(); + for (int node = 0; node < cpusize; node++) + { + if (src[node]) + src[node]->destroyList(); + if (transfer_src[node]) + transfer_src[node]->destroyList(); + if (transfer_dst[node]) + transfer_dst[node]->destroyList(); + } + + delete[] src; + delete[] transfer_src; + delete[] transfer_dst; +} +void Parallel::Restrict(MyList *PatcL, MyList *PatfL, + MyList *VarList1 /* source */, MyList *VarList2 /* target */, + int Symmetry) +{ + if (PatcL->data->lev >= PatfL->data->lev) + { + cout << "Parallel::Restrict: meet requst of Restrict from lev#" << PatfL->data->lev << " to lev#" << PatcL->data->lev << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + + MyList *dst; + MyList **src, **transfer_src, **transfer_dst; + src = new MyList *[cpusize]; + transfer_src = new MyList *[cpusize]; + transfer_dst = new MyList *[cpusize]; + + dst = build_complete_gsl(PatcL); // including ghost + for (int node = 0; node < cpusize; node++) + { +#if 0 +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + src[node]=build_owned_gsl(PatfL,node,2,Symmetry); // - buffer - ghost +#else +#ifdef Cell + src[node]=build_owned_gsl(PatfL,node,4,Symmetry); // - buffer - ghost - BD ghost +#else +#error Not define Vertex nor Cell +#endif +#endif +#else + // it seems bam always use this + src[node] = build_owned_gsl(PatfL, node, 2, Symmetry); // - buffer - ghost +#endif + build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node + } + + transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry); + + if (dst) + dst->destroyList(); + for (int node = 0; node < cpusize; node++) + { + if (src[node]) + src[node]->destroyList(); + if (transfer_src[node]) + transfer_src[node]->destroyList(); + if (transfer_dst[node]) + transfer_dst[node]->destroyList(); + } + + delete[] src; + delete[] transfer_src; + delete[] transfer_dst; +} +void Parallel::Restrict_after(MyList *PatcL, MyList *PatfL, + MyList *VarList1 /* source */, MyList *VarList2 /* target */, + int Symmetry) +{ + if (PatcL->data->lev >= PatfL->data->lev) + { + cout << "Parallel::Restrict: meet requst of Restrict from lev#" << PatfL->data->lev << " to lev#" << PatcL->data->lev << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + + MyList *dst; + MyList **src, **transfer_src, **transfer_dst; + src = new MyList *[cpusize]; + transfer_src = new MyList *[cpusize]; + transfer_dst = new MyList *[cpusize]; + + dst = build_complete_gsl(PatcL); // including ghost + for (int node = 0; node < cpusize; node++) + { + src[node] = build_owned_gsl(PatfL, node, 3, Symmetry); // - ghost - BD ghost + + build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node + } + + transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry); + + if (dst) + dst->destroyList(); + for (int node = 0; node < cpusize; node++) + { + if (src[node]) + src[node]->destroyList(); + if (transfer_src[node]) + transfer_src[node]->destroyList(); + if (transfer_dst[node]) + transfer_dst[node]->destroyList(); + } + + delete[] src; + delete[] transfer_src; + delete[] transfer_dst; +} +// for the same time level +void Parallel::OutBdLow2Hi(Patch *Patc, Patch *Patf, + MyList *VarList1 /* source */, MyList *VarList2 /* target */, + int Symmetry) +{ + if (Patc->lev >= Patf->lev) + { + cout << "Parallel::OutBdLow2Hi: meet requst of Prolong from lev#" << Patc->lev << " to lev#" << Patf->lev << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + + MyList *dst; + MyList **src, **transfer_src, **transfer_dst; + src = new MyList *[cpusize]; + transfer_src = new MyList *[cpusize]; + transfer_dst = new MyList *[cpusize]; + + dst = build_buffer_gsl(Patf); // buffer region only + + for (int node = 0; node < cpusize; node++) + { + src[node] = build_owned_gsl4(Patc, node, Symmetry); // - buffer - ghost - BD ghost + build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node + } + + transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry); + + if (dst) + dst->destroyList(); + for (int node = 0; node < cpusize; node++) + { + if (src[node]) + src[node]->destroyList(); + if (transfer_src[node]) + transfer_src[node]->destroyList(); + if (transfer_dst[node]) + transfer_dst[node]->destroyList(); + } + + delete[] src; + delete[] transfer_src; + delete[] transfer_dst; +} +void Parallel::OutBdLow2Hi(MyList *PatcL, MyList *PatfL, + MyList *VarList1 /* source */, MyList *VarList2 /* target */, + int Symmetry) +{ + MyList *Pp, *Ppc; + Ppc = PatcL; + while (Ppc) + { + Pp = PatfL; + while (Pp) + { + if (Ppc->data->lev >= Pp->data->lev) + { + cout << "Parallel::OutBdLow2Hi(list): meet requst of Prolong from lev#" << Ppc->data->lev << " to lev#" << Pp->data->lev << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + Pp = Pp->next; + } + Ppc = Ppc->next; + } + + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + + MyList *dst; + MyList **src, **transfer_src, **transfer_dst; + src = new MyList *[cpusize]; + transfer_src = new MyList *[cpusize]; + transfer_dst = new MyList *[cpusize]; + + dst = build_buffer_gsl(PatfL); // buffer region only + + for (int node = 0; node < cpusize; node++) + { + src[node] = build_owned_gsl(PatcL, node, 4, Symmetry); // - buffer - ghost - BD ghost + build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node + } + + transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry); + + if (dst) + dst->destroyList(); + for (int node = 0; node < cpusize; node++) + { + if (src[node]) + src[node]->destroyList(); + if (transfer_src[node]) + transfer_src[node]->destroyList(); + if (transfer_dst[node]) + transfer_dst[node]->destroyList(); + } + + delete[] src; + delete[] transfer_src; + delete[] transfer_dst; +} +// for the same time level +void Parallel::OutBdLow2Himix(Patch *Patc, Patch *Patf, + MyList *VarList1 /* source */, MyList *VarList2 /* target */, + int Symmetry) +{ + if (Patc->lev >= Patf->lev) + { + cout << "Parallel::OutBdLow2Himix: meet requst of Prolong from lev#" << Patc->lev << " to lev#" << Patf->lev << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + + MyList *dst; + MyList **src, **transfer_src, **transfer_dst; + src = new MyList *[cpusize]; + transfer_src = new MyList *[cpusize]; + transfer_dst = new MyList *[cpusize]; + + dst = build_buffer_gsl(Patf); // buffer region only + + for (int node = 0; node < cpusize; node++) + { + src[node] = build_owned_gsl4(Patc, node, Symmetry); // - buffer - ghost - BD ghost + build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node + } + + transfermix(transfer_src, transfer_dst, VarList1, VarList2, Symmetry); + + if (dst) + dst->destroyList(); + for (int node = 0; node < cpusize; node++) + { + if (src[node]) + src[node]->destroyList(); + if (transfer_src[node]) + transfer_src[node]->destroyList(); + if (transfer_dst[node]) + transfer_dst[node]->destroyList(); + } + + delete[] src; + delete[] transfer_src; + delete[] transfer_dst; + + // do not need this, we have done after calling of this routine in ProlongRestrict or RestrictProlong + // Sync(Patf,VarList2,Symmetry); // fine level points may be not enough for interpolation +} +void Parallel::OutBdLow2Himix(MyList *PatcL, MyList *PatfL, + MyList *VarList1 /* source */, MyList *VarList2 /* target */, + int Symmetry) +{ + MyList *Pp, *Ppc; + Ppc = PatcL; + while (Ppc) + { + Pp = PatfL; + while (Pp) + { + if (Ppc->data->lev >= Pp->data->lev) + { + cout << "Parallel::OutBdLow2Himix(list): meet requst of Prolong from lev#" << Ppc->data->lev << " to lev#" << Pp->data->lev << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + Pp = Pp->next; + } + Ppc = Ppc->next; + } + + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + + MyList *dst; + MyList **src, **transfer_src, **transfer_dst; + src = new MyList *[cpusize]; + transfer_src = new MyList *[cpusize]; + transfer_dst = new MyList *[cpusize]; + + dst = build_buffer_gsl(PatfL); // buffer region only + + for (int node = 0; node < cpusize; node++) + { + src[node] = build_owned_gsl(PatcL, node, 4, Symmetry); // - buffer - ghost - BD ghost + build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node + } + + transfermix(transfer_src, transfer_dst, VarList1, VarList2, Symmetry); + + if (dst) + dst->destroyList(); + for (int node = 0; node < cpusize; node++) + { + if (src[node]) + src[node]->destroyList(); + if (transfer_src[node]) + transfer_src[node]->destroyList(); + if (transfer_dst[node]) + transfer_dst[node]->destroyList(); + } + + delete[] src; + delete[] transfer_src; + delete[] transfer_dst; +} + +// Restrict_cached: cache grid segment lists, reuse buffers via transfer_cached +void Parallel::Restrict_cached(MyList *PatcL, MyList *PatfL, + MyList *VarList1, MyList *VarList2, + int Symmetry, SyncCache &cache) +{ + if (!cache.valid) + { + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + cache.cpusize = cpusize; + + if (!cache.combined_src) + { + cache.combined_src = new MyList *[cpusize]; + cache.combined_dst = new MyList *[cpusize]; + cache.send_lengths = new int[cpusize]; + cache.recv_lengths = new int[cpusize]; + cache.send_bufs = new double *[cpusize]; + cache.recv_bufs = new double *[cpusize]; + cache.send_buf_caps = new int[cpusize]; + cache.recv_buf_caps = new int[cpusize]; + cache.send_buf_pinned = new unsigned char[cpusize]; + cache.recv_buf_pinned = new unsigned char[cpusize]; + cache.send_buf_is_dev = new unsigned char[cpusize]; + cache.recv_buf_is_dev = new unsigned char[cpusize]; + cache.send_buf_caps_dev = new int[cpusize]; + cache.recv_buf_caps_dev = new int[cpusize]; + cache.send_bufs_dev = new double *[cpusize]; + cache.recv_bufs_dev = new double *[cpusize]; + for (int i = 0; i < cpusize; i++) + { + cache.send_bufs[i] = cache.recv_bufs[i] = 0; + cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0; + cache.send_buf_pinned[i] = cache.recv_buf_pinned[i] = 0; + cache.send_buf_is_dev[i] = cache.recv_buf_is_dev[i] = 0; + cache.send_buf_caps_dev[i] = cache.recv_buf_caps_dev[i] = 0; + cache.send_bufs_dev[i] = cache.recv_bufs_dev[i] = 0; + } + cache.max_reqs = 2 * cpusize; + cache.reqs = new MPI_Request[cache.max_reqs]; + cache.stats = new MPI_Status[cache.max_reqs]; + cache.tc_req_node = new int[cache.max_reqs]; + cache.tc_req_is_recv = new int[cache.max_reqs]; + cache.tc_completed = new int[cache.max_reqs]; + } + + MyList *dst = build_complete_gsl(PatcL); + for (int node = 0; node < cpusize; node++) + { + MyList *src_owned = build_owned_gsl(PatfL, node, 2, Symmetry); + build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]); + if (src_owned) src_owned->destroyList(); + } + if (dst) dst->destroyList(); + + cache.valid = true; + } + + transfer_cached(cache.combined_src, cache.combined_dst, VarList1, VarList2, Symmetry, cache); +} + +// OutBdLow2Hi_cached: cache grid segment lists, reuse buffers via transfer_cached +void Parallel::OutBdLow2Hi_cached(MyList *PatcL, MyList *PatfL, + MyList *VarList1, MyList *VarList2, + int Symmetry, SyncCache &cache) +{ + if (!cache.valid) + { + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + cache.cpusize = cpusize; + + if (!cache.combined_src) + { + cache.combined_src = new MyList *[cpusize]; + cache.combined_dst = new MyList *[cpusize]; + cache.send_lengths = new int[cpusize]; + cache.recv_lengths = new int[cpusize]; + cache.send_bufs = new double *[cpusize]; + cache.recv_bufs = new double *[cpusize]; + cache.send_buf_caps = new int[cpusize]; + cache.recv_buf_caps = new int[cpusize]; + cache.send_buf_pinned = new unsigned char[cpusize]; + cache.recv_buf_pinned = new unsigned char[cpusize]; + cache.send_buf_is_dev = new unsigned char[cpusize]; + cache.recv_buf_is_dev = new unsigned char[cpusize]; + cache.send_buf_caps_dev = new int[cpusize]; + cache.recv_buf_caps_dev = new int[cpusize]; + cache.send_bufs_dev = new double *[cpusize]; + cache.recv_bufs_dev = new double *[cpusize]; + for (int i = 0; i < cpusize; i++) + { + cache.send_bufs[i] = cache.recv_bufs[i] = 0; + cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0; + cache.send_buf_pinned[i] = cache.recv_buf_pinned[i] = 0; + cache.send_buf_is_dev[i] = cache.recv_buf_is_dev[i] = 0; + cache.send_buf_caps_dev[i] = cache.recv_buf_caps_dev[i] = 0; + cache.send_bufs_dev[i] = cache.recv_bufs_dev[i] = 0; + } + cache.max_reqs = 2 * cpusize; + cache.reqs = new MPI_Request[cache.max_reqs]; + cache.stats = new MPI_Status[cache.max_reqs]; + cache.tc_req_node = new int[cache.max_reqs]; + cache.tc_req_is_recv = new int[cache.max_reqs]; + cache.tc_completed = new int[cache.max_reqs]; + } + + MyList *dst = build_buffer_gsl(PatfL); + for (int node = 0; node < cpusize; node++) + { + MyList *src_owned = build_owned_gsl(PatcL, node, 4, Symmetry); + build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]); + if (src_owned) src_owned->destroyList(); + } + if (dst) dst->destroyList(); + + cache.valid = true; + } + + transfer_cached(cache.combined_src, cache.combined_dst, VarList1, VarList2, Symmetry, cache); +} + +// OutBdLow2Himix_cached: same as OutBdLow2Hi_cached but uses transfermix for unpacking +void Parallel::OutBdLow2Himix_cached(MyList *PatcL, MyList *PatfL, + MyList *VarList1, MyList *VarList2, + int Symmetry, SyncCache &cache) +{ + if (!cache.valid) + { + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + cache.cpusize = cpusize; + + if (!cache.combined_src) + { + cache.combined_src = new MyList *[cpusize]; + cache.combined_dst = new MyList *[cpusize]; + cache.send_lengths = new int[cpusize]; + cache.recv_lengths = new int[cpusize]; cache.send_bufs = new double *[cpusize]; cache.recv_bufs = new double *[cpusize]; cache.send_buf_caps = new int[cpusize]; @@ -6163,1212 +6732,1108 @@ void Parallel::Restrict_cached(MyList *PatcL, MyList *PatfL, cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0; cache.send_buf_pinned[i] = cache.recv_buf_pinned[i] = 0; } - cache.max_reqs = 2 * cpusize; - cache.reqs = new MPI_Request[cache.max_reqs]; - cache.stats = new MPI_Status[cache.max_reqs]; - cache.tc_req_node = new int[cache.max_reqs]; - cache.tc_req_is_recv = new int[cache.max_reqs]; - cache.tc_completed = new int[cache.max_reqs]; - } - - MyList *dst = build_complete_gsl(PatcL); - for (int node = 0; node < cpusize; node++) - { - MyList *src_owned = build_owned_gsl(PatfL, node, 2, Symmetry); - build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]); - if (src_owned) src_owned->destroyList(); - } - if (dst) dst->destroyList(); - - cache.valid = true; - } - - transfer_cached(cache.combined_src, cache.combined_dst, VarList1, VarList2, Symmetry, cache); -} - -// OutBdLow2Hi_cached: cache grid segment lists, reuse buffers via transfer_cached -void Parallel::OutBdLow2Hi_cached(MyList *PatcL, MyList *PatfL, - MyList *VarList1, MyList *VarList2, - int Symmetry, SyncCache &cache) -{ - if (!cache.valid) - { - int cpusize; - MPI_Comm_size(MPI_COMM_WORLD, &cpusize); - cache.cpusize = cpusize; - - if (!cache.combined_src) - { - cache.combined_src = new MyList *[cpusize]; - cache.combined_dst = new MyList *[cpusize]; - cache.send_lengths = new int[cpusize]; - cache.recv_lengths = new int[cpusize]; - cache.send_bufs = new double *[cpusize]; - cache.recv_bufs = new double *[cpusize]; - cache.send_buf_caps = new int[cpusize]; - cache.recv_buf_caps = new int[cpusize]; - cache.send_buf_pinned = new unsigned char[cpusize]; - cache.recv_buf_pinned = new unsigned char[cpusize]; - for (int i = 0; i < cpusize; i++) - { - cache.send_bufs[i] = cache.recv_bufs[i] = 0; - cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0; - cache.send_buf_pinned[i] = cache.recv_buf_pinned[i] = 0; - } - cache.max_reqs = 2 * cpusize; - cache.reqs = new MPI_Request[cache.max_reqs]; - cache.stats = new MPI_Status[cache.max_reqs]; - cache.tc_req_node = new int[cache.max_reqs]; - cache.tc_req_is_recv = new int[cache.max_reqs]; - cache.tc_completed = new int[cache.max_reqs]; - } - - MyList *dst = build_buffer_gsl(PatfL); - for (int node = 0; node < cpusize; node++) - { - MyList *src_owned = build_owned_gsl(PatcL, node, 4, Symmetry); - build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]); - if (src_owned) src_owned->destroyList(); - } - if (dst) dst->destroyList(); - - cache.valid = true; - } - - transfer_cached(cache.combined_src, cache.combined_dst, VarList1, VarList2, Symmetry, cache); -} - -// OutBdLow2Himix_cached: same as OutBdLow2Hi_cached but uses transfermix for unpacking -void Parallel::OutBdLow2Himix_cached(MyList *PatcL, MyList *PatfL, - MyList *VarList1, MyList *VarList2, - int Symmetry, SyncCache &cache) -{ - if (!cache.valid) - { - int cpusize; - MPI_Comm_size(MPI_COMM_WORLD, &cpusize); - cache.cpusize = cpusize; - - if (!cache.combined_src) - { - cache.combined_src = new MyList *[cpusize]; - cache.combined_dst = new MyList *[cpusize]; - cache.send_lengths = new int[cpusize]; - cache.recv_lengths = new int[cpusize]; - cache.send_bufs = new double *[cpusize]; - cache.recv_bufs = new double *[cpusize]; - cache.send_buf_caps = new int[cpusize]; - cache.recv_buf_caps = new int[cpusize]; - cache.send_buf_pinned = new unsigned char[cpusize]; - cache.recv_buf_pinned = new unsigned char[cpusize]; - for (int i = 0; i < cpusize; i++) - { - cache.send_bufs[i] = cache.recv_bufs[i] = 0; - cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0; - cache.send_buf_pinned[i] = cache.recv_buf_pinned[i] = 0; - } - cache.max_reqs = 2 * cpusize; - cache.reqs = new MPI_Request[cache.max_reqs]; - cache.stats = new MPI_Status[cache.max_reqs]; - cache.tc_req_node = new int[cache.max_reqs]; - cache.tc_req_is_recv = new int[cache.max_reqs]; - cache.tc_completed = new int[cache.max_reqs]; - } - - MyList *dst = build_buffer_gsl(PatfL); - for (int node = 0; node < cpusize; node++) - { - MyList *src_owned = build_owned_gsl(PatcL, node, 4, Symmetry); - build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]); - if (src_owned) src_owned->destroyList(); - } - if (dst) dst->destroyList(); - - cache.valid = true; - } - - // Use transfermix instead of transfer for mix-mode interpolation - int myrank; - MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize); - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - int cpusize = cache.cpusize; - - int req_no = 0; - int pending_recv = 0; - int *req_node = new int[cache.max_reqs]; - int *req_is_recv = new int[cache.max_reqs]; - int *completed = new int[cache.max_reqs]; - - // Post receives first so peers can progress rendezvous early. - for (int node = 0; node < cpusize; node++) - { - if (node == myrank) continue; - - int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry); + cache.max_reqs = 2 * cpusize; + cache.reqs = new MPI_Request[cache.max_reqs]; + cache.stats = new MPI_Status[cache.max_reqs]; + cache.tc_req_node = new int[cache.max_reqs]; + cache.tc_req_is_recv = new int[cache.max_reqs]; + cache.tc_completed = new int[cache.max_reqs]; + } + + MyList *dst = build_buffer_gsl(PatfL); + for (int node = 0; node < cpusize; node++) + { + MyList *src_owned = build_owned_gsl(PatcL, node, 4, Symmetry); + build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]); + if (src_owned) src_owned->destroyList(); + } + if (dst) dst->destroyList(); + + cache.valid = true; + } + + // Use transfermix instead of transfer for mix-mode interpolation + int myrank; + MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize); + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + int cpusize = cache.cpusize; + + int req_no = 0; + int pending_recv = 0; + int *req_node = new int[cache.max_reqs]; + int *req_is_recv = new int[cache.max_reqs]; + int *completed = new int[cache.max_reqs]; + + // Post receives first so peers can progress rendezvous early. + for (int node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry); cache.recv_lengths[node] = rlength; if (rlength > 0) { ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, node, rlength); MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); - req_node[req_no] = node; - req_is_recv[req_no] = 1; - req_no++; - pending_recv++; - } - } - - // Local transfer on this rank. - int self_len = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + req_node[req_no] = node; + req_is_recv[req_no] = 1; + req_no++; + pending_recv++; + } + } + + // Local transfer on this rank. + int self_len = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); cache.recv_lengths[myrank] = self_len; if (self_len > 0) { ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, myrank, self_len); data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); } - - // Pack and post sends. - for (int node = 0; node < cpusize; node++) - { - if (node == myrank) continue; - - int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + + // Pack and post sends. + for (int node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); cache.send_lengths[node] = slength; if (slength > 0) { ensure_comm_buffer(cache.send_bufs, cache.send_buf_pinned, cache.send_buf_caps, node, slength); data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); - req_node[req_no] = node; - req_is_recv[req_no] = 0; - req_no++; - } - } - - // Unpack as soon as receive completes to reduce pure wait time. - while (pending_recv > 0) - { - int outcount = 0; - MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats); - if (outcount == MPI_UNDEFINED) break; - - for (int i = 0; i < outcount; i++) - { - int idx = completed[i]; - if (idx >= 0 && req_is_recv[idx]) - { - int recv_node_i = req_node[idx]; - data_packermix(cache.recv_bufs[recv_node_i], cache.combined_src[recv_node_i], cache.combined_dst[recv_node_i], recv_node_i, UNPACK, VarList1, VarList2, Symmetry); - pending_recv--; - } - } - } - - if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats); - - if (self_len > 0) - data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); - - delete[] req_node; - delete[] req_is_recv; - delete[] completed; -} - -// collect all buffer grid segments or blocks for given patch -MyList *Parallel::build_buffer_gsl(Patch *Pat) -{ - MyList *cgsl, *gsc, *gsb; - - gsc = build_complete_gsl(Pat); // including ghost - - gsb = new MyList; - gsb->data = new Parallel::gridseg; - - for (int i = 0; i < dim; i++) - { - double DH = Pat->blb->data->getdX(i); - gsb->data->uub[i] = Pat->bbox[dim + i] - Pat->uui[i] * DH; - gsb->data->llb[i] = Pat->bbox[i] + Pat->lli[i] * DH; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - gsb->data->shape[i] = int((gsb->data->uub[i] - gsb->data->llb[i]) / DH + 0.4) + 1; -#else -#ifdef Cell - gsb->data->shape[i] = int((gsb->data->uub[i] - gsb->data->llb[i]) / DH + 0.4); -#else -#error Not define Vertex nor Cell -#endif -#endif - } - gsb->data->Bg = 0; - gsb->next = 0; - - cgsl = gsl_subtract(gsc, gsb); - - gsc->destroyList(); - gsb->destroyList(); - - // set illb and iuub - gsb = cgsl; - while (gsb) - { - for (int i = 0; i < dim; i++) - { - double DH = Pat->blb->data->getdX(i); - gsb->data->iuub[i] = Pat->bbox[dim + i] - Pat->uui[i] * DH; - gsb->data->illb[i] = Pat->bbox[i] + Pat->lli[i] * DH; - } - gsb = gsb->next; - } - - return cgsl; -} -MyList *Parallel::build_buffer_gsl(MyList *PatL) -{ - MyList *cgsl = 0, *gs; - while (PatL) - { - if (cgsl) - { - gs->next = build_buffer_gsl(PatL->data); - gs = gs->next; - if (gs) - while (gs->next) - gs = gs->next; - } - else - { - cgsl = build_buffer_gsl(PatL->data); - gs = cgsl; - if (gs) - while (gs->next) - gs = gs->next; - } - PatL = PatL->next; - } - - return cgsl; -} -void Parallel::Prolongint(Patch *Patc, Patch *Patf, - MyList *VarList1 /* source */, MyList *VarList2 /* target */, - int Symmetry) -{ - if (Patc->lev >= Patf->lev) - { - cout << "Parallel::Prolong: meet requst of Prolong from lev#" << Patc->lev << " to lev#" << Patf->lev << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - - int myrank = 0; - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - - int num_var = 0; - MyList *varl; - varl = VarList1; - while (varl) - { - num_var++; - varl = varl->next; - } - - MyList *BP = Patf->blb; - while (BP) - { - int Npts; - if (myrank == BP->data->rank) - Npts = BP->data->shape[0] * BP->data->shape[1] * BP->data->shape[2]; - MPI_Bcast(&Npts, 1, MPI_INT, BP->data->rank, MPI_COMM_WORLD); - double *pox[3]; - for (int i = 0; i < 3; i++) - pox[i] = new double[Npts]; - if (myrank == BP->data->rank) - { - for (int i = 0; i < Npts; i++) - { - int ind[3]; - Parallel::getarrayindex(3, BP->data->shape, ind, i); - pox[0][i] = BP->data->X[0][ind[0]]; - pox[1][i] = BP->data->X[1][ind[1]]; - pox[2][i] = BP->data->X[2][ind[2]]; - } - } - for (int i = 0; i < 3; i++) - MPI_Bcast(pox[i], Npts, MPI_DOUBLE, BP->data->rank, MPI_COMM_WORLD); - double *res; - res = new double[num_var * Npts]; - Patc->Interp_Points(VarList1, Npts, pox, res, Symmetry); // because this operation is a global operation (for all processors) - // we have to isolate it out of myrank==BP->data->rank - if (myrank == BP->data->rank) - { - for (int i = 0; i < Npts; i++) - { - varl = VarList2; - int j = 0; - while (varl) - { - (BP->data->fgfs[varl->data->sgfn])[i] = res[j + i * num_var]; - j++; - varl = varl->next; - } - } - } - delete[] pox[0]; - delete[] pox[1]; - delete[] pox[2]; - delete[] res; - BP = BP->next; - } -} -// -void Parallel::merge_gsl(MyList *&A, const double ratio) -{ - if (!A) - return; - - MyList *B, *C, *D = A; - bool flag = false; - while (D->next) - { - B = D->next; - while (B) - { - flag = merge_gs(D, B, C, ratio); - if (flag) - break; - B = B->next; - } - if (flag) - break; - D = D->next; - } - - if (flag) - { - // delete D and B from A - MyList *E = A; - while (E->next) - { - MyList *tp = E->next; - if (D == tp || B == tp) - { - E->next = (tp->next) ? tp->next : 0; - delete tp->data; - delete tp; - } - if (E->next) - E = E->next; - } - - if (D == A) - { - MyList *tp = A; - A = (A->next) ? A->next : 0; - delete tp->data; - delete tp; - } - // cat C to A - if (A) - A->catList(C); - else - A = C; - - merge_gsl(A, ratio); - } -} -// -bool Parallel::merge_gs(MyList *D, MyList *B, MyList *&C, const double ratio) -{ - if (!B || !D) - return false; - - C = 0; - double llb[dim], uub[dim], DH[dim]; - for (int i = 0; i < dim; i++) - { - double tdh; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - DH[i] = (D->data->uub[i] - D->data->llb[i]) / (D->data->shape[i] - 1); - tdh = (B->data->uub[i] - B->data->llb[i]) / (B->data->shape[i] - 1); -#else -#ifdef Cell - DH[i] = (D->data->uub[i] - D->data->llb[i]) / D->data->shape[i]; - tdh = (B->data->uub[i] - B->data->llb[i]) / B->data->shape[i]; -#else -#error Not define Vertex nor Cell -#endif -#endif - if (!feq(DH[i], tdh, DH[i] / 2)) - { - cout << "Parallel::merge_gs meets different grid segment " << DH[i] << " vs " << tdh << endl; - checkgsl(B, true); - checkgsl(D, true); - MPI_Abort(MPI_COMM_WORLD, 1); - } - llb[i] = Mymax(D->data->llb[i], B->data->llb[i]); - uub[i] = Mymin(D->data->uub[i], B->data->uub[i]); - // if(uub[i]-llb[i] < DH[i]/2) return false; //here this is valid for both vertex and cell - - // use 0 instead of DH[i]/2, we consider contact case, 2012 Aug 8 - if (uub[i] - llb[i] < 0) - return false; // here this is valid for both vertex and cell - } - - // vb: volume of B - // vd: volume of D - // vo: volume of overlap - // vt: volume of smallest common box (virtual merged box) - double vd = 1, vb = 1, vt = 1, vo = 1; - for (int i = 0; i < dim; i++) - { - vt = vt * (Mymax(D->data->uub[i], B->data->uub[i]) - Mymin(D->data->llb[i], B->data->llb[i])); - vo = vo * (uub[i] - llb[i]); - vd = vd * (D->data->uub[i] - D->data->llb[i]); - vb = vb * (B->data->uub[i] - B->data->llb[i]); - } - - // smller ratio, more possible to merge - if ((vd + vb - vo) / vt > ratio) - { - C = new MyList; - C->data = new gridseg; - for (int i = 0; i < dim; i++) - { - C->data->uub[i] = Mymax(D->data->uub[i], B->data->uub[i]); - C->data->llb[i] = Mymin(D->data->llb[i], B->data->llb[i]); -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / DH[i] + 0.4) + 1; -#else -#ifdef Cell - C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / DH[i] + 0.4); -#else -#error Not define Vertex nor Cell -#endif -#endif - } - if (D->data->Bg == B->data->Bg) - C->data->Bg = D->data->Bg; - else - C->data->Bg = 0; - - C->next = 0; - - return true; - } - else - { - return false; - } -} -// Add ghost region to tangent plane -// we assume the grids have the same resolution -void Parallel::add_ghost_touch(MyList *&A) -{ - if (!A || !(A->next)) - return; - - double DH[dim]; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - for (int i = 0; i < dim; i++) - DH[i] = (A->data->uub[i] - A->data->llb[i]) / (A->data->shape[i] - 1) / 2; -#else -#ifdef Cell - for (int i = 0; i < dim; i++) - DH[i] = (A->data->uub[i] - A->data->llb[i]) / A->data->shape[i] / 2; -#else -#error Not define Vertex nor Cell -#endif -#endif - - MyList *C1, *C2, *A1 = A, *A2, *dc; - dc = C1 = clone_gsl(A, false); - while (C1) - { - C2 = C1->next; - A2 = A1->next; - while (C2) - { - for (int i = 0; i < dim; i++) - { - if (feq(C1->data->llb[i], C2->data->uub[i], DH[i])) - { - // direction i touch, other directions overlap - bool flag = true; - for (int j = 0; j < i; j++) - if ((C1->data->llb[j] - C2->data->llb[j]) * (C1->data->uub[j] - C2->data->llb[j]) > 0 && - (C2->data->llb[j] - C1->data->llb[j]) * (C2->data->uub[j] - C1->data->llb[j]) > 0) - flag = false; - for (int j = i + 1; j < dim; j++) - if ((C1->data->llb[j] - C2->data->llb[j]) * (C1->data->uub[j] - C2->data->llb[j]) > 0 && - (C2->data->llb[j] - C1->data->llb[j]) * (C2->data->uub[j] - C1->data->llb[j]) > 0) - flag = false; - - if (flag) - { - // only add one ghost region - if (feq(A1->data->llb[i], C1->data->llb[i], DH[i])) - { - A1->data->llb[i] -= ghost_width * 2 * DH[i]; - A1->data->shape[i] += ghost_width; - } - if (feq(A2->data->uub[i], C2->data->uub[i], DH[i])) - { - A2->data->uub[i] += ghost_width * 2 * DH[i]; - A2->data->shape[i] += ghost_width; - } - } - } - if (feq(C1->data->uub[i], C2->data->llb[i], DH[i])) - { - // direction i touch, other directions overlap - bool flag = true; - for (int j = 0; j < i; j++) - if ((C1->data->llb[j] - C2->data->llb[j]) * (C1->data->uub[j] - C2->data->llb[j]) > 0 && - (C2->data->llb[j] - C1->data->llb[j]) * (C2->data->uub[j] - C1->data->llb[j]) > 0) - flag = false; - for (int j = i + 1; j < dim; j++) - if ((C1->data->llb[j] - C2->data->llb[j]) * (C1->data->uub[j] - C2->data->llb[j]) > 0 && - (C2->data->llb[j] - C1->data->llb[j]) * (C2->data->uub[j] - C1->data->llb[j]) > 0) - flag = false; - - if (flag) - { - // only add one ghost region - if (feq(A1->data->uub[i], C1->data->uub[i], DH[i])) - { - A1->data->uub[i] += ghost_width * 2 * DH[i]; - A1->data->shape[i] += ghost_width; - } - if (feq(A2->data->llb[i], C2->data->llb[i], DH[i])) - { - A2->data->llb[i] -= ghost_width * 2 * DH[i]; - A2->data->shape[i] += ghost_width; - } - } - } - } - C2 = C2->next; - A2 = A2->next; - } - C1 = C1->next; - A1 = A1->next; - } - - if (dc) - dc->destroyList(); -} -// According to overlap to cut the gsl into recular pices -void Parallel::cut_gsl(MyList *&A) -{ - if (!A) - return; - - MyList *B, *C, *D = A; - bool flag = false; - while (D->next) - { - B = D->next; - while (B) - { - flag = cut_gs(D, B, C); - if (flag) - break; - B = B->next; - } - if (flag) - break; - D = D->next; - } - - if (flag) - { - // delete D and B from A - MyList *E = A; - while (E->next) - { - MyList *tp = E->next; - if (D == tp || B == tp) - { - E->next = (tp->next) ? tp->next : 0; - delete tp->data; - delete tp; - } - if (E->next) - E = E->next; - } - - if (D == A) - { - MyList *tp = A; - A = (A->next) ? A->next : 0; - delete tp->data; - delete tp; - } - // cat C to A - if (A) - A->catList(C); - else - A = C; - - cut_gsl(A); - } -} -// when D and B have overlap, cut them into C and return true -// otherwise return false and C=0 -bool Parallel::cut_gs(MyList *D, MyList *B, MyList *&C) -{ - C = 0; - double llb[dim], uub[dim], DH[dim]; - for (int i = 0; i < dim; i++) - { - double tdh; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - DH[i] = (D->data->uub[i] - D->data->llb[i]) / (D->data->shape[i] - 1); - tdh = (B->data->uub[i] - B->data->llb[i]) / (B->data->shape[i] - 1); -#else -#ifdef Cell - DH[i] = (D->data->uub[i] - D->data->llb[i]) / D->data->shape[i]; - tdh = (B->data->uub[i] - B->data->llb[i]) / B->data->shape[i]; -#else -#error Not define Vertex nor Cell -#endif -#endif - if (!feq(DH[i], tdh, DH[i] / 2)) - { - cout << "Parallel::cut_gs meets different grid segment " << DH[i] << " vs " << tdh << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - llb[i] = Mymax(D->data->llb[i], B->data->llb[i]); - uub[i] = Mymin(D->data->uub[i], B->data->uub[i]); - // for efficiency we ask the width of the patch at least 2(buffer+ghost+BD ghost) - if (uub[i] - llb[i] < DH[i] * 2 * (buffer_width + 2 * ghost_width)) - return false; // here this is valid for both vertex and cell - } - - // this part code results in 5 patches generally - - C = new MyList; - C->data = new gridseg; - for (int i = 0; i < dim; i++) - { - C->data->llb[i] = llb[i]; - C->data->uub[i] = uub[i]; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / DH[i] + 0.4) + 1; -#else -#ifdef Cell - C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / DH[i] + 0.4); -#else -#error Not define Vertex nor Cell -#endif -#endif - } - if (D->data->Bg == B->data->Bg) - C->data->Bg = D->data->Bg; - else - C->data->Bg = 0; - - C->next = gs_subtract_virtual(D, C); - - MyList *E = C; - - while (E->next) - E = E->next; - - E->next = gs_subtract_virtual(B, C); - - // this part code results in 3 patches generally - /* - C = clone_gsl(D,true); - C->next = gs_subtract_virtual(B,C); - */ - - return true; -} -// note here it is different to real cut, we need leave the cutting edge for both vertex center and cell center -MyList *Parallel::gs_subtract_virtual(MyList *A, MyList *B) -{ - if (!A) - return 0; - if (!B) - return clone_gsl(A, true); - - double cut_plane[2 * dim], DH[dim]; - - for (int i = 0; i < dim; i++) - { - double tdh; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - DH[i] = (A->data->uub[i] - A->data->llb[i]) / (A->data->shape[i] - 1); - tdh = (B->data->uub[i] - B->data->llb[i]) / (B->data->shape[i] - 1); -#else -#ifdef Cell - DH[i] = (A->data->uub[i] - A->data->llb[i]) / A->data->shape[i]; - tdh = (B->data->uub[i] - B->data->llb[i]) / B->data->shape[i]; -#else -#error Not define Vertex nor Cell -#endif -#endif - if (!feq(DH[i], tdh, DH[i] / 2)) - { - cout << "Parallel::gs_subtract_virtual meets different grid segment " << DH[i] << " vs " << tdh << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - } - - MyList *C = 0, *q; - for (int i = 0; i < dim; i++) - { - if (B->data->llb[i] > A->data->uub[i] || B->data->uub[i] < A->data->llb[i]) - return clone_gsl(A, true); - cut_plane[i] = A->data->llb[i]; - cut_plane[i + dim] = A->data->uub[i]; - } - - for (int i = 0; i < dim; i++) - { - cut_plane[i] = Mymax(A->data->llb[i], B->data->llb[i]); - if (cut_plane[i] > A->data->llb[i]) - { - q = clone_gsl(A, true); - // prolong the list from head - if (C) - q->next = C; - C = q; - for (int j = 0; j < dim; j++) - { - if (i == j) - { - C->data->llb[i] = A->data->llb[i]; - // **note here it is different to real cut, we need leave the cutting edge for both vertex center and cell center** - C->data->uub[i] = Mymax(C->data->llb[i], cut_plane[i]); - } - else - { - C->data->llb[j] = cut_plane[j]; - C->data->uub[j] = cut_plane[j + dim]; - } -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4) + 1; -#else -#ifdef Cell - C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4); -#else -#error Not define Vertex nor Cell -#endif -#endif - } - } - - cut_plane[i + dim] = Mymin(A->data->uub[i], B->data->uub[i]); - if (cut_plane[i + dim] < A->data->uub[i]) - { - q = clone_gsl(A, true); - if (C) - q->next = C; - C = q; - for (int j = 0; j < dim; j++) - { - if (i == j) - { - C->data->uub[i] = A->data->uub[i]; - // note here it is different to real cut, we need leave the cutting edge for both vertex center and cell center - C->data->llb[i] = Mymin(C->data->uub[i], cut_plane[i + dim]); - } - else - { - C->data->llb[j] = cut_plane[j]; - C->data->uub[j] = cut_plane[j + dim]; - } -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4) + 1; -#else -#ifdef Cell - C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4); -#else -#error Not define Vertex nor Cell -#endif -#endif - } - } - } - return C; -} -// note the data structure -// if CC is true -// 1 ----------- 1 ------ ^ -// 0 ------ | t -// 0 ----------- old ------ | -// -// old ----------- -// if CC is false -// 1 ----------- 1 ------ ^ -// 0 ----------- 0 ------ | t -// old ----------- old ------ | -void Parallel::fill_level_data(MyList *PatLd, MyList *PatLs, MyList *PatcL, - MyList *OldList, MyList *StateList, MyList *FutureList, - MyList *tmList, int Symmetry, bool BB, bool CC) -{ - if (PatLd->data->lev != PatLs->data->lev) - { - cout << "Parallel::fill_level_data: meet requst from lev#" << PatLs->data->lev << " to lev#" << PatLd->data->lev << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - if (PatLd->data->lev <= PatcL->data->lev) - { - cout << "Parallel::fill_level_data: meet prolong requst from lev#" << PatcL->data->lev << " to lev#" << PatLd->data->lev << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - - int cpusize; - MPI_Comm_size(MPI_COMM_WORLD, &cpusize); - - MyList *VarList = 0; - MyList *p; - p = StateList; - while (p) - { - if (VarList) - VarList->insert(p->data); - else - VarList = new MyList(p->data); - p = p->next; - } - p = FutureList; - while (p) - { - if (VarList) - VarList->insert(p->data); - else - VarList = new MyList(p->data); - p = p->next; - } - - MyList *dst; - MyList **src, **transfer_src, **transfer_dst; - src = new MyList *[cpusize]; - transfer_src = new MyList *[cpusize]; - transfer_dst = new MyList *[cpusize]; - - dst = build_complete_gsl(PatLd); // including ghost - // copy part - for (int node = 0; node < cpusize; node++) - { - src[node] = build_owned_gsl(PatLs, node, 0, Symmetry); // similar to Sync - build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node - } - - transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry); - - for (int node = 0; node < cpusize; node++) - { - if (src[node]) - src[node]->destroyList(); - if (transfer_src[node]) - transfer_src[node]->destroyList(); - if (transfer_dst[node]) - transfer_dst[node]->destroyList(); - } - - MyList *dsts, *dstd; - dsts = build_complete_gsl_virtual(PatLs); - dstd = dst; - dst = gsl_subtract(dstd, dsts); - if (dstd) - dstd->destroyList(); - if (dsts) - dsts->destroyList(); - - if (dst) - { - // prolongation part - for (int node = 0; node < cpusize; node++) - { - src[node] = build_owned_gsl(PatcL, node, 4, Symmetry); // - buffer - ghost - BD ghost - build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node - } - - if (CC) - { - // for FutureList - // restrict first~~~> - { - Restrict(PatcL, PatLs, FutureList, FutureList, Symmetry); - Sync(PatcL, FutureList, Symmetry); - } - //<~~~prolong then - transfer(transfer_src, transfer_dst, FutureList, FutureList, Symmetry); - - // for StateList - // time interpolation part - if (BB) - prepare_inter_time_level(PatcL, FutureList, StateList, OldList, - tmList, 0); // use SynchList_pre as temporal storage space - else - prepare_inter_time_level(PatcL, FutureList, StateList, - tmList, 0); // use SynchList_pre as temporal storage space - // restrict first~~~> - { - Restrict(PatcL, PatLs, StateList, tmList, Symmetry); - Sync(PatcL, tmList, Symmetry); - } - //<~~~prolong then - transfer(transfer_src, transfer_dst, tmList, StateList, Symmetry); - } - else - { - // for both FutureList and StateList - // restrict first~~~> - { - Restrict(PatcL, PatLs, VarList, VarList, Symmetry); - Sync(PatcL, VarList, Symmetry); - } - //<~~~prolong then - transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry); - } - - for (int node = 0; node < cpusize; node++) - { - if (src[node]) - src[node]->destroyList(); - if (transfer_src[node]) - transfer_src[node]->destroyList(); - if (transfer_dst[node]) - transfer_dst[node]->destroyList(); - } - - dst->destroyList(); - } - - delete[] src; - delete[] transfer_src; - delete[] transfer_dst; - - VarList->clearList(); -} -void Parallel::KillBlocks(MyList *PatchLIST) -{ - while (PatchLIST) - { - Patch *Pp = PatchLIST->data; - MyList *bg; - while (Pp->blb) - { - if (Pp->blb == Pp->ble) - break; - bg = (Pp->blb->next) ? Pp->blb->next : 0; - delete Pp->blb->data; - delete Pp->blb; - Pp->blb = bg; - } - if (Pp->ble) - { - delete Pp->ble->data; - delete Pp->ble; - } - Pp->blb = Pp->ble = 0; - PatchLIST = PatchLIST->next; - } -} -bool Parallel::PatList_Interp_Points(MyList *PatL, MyList *VarList, - int NN, double **XX, - double *Shellf, int Symmetry) -{ - MyList *varl; - int num_var = 0; - varl = VarList; - while (varl) - { - num_var++; - varl = varl->next; - } - - double lld[dim], uud[dim]; - double **pox; - pox = new double *[dim]; - for (int j = 0; j < dim; j++) - pox[j] = new double[1]; - for (int i = 0; i < NN; i++) - { - MyList *PL = PatL; - while (PL) - { - bool flag = true; - for (int j = 0; j < dim; j++) - { - double h = PL->data->getdX(j); - lld[j] = PL->data->lli[j] * h; - uud[j] = PL->data->uui[j] * h; - if (XX[j][i] < PL->data->bbox[j] + lld[j] || XX[j][i] > PL->data->bbox[j + dim] - uud[j]) - { - flag = false; - break; - } - pox[j][0] = XX[j][i]; - } - if (flag) - { - PL->data->Interp_Points(VarList, 1, pox, Shellf + i * num_var, Symmetry); - break; - } - PL = PL->next; - } - if (!PL) - { - checkpatchlist(PatL, false); - return false; - } - } - for (int j = 0; j < dim; j++) - delete[] pox[j]; - delete[] pox; - - return true; -} -bool Parallel::PatList_Interp_Points(MyList *PatL, MyList *VarList, - int NN, double **XX, - double *Shellf, int Symmetry, MPI_Comm Comm_here) -{ - MyList *varl; - int num_var = 0; - varl = VarList; - while (varl) - { - num_var++; - varl = varl->next; - } - - double lld[dim], uud[dim]; - double **pox; - pox = new double *[dim]; - for (int j = 0; j < dim; j++) - pox[j] = new double[1]; - for (int i = 0; i < NN; i++) - { - MyList *PL = PatL; - while (PL) - { - bool flag = true; - for (int j = 0; j < dim; j++) - { - double h = PL->data->getdX(j); - lld[j] = PL->data->lli[j] * h; - uud[j] = PL->data->uui[j] * h; - if (XX[j][i] < PL->data->bbox[j] + lld[j] || XX[j][i] > PL->data->bbox[j + dim] - uud[j]) - { - flag = false; - break; - } - pox[j][0] = XX[j][i]; - } - if (flag) - { - PL->data->Interp_Points(VarList, 1, pox, Shellf + i * num_var, Symmetry, Comm_here); - break; - } - PL = PL->next; - } - if (!PL) - { - checkpatchlist(PatL, false); - return false; - } - } - for (int j = 0; j < dim; j++) - delete[] pox[j]; - delete[] pox; - - return true; -} -void Parallel::aligncheck(double *bbox0, double *bboxl, int lev, double *DH0, int *shape) -{ - const double aligntiny = 0.1; - double DHl, rr; - int NN; - for (int i = 0; i < dim; i++) - { - DHl = DH0[i] * pow(0.5, lev); - rr = bboxl[i] - bbox0[i]; - bboxl[i] = bbox0[i] + int(rr / DHl + 0.4) * DHl; - rr = bbox0[i + dim] - bboxl[i + dim]; - bboxl[i + dim] = bbox0[i + dim] - int(rr / DHl + 0.4) * DHl; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - NN = int((bboxl[i + dim] - bboxl[i]) / DHl + 0.4) + 1; -#else -#ifdef Cell - NN = int((bboxl[i + dim] - bboxl[i]) / DHl + 0.4); -#else -#error Not define Vertex nor Cell -#endif -#endif - if (NN != shape[i]) - { - int myrank; - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - if (myrank == 0) - { - cout << "Parallel::aligncheck want shape " << NN << " for lev#" << lev << ", but " << shape[i] << endl; - cout << "i = " << i << ", low = " << bboxl[i] << ", up = " << bboxl[i + dim] << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - } - } -} -bool Parallel::point_locat_gsl(double *pox, MyList *gsl) -{ - bool flag = false; - while (gsl) - { - for (int i = 0; i < dim; i++) - { - if (pox[i] > gsl->data->llb[i] && pox[i] < gsl->data->uub[i]) - flag = true; - else - { - flag = false; - break; - } - } - if (flag) - break; - gsl = gsl->next; - } - - return flag; -} -void Parallel::checkpatchlist(MyList *PatL, bool buflog) -{ - MyList *PL = PatL; - while (PL) - { - PL->data->checkPatch(buflog); - PL = PL->next; - } -} + MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); + req_node[req_no] = node; + req_is_recv[req_no] = 0; + req_no++; + } + } + + // Unpack as soon as receive completes to reduce pure wait time. + while (pending_recv > 0) + { + int outcount = 0; + MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats); + if (outcount == MPI_UNDEFINED) break; + + for (int i = 0; i < outcount; i++) + { + int idx = completed[i]; + if (idx >= 0 && req_is_recv[idx]) + { + int recv_node_i = req_node[idx]; + data_packermix(cache.recv_bufs[recv_node_i], cache.combined_src[recv_node_i], cache.combined_dst[recv_node_i], recv_node_i, UNPACK, VarList1, VarList2, Symmetry); + pending_recv--; + } + } + } + + if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats); + + if (self_len > 0) + data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); + + delete[] req_node; + delete[] req_is_recv; + delete[] completed; +} + +// collect all buffer grid segments or blocks for given patch +MyList *Parallel::build_buffer_gsl(Patch *Pat) +{ + MyList *cgsl, *gsc, *gsb; + + gsc = build_complete_gsl(Pat); // including ghost + + gsb = new MyList; + gsb->data = new Parallel::gridseg; + + for (int i = 0; i < dim; i++) + { + double DH = Pat->blb->data->getdX(i); + gsb->data->uub[i] = Pat->bbox[dim + i] - Pat->uui[i] * DH; + gsb->data->llb[i] = Pat->bbox[i] + Pat->lli[i] * DH; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + gsb->data->shape[i] = int((gsb->data->uub[i] - gsb->data->llb[i]) / DH + 0.4) + 1; +#else +#ifdef Cell + gsb->data->shape[i] = int((gsb->data->uub[i] - gsb->data->llb[i]) / DH + 0.4); +#else +#error Not define Vertex nor Cell +#endif +#endif + } + gsb->data->Bg = 0; + gsb->next = 0; + + cgsl = gsl_subtract(gsc, gsb); + + gsc->destroyList(); + gsb->destroyList(); + + // set illb and iuub + gsb = cgsl; + while (gsb) + { + for (int i = 0; i < dim; i++) + { + double DH = Pat->blb->data->getdX(i); + gsb->data->iuub[i] = Pat->bbox[dim + i] - Pat->uui[i] * DH; + gsb->data->illb[i] = Pat->bbox[i] + Pat->lli[i] * DH; + } + gsb = gsb->next; + } + + return cgsl; +} +MyList *Parallel::build_buffer_gsl(MyList *PatL) +{ + MyList *cgsl = 0, *gs; + while (PatL) + { + if (cgsl) + { + gs->next = build_buffer_gsl(PatL->data); + gs = gs->next; + if (gs) + while (gs->next) + gs = gs->next; + } + else + { + cgsl = build_buffer_gsl(PatL->data); + gs = cgsl; + if (gs) + while (gs->next) + gs = gs->next; + } + PatL = PatL->next; + } + + return cgsl; +} +void Parallel::Prolongint(Patch *Patc, Patch *Patf, + MyList *VarList1 /* source */, MyList *VarList2 /* target */, + int Symmetry) +{ + if (Patc->lev >= Patf->lev) + { + cout << "Parallel::Prolong: meet requst of Prolong from lev#" << Patc->lev << " to lev#" << Patf->lev << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + int myrank = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + + int num_var = 0; + MyList *varl; + varl = VarList1; + while (varl) + { + num_var++; + varl = varl->next; + } + + MyList *BP = Patf->blb; + while (BP) + { + int Npts; + if (myrank == BP->data->rank) + Npts = BP->data->shape[0] * BP->data->shape[1] * BP->data->shape[2]; + MPI_Bcast(&Npts, 1, MPI_INT, BP->data->rank, MPI_COMM_WORLD); + double *pox[3]; + for (int i = 0; i < 3; i++) + pox[i] = new double[Npts]; + if (myrank == BP->data->rank) + { + for (int i = 0; i < Npts; i++) + { + int ind[3]; + Parallel::getarrayindex(3, BP->data->shape, ind, i); + pox[0][i] = BP->data->X[0][ind[0]]; + pox[1][i] = BP->data->X[1][ind[1]]; + pox[2][i] = BP->data->X[2][ind[2]]; + } + } + for (int i = 0; i < 3; i++) + MPI_Bcast(pox[i], Npts, MPI_DOUBLE, BP->data->rank, MPI_COMM_WORLD); + double *res; + res = new double[num_var * Npts]; + Patc->Interp_Points(VarList1, Npts, pox, res, Symmetry); // because this operation is a global operation (for all processors) + // we have to isolate it out of myrank==BP->data->rank + if (myrank == BP->data->rank) + { + for (int i = 0; i < Npts; i++) + { + varl = VarList2; + int j = 0; + while (varl) + { + (BP->data->fgfs[varl->data->sgfn])[i] = res[j + i * num_var]; + j++; + varl = varl->next; + } + } + } + delete[] pox[0]; + delete[] pox[1]; + delete[] pox[2]; + delete[] res; + BP = BP->next; + } +} +// +void Parallel::merge_gsl(MyList *&A, const double ratio) +{ + if (!A) + return; + + MyList *B, *C, *D = A; + bool flag = false; + while (D->next) + { + B = D->next; + while (B) + { + flag = merge_gs(D, B, C, ratio); + if (flag) + break; + B = B->next; + } + if (flag) + break; + D = D->next; + } + + if (flag) + { + // delete D and B from A + MyList *E = A; + while (E->next) + { + MyList *tp = E->next; + if (D == tp || B == tp) + { + E->next = (tp->next) ? tp->next : 0; + delete tp->data; + delete tp; + } + if (E->next) + E = E->next; + } + + if (D == A) + { + MyList *tp = A; + A = (A->next) ? A->next : 0; + delete tp->data; + delete tp; + } + // cat C to A + if (A) + A->catList(C); + else + A = C; + + merge_gsl(A, ratio); + } +} +// +bool Parallel::merge_gs(MyList *D, MyList *B, MyList *&C, const double ratio) +{ + if (!B || !D) + return false; + + C = 0; + double llb[dim], uub[dim], DH[dim]; + for (int i = 0; i < dim; i++) + { + double tdh; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + DH[i] = (D->data->uub[i] - D->data->llb[i]) / (D->data->shape[i] - 1); + tdh = (B->data->uub[i] - B->data->llb[i]) / (B->data->shape[i] - 1); +#else +#ifdef Cell + DH[i] = (D->data->uub[i] - D->data->llb[i]) / D->data->shape[i]; + tdh = (B->data->uub[i] - B->data->llb[i]) / B->data->shape[i]; +#else +#error Not define Vertex nor Cell +#endif +#endif + if (!feq(DH[i], tdh, DH[i] / 2)) + { + cout << "Parallel::merge_gs meets different grid segment " << DH[i] << " vs " << tdh << endl; + checkgsl(B, true); + checkgsl(D, true); + MPI_Abort(MPI_COMM_WORLD, 1); + } + llb[i] = Mymax(D->data->llb[i], B->data->llb[i]); + uub[i] = Mymin(D->data->uub[i], B->data->uub[i]); + // if(uub[i]-llb[i] < DH[i]/2) return false; //here this is valid for both vertex and cell + + // use 0 instead of DH[i]/2, we consider contact case, 2012 Aug 8 + if (uub[i] - llb[i] < 0) + return false; // here this is valid for both vertex and cell + } + + // vb: volume of B + // vd: volume of D + // vo: volume of overlap + // vt: volume of smallest common box (virtual merged box) + double vd = 1, vb = 1, vt = 1, vo = 1; + for (int i = 0; i < dim; i++) + { + vt = vt * (Mymax(D->data->uub[i], B->data->uub[i]) - Mymin(D->data->llb[i], B->data->llb[i])); + vo = vo * (uub[i] - llb[i]); + vd = vd * (D->data->uub[i] - D->data->llb[i]); + vb = vb * (B->data->uub[i] - B->data->llb[i]); + } + + // smller ratio, more possible to merge + if ((vd + vb - vo) / vt > ratio) + { + C = new MyList; + C->data = new gridseg; + for (int i = 0; i < dim; i++) + { + C->data->uub[i] = Mymax(D->data->uub[i], B->data->uub[i]); + C->data->llb[i] = Mymin(D->data->llb[i], B->data->llb[i]); +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / DH[i] + 0.4) + 1; +#else +#ifdef Cell + C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / DH[i] + 0.4); +#else +#error Not define Vertex nor Cell +#endif +#endif + } + if (D->data->Bg == B->data->Bg) + C->data->Bg = D->data->Bg; + else + C->data->Bg = 0; + + C->next = 0; + + return true; + } + else + { + return false; + } +} +// Add ghost region to tangent plane +// we assume the grids have the same resolution +void Parallel::add_ghost_touch(MyList *&A) +{ + if (!A || !(A->next)) + return; + + double DH[dim]; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + for (int i = 0; i < dim; i++) + DH[i] = (A->data->uub[i] - A->data->llb[i]) / (A->data->shape[i] - 1) / 2; +#else +#ifdef Cell + for (int i = 0; i < dim; i++) + DH[i] = (A->data->uub[i] - A->data->llb[i]) / A->data->shape[i] / 2; +#else +#error Not define Vertex nor Cell +#endif +#endif + + MyList *C1, *C2, *A1 = A, *A2, *dc; + dc = C1 = clone_gsl(A, false); + while (C1) + { + C2 = C1->next; + A2 = A1->next; + while (C2) + { + for (int i = 0; i < dim; i++) + { + if (feq(C1->data->llb[i], C2->data->uub[i], DH[i])) + { + // direction i touch, other directions overlap + bool flag = true; + for (int j = 0; j < i; j++) + if ((C1->data->llb[j] - C2->data->llb[j]) * (C1->data->uub[j] - C2->data->llb[j]) > 0 && + (C2->data->llb[j] - C1->data->llb[j]) * (C2->data->uub[j] - C1->data->llb[j]) > 0) + flag = false; + for (int j = i + 1; j < dim; j++) + if ((C1->data->llb[j] - C2->data->llb[j]) * (C1->data->uub[j] - C2->data->llb[j]) > 0 && + (C2->data->llb[j] - C1->data->llb[j]) * (C2->data->uub[j] - C1->data->llb[j]) > 0) + flag = false; + + if (flag) + { + // only add one ghost region + if (feq(A1->data->llb[i], C1->data->llb[i], DH[i])) + { + A1->data->llb[i] -= ghost_width * 2 * DH[i]; + A1->data->shape[i] += ghost_width; + } + if (feq(A2->data->uub[i], C2->data->uub[i], DH[i])) + { + A2->data->uub[i] += ghost_width * 2 * DH[i]; + A2->data->shape[i] += ghost_width; + } + } + } + if (feq(C1->data->uub[i], C2->data->llb[i], DH[i])) + { + // direction i touch, other directions overlap + bool flag = true; + for (int j = 0; j < i; j++) + if ((C1->data->llb[j] - C2->data->llb[j]) * (C1->data->uub[j] - C2->data->llb[j]) > 0 && + (C2->data->llb[j] - C1->data->llb[j]) * (C2->data->uub[j] - C1->data->llb[j]) > 0) + flag = false; + for (int j = i + 1; j < dim; j++) + if ((C1->data->llb[j] - C2->data->llb[j]) * (C1->data->uub[j] - C2->data->llb[j]) > 0 && + (C2->data->llb[j] - C1->data->llb[j]) * (C2->data->uub[j] - C1->data->llb[j]) > 0) + flag = false; + + if (flag) + { + // only add one ghost region + if (feq(A1->data->uub[i], C1->data->uub[i], DH[i])) + { + A1->data->uub[i] += ghost_width * 2 * DH[i]; + A1->data->shape[i] += ghost_width; + } + if (feq(A2->data->llb[i], C2->data->llb[i], DH[i])) + { + A2->data->llb[i] -= ghost_width * 2 * DH[i]; + A2->data->shape[i] += ghost_width; + } + } + } + } + C2 = C2->next; + A2 = A2->next; + } + C1 = C1->next; + A1 = A1->next; + } + + if (dc) + dc->destroyList(); +} +// According to overlap to cut the gsl into recular pices +void Parallel::cut_gsl(MyList *&A) +{ + if (!A) + return; + + MyList *B, *C, *D = A; + bool flag = false; + while (D->next) + { + B = D->next; + while (B) + { + flag = cut_gs(D, B, C); + if (flag) + break; + B = B->next; + } + if (flag) + break; + D = D->next; + } + + if (flag) + { + // delete D and B from A + MyList *E = A; + while (E->next) + { + MyList *tp = E->next; + if (D == tp || B == tp) + { + E->next = (tp->next) ? tp->next : 0; + delete tp->data; + delete tp; + } + if (E->next) + E = E->next; + } + + if (D == A) + { + MyList *tp = A; + A = (A->next) ? A->next : 0; + delete tp->data; + delete tp; + } + // cat C to A + if (A) + A->catList(C); + else + A = C; + + cut_gsl(A); + } +} +// when D and B have overlap, cut them into C and return true +// otherwise return false and C=0 +bool Parallel::cut_gs(MyList *D, MyList *B, MyList *&C) +{ + C = 0; + double llb[dim], uub[dim], DH[dim]; + for (int i = 0; i < dim; i++) + { + double tdh; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + DH[i] = (D->data->uub[i] - D->data->llb[i]) / (D->data->shape[i] - 1); + tdh = (B->data->uub[i] - B->data->llb[i]) / (B->data->shape[i] - 1); +#else +#ifdef Cell + DH[i] = (D->data->uub[i] - D->data->llb[i]) / D->data->shape[i]; + tdh = (B->data->uub[i] - B->data->llb[i]) / B->data->shape[i]; +#else +#error Not define Vertex nor Cell +#endif +#endif + if (!feq(DH[i], tdh, DH[i] / 2)) + { + cout << "Parallel::cut_gs meets different grid segment " << DH[i] << " vs " << tdh << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + llb[i] = Mymax(D->data->llb[i], B->data->llb[i]); + uub[i] = Mymin(D->data->uub[i], B->data->uub[i]); + // for efficiency we ask the width of the patch at least 2(buffer+ghost+BD ghost) + if (uub[i] - llb[i] < DH[i] * 2 * (buffer_width + 2 * ghost_width)) + return false; // here this is valid for both vertex and cell + } + + // this part code results in 5 patches generally + + C = new MyList; + C->data = new gridseg; + for (int i = 0; i < dim; i++) + { + C->data->llb[i] = llb[i]; + C->data->uub[i] = uub[i]; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / DH[i] + 0.4) + 1; +#else +#ifdef Cell + C->data->shape[i] = int((C->data->uub[i] - C->data->llb[i]) / DH[i] + 0.4); +#else +#error Not define Vertex nor Cell +#endif +#endif + } + if (D->data->Bg == B->data->Bg) + C->data->Bg = D->data->Bg; + else + C->data->Bg = 0; + + C->next = gs_subtract_virtual(D, C); + + MyList *E = C; + + while (E->next) + E = E->next; + + E->next = gs_subtract_virtual(B, C); + + // this part code results in 3 patches generally + /* + C = clone_gsl(D,true); + C->next = gs_subtract_virtual(B,C); + */ + + return true; +} +// note here it is different to real cut, we need leave the cutting edge for both vertex center and cell center +MyList *Parallel::gs_subtract_virtual(MyList *A, MyList *B) +{ + if (!A) + return 0; + if (!B) + return clone_gsl(A, true); + + double cut_plane[2 * dim], DH[dim]; + + for (int i = 0; i < dim; i++) + { + double tdh; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + DH[i] = (A->data->uub[i] - A->data->llb[i]) / (A->data->shape[i] - 1); + tdh = (B->data->uub[i] - B->data->llb[i]) / (B->data->shape[i] - 1); +#else +#ifdef Cell + DH[i] = (A->data->uub[i] - A->data->llb[i]) / A->data->shape[i]; + tdh = (B->data->uub[i] - B->data->llb[i]) / B->data->shape[i]; +#else +#error Not define Vertex nor Cell +#endif +#endif + if (!feq(DH[i], tdh, DH[i] / 2)) + { + cout << "Parallel::gs_subtract_virtual meets different grid segment " << DH[i] << " vs " << tdh << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + + MyList *C = 0, *q; + for (int i = 0; i < dim; i++) + { + if (B->data->llb[i] > A->data->uub[i] || B->data->uub[i] < A->data->llb[i]) + return clone_gsl(A, true); + cut_plane[i] = A->data->llb[i]; + cut_plane[i + dim] = A->data->uub[i]; + } + + for (int i = 0; i < dim; i++) + { + cut_plane[i] = Mymax(A->data->llb[i], B->data->llb[i]); + if (cut_plane[i] > A->data->llb[i]) + { + q = clone_gsl(A, true); + // prolong the list from head + if (C) + q->next = C; + C = q; + for (int j = 0; j < dim; j++) + { + if (i == j) + { + C->data->llb[i] = A->data->llb[i]; + // **note here it is different to real cut, we need leave the cutting edge for both vertex center and cell center** + C->data->uub[i] = Mymax(C->data->llb[i], cut_plane[i]); + } + else + { + C->data->llb[j] = cut_plane[j]; + C->data->uub[j] = cut_plane[j + dim]; + } +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4) + 1; +#else +#ifdef Cell + C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4); +#else +#error Not define Vertex nor Cell +#endif +#endif + } + } + + cut_plane[i + dim] = Mymin(A->data->uub[i], B->data->uub[i]); + if (cut_plane[i + dim] < A->data->uub[i]) + { + q = clone_gsl(A, true); + if (C) + q->next = C; + C = q; + for (int j = 0; j < dim; j++) + { + if (i == j) + { + C->data->uub[i] = A->data->uub[i]; + // note here it is different to real cut, we need leave the cutting edge for both vertex center and cell center + C->data->llb[i] = Mymin(C->data->uub[i], cut_plane[i + dim]); + } + else + { + C->data->llb[j] = cut_plane[j]; + C->data->uub[j] = cut_plane[j + dim]; + } +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4) + 1; +#else +#ifdef Cell + C->data->shape[j] = int((C->data->uub[j] - C->data->llb[j]) / DH[j] + 0.4); +#else +#error Not define Vertex nor Cell +#endif +#endif + } + } + } + return C; +} +// note the data structure +// if CC is true +// 1 ----------- 1 ------ ^ +// 0 ------ | t +// 0 ----------- old ------ | +// +// old ----------- +// if CC is false +// 1 ----------- 1 ------ ^ +// 0 ----------- 0 ------ | t +// old ----------- old ------ | +void Parallel::fill_level_data(MyList *PatLd, MyList *PatLs, MyList *PatcL, + MyList *OldList, MyList *StateList, MyList *FutureList, + MyList *tmList, int Symmetry, bool BB, bool CC) +{ + if (PatLd->data->lev != PatLs->data->lev) + { + cout << "Parallel::fill_level_data: meet requst from lev#" << PatLs->data->lev << " to lev#" << PatLd->data->lev << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + if (PatLd->data->lev <= PatcL->data->lev) + { + cout << "Parallel::fill_level_data: meet prolong requst from lev#" << PatcL->data->lev << " to lev#" << PatLd->data->lev << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + + MyList *VarList = 0; + MyList *p; + p = StateList; + while (p) + { + if (VarList) + VarList->insert(p->data); + else + VarList = new MyList(p->data); + p = p->next; + } + p = FutureList; + while (p) + { + if (VarList) + VarList->insert(p->data); + else + VarList = new MyList(p->data); + p = p->next; + } + + MyList *dst; + MyList **src, **transfer_src, **transfer_dst; + src = new MyList *[cpusize]; + transfer_src = new MyList *[cpusize]; + transfer_dst = new MyList *[cpusize]; + + dst = build_complete_gsl(PatLd); // including ghost + // copy part + for (int node = 0; node < cpusize; node++) + { + src[node] = build_owned_gsl(PatLs, node, 0, Symmetry); // similar to Sync + build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node + } + + transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry); + + for (int node = 0; node < cpusize; node++) + { + if (src[node]) + src[node]->destroyList(); + if (transfer_src[node]) + transfer_src[node]->destroyList(); + if (transfer_dst[node]) + transfer_dst[node]->destroyList(); + } + + MyList *dsts, *dstd; + dsts = build_complete_gsl_virtual(PatLs); + dstd = dst; + dst = gsl_subtract(dstd, dsts); + if (dstd) + dstd->destroyList(); + if (dsts) + dsts->destroyList(); + + if (dst) + { + // prolongation part + for (int node = 0; node < cpusize; node++) + { + src[node] = build_owned_gsl(PatcL, node, 4, Symmetry); // - buffer - ghost - BD ghost + build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node + } + + if (CC) + { + // for FutureList + // restrict first~~~> + { + Restrict(PatcL, PatLs, FutureList, FutureList, Symmetry); + Sync(PatcL, FutureList, Symmetry); + } + //<~~~prolong then + transfer(transfer_src, transfer_dst, FutureList, FutureList, Symmetry); + + // for StateList + // time interpolation part + if (BB) + prepare_inter_time_level(PatcL, FutureList, StateList, OldList, + tmList, 0); // use SynchList_pre as temporal storage space + else + prepare_inter_time_level(PatcL, FutureList, StateList, + tmList, 0); // use SynchList_pre as temporal storage space + // restrict first~~~> + { + Restrict(PatcL, PatLs, StateList, tmList, Symmetry); + Sync(PatcL, tmList, Symmetry); + } + //<~~~prolong then + transfer(transfer_src, transfer_dst, tmList, StateList, Symmetry); + } + else + { + // for both FutureList and StateList + // restrict first~~~> + { + Restrict(PatcL, PatLs, VarList, VarList, Symmetry); + Sync(PatcL, VarList, Symmetry); + } + //<~~~prolong then + transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry); + } + + for (int node = 0; node < cpusize; node++) + { + if (src[node]) + src[node]->destroyList(); + if (transfer_src[node]) + transfer_src[node]->destroyList(); + if (transfer_dst[node]) + transfer_dst[node]->destroyList(); + } + + dst->destroyList(); + } + + delete[] src; + delete[] transfer_src; + delete[] transfer_dst; + + VarList->clearList(); +} +void Parallel::KillBlocks(MyList *PatchLIST) +{ + while (PatchLIST) + { + Patch *Pp = PatchLIST->data; + MyList *bg; + while (Pp->blb) + { + if (Pp->blb == Pp->ble) + break; + bg = (Pp->blb->next) ? Pp->blb->next : 0; + delete Pp->blb->data; + delete Pp->blb; + Pp->blb = bg; + } + if (Pp->ble) + { + delete Pp->ble->data; + delete Pp->ble; + } + Pp->blb = Pp->ble = 0; + PatchLIST = PatchLIST->next; + } +} +bool Parallel::PatList_Interp_Points(MyList *PatL, MyList *VarList, + int NN, double **XX, + double *Shellf, int Symmetry) +{ + MyList *varl; + int num_var = 0; + varl = VarList; + while (varl) + { + num_var++; + varl = varl->next; + } + + double lld[dim], uud[dim]; + double **pox; + pox = new double *[dim]; + for (int j = 0; j < dim; j++) + pox[j] = new double[1]; + for (int i = 0; i < NN; i++) + { + MyList *PL = PatL; + while (PL) + { + bool flag = true; + for (int j = 0; j < dim; j++) + { + double h = PL->data->getdX(j); + lld[j] = PL->data->lli[j] * h; + uud[j] = PL->data->uui[j] * h; + if (XX[j][i] < PL->data->bbox[j] + lld[j] || XX[j][i] > PL->data->bbox[j + dim] - uud[j]) + { + flag = false; + break; + } + pox[j][0] = XX[j][i]; + } + if (flag) + { + PL->data->Interp_Points(VarList, 1, pox, Shellf + i * num_var, Symmetry); + break; + } + PL = PL->next; + } + if (!PL) + { + checkpatchlist(PatL, false); + return false; + } + } + for (int j = 0; j < dim; j++) + delete[] pox[j]; + delete[] pox; + + return true; +} +bool Parallel::PatList_Interp_Points(MyList *PatL, MyList *VarList, + int NN, double **XX, + double *Shellf, int Symmetry, MPI_Comm Comm_here) +{ + MyList *varl; + int num_var = 0; + varl = VarList; + while (varl) + { + num_var++; + varl = varl->next; + } + + double lld[dim], uud[dim]; + double **pox; + pox = new double *[dim]; + for (int j = 0; j < dim; j++) + pox[j] = new double[1]; + for (int i = 0; i < NN; i++) + { + MyList *PL = PatL; + while (PL) + { + bool flag = true; + for (int j = 0; j < dim; j++) + { + double h = PL->data->getdX(j); + lld[j] = PL->data->lli[j] * h; + uud[j] = PL->data->uui[j] * h; + if (XX[j][i] < PL->data->bbox[j] + lld[j] || XX[j][i] > PL->data->bbox[j + dim] - uud[j]) + { + flag = false; + break; + } + pox[j][0] = XX[j][i]; + } + if (flag) + { + PL->data->Interp_Points(VarList, 1, pox, Shellf + i * num_var, Symmetry, Comm_here); + break; + } + PL = PL->next; + } + if (!PL) + { + checkpatchlist(PatL, false); + return false; + } + } + for (int j = 0; j < dim; j++) + delete[] pox[j]; + delete[] pox; + + return true; +} +void Parallel::aligncheck(double *bbox0, double *bboxl, int lev, double *DH0, int *shape) +{ + const double aligntiny = 0.1; + double DHl, rr; + int NN; + for (int i = 0; i < dim; i++) + { + DHl = DH0[i] * pow(0.5, lev); + rr = bboxl[i] - bbox0[i]; + bboxl[i] = bbox0[i] + int(rr / DHl + 0.4) * DHl; + rr = bbox0[i + dim] - bboxl[i + dim]; + bboxl[i + dim] = bbox0[i + dim] - int(rr / DHl + 0.4) * DHl; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + NN = int((bboxl[i + dim] - bboxl[i]) / DHl + 0.4) + 1; +#else +#ifdef Cell + NN = int((bboxl[i + dim] - bboxl[i]) / DHl + 0.4); +#else +#error Not define Vertex nor Cell +#endif +#endif + if (NN != shape[i]) + { + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + if (myrank == 0) + { + cout << "Parallel::aligncheck want shape " << NN << " for lev#" << lev << ", but " << shape[i] << endl; + cout << "i = " << i << ", low = " << bboxl[i] << ", up = " << bboxl[i + dim] << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + } +} +bool Parallel::point_locat_gsl(double *pox, MyList *gsl) +{ + bool flag = false; + while (gsl) + { + for (int i = 0; i < dim; i++) + { + if (pox[i] > gsl->data->llb[i] && pox[i] < gsl->data->uub[i]) + flag = true; + else + { + flag = false; + break; + } + } + if (flag) + break; + gsl = gsl->next; + } + + return flag; +} +void Parallel::checkpatchlist(MyList *PatL, bool buflog) +{ + MyList *PL = PatL; + while (PL) + { + PL->data->checkPatch(buflog); + PL = PL->next; + } +} diff --git a/AMSS_NCKU_source/Parallel.h b/AMSS_NCKU_source/Parallel.h index 5179786..5712a15 100644 --- a/AMSS_NCKU_source/Parallel.h +++ b/AMSS_NCKU_source/Parallel.h @@ -100,29 +100,36 @@ namespace Parallel MyList **combined_dst; int *send_lengths; int *recv_lengths; - double **send_bufs; - double **recv_bufs; - int *send_buf_caps; - int *recv_buf_caps; - unsigned char *send_buf_pinned; - unsigned char *recv_buf_pinned; - MPI_Request *reqs; - MPI_Status *stats; + double **send_bufs; + double **recv_bufs; + int *send_buf_caps; + int *recv_buf_caps; + unsigned char *send_buf_pinned; + unsigned char *recv_buf_pinned; + unsigned char *send_buf_is_dev; + unsigned char *recv_buf_is_dev; + int *send_buf_caps_dev; + int *recv_buf_caps_dev; + double **send_bufs_dev; + double **recv_bufs_dev; + MPI_Request *reqs; + MPI_Status *stats; int max_reqs; bool lengths_valid; int *tc_req_node; int *tc_req_is_recv; int *tc_completed; + bool cuda_aware_mode; SyncCache(); void invalidate(); void destroy(); }; - void Sync_cached(MyList *PatL, MyList *VarList, int Symmetry, SyncCache &cache); - void Sync_ensure_cache(MyList *PatL, int Symmetry, SyncCache &cache); - void transfer_cached(MyList **src, MyList **dst, - MyList *VarList1, MyList *VarList2, - int Symmetry, SyncCache &cache); + void Sync_cached(MyList *PatL, MyList *VarList, int Symmetry, SyncCache &cache); + void Sync_ensure_cache(MyList *PatL, int Symmetry, SyncCache &cache); + void transfer_cached(MyList **src, MyList **dst, + MyList *VarList1, MyList *VarList2, + int Symmetry, SyncCache &cache); struct AsyncSyncState { int req_no; @@ -182,13 +189,13 @@ namespace Parallel MyList *clone_gsl(MyList *p, bool first_only); MyList *build_bulk_gsl(Patch *Pat); // similar to build_owned_gsl0 but does not care rank issue MyList *build_bulk_gsl(Block *bp, Patch *Pat); - void build_PhysBD_gstl(Patch *Pat, MyList *srci, MyList *dsti, - MyList **out_src, MyList **out_dst); - void PeriodicBD(Patch *Pat, MyList *VarList, int Symmetry); - double L2Norm(Patch *Pat, var *vf); - void L2Norm7(Patch *Pat, var **vf, double *norms); - void checkgsl(MyList *pp, bool first_only); - void checkvarl(MyList *pp, bool first_only); + void build_PhysBD_gstl(Patch *Pat, MyList *srci, MyList *dsti, + MyList **out_src, MyList **out_dst); + void PeriodicBD(Patch *Pat, MyList *VarList, int Symmetry); + double L2Norm(Patch *Pat, var *vf); + void L2Norm7(Patch *Pat, var **vf, double *norms); + void checkgsl(MyList *pp, bool first_only); + void checkvarl(MyList *pp, bool first_only); MyList *divide_gsl(MyList *p, Patch *Pat); MyList *divide_gs(MyList *p, Patch *Pat); void prepare_inter_time_level(Patch *Pat, @@ -220,12 +227,12 @@ namespace Parallel void aligncheck(double *bbox0, double *bboxl, int lev, double *DH0, int *shape); bool point_locat_gsl(double *pox, MyList *gsl); void checkpatchlist(MyList *PatL, bool buflog); - - double L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here); - void L2Norm7(Patch *Pat, var **vf, double *norms, MPI_Comm Comm_here); - bool PatList_Interp_Points(MyList *PatL, MyList *VarList, - int NN, double **XX, - double *Shellf, int Symmetry, MPI_Comm Comm_here); + + double L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here); + void L2Norm7(Patch *Pat, var **vf, double *norms, MPI_Comm Comm_here); + bool PatList_Interp_Points(MyList *PatL, MyList *VarList, + int NN, double **XX, + double *Shellf, int Symmetry, MPI_Comm Comm_here); #if (PSTR == 1 || PSTR == 2 || PSTR == 3) MyList *distribute(MyList *PatchLIST, int cpusize, int ingfsi, int fngfsi, bool periodic, int start_rank, int end_rank, int nodes = 0); diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index ad31c7f..c818792 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -76,11 +76,36 @@ struct CudaProfileStats { double output_ms; }; +enum RhsStageId { + RHS_STAGE_PREP = 0, + RHS_STAGE_DERIV1, + RHS_STAGE_METRIC, + RHS_STAGE_GAUGE_DERIV, + RHS_STAGE_GAMMA_CONTRACT, + RHS_STAGE_RICCI_DIFF, + RHS_STAGE_RICCI_FUSED, + RHS_STAGE_CHI, + RHS_STAGE_GAUGE_RHS, + RHS_STAGE_KODIS, + RHS_STAGE_CONSTRAINTS, + RHS_STAGE_COUNT +}; + +struct RhsStageProfileStats { + long long calls; + double ms[RHS_STAGE_COUNT]; +}; + static CudaProfileStats &cuda_profile_stats() { static CudaProfileStats stats = {0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; return stats; } +static RhsStageProfileStats &rhs_stage_profile_stats() { + static RhsStageProfileStats stats = {}; + return stats; +} + static bool cuda_profile_enabled() { static int enabled = -1; if (enabled < 0) { @@ -99,6 +124,24 @@ static int cuda_profile_every() { return every; } +static bool rhs_stage_timing_enabled() { + static int enabled = -1; + if (enabled < 0) { + const char *env = getenv("AMSS_GPU_STAGE_TIMING"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +static int rhs_stage_timing_every() { + static int every = -1; + if (every < 0) { + const char *env = getenv("AMSS_GPU_STAGE_TIMING_EVERY"); + every = (env && atoi(env) > 0) ? atoi(env) : cuda_profile_every(); + } + return every; +} + static double cuda_profile_now_ms() { using clock = std::chrono::steady_clock; return std::chrono::duration( @@ -131,6 +174,36 @@ static void cuda_profile_maybe_log() { fflush(stderr); } +static void rhs_stage_profile_accumulate(const double *stage_ms) { + if (!rhs_stage_timing_enabled()) return; + + RhsStageProfileStats &stats = rhs_stage_profile_stats(); + stats.calls++; + for (int i = 0; i < RHS_STAGE_COUNT; ++i) { + stats.ms[i] += stage_ms[i]; + } + if (stats.calls <= 0 || stats.calls % rhs_stage_timing_every() != 0) return; + + fprintf(stderr, + "[AMSS-CUDA-STAGE][rank %d][dev %d] calls=%lld" + " prep=%.3f deriv1=%.3f metric=%.3f gauge_deriv=%.3f" + " gamma_contract=%.3f ricci_diff=%.3f ricci_fused=%.3f" + " chi=%.3f gauge_rhs=%.3f kodis=%.3f constraints=%.3f ms\n", + g_dispatch.my_rank, g_dispatch.my_device, stats.calls, + stats.ms[RHS_STAGE_PREP] / (double)stats.calls, + stats.ms[RHS_STAGE_DERIV1] / (double)stats.calls, + stats.ms[RHS_STAGE_METRIC] / (double)stats.calls, + stats.ms[RHS_STAGE_GAUGE_DERIV] / (double)stats.calls, + stats.ms[RHS_STAGE_GAMMA_CONTRACT] / (double)stats.calls, + stats.ms[RHS_STAGE_RICCI_DIFF] / (double)stats.calls, + stats.ms[RHS_STAGE_RICCI_FUSED] / (double)stats.calls, + stats.ms[RHS_STAGE_CHI] / (double)stats.calls, + stats.ms[RHS_STAGE_GAUGE_RHS] / (double)stats.calls, + stats.ms[RHS_STAGE_KODIS] / (double)stats.calls, + stats.ms[RHS_STAGE_CONSTRAINTS] / (double)stats.calls); + fflush(stderr); +} + /* ------------------------------------------------------------------ */ /* Error checking */ /* ------------------------------------------------------------------ */ @@ -4643,6 +4716,20 @@ static void compute_patch_boundary_flags(int *ex, static void upload_state_inputs(double **state_host, size_t all) { const size_t bytes = all * sizeof(double); + static int direct_upload = -1; + if (direct_upload < 0) { + const char *env = getenv("AMSS_CUDA_DIRECT_STATE_UPLOAD"); + const char *pin_env = getenv("AMSS_CUDA_PIN_GRIDFUNCS"); + direct_upload = env ? ((atoi(env) != 0) ? 1 : 0) + : ((pin_env && atoi(pin_env) != 0) ? 1 : 0); + } + if (direct_upload) { + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[k_state_input_slots[i]], state_host[i], + bytes, cudaMemcpyHostToDevice)); + } + return; + } for (int i = 0; i < BSSN_STATE_COUNT; ++i) { std::memcpy(g_buf.h_stage + (size_t)i * all, state_host[i], bytes); } @@ -4697,11 +4784,24 @@ static void launch_rhs_pipeline(int all, double eps, int co) { const double SYM = 1.0; const double ANTI = -1.0; + const bool stage_timing = rhs_stage_timing_enabled(); + double stage_ms[RHS_STAGE_COUNT] = {}; + double stage_t0 = stage_timing ? cuda_profile_now_ms() : 0.0; #define D(s) g_buf.slot[s] + #define MARK_RHS_STAGE(stage_id) do { \ + if (stage_timing) { \ + cuda_profile_sync(); \ + const double stage_t1 = cuda_profile_now_ms(); \ + stage_ms[(stage_id)] += stage_t1 - stage_t0; \ + stage_t0 = stage_t1; \ + } \ + } while (0) + kern_phase1_prep<<>>( D(S_Lap), D(S_chi), D(S_dxx), D(S_dyy), D(S_dzz), D(S_alpn1), D(S_chin1), D(S_gxx), D(S_gyy), D(S_gzz)); + MARK_RHS_STAGE(RHS_STAGE_PREP); { double *src_fields[] = { @@ -4742,6 +4842,7 @@ static void launch_rhs_pipeline(int all, double eps, int co) src_fields, fx_fields, fy_fields, fz_fields, soa_signs, all); } + MARK_RHS_STAGE(RHS_STAGE_DERIV1); kern_phase2_metric_rhs<<>>( D(S_alpn1), D(S_chin1), @@ -4799,6 +4900,7 @@ static void launch_rhs_pipeline(int all, double eps, int co) D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz), D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz), D(S_Gamx_rhs), D(S_Gamy_rhs), D(S_Gamz_rhs)); + MARK_RHS_STAGE(RHS_STAGE_METRIC); { double *src_fields[] = {D(S_betax), D(S_betay), D(S_betaz)}; @@ -4832,6 +4934,7 @@ static void launch_rhs_pipeline(int all, double eps, int co) src_fields, fx_fields, fy_fields, fz_fields, soa_signs, all); } + MARK_RHS_STAGE(RHS_STAGE_GAUGE_DERIV); kern_phase8_9_gamma_rhs_contract_fused<<>>( D(S_gupxx), D(S_gupxy), D(S_gupxz), @@ -4854,6 +4957,7 @@ static void launch_rhs_pipeline(int all, double eps, int co) D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx), D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy), D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz)); + MARK_RHS_STAGE(RHS_STAGE_GAMMA_CONTRACT); { double *src_fields[] = {D(S_dxx), D(S_dyy), D(S_dzz), D(S_gxy), D(S_gxz), D(S_gyz)}; @@ -4870,6 +4974,7 @@ static void launch_rhs_pipeline(int all, double eps, int co) D(S_gupyy), D(S_gupyz), D(S_gupzz), src_fields, dst_fields, soa_signs, all); } + MARK_RHS_STAGE(RHS_STAGE_RICCI_DIFF); kern_phase11_ricci_fused<<>>( D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz), @@ -4889,6 +4994,7 @@ static void launch_rhs_pipeline(int all, double eps, int co) D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz), D(S_Rxx),D(S_Rxy),D(S_Rxz), D(S_Ryy),D(S_Ryz),D(S_Rzz)); + MARK_RHS_STAGE(RHS_STAGE_RICCI_FUSED); kern_phase12_13_chi_correction_fused<<>>( D(S_chi), D(S_chin1), @@ -4904,6 +5010,7 @@ static void launch_rhs_pipeline(int all, double eps, int co) D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz), D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz)); + MARK_RHS_STAGE(RHS_STAGE_CHI); kern_phase15_trK_Aij_gauge<<>>( D(S_alpn1), D(S_chin1), @@ -4936,8 +5043,10 @@ static void launch_rhs_pipeline(int all, double eps, int co) D(S_betax_rhs), D(S_betay_rhs), D(S_betaz_rhs), D(S_Gamx_rhs), D(S_Gamy_rhs), D(S_Gamz_rhs), D(S_f_arr), D(S_S_arr)); + MARK_RHS_STAGE(RHS_STAGE_GAUGE_RHS); gpu_lopsided_kodis_state_batch(eps, all); + MARK_RHS_STAGE(RHS_STAGE_KODIS); if (co == 0) { { @@ -4982,7 +5091,10 @@ static void launch_rhs_pipeline(int all, double eps, int co) D(S_gzzx), D(S_gzzy), D(S_gzzz), D(S_ham_Res), D(S_movx_Res), D(S_movy_Res), D(S_movz_Res)); } + MARK_RHS_STAGE(RHS_STAGE_CONSTRAINTS); + rhs_stage_profile_accumulate(stage_ms); + #undef MARK_RHS_STAGE #undef D } @@ -5196,6 +5308,21 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos const size_t all = (size_t)ex[0] * ex[1] * ex[2]; const size_t bytes = all * sizeof(double); StepContext &ctx = ensure_step_ctx(block_tag, all); + static int direct_download = -1; + if (direct_download < 0) { + const char *env = getenv("AMSS_CUDA_DIRECT_STATE_DOWNLOAD"); + const char *pin_env = getenv("AMSS_CUDA_PIN_GRIDFUNCS"); + direct_download = env ? ((atoi(env) != 0) ? 1 : 0) + : ((pin_env && atoi(pin_env) != 0) ? 1 : 0); + } + if (direct_download) { + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + CUDA_CHECK(cudaMemcpyAsync(state_host_out[i], ctx.d_state_curr[i], + bytes, cudaMemcpyDeviceToHost)); + } + CUDA_CHECK(cudaDeviceSynchronize()); + return; + } CUDA_CHECK(cudaMemcpy(g_buf.h_stage, ctx.d_state_curr_mem, (size_t)BSSN_STATE_COUNT * bytes, cudaMemcpyDeviceToHost)); @@ -5902,6 +6029,67 @@ int bssn_cuda_unpack_state_batch_from_host_buffer(void *block_tag, return 0; } +static void copy_state_device_batch(void *block_tag, + int state_count, + double *device_buffer, + const int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz, + int pack_not_unpack) +{ + if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return; + if (sx <= 0 || sy <= 0 || sz <= 0) return; + + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + const int region_all = sx * sy * sz; + dim3 launch_grid((unsigned int)grid((size_t)region_all), + (unsigned int)state_count); + + if (pack_not_unpack) { + kern_pack_state_region_batch<<>>( + ctx.d_state_curr_mem, device_buffer, + ex[0], ex[1], i0, j0, k0, sx, sy, sz, + region_all, state_count, + ex[0] * ex[1] * ex[2]); + } else { + kern_unpack_state_region_batch<<>>( + ctx.d_state_curr_mem, device_buffer, + ex[0], ex[1], i0, j0, k0, sx, sy, sz, + region_all, state_count, + ex[0] * ex[1] * ex[2]); + } +} + +extern "C" +int bssn_cuda_pack_state_batch_to_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + copy_state_device_batch(block_tag, state_count, device_buffer, ex, + i0, j0, k0, sx, sy, sz, 1); + return 0; +} + +extern "C" +int bssn_cuda_unpack_state_batch_from_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + copy_state_device_batch(block_tag, state_count, device_buffer, ex, + i0, j0, k0, sx, sy, sz, 0); + return 0; +} + extern "C" int bssn_cuda_download_state_subset(void *block_tag, int *ex, diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.h b/AMSS_NCKU_source/bssn_rhs_cuda.h index 55b6380..12e190a 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.h +++ b/AMSS_NCKU_source/bssn_rhs_cuda.h @@ -1,6 +1,6 @@ -#ifndef BSSN_RHS_CUDA_H -#define BSSN_RHS_CUDA_H - +#ifndef BSSN_RHS_CUDA_H +#define BSSN_RHS_CUDA_H + #ifdef __cplusplus extern "C" { #endif @@ -9,28 +9,28 @@ enum { BSSN_CUDA_STATE_COUNT = 24, BSSN_CUDA_MATTER_COUNT = 10 }; - + int f_compute_rhs_bssn(int *ex, double &T, double *X, double *Y, double *Z, double *chi, double *trK, - double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz, - double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz, - double *Gamx, double *Gamy, double *Gamz, - double *Lap, double *betax, double *betay, double *betaz, - double *dtSfx, double *dtSfy, double *dtSfz, - double *chi_rhs, double *trK_rhs, - double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs, - double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs, - double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs, - double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs, - double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs, - double *rho, double *Sx, double *Sy, double *Sz, - double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz, - double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz, - double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz, - double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz, - double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz, - double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res, + double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz, + double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz, + double *Gamx, double *Gamy, double *Gamz, + double *Lap, double *betax, double *betay, double *betaz, + double *dtSfx, double *dtSfy, double *dtSfz, + double *chi_rhs, double *trK_rhs, + double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs, + double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs, + double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs, + double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs, + double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs, + double *rho, double *Sx, double *Sy, double *Sz, + double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz, + double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz, + double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz, + double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz, + double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz, + double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res, double *Gmx_Res, double *Gmy_Res, double *Gmz_Res, int &Symmetry, int &Lev, double &eps, int &co); @@ -104,6 +104,20 @@ int bssn_cuda_unpack_state_batch_from_host_buffer(void *block_tag, int i0, int j0, int k0, int sx, int sy, int sz); +int bssn_cuda_pack_state_batch_to_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz); + +int bssn_cuda_unpack_state_batch_from_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz); + int bssn_cuda_download_state_subset(void *block_tag, int *ex, int subset_count, @@ -122,6 +136,6 @@ void bssn_cuda_release_step_ctx(void *block_tag); #ifdef __cplusplus } -#endif - -#endif +#endif + +#endif