Compare commits

...

36 Commits

Author SHA1 Message Date
6fd7ef2b55 Cache GPU RHS symbols and zero vacuum sources once 2026-04-12 22:42:58 +08:00
7064ebd5b4 Batch GPU stage downloads 2026-04-12 21:06:41 +08:00
87c581ea7c Checkpoint stable GPU optimization baseline 2026-04-12 20:26:27 +08:00
d702aa06b9 Trim GPU restrict sync overhead 2026-04-12 19:45:34 +08:00
ce88c18265 Tune GPU RHS launch geometry 2026-04-12 18:59:59 +08:00
db2d6978b2 Reduce final GPU host downloads 2026-04-12 18:46:42 +08:00
c8977d8356 Optimize GPU RK4 stage sync path 2026-04-12 18:36:05 +08:00
d9287ea530 Fix GPU RK4 boundary and sync correctness 2026-04-12 12:13:47 +08:00
b78874ef21 Refine stable GPU AMR staging path 2026-04-10 23:37:36 +08:00
a089041c3b Stabilize GPU AMR prolong/restrict paths 2026-04-10 21:57:58 +08:00
c578a15ecd Fix GPU interpolation cache lifetime leaks 2026-04-10 10:29:04 +08:00
e1a0bff43c Reduce redundant GPU host buffer preparation 2026-04-09 21:20:45 +08:00
cf3c6d6218 Stabilize GPU buffer lifecycle around regrid 2026-04-09 20:48:06 +08:00
46e94d1248 Trim constraint-only GPU downloads 2026-04-09 19:36:19 +08:00
7cd2414faa Move constraint recomputation onto GPU path 2026-04-09 19:17:39 +08:00
4463f1d23e Unpack intermediate sync stages directly to GPU 2026-04-09 19:01:12 +08:00
4484635f0d Pack sync send buffers directly from GPU state 2026-04-09 18:49:11 +08:00
b0dd069a2b Register GPU transfer buffers as pinned host memory 2026-04-09 18:36:10 +08:00
5bc67ded06 Download staged GPU sync regions incrementally 2026-04-09 18:23:05 +08:00
3b16795e78 Refresh synced GPU regions incrementally 2026-04-09 17:07:31 +08:00
5b00d49070 Reduce staged GPU host-device copies 2026-04-09 16:44:08 +08:00
42e851d19a Cache repeated interpolation plans 2026-04-09 15:21:01 +08:00
06fa643365 Refine batched CUDA interpolation kernel 2026-04-09 15:06:11 +08:00
c47349b7a9 Add batched CUDA patch interpolation path 2026-04-09 14:56:01 +08:00
ad999e4c5a Add guarded GPU prolong3 path scaffold 2026-04-09 14:28:36 +08:00
e1e3b4a448 Reduce GPU RK4 transfer overhead 2026-04-09 12:11:40 +08:00
49409645c0 Stabilize GPU output path and MPI sync 2026-04-09 10:57:49 +08:00
4e3946a4f0 Persist GPU RK4 stage caches 2026-04-08 20:59:15 +08:00
a0af9b8804 Trim GPU main-path transfer overhead 2026-04-08 20:16:25 +08:00
01ac1f9250 Cache GPU main-path device buffers 2026-04-08 19:43:17 +08:00
ea470737db Add runnable GPU main-path prototype 2026-04-08 19:14:37 +08:00
8c1f4d8108 迁移C算子的循环融合和临时量消除 2026-03-03 16:20:15 +08:00
d310ef918b bssn_rhs(fortran): migrate C kernel loop-fusion optimizations 2026-03-03 16:20:15 +08:00
b35e1b289f 设置开关关闭内存打印统计 2026-03-03 16:17:47 +08:00
05851b2c59 关闭静态负载 2026-03-03 16:17:47 +08:00
3b39583d67 fix(bssn_rhs) 2026-03-03 16:06:33 +08:00
22 changed files with 7376 additions and 1726 deletions

View File

@@ -23,22 +23,20 @@ using namespace std;
#include <mpi.h> #include <mpi.h>
#include "misc.h" #include "misc.h"
#include "macrodef.h" #include "macrodef.h"
#ifdef USE_GPU
extern void bssn_cuda_dump_stage_profile();
#endif
#ifndef ABEtype #ifndef ABEtype
#error "not define ABEtype" #error "not define ABEtype"
#endif #endif
#if (ABEtype == 0) #if (ABEtype == 0)
#include "bssn_class.h"
#ifdef USE_GPU
#include "bssn_gpu_class.h" #elif (ABEtype == 1)
#else #include "bssnEScalar_class.h"
#include "bssn_class.h"
#endif
#elif (ABEtype == 1)
#include "bssnEScalar_class.h"
#elif (ABEtype == 2) #elif (ABEtype == 2)
#include "Z4c_class.h" #include "Z4c_class.h"
@@ -474,10 +472,13 @@ int main(int argc, char *argv[])
cout << endl; cout << endl;
} }
ADM->Evolve(Steps); ADM->Evolve(Steps);
#ifdef USE_GPU
if (myrank == 0) bssn_cuda_dump_stage_profile();
{ #endif
if (myrank == 0)
{
cout << endl; cout << endl;
cout << " Total Evolve Time: " << MPI_Wtime() - End_clock << " seconds!" << endl; cout << " Total Evolve Time: " << MPI_Wtime() - End_clock << " seconds!" << endl;
cout << " Total Running Time: " << MPI_Wtime() - Begin_clock << " seconds!" << endl; cout << " Total Running Time: " << MPI_Wtime() - Begin_clock << " seconds!" << endl;

View File

@@ -9,8 +9,12 @@
#include <new> #include <new>
using namespace std; using namespace std;
#include "Block.h" #include "Block.h"
#include "misc.h" #include "misc.h"
#ifdef USE_GPU
#include "bssn_gpu.h"
#include "bssn_cuda_ops.h"
#endif
Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfsi, int levi, const int cgpui) : rank(ranki), ingfs(ingfsi), fngfs(fngfsi), lev(levi), cgpu(cgpui) Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfsi, int levi, const int cgpui) : rank(ranki), ingfs(ingfsi), fngfs(fngfsi), lev(levi), cgpu(cgpui)
{ {
@@ -95,14 +99,19 @@ Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fng
} }
#endif #endif
} }
Block::~Block() Block::~Block()
{ {
int myrank; int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
if (myrank == rank) if (myrank == rank)
{ {
for (int i = 0; i < dim; i++) #ifdef USE_GPU
delete[] X[i]; bssn_gpu_clear_cached_device_buffers();
bssn_cuda_release_rk4_caches();
bssn_cuda_release_interp_caches();
#endif
for (int i = 0; i < dim; i++)
delete[] X[i];
for (int i = 0; i < ingfs; i++) for (int i = 0; i < ingfs; i++)
free(igfs[i]); free(igfs[i]);
delete[] igfs; delete[] igfs;

View File

@@ -2,29 +2,100 @@
#include <iostream> #include <iostream>
#include <iomanip> #include <iomanip>
#include <fstream> #include <fstream>
#include <cstdlib> #include <cstdlib>
#include <cstdio> #include <cstdio>
#include <string> #include <string>
#include <cmath> #include <cmath>
#include <new> #include <new>
#include <vector> #include <map>
using namespace std; #include <vector>
using namespace std;
#include "misc.h" #include "misc.h"
#include "MPatch.h" #include "MPatch.h"
#include "Parallel.h" #include "Parallel.h"
#include "fmisc.h" #include "fmisc.h"
#ifdef INTERP_LB_PROFILE #include "bssn_cuda_ops.h"
#include "interp_lb_profile.h" #ifdef INTERP_LB_PROFILE
#endif #include "interp_lb_profile.h"
#endif
namespace
{ #if defined(__GNUC__) || defined(__clang__)
struct InterpBlockView extern int bssn_cuda_interp_points_batch(const int *ex,
{ const double *X, const double *Y, const double *Z,
Block *bp; const double *const *fields,
double llb[dim]; const double *soa_flat,
double uub[dim]; int num_var,
const double *px, const double *py, const double *pz,
int num_points,
int ordn,
int symmetry,
double *out) __attribute__((weak));
#endif
namespace
{
struct InterpVarDesc
{
int sgfn;
double soa[dim];
};
struct InterpPlanKey
{
const Patch *patch;
const double *x;
const double *y;
const double *z;
int NN;
int Symmetry;
int myrank;
};
struct InterpPlanKeyLess
{
bool operator()(const InterpPlanKey &lhs, const InterpPlanKey &rhs) const
{
if (lhs.patch != rhs.patch) return lhs.patch < rhs.patch;
if (lhs.x != rhs.x) return lhs.x < rhs.x;
if (lhs.y != rhs.y) return lhs.y < rhs.y;
if (lhs.z != rhs.z) return lhs.z < rhs.z;
if (lhs.NN != rhs.NN) return lhs.NN < rhs.NN;
if (lhs.Symmetry != rhs.Symmetry) return lhs.Symmetry < rhs.Symmetry;
return lhs.myrank < rhs.myrank;
}
};
struct CachedInterpPlan
{
int nblocks;
vector<int> owner_rank;
vector<int> owner_block;
vector<vector<int> > block_points;
vector<vector<double> > block_px;
vector<vector<double> > block_py;
vector<vector<double> > block_pz;
CachedInterpPlan() : nblocks(0) {}
};
struct CachedInterpPlanEntry
{
bool valid;
InterpPlanKey key;
vector<double> xvals;
vector<double> yvals;
vector<double> zvals;
CachedInterpPlan plan;
CachedInterpPlanEntry() : valid(false) {}
};
struct InterpBlockView
{
Block *bp;
double llb[dim];
double uub[dim];
}; };
struct BlockBinIndex struct BlockBinIndex
@@ -154,10 +225,10 @@ void build_block_bin_index(Patch *patch, const double *DH, BlockBinIndex &index)
index.valid = true; index.valid = true;
} }
int find_block_index_for_point(const BlockBinIndex &index, const double *pox, const double *DH) int find_block_index_for_point(const BlockBinIndex &index, const double *pox, const double *DH)
{ {
if (!index.valid) if (!index.valid)
return -1; return -1;
const int bx = coord_to_bin(pox[0], index.lo[0], index.inv[0], index.bins[0]); const int bx = coord_to_bin(pox[0], index.lo[0], index.inv[0], index.bins[0]);
const int by = coord_to_bin(pox[1], index.lo[1], index.inv[1], index.bins[1]); const int by = coord_to_bin(pox[1], index.lo[1], index.inv[1], index.bins[1]);
@@ -175,13 +246,314 @@ int find_block_index_for_point(const BlockBinIndex &index, const double *pox, co
for (size_t bi = 0; bi < index.views.size(); bi++) for (size_t bi = 0; bi < index.views.size(); bi++)
if (point_in_block_view(index.views[bi], pox, DH)) if (point_in_block_view(index.views[bi], pox, DH))
return int(bi); return int(bi);
return -1; return -1;
} }
} // namespace
void collect_interp_vars(MyList<var> *VarList, vector<InterpVarDesc> &vars)
Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi) {
{ vars.clear();
MyList<var> *varl = VarList;
while (varl)
{
InterpVarDesc desc;
desc.sgfn = varl->data->sgfn;
for (int d = 0; d < dim; ++d)
desc.soa[d] = varl->data->SoA[d];
vars.push_back(desc);
varl = varl->next;
}
}
bool should_try_cuda_interp(int ordn, int num_points, int num_var)
{
#if defined(__GNUC__) || defined(__clang__)
if (!bssn_cuda_interp_points_batch)
return false;
#else
return false;
#endif
if (ordn != 6)
return false;
if (num_points < 32)
return false;
return num_points * num_var >= 256;
}
CachedInterpPlanEntry &interp_plan_cache_entry()
{
static CachedInterpPlanEntry cache;
return cache;
}
bool same_interp_plan_key(const InterpPlanKey &lhs, const InterpPlanKey &rhs)
{
return lhs.patch == rhs.patch &&
lhs.NN == rhs.NN &&
lhs.Symmetry == rhs.Symmetry &&
lhs.myrank == rhs.myrank;
}
bool same_interp_plan_points(const CachedInterpPlanEntry &cache, int NN, double **XX)
{
if (static_cast<int>(cache.xvals.size()) != NN ||
static_cast<int>(cache.yvals.size()) != NN ||
static_cast<int>(cache.zvals.size()) != NN)
return false;
for (int j = 0; j < NN; ++j)
{
if (cache.xvals[j] != XX[0][j] ||
cache.yvals[j] != XX[1][j] ||
cache.zvals[j] != XX[2][j])
return false;
}
return true;
}
CachedInterpPlan &get_cached_interp_plan(Patch *patch,
int NN, double **XX,
int Symmetry, int myrank,
const double *DH,
const BlockBinIndex &block_index,
bool report_bounds_here,
bool allow_missing_points)
{
InterpPlanKey key;
key.patch = patch;
key.x = XX[0];
key.y = XX[1];
key.z = XX[2];
key.NN = NN;
key.Symmetry = Symmetry;
key.myrank = myrank;
CachedInterpPlanEntry &cache = interp_plan_cache_entry();
if (cache.valid &&
same_interp_plan_key(cache.key, key) &&
same_interp_plan_points(cache, NN, XX) &&
cache.plan.nblocks == static_cast<int>(block_index.views.size()))
return cache.plan;
cache.valid = true;
cache.key = key;
cache.xvals.assign(XX[0], XX[0] + NN);
cache.yvals.assign(XX[1], XX[1] + NN);
cache.zvals.assign(XX[2], XX[2] + NN);
cache.plan = CachedInterpPlan();
CachedInterpPlan &plan = cache.plan;
plan.nblocks = static_cast<int>(block_index.views.size());
plan.owner_rank.assign(NN, -1);
plan.owner_block.assign(NN, -1);
plan.block_points.resize(plan.nblocks);
plan.block_px.resize(plan.nblocks);
plan.block_py.resize(plan.nblocks);
plan.block_pz.resize(plan.nblocks);
for (int j = 0; j < NN; ++j)
{
double pox[dim];
for (int i = 0; i < dim; ++i)
{
pox[i] = XX[i][j];
if (report_bounds_here &&
(XX[i][j] < patch->bbox[i] + patch->lli[i] * DH[i] ||
XX[i][j] > patch->bbox[dim + i] - patch->uui[i] * DH[i]))
{
cout << "Patch::Interp_Points: point (";
for (int k = 0; k < dim; ++k)
{
cout << XX[k][j];
if (k < dim - 1)
cout << ",";
else
cout << ") is out of current Patch." << endl;
}
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
const int block_i = find_block_index_for_point(block_index, pox, DH);
if (block_i >= 0)
{
Block *BP = block_index.views[block_i].bp;
plan.owner_rank[j] = BP->rank;
plan.owner_block[j] = block_i;
if (BP->rank == myrank)
{
plan.block_points[block_i].push_back(j);
plan.block_px[block_i].push_back(XX[0][j]);
plan.block_py[block_i].push_back(XX[1][j]);
plan.block_pz[block_i].push_back(XX[2][j]);
}
}
}
if (!allow_missing_points && report_bounds_here)
{
for (int j = 0; j < NN; ++j)
{
if (plan.owner_rank[j] >= 0)
continue;
cout << "ERROR: Patch::Interp_Points fails to find point (";
for (int d = 0; d < dim; ++d)
{
cout << XX[d][j];
if (d < dim - 1)
cout << ",";
else
cout << ")";
}
cout << " on Patch (";
for (int d = 0; d < dim; ++d)
{
cout << patch->bbox[d] << "+" << patch->lli[d] * DH[d];
if (d < dim - 1)
cout << ",";
else
cout << ")--";
}
cout << "(";
for (int d = 0; d < dim; ++d)
{
cout << patch->bbox[dim + d] << "-" << patch->uui[d] * DH[d];
if (d < dim - 1)
cout << ",";
else
cout << ")" << endl;
}
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
return plan;
}
void release_interp_plan_cache_internal()
{
CachedInterpPlanEntry &cache = interp_plan_cache_entry();
cache.valid = false;
cache.xvals.clear();
cache.yvals.clear();
cache.zvals.clear();
cache.plan = CachedInterpPlan();
}
bool run_cuda_interp_for_block(Block *BP,
const vector<InterpVarDesc> &vars,
const vector<int> &point_ids,
const vector<double> &px,
const vector<double> &py,
const vector<double> &pz,
double *Shellf,
int num_var,
int ordn,
int Symmetry)
{
if (!should_try_cuda_interp(ordn, static_cast<int>(point_ids.size()), num_var))
return false;
vector<const double *> field_ptrs(num_var);
vector<double> soa_flat(3 * num_var);
for (int v = 0; v < num_var; ++v)
{
field_ptrs[v] = BP->fgfs[vars[v].sgfn];
for (int d = 0; d < dim; ++d)
soa_flat[3 * v + d] = vars[v].soa[d];
}
const int npts = static_cast<int>(point_ids.size());
vector<double> out(static_cast<size_t>(npts) * static_cast<size_t>(num_var));
if (bssn_cuda_interp_points_batch(BP->shape,
BP->X[0], BP->X[1], BP->X[2],
field_ptrs.data(),
soa_flat.data(),
num_var,
px.data(), py.data(), pz.data(),
npts,
ordn,
Symmetry,
out.data()) != 0)
{
return false;
}
for (int p = 0; p < npts; ++p)
{
const int j = point_ids[p];
memcpy(Shellf + j * num_var, out.data() + p * num_var, sizeof(double) * num_var);
}
return true;
}
void run_cpu_interp_for_block(Block *BP,
const vector<InterpVarDesc> &vars,
const vector<int> &point_ids,
const vector<double> &px,
const vector<double> &py,
const vector<double> &pz,
double *Shellf,
int num_var,
int ordn,
int Symmetry)
{
for (size_t p = 0; p < point_ids.size(); ++p)
{
const int j = point_ids[p];
double x = px[p];
double y = py[p];
double z = pz[p];
int ordn_local = ordn;
int symmetry_local = Symmetry;
for (int v = 0; v < num_var; ++v)
{
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2],
BP->fgfs[vars[v].sgfn], Shellf[j * num_var + v],
x, y, z, ordn_local, const_cast<double *>(vars[v].soa), symmetry_local);
}
}
}
void interpolate_owned_points(MyList<var> *VarList,
double *Shellf, int Symmetry,
int ordn,
const BlockBinIndex &block_index,
const CachedInterpPlan &plan)
{
vector<InterpVarDesc> vars;
collect_interp_vars(VarList, vars);
const int num_var = static_cast<int>(vars.size());
for (size_t bi = 0; bi < plan.block_points.size(); ++bi)
{
if (plan.block_points[bi].empty())
continue;
Block *BP = block_index.views[bi].bp;
bool done = run_cuda_interp_for_block(BP, vars,
plan.block_points[bi],
plan.block_px[bi],
plan.block_py[bi],
plan.block_pz[bi],
Shellf, num_var, ordn, Symmetry);
if (!done)
run_cpu_interp_for_block(BP, vars,
plan.block_points[bi],
plan.block_px[bi],
plan.block_py[bi],
plan.block_pz[bi],
Shellf, num_var, ordn, Symmetry);
}
}
} // namespace
void patch_release_interp_plan_cache()
{
release_interp_plan_cache_internal();
}
Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
{
int hbuffer_width = buffer_width; int hbuffer_width = buffer_width;
if (lev == 0) if (lev == 0)
@@ -523,60 +895,15 @@ void Patch::Interp_Points(MyList<var> *VarList,
memset(Shellf, 0, sizeof(double) * NN * num_var); memset(Shellf, 0, sizeof(double) * NN * num_var);
// owner_rank[j] records which MPI rank owns point j double DH[dim];
// All ranks traverse the same block list so they all agree on ownership for (int i = 0; i < dim; i++)
int *owner_rank; DH[i] = getdX(i);
owner_rank = new int[NN]; BlockBinIndex block_index;
for (int j = 0; j < NN; j++) build_block_bin_index(this, DH, block_index);
owner_rank[j] = -1; CachedInterpPlan &plan = get_cached_interp_plan(this, NN, XX, Symmetry, myrank, DH, block_index, myrank == 0, false);
const int *owner_rank = plan.owner_rank.data();
double DH[dim];
for (int i = 0; i < dim; i++) interpolate_owned_points(VarList, Shellf, Symmetry, ordn, block_index, plan);
DH[i] = getdX(i);
BlockBinIndex block_index;
build_block_bin_index(this, DH, block_index);
for (int j = 0; j < NN; j++) // run along points
{
double pox[dim];
for (int i = 0; i < dim; i++)
{
pox[i] = XX[i][j];
if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
{
cout << "Patch::Interp_Points: point (";
for (int k = 0; k < dim; k++)
{
cout << XX[k][j];
if (k < dim - 1)
cout << ",";
else
cout << ") is out of current Patch." << endl;
}
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
const int block_i = find_block_index_for_point(block_index, pox, DH);
if (block_i >= 0)
{
Block *BP = block_index.views[block_i].bp;
owner_rank[j] = BP->rank;
if (myrank == BP->rank)
{
//---> interpolation
varl = VarList;
int k = 0;
while (varl) // run along variables
{
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next;
k++;
}
}
}
}
// Replace MPI_Allreduce with per-owner MPI_Bcast: // Replace MPI_Allreduce with per-owner MPI_Bcast:
// Group consecutive points by owner rank and broadcast each group. // Group consecutive points by owner rank and broadcast each group.
@@ -631,9 +958,8 @@ void Patch::Interp_Points(MyList<var> *VarList,
MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner, MPI_COMM_WORLD); MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner, MPI_COMM_WORLD);
} }
} }
delete[] owner_rank; }
}
void Patch::Interp_Points(MyList<var> *VarList, void Patch::Interp_Points(MyList<var> *VarList,
int NN, double **XX, int NN, double **XX,
double *Shellf, int Symmetry, double *Shellf, int Symmetry,
@@ -661,102 +987,22 @@ void Patch::Interp_Points(MyList<var> *VarList,
memset(Shellf, 0, sizeof(double) * NN * num_var); memset(Shellf, 0, sizeof(double) * NN * num_var);
// owner_rank[j] records which MPI rank owns point j double DH[dim];
int *owner_rank; for (int i = 0; i < dim; i++)
owner_rank = new int[NN]; DH[i] = getdX(i);
for (int j = 0; j < NN; j++) BlockBinIndex block_index;
owner_rank[j] = -1; build_block_bin_index(this, DH, block_index);
CachedInterpPlan &plan = get_cached_interp_plan(this, NN, XX, Symmetry, myrank, DH, block_index, myrank == 0, false);
double DH[dim]; const int *owner_rank = plan.owner_rank.data();
for (int i = 0; i < dim; i++)
DH[i] = getdX(i); interpolate_owned_points(VarList, Shellf, Symmetry, ordn, block_index, plan);
BlockBinIndex block_index;
build_block_bin_index(this, DH, block_index);
// --- Interpolation phase (identical to original) ---
for (int j = 0; j < NN; j++)
{
double pox[dim];
for (int i = 0; i < dim; i++)
{
pox[i] = XX[i][j];
if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
{
cout << "Patch::Interp_Points: point (";
for (int k = 0; k < dim; k++)
{
cout << XX[k][j];
if (k < dim - 1)
cout << ",";
else
cout << ") is out of current Patch." << endl;
}
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
const int block_i = find_block_index_for_point(block_index, pox, DH);
if (block_i >= 0)
{
Block *BP = block_index.views[block_i].bp;
owner_rank[j] = BP->rank;
if (myrank == BP->rank)
{
varl = VarList;
int k = 0;
while (varl)
{
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next;
k++;
}
}
}
}
#ifdef INTERP_LB_PROFILE #ifdef INTERP_LB_PROFILE
double t_interp_end = MPI_Wtime(); double t_interp_end = MPI_Wtime();
double t_interp_local = t_interp_end - t_interp_start; double t_interp_local = t_interp_end - t_interp_start;
#endif #endif
// --- Error check for unfound points --- // --- Targeted point-to-point communication phase ---
for (int j = 0; j < NN; j++)
{
if (owner_rank[j] < 0 && myrank == 0)
{
cout << "ERROR: Patch::Interp_Points fails to find point (";
for (int d = 0; d < dim; d++)
{
cout << XX[d][j];
if (d < dim - 1)
cout << ",";
else
cout << ")";
}
cout << " on Patch (";
for (int d = 0; d < dim; d++)
{
cout << bbox[d] << "+" << lli[d] * DH[d];
if (d < dim - 1)
cout << ",";
else
cout << ")--";
}
cout << "(";
for (int d = 0; d < dim; d++)
{
cout << bbox[dim + d] << "-" << uui[d] * DH[d];
if (d < dim - 1)
cout << ",";
else
cout << ")" << endl;
}
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
// --- Targeted point-to-point communication phase ---
// Compute consumer_rank[j] using the same deterministic formula as surface_integral // Compute consumer_rank[j] using the same deterministic formula as surface_integral
int *consumer_rank = new int[NN]; int *consumer_rank = new int[NN];
{ {
@@ -873,9 +1119,8 @@ void Patch::Interp_Points(MyList<var> *VarList,
delete[] send_offset; delete[] send_offset;
delete[] recv_offset; delete[] recv_offset;
delete[] send_count; delete[] send_count;
delete[] recv_count; delete[] recv_count;
delete[] consumer_rank; delete[] consumer_rank;
delete[] owner_rank;
#ifdef INTERP_LB_PROFILE #ifdef INTERP_LB_PROFILE
{ {
@@ -923,64 +1168,20 @@ void Patch::Interp_Points(MyList<var> *VarList,
memset(Shellf, 0, sizeof(double) * NN * num_var); memset(Shellf, 0, sizeof(double) * NN * num_var);
// owner_rank[j] stores the global rank that owns point j // Build global-to-local rank translation for Comm_here
int *owner_rank; MPI_Group world_group, local_group;
owner_rank = new int[NN]; MPI_Comm_group(MPI_COMM_WORLD, &world_group);
for (int j = 0; j < NN; j++) MPI_Comm_group(Comm_here, &local_group);
owner_rank[j] = -1;
// Build global-to-local rank translation for Comm_here double DH[dim];
MPI_Group world_group, local_group; for (int i = 0; i < dim; i++)
MPI_Comm_group(MPI_COMM_WORLD, &world_group); DH[i] = getdX(i);
MPI_Comm_group(Comm_here, &local_group); BlockBinIndex block_index;
build_block_bin_index(this, DH, block_index);
double DH[dim]; CachedInterpPlan &plan = get_cached_interp_plan(this, NN, XX, Symmetry, myrank, DH, block_index, lmyrank == 0, true);
for (int i = 0; i < dim; i++) const int *owner_rank = plan.owner_rank.data();
DH[i] = getdX(i);
BlockBinIndex block_index; interpolate_owned_points(VarList, Shellf, Symmetry, ordn, block_index, plan);
build_block_bin_index(this, DH, block_index);
for (int j = 0; j < NN; j++) // run along points
{
double pox[dim];
for (int i = 0; i < dim; i++)
{
pox[i] = XX[i][j];
if (lmyrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
{
cout << "Patch::Interp_Points: point (";
for (int k = 0; k < dim; k++)
{
cout << XX[k][j];
if (k < dim - 1)
cout << ",";
else
cout << ") is out of current Patch." << endl;
}
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
const int block_i = find_block_index_for_point(block_index, pox, DH);
if (block_i >= 0)
{
Block *BP = block_index.views[block_i].bp;
owner_rank[j] = BP->rank;
if (myrank == BP->rank)
{
//---> interpolation
varl = VarList;
int k = 0;
while (varl) // run along variables
{
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next;
k++;
}
}
}
}
// Collect unique global owner ranks and translate to local ranks in Comm_here // Collect unique global owner ranks and translate to local ranks in Comm_here
// Then broadcast each owner's points via MPI_Bcast on Comm_here // Then broadcast each owner's points via MPI_Bcast on Comm_here
@@ -1008,10 +1209,9 @@ void Patch::Interp_Points(MyList<var> *VarList,
} }
} }
MPI_Group_free(&world_group); MPI_Group_free(&world_group);
MPI_Group_free(&local_group); MPI_Group_free(&local_group);
delete[] owner_rank; }
}
void Patch::checkBlock() void Patch::checkBlock()
{ {
int myrank; int myrank;

View File

@@ -8,7 +8,7 @@
#include "var.h" #include "var.h"
#include "macrodef.h" //need dim here; Vertex or Cell; ghost_width #include "macrodef.h" //need dim here; Vertex or Cell; ghost_width
class Patch class Patch
{ {
public: public:
@@ -50,6 +50,8 @@ public:
double *Shellf, int Symmetry, MPI_Comm Comm_here); double *Shellf, int Symmetry, MPI_Comm Comm_here);
void Find_Maximum(MyList<var> *VarList, double *XX, void Find_Maximum(MyList<var> *VarList, double *XX,
double *Shellf, MPI_Comm Comm_here); double *Shellf, MPI_Comm Comm_here);
}; };
#endif /* PATCH_H */ void patch_release_interp_plan_cache();
#endif /* PATCH_H */

File diff suppressed because it is too large Load Diff

View File

@@ -89,9 +89,12 @@ namespace Parallel
void transfermix(MyList<gridseg> **src, MyList<gridseg> **dst, void transfermix(MyList<gridseg> **src, MyList<gridseg> **dst,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */, MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
int Symmetry); int Symmetry);
void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry); void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry);
void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry); void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry, const char *context);
void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry); void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, const char *context);
void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, const char *context);
struct SyncCache { struct SyncCache {
bool valid; bool valid;
@@ -105,12 +108,13 @@ namespace Parallel
int *send_buf_caps; int *send_buf_caps;
int *recv_buf_caps; int *recv_buf_caps;
MPI_Request *reqs; MPI_Request *reqs;
MPI_Status *stats; MPI_Status *stats;
int max_reqs; int max_reqs;
bool lengths_valid; bool lengths_valid;
int *tc_req_node; int lengths_var_count;
int *tc_req_is_recv; int *tc_req_node;
int *tc_completed; int *tc_req_is_recv;
int *tc_completed;
SyncCache(); SyncCache();
void invalidate(); void invalidate();
void destroy(); void destroy();
@@ -121,19 +125,20 @@ namespace Parallel
MyList<var> *VarList1, MyList<var> *VarList2, MyList<var> *VarList1, MyList<var> *VarList2,
int Symmetry, SyncCache &cache); int Symmetry, SyncCache &cache);
struct AsyncSyncState { struct AsyncSyncState {
int req_no; int req_no;
bool active; bool active;
int *req_node; int mpi_tag;
int *req_is_recv; int *req_node;
int pending_recv; int *req_is_recv;
AsyncSyncState() : req_no(0), active(false), req_node(0), req_is_recv(0), pending_recv(0) {} int pending_recv;
}; AsyncSyncState() : req_no(0), active(false), mpi_tag(0), req_node(0), req_is_recv(0), pending_recv(0) {}
};
void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
SyncCache &cache, AsyncSyncState &state); SyncCache &cache, AsyncSyncState &state);
void Sync_finish(SyncCache &cache, AsyncSyncState &state, void Sync_finish(SyncCache &cache, AsyncSyncState &state,
MyList<var> *VarList, int Symmetry); MyList<var> *VarList, int Symmetry, bool unpack_to_host = true);
void OutBdLow2Hi(Patch *Patc, Patch *Patf, void OutBdLow2Hi(Patch *Patc, Patch *Patf,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */, MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
int Symmetry); int Symmetry);

View File

@@ -14,7 +14,8 @@ using namespace std;
#include <string.h> #include <string.h>
#endif #endif
#include <time.h> #include <time.h>
#include <unistd.h>
#include "macrodef.h" #include "macrodef.h"
#include "misc.h" #include "misc.h"
@@ -28,20 +29,30 @@ using namespace std;
#include "rungekutta4_rout.h" #include "rungekutta4_rout.h"
#include "sommerfeld_rout.h" #include "sommerfeld_rout.h"
#include "getnp4.h" #include "getnp4.h"
#include "shellfunctions.h" #include "shellfunctions.h"
#include "parameters.h" #include "parameters.h"
#ifdef USE_GPU
#include "bssn_macro.h"
#include "bssn_gpu.h"
#endif
#ifdef With_AHF #ifdef With_AHF
#include "derivatives.h" #include "derivatives.h"
#include "myglobal.h" #include "myglobal.h"
#endif #endif
#include "perf.h" #include "perf.h"
#include "derivatives.h" #include "derivatives.h"
#include "ricci_gamma.h" #include "ricci_gamma.h"
//================================================================================================ // Compile-time switch for per-timestep memory usage collection/printing.
// Default is OFF to reduce overhead in production runs.
#ifndef BSSN_ENABLE_MEM_USAGE_LOG
#define BSSN_ENABLE_MEM_USAGE_LOG 0
#endif
//================================================================================================
// define bssn_class // define bssn_class
@@ -734,11 +745,12 @@ void bssn_class::Initialize()
// Initialize sync caches (per-level, for predictor and corrector) // Initialize sync caches (per-level, for predictor and corrector)
sync_cache_pre = new Parallel::SyncCache[GH->levels]; sync_cache_pre = new Parallel::SyncCache[GH->levels];
sync_cache_cor = new Parallel::SyncCache[GH->levels]; sync_cache_cor = new Parallel::SyncCache[GH->levels];
sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels]; sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
sync_cache_rp_fine = new Parallel::SyncCache[GH->levels]; sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
sync_cache_restrict = new Parallel::SyncCache[GH->levels]; sync_cache_restrict = new Parallel::SyncCache[GH->levels];
sync_cache_outbd = new Parallel::SyncCache[GH->levels]; sync_cache_outbd = new Parallel::SyncCache[GH->levels];
} sync_cache_psi4 = new Parallel::SyncCache[GH->levels];
}
//================================================================================================ //================================================================================================
@@ -750,8 +762,8 @@ void bssn_class::Initialize()
//================================================================================================ //================================================================================================
bssn_class::~bssn_class() bssn_class::~bssn_class()
{ {
#ifdef With_AHF #ifdef With_AHF
AHList->clearList(); AHList->clearList();
AHDList->clearList(); AHDList->clearList();
@@ -1008,12 +1020,30 @@ bssn_class::~bssn_class()
sync_cache_rp_coarse[i].destroy(); sync_cache_rp_coarse[i].destroy();
delete[] sync_cache_rp_coarse; delete[] sync_cache_rp_coarse;
} }
if (sync_cache_rp_fine) if (sync_cache_rp_fine)
{ {
for (int i = 0; i < GH->levels; i++) for (int i = 0; i < GH->levels; i++)
sync_cache_rp_fine[i].destroy(); sync_cache_rp_fine[i].destroy();
delete[] sync_cache_rp_fine; delete[] sync_cache_rp_fine;
} }
if (sync_cache_restrict)
{
for (int i = 0; i < GH->levels; i++)
sync_cache_restrict[i].destroy();
delete[] sync_cache_restrict;
}
if (sync_cache_outbd)
{
for (int i = 0; i < GH->levels; i++)
sync_cache_outbd[i].destroy();
delete[] sync_cache_outbd;
}
if (sync_cache_psi4)
{
for (int i = 0; i < GH->levels; i++)
sync_cache_psi4[i].destroy();
delete[] sync_cache_psi4;
}
delete GH; delete GH;
#ifdef WithShell #ifdef WithShell
@@ -1046,8 +1076,25 @@ bssn_class::~bssn_class()
delete ConVMonitor; delete ConVMonitor;
delete Waveshell; delete Waveshell;
delete CheckPoint; delete CheckPoint;
} }
void bssn_class::InvalidateSyncCaches()
{
if (!GH)
return;
for (int il = 0; il < GH->levels; il++)
{
sync_cache_pre[il].invalidate();
sync_cache_cor[il].invalidate();
sync_cache_rp_coarse[il].invalidate();
sync_cache_rp_fine[il].invalidate();
sync_cache_restrict[il].invalidate();
sync_cache_outbd[il].invalidate();
sync_cache_psi4[il].invalidate();
}
}
//================================================================================================ //================================================================================================
@@ -2030,9 +2077,10 @@ void bssn_class::Read_Ansorg()
void bssn_class::Evolve(int Steps) void bssn_class::Evolve(int Steps)
{ {
clock_t prev_clock, curr_clock; clock_t prev_clock, curr_clock;
double LastDump = 0.0, LastCheck = 0.0, Last2dDump = 0.0; double LastDump = 0.0, LastCheck = 0.0, Last2dDump = 0.0;
LastAnas = 0; LastAnas = 0;
LastConsOut = 0;
#if 0 #if 0
//initial checkpoint for special uasge //initial checkpoint for special uasge
{ {
@@ -2129,8 +2177,10 @@ void bssn_class::Evolve(int Steps)
#endif #endif
*/ */
perf bssn_perf; #if BSSN_ENABLE_MEM_USAGE_LOG
size_t current_min, current_avg, current_max, peak_min, peak_avg, peak_max; perf bssn_perf;
size_t current_min, current_avg, current_max, peak_min, peak_avg, peak_max;
#endif
for (int lev = 0; lev < GH->levels; lev++) for (int lev = 0; lev < GH->levels; lev++)
GH->Lt[lev] = PhysTime; GH->Lt[lev] = PhysTime;
@@ -2215,7 +2265,7 @@ void bssn_class::Evolve(int Steps)
GH->Regrid(Symmetry, BH_num, Porgbr, Porg0, GH->Regrid(Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor); fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } InvalidateSyncCaches();
#endif #endif
#if (REGLEV == 0 && (PSTR == 1 || PSTR == 2)) #if (REGLEV == 0 && (PSTR == 1 || PSTR == 2))
@@ -2224,21 +2274,23 @@ void bssn_class::Evolve(int Steps)
// fgt(PhysTime-dT_mon,StartTime,dT_mon/2),ErrorMonitor); // fgt(PhysTime-dT_mon,StartTime,dT_mon/2),ErrorMonitor);
#endif #endif
// Retrieve memory usage information used during computation; master process prints it #if BSSN_ENABLE_MEM_USAGE_LOG
bssn_perf.MemoryUsage(&current_min, &current_avg, &current_max, // Retrieve memory usage information used during computation; master process prints it
&peak_min, &peak_avg, &peak_max, nprocs); bssn_perf.MemoryUsage(&current_min, &current_avg, &current_max,
if (myrank == 0) &peak_min, &peak_avg, &peak_max, nprocs);
{ if (myrank == 0)
printf(" Memory usage: current %0.4lg/%0.4lg/%0.4lgMB, " {
"peak %0.4lg/%0.4lg/%0.4lgMB\n", printf(" Memory usage: current %0.4lg/%0.4lg/%0.4lgMB, "
(double)current_min / (1024.0 * 1024.0), "peak %0.4lg/%0.4lg/%0.4lgMB\n",
(double)current_avg / (1024.0 * 1024.0), (double)current_min / (1024.0 * 1024.0),
(double)current_max / (1024.0 * 1024.0), (double)current_avg / (1024.0 * 1024.0),
(double)peak_min / (1024.0 * 1024.0), (double)current_max / (1024.0 * 1024.0),
(double)peak_avg / (1024.0 * 1024.0), (double)peak_min / (1024.0 * 1024.0),
(double)peak_max / (1024.0 * 1024.0)); (double)peak_avg / (1024.0 * 1024.0),
cout << endl; (double)peak_max / (1024.0 * 1024.0));
} cout << endl;
}
#endif
// Output puncture positions at each step // Output puncture positions at each step
if (myrank == 0) if (myrank == 0)
@@ -2286,18 +2338,21 @@ void bssn_class::Evolve(int Steps)
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// When LastCheck >= CheckTime, perform runtime checks and output status data // When LastCheck >= CheckTime, perform runtime checks and output status data
if (LastCheck >= CheckTime) if (LastCheck >= CheckTime)
{ {
LastCheck = 0; LastCheck = 0;
CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass); CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass);
CheckPoint->writecheck_cgh(PhysTime, GH); CheckPoint->writecheck_cgh(PhysTime, GH);
#ifdef WithShell #ifdef WithShell
CheckPoint->writecheck_sh(PhysTime, SH); CheckPoint->writecheck_sh(PhysTime, SH);
#endif #endif
CheckPoint->write_bssn(LastDump, Last2dDump, LastAnas); CheckPoint->write_bssn(LastDump, Last2dDump, LastAnas);
} }
}
// Keep output/analysis phases aligned across ranks before the next coarse step.
MPI_Barrier(MPI_COMM_WORLD);
}
/* /*
#ifdef With_AHF #ifdef With_AHF
// final apparent horizon finding // final apparent horizon finding
@@ -2431,7 +2486,7 @@ void bssn_class::RecursiveStep(int lev)
if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } InvalidateSyncCaches();
#endif #endif
} }
@@ -2610,7 +2665,7 @@ void bssn_class::ParallelStep()
if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0, if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } InvalidateSyncCaches();
#endif #endif
} }
@@ -2777,7 +2832,7 @@ void bssn_class::ParallelStep()
if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0, if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor)) fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor))
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } InvalidateSyncCaches();
// a_stream.clear(); // a_stream.clear();
// a_stream.str(""); // a_stream.str("");
@@ -2792,7 +2847,7 @@ void bssn_class::ParallelStep()
if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } InvalidateSyncCaches();
// a_stream.clear(); // a_stream.clear();
// a_stream.str(""); // a_stream.str("");
@@ -2811,7 +2866,7 @@ void bssn_class::ParallelStep()
if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor)) fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } InvalidateSyncCaches();
// a_stream.clear(); // a_stream.clear();
// a_stream.str(""); // a_stream.str("");
@@ -2827,7 +2882,7 @@ void bssn_class::ParallelStep()
if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor)) fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } InvalidateSyncCaches();
// a_stream.clear(); // a_stream.clear();
// a_stream.str(""); // a_stream.str("");
@@ -3016,9 +3071,14 @@ void bssn_class::RecursiveStep(int lev, int num) // in all 2^(lev+1)-1 steps
#if (PSTR == 0) #if (PSTR == 0)
#if 1 #if 1
void bssn_class::Step(int lev, int YN) void bssn_class::Step(int lev, int YN)
{ {
setpbh(BH_num, Porg0, Mass, BH_num_input); #ifdef USE_GPU
Step_MainPath_GPU(lev, YN);
return;
#endif
setpbh(BH_num, Porg0, Mass, BH_num_input);
double dT_lev = dT * pow(0.5, Mymax(lev, trfls)); double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
@@ -6238,7 +6298,7 @@ for(int ilev = GH->levels-1;ilev>=lev;ilev--)
for(int ilev=GH->levels-1;ilev>lev;ilev--) for(int ilev=GH->levels-1;ilev>lev;ilev--)
RestrictProlong(ilev,1,false,DG_List,DG_List,DG_List); RestrictProlong(ilev,1,false,DG_List,DG_List,DG_List);
#else #else
Parallel::Sync(GH->PatL[lev], DG_List, Symmetry); Parallel::Sync_cached(GH->PatL[lev], DG_List, Symmetry, sync_cache_psi4[lev]);
#endif #endif
#ifdef WithShell #ifdef WithShell
@@ -6893,10 +6953,10 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)
{ {
LastAnas += dT_lev; LastAnas += dT_lev;
if (LastAnas >= AnasTime) if (LastAnas >= AnasTime)
{ {
#ifdef Point_Psi4 #ifdef Point_Psi4
#error "not support parallel levels yet" #error "not support parallel levels yet"
// Gam_ijk and R_ij have been calculated in Interp_Constraint() // Gam_ijk and R_ij have been calculated in Interp_Constraint()
double SYM = 1, ANT = -1; double SYM = 1, ANT = -1;
for (int levh = lev; levh < GH->levels; levh++) for (int levh = lev; levh < GH->levels; levh++)
@@ -7240,9 +7300,9 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)
//================================================================================================ //================================================================================================
void bssn_class::Constraint_Out() void bssn_class::Constraint_Out()
{ {
LastConsOut += dT * pow(0.5, Mymax(0, trfls)); LastConsOut += dT * pow(0.5, Mymax(0, trfls));
if (LastConsOut >= AnasTime) if (LastConsOut >= AnasTime)
// Constraint violation // Constraint violation
@@ -7262,12 +7322,15 @@ void bssn_class::Constraint_Out()
MyList<Block> *BP = Pp->data->blb; MyList<Block> *BP = Pp->data->blb;
while (BP) while (BP)
{ {
Block *cg = BP->data; Block *cg = BP->data;
if (myrank == cg->rank) if (myrank == cg->rank)
{ {
f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], #ifdef USE_GPU
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn], gpu_rhs(CALLED_BY_CONSTRAINT_CONS_ONLY, myrank, RHS_PARA_CALLED_Constraint_Out);
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], #else
f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn], cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn],
@@ -7295,11 +7358,12 @@ void bssn_class::Constraint_Out()
cg->fgfs[Gamzyy->sgfn], cg->fgfs[Gamzyz->sgfn], cg->fgfs[Gamzzz->sgfn], cg->fgfs[Gamzyy->sgfn], cg->fgfs[Gamzyz->sgfn], cg->fgfs[Gamzzz->sgfn],
cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn], cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn],
cg->fgfs[Ryy->sgfn], cg->fgfs[Ryz->sgfn], cg->fgfs[Rzz->sgfn], cg->fgfs[Ryy->sgfn], cg->fgfs[Ryz->sgfn], cg->fgfs[Rzz->sgfn],
cg->fgfs[Cons_Ham->sgfn], cg->fgfs[Cons_Ham->sgfn],
cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn], cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn], cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
Symmetry, lev, ndeps, pre); Symmetry, lev, ndeps, pre);
} #endif
}
if (BP == Pp->data->ble) if (BP == Pp->data->ble)
break; break;
BP = BP->next; BP = BP->next;
@@ -7307,7 +7371,7 @@ void bssn_class::Constraint_Out()
Pp = Pp->next; Pp = Pp->next;
} }
} }
Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry); Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry, "bssn_class::Constraint_Out[level]");
} }
#ifdef WithShell #ifdef WithShell
if (0) // if the constrait quantities can be reused from the step rhs calculation if (0) // if the constrait quantities can be reused from the step rhs calculation
@@ -7529,7 +7593,7 @@ void bssn_class::AH_Prepare_derivatives()
} }
Pp = Pp->next; Pp = Pp->next;
} }
Parallel::Sync(GH->PatL[lev], AHDList, Symmetry); Parallel::Sync(GH->PatL[lev], AHDList, Symmetry, "bssn_class::AH_Prepare_derivatives");
} }
} }
@@ -7765,12 +7829,15 @@ void bssn_class::Interp_Constraint(bool infg)
MyList<Block> *BP = Pp->data->blb; MyList<Block> *BP = Pp->data->blb;
while (BP) while (BP)
{ {
Block *cg = BP->data; Block *cg = BP->data;
if (myrank == cg->rank) if (myrank == cg->rank)
{ {
f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], #ifdef USE_GPU
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn], gpu_rhs(CALLED_BY_CONSTRAINT_CONS_ONLY, myrank, RHS_PARA_CALLED_Interp_Constraint);
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], #else
f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn], cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn],
@@ -7798,11 +7865,12 @@ void bssn_class::Interp_Constraint(bool infg)
cg->fgfs[Gamzyy->sgfn], cg->fgfs[Gamzyz->sgfn], cg->fgfs[Gamzzz->sgfn], cg->fgfs[Gamzyy->sgfn], cg->fgfs[Gamzyz->sgfn], cg->fgfs[Gamzzz->sgfn],
cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn], cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn],
cg->fgfs[Ryy->sgfn], cg->fgfs[Ryz->sgfn], cg->fgfs[Rzz->sgfn], cg->fgfs[Ryy->sgfn], cg->fgfs[Ryz->sgfn], cg->fgfs[Rzz->sgfn],
cg->fgfs[Cons_Ham->sgfn], cg->fgfs[Cons_Ham->sgfn],
cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn], cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn], cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
Symmetry, lev, ndeps, pre); Symmetry, lev, ndeps, pre);
} #endif
}
if (BP == Pp->data->ble) if (BP == Pp->data->ble)
break; break;
BP = BP->next; BP = BP->next;
@@ -7810,7 +7878,7 @@ void bssn_class::Interp_Constraint(bool infg)
Pp = Pp->next; Pp = Pp->next;
} }
} }
Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry); Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry, "bssn_class::Interp_Constraint[level]");
} }
#ifdef WithShell #ifdef WithShell
if (0) // if the constrait quantities can be reused from the step rhs calculation if (0) // if the constrait quantities can be reused from the step rhs calculation
@@ -8068,7 +8136,7 @@ void bssn_class::Compute_Constraint()
Pp = Pp->next; Pp = Pp->next;
} }
} }
Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry); Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry, "bssn_class::Compute_Constraint[level]");
} }
// prolong restrict constraint quantities // prolong restrict constraint quantities
for (lev = GH->levels - 1; lev > 0; lev--) for (lev = GH->levels - 1; lev > 0; lev--)
@@ -8381,12 +8449,18 @@ void bssn_class::Enforce_algcon(int lev, int fg)
//================================================================================================ //================================================================================================
bool bssn_class::check_Stdin_Abort() bool bssn_class::check_Stdin_Abort()
{ {
// Non-interactive launches (mpirun via Python/subprocess, batch jobs, redirected stdin)
fd_set readfds; // should not probe stdin. Some MPI runtimes treat stdin as a managed channel and can
// fail when rank 0 polls/consumes it.
struct timeval timeout; if (!isatty(STDIN_FILENO)) {
return false;
}
fd_set readfds;
struct timeval timeout;
FD_ZERO(&readfds); FD_ZERO(&readfds);
FD_SET(STDIN_FILENO, &readfds); FD_SET(STDIN_FILENO, &readfds);
@@ -8395,14 +8469,17 @@ bool bssn_class::check_Stdin_Abort()
timeout.tv_sec = 0; timeout.tv_sec = 0;
timeout.tv_usec = 0; timeout.tv_usec = 0;
int activity = select(STDIN_FILENO + 1, &readfds, nullptr, nullptr, &timeout); int activity = select(STDIN_FILENO + 1, &readfds, nullptr, nullptr, &timeout);
if (activity <= 0) {
if (activity > 0 && FD_ISSET(STDIN_FILENO, &readfds)) { return false;
string input_abort; }
if (cin >> input_abort) {
if (input_abort == "stop") { if (FD_ISSET(STDIN_FILENO, &readfds)) {
return true; string input_abort;
} if (cin >> input_abort) {
if (input_abort == "stop") {
return true;
}
} }
} }

View File

@@ -128,10 +128,11 @@ public:
Parallel::SyncCache *sync_cache_pre; // per-level cache for predictor sync Parallel::SyncCache *sync_cache_pre; // per-level cache for predictor sync
Parallel::SyncCache *sync_cache_cor; // per-level cache for corrector sync Parallel::SyncCache *sync_cache_cor; // per-level cache for corrector sync
Parallel::SyncCache *sync_cache_rp_coarse; // RestrictProlong sync on PatL[lev-1] Parallel::SyncCache *sync_cache_rp_coarse; // RestrictProlong sync on PatL[lev-1]
Parallel::SyncCache *sync_cache_rp_fine; // RestrictProlong sync on PatL[lev] Parallel::SyncCache *sync_cache_rp_fine; // RestrictProlong sync on PatL[lev]
Parallel::SyncCache *sync_cache_restrict; // cached Restrict in RestrictProlong Parallel::SyncCache *sync_cache_restrict; // cached Restrict in RestrictProlong
Parallel::SyncCache *sync_cache_outbd; // cached OutBdLow2Hi in RestrictProlong Parallel::SyncCache *sync_cache_outbd; // cached OutBdLow2Hi in RestrictProlong
Parallel::SyncCache *sync_cache_psi4; // cached Psi4 sync on PatL[lev]
monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor; monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
monitor *ConVMonitor; monitor *ConVMonitor;
@@ -171,16 +172,20 @@ public:
bool check_Stdin_Abort(); bool check_Stdin_Abort();
virtual void Setup_Initial_Data_Cao(); virtual void Setup_Initial_Data_Cao();
virtual void Setup_Initial_Data_Lousto(); virtual void Setup_Initial_Data_Lousto();
virtual void Initialize(); virtual void Initialize();
virtual void Read_Ansorg(); virtual void Read_Ansorg();
virtual void Read_Pablo() {}; virtual void Read_Pablo() {};
virtual void Compute_Psi4(int lev); void InvalidateSyncCaches();
virtual void Step(int lev, int YN); virtual void Compute_Psi4(int lev);
virtual void Interp_Constraint(bool infg); virtual void Step(int lev, int YN);
virtual void Constraint_Out(); #ifdef USE_GPU
virtual void Compute_Constraint(); void Step_MainPath_GPU(int lev, int YN);
#endif
virtual void Interp_Constraint(bool infg);
virtual void Constraint_Out();
virtual void Compute_Constraint();
#ifdef With_AHF #ifdef With_AHF
protected: protected:

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,68 @@
#ifndef BSSN_CUDA_OPS_H
#define BSSN_CUDA_OPS_H
int bssn_cuda_enforce_ga(int *ex,
double *dxx, double *gxy, double *gxz,
double *dyy, double *gyz, double *dzz,
double *Axx, double *Axy, double *Axz,
double *Ayy, double *Ayz, double *Azz);
int bssn_cuda_rk4_boundary_var(int *ex, double dT,
const double *X, const double *Y, const double *Z,
double xmin, double ymin, double zmin,
double xmax, double ymax, double zmax,
const double *state0,
const double *phi_field,
const double *lap_field,
const double *boundary_src,
double *stage_data,
double *rhs_accum,
double propspeed,
const double SoA[3],
int symmetry,
int lev,
int rk_stage,
bool force_host_boundary_fix,
bool download_to_host = true);
int bssn_cuda_rk4_boundary_batch(int *ex, double dT,
const double *X, const double *Y, const double *Z,
double xmin, double ymin, double zmin,
double xmax, double ymax, double zmax,
int symmetry,
const double *const *state0_list,
double *const *stage_data_list,
double *const *rhs_accum_list,
int num_var,
int rk_stage,
bool download_to_host = false);
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_host = true);
int bssn_cuda_download_buffer(int *ex, double *host_ptr);
void bssn_cuda_release_rk4_caches();
void bssn_cuda_release_interp_caches();
int bssn_cuda_prolong3_pack(int wei,
const double *llbc, const double *uubc, const int *extc, const double *func,
const double *llbf, const double *uubf, const int *extf, double *funf,
const double *llbp, const double *uubp,
const double *SoA, int symmetry);
int bssn_cuda_restrict3_pack(int wei,
const double *llbc, const double *uubc, const int *extc, double *func,
const double *llbf, const double *uubf, const int *extf, const double *funf,
const double *llbr, const double *uubr,
const double *SoA, int symmetry);
int bssn_cuda_interp_points_batch(const int *ex,
const double *X, const double *Y, const double *Z,
const double *const *fields,
const double *soa_flat,
int num_var,
const double *px, const double *py, const double *pz,
int num_points,
int ordn,
int symmetry,
double *out);
#endif

View File

@@ -0,0 +1,936 @@
#include "macrodef.h"
#ifdef USE_GPU
#include <algorithm>
#include <cmath>
#include <cstring>
#include <cstdlib>
#include <iomanip>
#include <vector>
#include "bssn_class.h"
#include "bssn_cuda_ops.h"
#include "bssn_gpu.h"
#include "bssn_macro.h"
namespace
{
enum StageProfileMetric
{
STAGE_PROFILE_TOTAL = 0,
STAGE_PROFILE_RHS,
STAGE_PROFILE_RUN_STAGE,
STAGE_PROFILE_RUN_STAGE_DEVICE,
STAGE_PROFILE_RUN_STAGE_HOST_FIX,
STAGE_PROFILE_LOWERBOUND,
STAGE_PROFILE_ENSURE,
STAGE_PROFILE_DOWNLOAD,
STAGE_PROFILE_CLEAR_CACHE,
STAGE_PROFILE_SYNC_START,
STAGE_PROFILE_SYNC_FINISH,
STAGE_PROFILE_REFRESH,
STAGE_PROFILE_COUNT
};
static const int kStageProfileMaxLevels = 32;
struct StageProfileStore
{
bool env_checked;
bool enabled;
int calls[kStageProfileMaxLevels];
double metric[kStageProfileMaxLevels][STAGE_PROFILE_COUNT];
};
StageProfileStore &stage_profile_store()
{
static StageProfileStore store = {};
return store;
}
bool stage_profile_enabled()
{
StageProfileStore &store = stage_profile_store();
if (!store.env_checked)
{
const char *env = getenv("AMSS_GPU_STAGE_TIMING");
store.enabled = (env && env[0] && strcmp(env, "0") != 0);
store.env_checked = true;
}
return store.enabled;
}
void stage_profile_note_call(int lev)
{
if (lev >= 0 && lev < kStageProfileMaxLevels)
stage_profile_store().calls[lev]++;
}
void stage_profile_add(int lev, StageProfileMetric metric, double seconds)
{
if (lev >= 0 && lev < kStageProfileMaxLevels)
stage_profile_store().metric[lev][metric] += seconds;
}
const char *stage_profile_metric_name(StageProfileMetric metric)
{
switch (metric)
{
case STAGE_PROFILE_TOTAL:
return "total";
case STAGE_PROFILE_RHS:
return "rhs";
case STAGE_PROFILE_RUN_STAGE:
return "run_stage";
case STAGE_PROFILE_RUN_STAGE_DEVICE:
return "run_stage_dev";
case STAGE_PROFILE_RUN_STAGE_HOST_FIX:
return "run_stage_host";
case STAGE_PROFILE_LOWERBOUND:
return "lower";
case STAGE_PROFILE_ENSURE:
return "ensure";
case STAGE_PROFILE_DOWNLOAD:
return "download";
case STAGE_PROFILE_CLEAR_CACHE:
return "clear_cache";
case STAGE_PROFILE_SYNC_START:
return "sync_start";
case STAGE_PROFILE_SYNC_FINISH:
return "sync_finish";
case STAGE_PROFILE_REFRESH:
return "refresh";
default:
return "unknown";
}
}
} // namespace
void bssn_cuda_dump_stage_profile()
{
if (!stage_profile_enabled())
return;
int myrank = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
StageProfileStore &store = stage_profile_store();
int global_calls_sum[kStageProfileMaxLevels] = {};
double global_metric_sum[kStageProfileMaxLevels][STAGE_PROFILE_COUNT] = {};
double global_metric_max[kStageProfileMaxLevels][STAGE_PROFILE_COUNT] = {};
MPI_Reduce(store.calls, global_calls_sum, kStageProfileMaxLevels, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
MPI_Reduce(store.metric[0], global_metric_sum[0],
kStageProfileMaxLevels * STAGE_PROFILE_COUNT,
MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
MPI_Reduce(store.metric[0], global_metric_max[0],
kStageProfileMaxLevels * STAGE_PROFILE_COUNT,
MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
if (myrank != 0)
return;
cout << endl;
cout << " GPU stage timing summary (sum/max over MPI ranks) " << endl;
cout << " lev calls";
for (int metric = 0; metric < STAGE_PROFILE_COUNT; ++metric)
cout << " " << setw(22) << stage_profile_metric_name(static_cast<StageProfileMetric>(metric));
cout << endl;
for (int lev = 0; lev < kStageProfileMaxLevels; ++lev)
{
if (global_calls_sum[lev] == 0)
continue;
cout << setw(4) << lev << " " << setw(5) << global_calls_sum[lev];
for (int metric = 0; metric < STAGE_PROFILE_COUNT; ++metric)
{
cout << " "
<< setw(10) << setprecision(6) << fixed << global_metric_sum[lev][metric]
<< "/"
<< setw(10) << setprecision(6) << fixed << global_metric_max[lev][metric];
}
cout << endl;
}
cout << endl;
}
void bssn_class::Step_MainPath_GPU(int lev, int YN)
{
#ifdef WithShell
#error "Step_MainPath_GPU currently supports Patch grids only."
#endif
const bool profile_enabled = stage_profile_enabled();
const double step_total_begin = profile_enabled ? MPI_Wtime() : 0.0;
if (profile_enabled)
stage_profile_note_call(lev);
if (bssn_gpu_bind_process_device(myrank))
{
cerr << "GPU device bind failure on MPI rank " << myrank << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
if (profile_enabled)
{
const double t0 = MPI_Wtime();
bssn_gpu_clear_cached_device_buffers();
stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0);
}
else
bssn_gpu_clear_cached_device_buffers();
setpbh(BH_num, Porg0, Mass, BH_num_input);
const double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
#if (MAPBH == 1)
if (BH_num > 0 && lev == GH->levels - 1)
{
compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
for (int ith = 0; ith < 3; ith++)
Porg1[ithBH][ith] = Porg0[ithBH][ith] + Porg_rhs[ithBH][ith] * dT_lev;
if (Symmetry > 0)
Porg1[ithBH][2] = fabs(Porg1[ithBH][2]);
if (Symmetry == 2)
{
Porg1[ithBH][0] = fabs(Porg1[ithBH][0]);
Porg1[ithBH][1] = fabs(Porg1[ithBH][1]);
}
}
}
if (lev == a_lev)
AnalysisStuff(lev, dT_lev);
#endif
#ifdef With_AHF
AH_Step_Find(lev, dT_lev);
#endif
const bool BB = fgt(PhysTime, StartTime, dT_lev / 2);
(void)BB;
double ndeps = (lev < GH->movls) ? numepsb : numepss;
double TRK4 = PhysTime;
int iter_count = 0;
int pre = 0, cor = 1;
int ERROR = 0;
const bool keep_stage_sync_on_device = (RPS == 1) && (MAPBH == 1) && (REGLEV == 0);
auto run_stage_on_block =
[&](Block *cg, Patch *patch, MyList<var> *state0_list,
MyList<var> *boundary_src_list, MyList<var> *stage_data_list,
MyList<var> *rhs_list, int rk_stage) {
MyList<var> *varl0 = state0_list;
MyList<var> *varlb = boundary_src_list;
MyList<var> *varls = stage_data_list;
MyList<var> *varlr = rhs_list;
std::vector<const double *> batch_state0;
std::vector<double *> batch_stage;
std::vector<double *> batch_rhs;
while (varl0)
{
const bool force_host_boundary_fix = false;
const bool can_batch_device_path = (lev > 0) && !force_host_boundary_fix;
if (can_batch_device_path)
{
batch_state0.push_back(cg->fgfs[varl0->data->sgfn]);
batch_stage.push_back(cg->fgfs[varls->data->sgfn]);
batch_rhs.push_back(cg->fgfs[varlr->data->sgfn]);
varl0 = varl0->next;
varlb = varlb->next;
varls = varls->next;
varlr = varlr->next;
continue;
}
const double var_begin = profile_enabled ? MPI_Wtime() : 0.0;
if (bssn_cuda_rk4_boundary_var(cg->shape, dT_lev,
cg->X[0], cg->X[1], cg->X[2],
patch->bbox[0], patch->bbox[1], patch->bbox[2],
patch->bbox[3], patch->bbox[4], patch->bbox[5],
cg->fgfs[varl0->data->sgfn],
cg->fgfs[phi0->sgfn],
cg->fgfs[Lap0->sgfn],
cg->fgfs[varlb->data->sgfn],
cg->fgfs[varls->data->sgfn],
cg->fgfs[varlr->data->sgfn],
varl0->data->propspeed,
varl0->data->SoA,
Symmetry, lev, rk_stage,
force_host_boundary_fix, false))
{
cerr << "GPU rk4/boundary failure: lev=" << lev
<< " rk_stage=" << rk_stage
<< " var=" << varl0->data->name
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
break;
}
if (profile_enabled)
{
stage_profile_add(lev,
force_host_boundary_fix ? STAGE_PROFILE_RUN_STAGE_HOST_FIX
: STAGE_PROFILE_RUN_STAGE_DEVICE,
MPI_Wtime() - var_begin);
}
varl0 = varl0->next;
varlb = varlb->next;
varls = varls->next;
varlr = varlr->next;
}
if (!ERROR && !batch_state0.empty())
{
const double batch_begin = profile_enabled ? MPI_Wtime() : 0.0;
if (bssn_cuda_rk4_boundary_batch(cg->shape, dT_lev,
cg->X[0], cg->X[1], cg->X[2],
patch->bbox[0], patch->bbox[1], patch->bbox[2],
patch->bbox[3], patch->bbox[4], patch->bbox[5],
Symmetry,
&batch_state0[0],
&batch_stage[0],
&batch_rhs[0],
static_cast<int>(batch_state0.size()),
rk_stage, false))
{
cerr << "GPU rk4/boundary batch failure: lev=" << lev
<< " rk_stage=" << rk_stage
<< " vars=" << batch_state0.size()
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
else if (profile_enabled)
{
stage_profile_add(lev, STAGE_PROFILE_RUN_STAGE_DEVICE, MPI_Wtime() - batch_begin);
}
}
};
auto stage_download_var_list =
[&](Block *cg, MyList<var> *var_list, bool skip_unmapped) {
std::vector<double *> batch_host_ptrs;
std::vector<MyList<var> *> batch_vars;
while (var_list)
{
double *host_ptr = cg->fgfs[var_list->data->sgfn];
if (skip_unmapped && !bssn_gpu_find_device_buffer(host_ptr))
{
var_list = var_list->next;
continue;
}
batch_host_ptrs.push_back(host_ptr);
batch_vars.push_back(var_list);
var_list = var_list->next;
}
if (!batch_host_ptrs.empty() &&
bssn_gpu_download_buffer_batch(cg->shape, &batch_host_ptrs[0],
static_cast<int>(batch_host_ptrs.size())))
{
for (size_t i = 0; i < batch_host_ptrs.size(); ++i)
{
if (bssn_cuda_download_buffer(cg->shape, batch_host_ptrs[i]))
{
cerr << "GPU stage download failure: lev=" << lev
<< " var=" << batch_vars[i]->data->name
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
break;
}
}
}
};
auto stage_download_patch_list =
[&](MyList<var> *var_list, bool skip_unmapped) {
MyList<Patch> *patch_it = GH->PatL[lev];
while (patch_it)
{
MyList<Block> *block_it = patch_it->data->blb;
while (block_it)
{
Block *cg = block_it->data;
if (myrank == cg->rank)
stage_download_var_list(cg, var_list, skip_unmapped);
if (block_it == patch_it->data->ble)
break;
block_it = block_it->next;
}
if (ERROR)
break;
patch_it = patch_it->next;
}
};
auto ensure_stage_device_var_list =
[&](Block *cg, MyList<var> *var_list) {
const int n = cg->shape[0] * cg->shape[1] * cg->shape[2];
while (var_list)
{
double *host_ptr = cg->fgfs[var_list->data->sgfn];
if (!bssn_gpu_find_device_buffer(host_ptr) &&
bssn_gpu_stage_upload_buffer(host_ptr, n))
{
cerr << "GPU state ensure failure: lev=" << lev
<< " var=" << var_list->data->name
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
break;
}
var_list = var_list->next;
}
};
auto refresh_synced_device_regions =
[&](Block *cg, MyList<var> *var_list, Parallel::SyncCache &cache) {
std::vector<Parallel::gridseg *> local_segments;
for (int node = 0; node < cache.cpusize; ++node)
{
MyList<Parallel::gridseg> *seg = cache.combined_dst[node];
while (seg)
{
if (seg->data && seg->data->Bg == cg)
local_segments.push_back(seg->data);
seg = seg->next;
}
}
if (local_segments.empty())
return;
const int n = cg->shape[0] * cg->shape[1] * cg->shape[2];
while (var_list)
{
double *host_ptr = cg->fgfs[var_list->data->sgfn];
if (!bssn_gpu_find_device_buffer(host_ptr))
{
if (bssn_gpu_stage_upload_buffer(host_ptr, n))
{
cerr << "GPU sync refresh upload failure: lev=" << lev
<< " var=" << var_list->data->name
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
break;
}
}
else
{
for (size_t i = 0; i < local_segments.size(); ++i)
{
Parallel::gridseg *seg = local_segments[i];
if (bssn_gpu_stage_upload_region(host_ptr,
cg->shape,
cg->bbox,
cg->bbox + dim,
seg->shape,
seg->llb))
{
cerr << "GPU sync region refresh failure: lev=" << lev
<< " var=" << var_list->data->name
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
break;
}
}
if (ERROR)
break;
}
var_list = var_list->next;
}
};
auto refresh_stage_device_after_sync =
[&](MyList<var> *var_list, Parallel::SyncCache &cache) {
MyList<Patch> *patch_it = GH->PatL[lev];
while (patch_it)
{
MyList<Block> *block_it = patch_it->data->blb;
while (block_it)
{
Block *cg = block_it->data;
if (myrank == cg->rank)
refresh_synced_device_regions(cg, var_list, cache);
if (block_it == patch_it->data->ble)
break;
block_it = block_it->next;
}
if (ERROR)
break;
patch_it = patch_it->next;
}
};
auto refresh_stage_host_before_sync =
[&](MyList<var> *var_list, Parallel::SyncCache &cache) -> bool {
if (!cache.valid || !cache.combined_src || myrank < 0 || myrank >= cache.cpusize)
return false;
MyList<Patch> *patch_it = GH->PatL[lev];
while (patch_it)
{
MyList<Block> *block_it = patch_it->data->blb;
while (block_it)
{
Block *cg = block_it->data;
if (myrank == cg->rank)
{
std::vector<Parallel::gridseg *> local_segments;
MyList<Parallel::gridseg> *seg = cache.combined_src[myrank];
while (seg)
{
if (seg->data && seg->data->Bg == cg)
local_segments.push_back(seg->data);
seg = seg->next;
}
if (!local_segments.empty())
{
MyList<var> *var_it = var_list;
while (var_it)
{
double *host_ptr = cg->fgfs[var_it->data->sgfn];
for (size_t i = 0; i < local_segments.size(); ++i)
{
Parallel::gridseg *src_seg = local_segments[i];
if (bssn_gpu_stage_download_region(host_ptr,
cg->shape,
cg->bbox,
cg->bbox + dim,
src_seg->shape,
src_seg->llb))
{
cerr << "GPU sync region download failure: lev=" << lev
<< " var=" << var_it->data->name
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
return true;
}
}
var_it = var_it->next;
}
}
}
if (block_it == patch_it->data->ble)
break;
block_it = block_it->next;
}
patch_it = patch_it->next;
}
return true;
};
auto can_pack_sync_from_device =
[&](MyList<var> *var_list, Parallel::SyncCache &cache) -> bool {
if (!cache.valid || !cache.combined_src || myrank < 0 || myrank >= cache.cpusize)
return false;
MyList<Parallel::gridseg> *seg = cache.combined_src[myrank];
while (seg)
{
MyList<var> *var_it = var_list;
while (var_it)
{
if (!bssn_gpu_find_device_buffer(seg->data->Bg->fgfs[var_it->data->sgfn]))
return false;
var_it = var_it->next;
}
seg = seg->next;
}
return true;
};
MyList<Patch> *Pp = GH->PatL[lev];
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank)
{
double t0 = 0.0;
if (profile_enabled)
t0 = MPI_Wtime();
if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_FIRST_TIME))
ERROR = 1;
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_RHS, MPI_Wtime() - t0);
if (profile_enabled)
t0 = MPI_Wtime();
run_stage_on_block(cg, Pp->data, StateList, StateList, SynchList_pre, RHSList, iter_count);
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_RUN_STAGE, MPI_Wtime() - t0);
if (profile_enabled)
t0 = MPI_Wtime();
if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi->sgfn], chitiny, false))
{
cerr << "GPU lowerbound failure: lev=" << lev
<< " rk_stage=" << iter_count
<< " var=" << phi->name
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_LOWERBOUND, MPI_Wtime() - t0);
}
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
if (!ERROR)
{
if (!keep_stage_sync_on_device)
{
double t0 = 0.0;
if (profile_enabled)
t0 = MPI_Wtime();
stage_download_patch_list(SynchList_pre, false);
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_DOWNLOAD, MPI_Wtime() - t0);
if (!ERROR)
{
if (profile_enabled)
t0 = MPI_Wtime();
bssn_gpu_clear_cached_device_buffers();
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0);
}
}
}
MPI_Request err_req_pre;
{
int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_pre);
}
Parallel::AsyncSyncState async_pre;
if (profile_enabled)
{
const double t0 = MPI_Wtime();
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
stage_profile_add(lev, STAGE_PROFILE_SYNC_START, MPI_Wtime() - t0);
}
else
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
if (profile_enabled)
{
const double t0 = MPI_Wtime();
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry,
!keep_stage_sync_on_device);
stage_profile_add(lev, STAGE_PROFILE_SYNC_FINISH, MPI_Wtime() - t0);
}
else
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry,
!keep_stage_sync_on_device);
if (!ERROR && !keep_stage_sync_on_device)
{
if (profile_enabled)
{
const double t0 = MPI_Wtime();
refresh_stage_device_after_sync(SynchList_pre, sync_cache_pre[lev]);
stage_profile_add(lev, STAGE_PROFILE_REFRESH, MPI_Wtime() - t0);
}
else
refresh_stage_device_after_sync(SynchList_pre, sync_cache_pre[lev]);
}
MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE);
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#if (MAPBH == 0)
if (BH_num > 0 && lev == GH->levels - 1)
{
compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count);
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg[ithBH][1], Porg_rhs[ithBH][1], iter_count);
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg[ithBH][2], Porg_rhs[ithBH][2], iter_count);
if (Symmetry > 0)
Porg[ithBH][2] = fabs(Porg[ithBH][2]);
if (Symmetry == 2)
{
Porg[ithBH][0] = fabs(Porg[ithBH][0]);
Porg[ithBH][1] = fabs(Porg[ithBH][1]);
}
}
}
if (lev == a_lev)
AnalysisStuff(lev, dT_lev);
#endif
for (iter_count = 1; iter_count < 4; iter_count++)
{
if (iter_count == 1 || iter_count == 3)
TRK4 += dT_lev / 2;
Pp = GH->PatL[lev];
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank)
{
double t0 = 0.0;
if (profile_enabled)
t0 = MPI_Wtime();
ensure_stage_device_var_list(cg, SynchList_pre);
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_ENSURE, MPI_Wtime() - t0);
if (profile_enabled)
t0 = MPI_Wtime();
if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_THEN))
ERROR = 1;
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_RHS, MPI_Wtime() - t0);
if (profile_enabled)
t0 = MPI_Wtime();
run_stage_on_block(cg, Pp->data, StateList, SynchList_pre, SynchList_cor, RHSList, iter_count);
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_RUN_STAGE, MPI_Wtime() - t0);
if (profile_enabled)
t0 = MPI_Wtime();
if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi1->sgfn], chitiny, false))
{
cerr << "GPU lowerbound failure: lev=" << lev
<< " rk_stage=" << iter_count
<< " var=" << phi1->name
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_LOWERBOUND, MPI_Wtime() - t0);
}
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
if (!ERROR)
{
if (!keep_stage_sync_on_device)
{
double t0 = 0.0;
if (profile_enabled)
t0 = MPI_Wtime();
stage_download_patch_list(SynchList_cor, false);
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_DOWNLOAD, MPI_Wtime() - t0);
if (!ERROR)
{
if (profile_enabled)
t0 = MPI_Wtime();
bssn_gpu_clear_cached_device_buffers();
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0);
}
}
}
MPI_Request err_req_cor;
{
int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
}
Parallel::AsyncSyncState async_cor;
if (profile_enabled)
{
const double t0 = MPI_Wtime();
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
stage_profile_add(lev, STAGE_PROFILE_SYNC_START, MPI_Wtime() - t0);
}
else
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
if (profile_enabled)
{
const double t0 = MPI_Wtime();
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry,
!keep_stage_sync_on_device);
stage_profile_add(lev, STAGE_PROFILE_SYNC_FINISH, MPI_Wtime() - t0);
}
else
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry,
!keep_stage_sync_on_device);
if (!ERROR && !keep_stage_sync_on_device && iter_count < 3)
{
if (profile_enabled)
{
const double t0 = MPI_Wtime();
refresh_stage_device_after_sync(SynchList_cor, sync_cache_cor[lev]);
stage_profile_add(lev, STAGE_PROFILE_REFRESH, MPI_Wtime() - t0);
}
else
refresh_stage_device_after_sync(SynchList_cor, sync_cache_cor[lev]);
}
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#if (MAPBH == 0)
if (BH_num > 0 && lev == GH->levels - 1)
{
compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev);
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg1[ithBH][0], Porg_rhs[ithBH][0], iter_count);
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg1[ithBH][1], Porg_rhs[ithBH][1], iter_count);
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg1[ithBH][2], Porg_rhs[ithBH][2], iter_count);
if (Symmetry > 0)
Porg1[ithBH][2] = fabs(Porg1[ithBH][2]);
if (Symmetry == 2)
{
Porg1[ithBH][0] = fabs(Porg1[ithBH][0]);
Porg1[ithBH][1] = fabs(Porg1[ithBH][1]);
}
}
}
#endif
if (iter_count < 3)
{
Pp = GH->PatL[lev];
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
BP->data->swapList(SynchList_pre, SynchList_cor, myrank);
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
#if (MAPBH == 0)
if (BH_num > 0 && lev == GH->levels - 1)
{
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
Porg[ithBH][0] = Porg1[ithBH][0];
Porg[ithBH][1] = Porg1[ithBH][1];
Porg[ithBH][2] = Porg1[ithBH][2];
}
}
#endif
}
}
#if (RPS == 0)
RestrictProlong(lev, YN, BB);
#endif
Pp = GH->PatL[lev];
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
cg->swapList(StateList, SynchList_cor, myrank);
cg->swapList(OldStateList, SynchList_cor, myrank);
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
if (!ERROR && keep_stage_sync_on_device)
{
// After the swaps above, only StateList points at arrays updated during this step.
// OldStateList/SynchList_cor remain valid on host because their backing arrays were
// read-only during the RK step, and SynchList_pre is reused only as scratch later.
const double t0 = profile_enabled ? MPI_Wtime() : 0.0;
stage_download_patch_list(StateList, true);
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_DOWNLOAD, MPI_Wtime() - t0);
}
if (profile_enabled)
{
const double t0 = MPI_Wtime();
bssn_gpu_clear_cached_device_buffers();
stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0);
}
else
bssn_gpu_clear_cached_device_buffers();
if (BH_num > 0 && lev == GH->levels - 1)
{
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
Porg0[ithBH][0] = Porg1[ithBH][0];
Porg0[ithBH][1] = Porg1[ithBH][1];
Porg0[ithBH][2] = Porg1[ithBH][2];
}
}
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_TOTAL, MPI_Wtime() - step_total_begin);
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -4,10 +4,8 @@
#include "bssn_macro.h" #include "bssn_macro.h"
#include "macrodef.fh" #include "macrodef.fh"
#define DEVICE_ID 0 #define GRID_DIM 256
// #define DEVICE_ID_BY_MPI_RANK #define BLOCK_DIM 128
#define GRID_DIM 256
#define BLOCK_DIM 128
#define _FH2_(i, j, k) fh[(i) + (j) * _1D_SIZE[2] + (k) * _2D_SIZE[2]] #define _FH2_(i, j, k) fh[(i) + (j) * _1D_SIZE[2] + (k) * _2D_SIZE[2]]
#define _FH3_(i, j, k) fh[(i) + (j) * _1D_SIZE[3] + (k) * _2D_SIZE[3]] #define _FH3_(i, j, k) fh[(i) + (j) * _1D_SIZE[3] + (k) * _2D_SIZE[3]]
@@ -65,9 +63,45 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,
double *Gmx_Res, double *Gmy_Res, double *Gmz_Res, double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
int &Symmetry, int &Lev, double &eps, int &co); int &Symmetry, int &Lev, double &eps, int &co);
int gpu_rhs_ss(RHS_SS_PARA); int gpu_rhs_ss(RHS_SS_PARA);
/** Init GPU side data in GPUMeta. */ int bssn_gpu_bind_process_device(int mpi_rank);
// void init_fluid_meta_gpu(GPUMeta *gpu_meta); void bssn_gpu_clear_cached_device_buffers();
void bssn_gpu_release_pinned_host_buffers();
const double *bssn_gpu_find_device_buffer(const double *host_ptr);
void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr);
void bssn_gpu_prepare_host_buffer(const double *host_ptr, int count);
int bssn_gpu_stage_upload_buffer(const double *host_ptr, int count);
int bssn_gpu_stage_zero_buffer(const double *host_ptr, int count);
int bssn_gpu_stage_upload_region(const double *host_ptr,
const int *full_shape,
const double *full_llb,
const double *full_uub,
const int *region_shape,
const double *region_llb);
int bssn_gpu_stage_download_region(double *host_ptr,
const int *full_shape,
const double *full_llb,
const double *full_uub,
const int *region_shape,
const double *region_llb);
int bssn_gpu_stage_download_region_to_buffer(const double *host_src_ptr,
const int *full_shape,
const double *full_llb,
const double *full_uub,
const int *region_shape,
const double *region_llb,
double *host_dst_ptr);
int bssn_gpu_stage_upload_buffer_to_region(const double *host_src_ptr,
double *host_dst_ptr,
const int *full_shape,
const double *full_llb,
const double *full_uub,
const int *region_shape,
const double *region_llb);
int bssn_gpu_download_buffer_batch(const int *ex, double **host_ptrs, int num_buffers);
/** Init GPU side data in GPUMeta. */
// void init_fluid_meta_gpu(GPUMeta *gpu_meta);
#endif #endif

View File

@@ -65,9 +65,10 @@ if(TIME_COUNT_EACH_RANK == 1){\
}\ }\
} }
//3---------------------GPU--------------------- //3---------------------GPU---------------------
#define CALLED_BY_STEP 0 #define CALLED_BY_STEP 0
#define CALLED_BY_CONSTRAINT 1 #define CALLED_BY_CONSTRAINT 1
#define CALLED_BY_CONSTRAINT_CONS_ONLY 2
#define RHS_PARA_CALLED_FIRST_TIME cg->shape,TRK4,cg->X[0],cg->X[1],cg->X[2],cg->fgfs[phi0->sgfn],cg->fgfs[trK0->sgfn],cg->fgfs[gxx0->sgfn],cg->fgfs[gxy0->sgfn],cg->fgfs[gxz0->sgfn],cg->fgfs[gyy0->sgfn],cg->fgfs[gyz0->sgfn],cg->fgfs[gzz0->sgfn],cg->fgfs[Axx0->sgfn],cg->fgfs[Axy0->sgfn],cg->fgfs[Axz0->sgfn],cg->fgfs[Ayy0->sgfn],cg->fgfs[Ayz0->sgfn],cg->fgfs[Azz0->sgfn],cg->fgfs[Gmx0->sgfn],cg->fgfs[Gmy0->sgfn],cg->fgfs[Gmz0->sgfn],cg->fgfs[Lap0->sgfn],cg->fgfs[Sfx0->sgfn],cg->fgfs[Sfy0->sgfn],cg->fgfs[Sfz0->sgfn],cg->fgfs[dtSfx0->sgfn],cg->fgfs[dtSfy0->sgfn],cg->fgfs[dtSfz0->sgfn],cg->fgfs[phi_rhs->sgfn],cg->fgfs[trK_rhs->sgfn],cg->fgfs[gxx_rhs->sgfn],cg->fgfs[gxy_rhs->sgfn],cg->fgfs[gxz_rhs->sgfn],cg->fgfs[gyy_rhs->sgfn],cg->fgfs[gyz_rhs->sgfn],cg->fgfs[gzz_rhs->sgfn],cg->fgfs[Axx_rhs->sgfn],cg->fgfs[Axy_rhs->sgfn],cg->fgfs[Axz_rhs->sgfn],cg->fgfs[Ayy_rhs->sgfn],cg->fgfs[Ayz_rhs->sgfn],cg->fgfs[Azz_rhs->sgfn],cg->fgfs[Gmx_rhs->sgfn],cg->fgfs[Gmy_rhs->sgfn],cg->fgfs[Gmz_rhs->sgfn],cg->fgfs[Lap_rhs->sgfn],cg->fgfs[Sfx_rhs->sgfn],cg->fgfs[Sfy_rhs->sgfn],cg->fgfs[Sfz_rhs->sgfn],cg->fgfs[dtSfx_rhs->sgfn],cg->fgfs[dtSfy_rhs->sgfn],cg->fgfs[dtSfz_rhs->sgfn],cg->fgfs[rho->sgfn],cg->fgfs[Sx->sgfn],cg->fgfs[Sy->sgfn],cg->fgfs[Sz->sgfn],cg->fgfs[Sxx->sgfn],cg->fgfs[Sxy->sgfn],cg->fgfs[Sxz->sgfn],cg->fgfs[Syy->sgfn],cg->fgfs[Syz->sgfn],cg->fgfs[Szz->sgfn],cg->fgfs[Gamxxx->sgfn],cg->fgfs[Gamxxy->sgfn],cg->fgfs[Gamxxz->sgfn],cg->fgfs[Gamxyy->sgfn],cg->fgfs[Gamxyz->sgfn],cg->fgfs[Gamxzz->sgfn],cg->fgfs[Gamyxx->sgfn],cg->fgfs[Gamyxy->sgfn],cg->fgfs[Gamyxz->sgfn],cg->fgfs[Gamyyy->sgfn],cg->fgfs[Gamyyz->sgfn],cg->fgfs[Gamyzz->sgfn],cg->fgfs[Gamzxx->sgfn],cg->fgfs[Gamzxy->sgfn],cg->fgfs[Gamzxz->sgfn],cg->fgfs[Gamzyy->sgfn],cg->fgfs[Gamzyz->sgfn],cg->fgfs[Gamzzz->sgfn],cg->fgfs[Rxx->sgfn],cg->fgfs[Rxy->sgfn],cg->fgfs[Rxz->sgfn],cg->fgfs[Ryy->sgfn],cg->fgfs[Ryz->sgfn],cg->fgfs[Rzz->sgfn],cg->fgfs[Cons_Ham->sgfn],cg->fgfs[Cons_Px->sgfn],cg->fgfs[Cons_Py->sgfn],cg->fgfs[Cons_Pz->sgfn],cg->fgfs[Cons_Gx->sgfn],cg->fgfs[Cons_Gy->sgfn],cg->fgfs[Cons_Gz->sgfn],Symmetry,lev,ndeps,pre #define RHS_PARA_CALLED_FIRST_TIME cg->shape,TRK4,cg->X[0],cg->X[1],cg->X[2],cg->fgfs[phi0->sgfn],cg->fgfs[trK0->sgfn],cg->fgfs[gxx0->sgfn],cg->fgfs[gxy0->sgfn],cg->fgfs[gxz0->sgfn],cg->fgfs[gyy0->sgfn],cg->fgfs[gyz0->sgfn],cg->fgfs[gzz0->sgfn],cg->fgfs[Axx0->sgfn],cg->fgfs[Axy0->sgfn],cg->fgfs[Axz0->sgfn],cg->fgfs[Ayy0->sgfn],cg->fgfs[Ayz0->sgfn],cg->fgfs[Azz0->sgfn],cg->fgfs[Gmx0->sgfn],cg->fgfs[Gmy0->sgfn],cg->fgfs[Gmz0->sgfn],cg->fgfs[Lap0->sgfn],cg->fgfs[Sfx0->sgfn],cg->fgfs[Sfy0->sgfn],cg->fgfs[Sfz0->sgfn],cg->fgfs[dtSfx0->sgfn],cg->fgfs[dtSfy0->sgfn],cg->fgfs[dtSfz0->sgfn],cg->fgfs[phi_rhs->sgfn],cg->fgfs[trK_rhs->sgfn],cg->fgfs[gxx_rhs->sgfn],cg->fgfs[gxy_rhs->sgfn],cg->fgfs[gxz_rhs->sgfn],cg->fgfs[gyy_rhs->sgfn],cg->fgfs[gyz_rhs->sgfn],cg->fgfs[gzz_rhs->sgfn],cg->fgfs[Axx_rhs->sgfn],cg->fgfs[Axy_rhs->sgfn],cg->fgfs[Axz_rhs->sgfn],cg->fgfs[Ayy_rhs->sgfn],cg->fgfs[Ayz_rhs->sgfn],cg->fgfs[Azz_rhs->sgfn],cg->fgfs[Gmx_rhs->sgfn],cg->fgfs[Gmy_rhs->sgfn],cg->fgfs[Gmz_rhs->sgfn],cg->fgfs[Lap_rhs->sgfn],cg->fgfs[Sfx_rhs->sgfn],cg->fgfs[Sfy_rhs->sgfn],cg->fgfs[Sfz_rhs->sgfn],cg->fgfs[dtSfx_rhs->sgfn],cg->fgfs[dtSfy_rhs->sgfn],cg->fgfs[dtSfz_rhs->sgfn],cg->fgfs[rho->sgfn],cg->fgfs[Sx->sgfn],cg->fgfs[Sy->sgfn],cg->fgfs[Sz->sgfn],cg->fgfs[Sxx->sgfn],cg->fgfs[Sxy->sgfn],cg->fgfs[Sxz->sgfn],cg->fgfs[Syy->sgfn],cg->fgfs[Syz->sgfn],cg->fgfs[Szz->sgfn],cg->fgfs[Gamxxx->sgfn],cg->fgfs[Gamxxy->sgfn],cg->fgfs[Gamxxz->sgfn],cg->fgfs[Gamxyy->sgfn],cg->fgfs[Gamxyz->sgfn],cg->fgfs[Gamxzz->sgfn],cg->fgfs[Gamyxx->sgfn],cg->fgfs[Gamyxy->sgfn],cg->fgfs[Gamyxz->sgfn],cg->fgfs[Gamyyy->sgfn],cg->fgfs[Gamyyz->sgfn],cg->fgfs[Gamyzz->sgfn],cg->fgfs[Gamzxx->sgfn],cg->fgfs[Gamzxy->sgfn],cg->fgfs[Gamzxz->sgfn],cg->fgfs[Gamzyy->sgfn],cg->fgfs[Gamzyz->sgfn],cg->fgfs[Gamzzz->sgfn],cg->fgfs[Rxx->sgfn],cg->fgfs[Rxy->sgfn],cg->fgfs[Rxz->sgfn],cg->fgfs[Ryy->sgfn],cg->fgfs[Ryz->sgfn],cg->fgfs[Rzz->sgfn],cg->fgfs[Cons_Ham->sgfn],cg->fgfs[Cons_Px->sgfn],cg->fgfs[Cons_Py->sgfn],cg->fgfs[Cons_Pz->sgfn],cg->fgfs[Cons_Gx->sgfn],cg->fgfs[Cons_Gy->sgfn],cg->fgfs[Cons_Gz->sgfn],Symmetry,lev,ndeps,pre

View File

@@ -59,9 +59,10 @@
real*8, dimension(ex(1),ex(2),ex(3)),intent(out) :: Rxx,Rxy,Rxz,Ryy,Ryz,Rzz real*8, dimension(ex(1),ex(2),ex(3)),intent(out) :: Rxx,Rxy,Rxz,Ryy,Ryz,Rzz
real*8,intent(in) :: eps real*8,intent(in) :: eps
real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: ham_Res, movx_Res, movy_Res, movz_Res real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: ham_Res, movx_Res, movy_Res, movz_Res
real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: Gmx_Res, Gmy_Res, Gmz_Res real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: Gmx_Res, Gmy_Res, Gmz_Res
! gont = 0: success; gont = 1: something wrong ! gont = 0: success; gont = 1: something wrong
integer::gont integer::gont
integer :: i,j,k
!~~~~~~> Other variables: !~~~~~~> Other variables:
@@ -83,11 +84,18 @@
real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz
real*8, dimension(ex(1),ex(2),ex(3)) :: gupyy,gupyz,gupzz real*8, dimension(ex(1),ex(2),ex(3)) :: gupyy,gupyz,gupzz
real*8,dimension(3) ::SSS,AAS,ASA,SAA,ASS,SAS,SSA real*8,dimension(3) ::SSS,AAS,ASA,SAA,ASS,SAS,SSA
real*8 :: dX, dY, dZ, PI real*8 :: dX, dY, dZ, PI
real*8, parameter :: ZEO = 0.d0,ONE = 1.D0, TWO = 2.D0, FOUR = 4.D0 real*8 :: divb_loc,det_loc
real*8, parameter :: EIGHT = 8.D0, HALF = 0.5D0, THR = 3.d0 real*8 :: gupxx_loc,gupxy_loc,gupxz_loc,gupyy_loc,gupyz_loc,gupzz_loc
real*8, parameter :: SYM = 1.D0, ANTI= - 1.D0 real*8 :: Rxx_loc,Rxy_loc,Rxz_loc,Ryy_loc,Ryz_loc,Rzz_loc
real*8 :: fxx_loc,fxy_loc,fxz_loc
real*8 :: Gamxa_loc,Gamya_loc,Gamza_loc
real*8 :: f_loc,chin_loc
real*8 :: l_fxx,l_fxy,l_fxz,l_fyy,l_fyz,l_fzz,S_loc
real*8, parameter :: ZEO = 0.d0,ONE = 1.D0, TWO = 2.D0, FOUR = 4.D0
real*8, parameter :: EIGHT = 8.D0, HALF = 0.5D0, THR = 3.d0
real*8, parameter :: SYM = 1.D0, ANTI= - 1.D0
double precision,parameter::FF = 0.75d0,eta=2.d0 double precision,parameter::FF = 0.75d0,eta=2.d0
real*8, parameter :: F1o3 = 1.D0/3.D0, F2o3 = 2.D0/3.D0,F3o2=1.5d0, F1o6 = 1.D0/6.D0 real*8, parameter :: F1o3 = 1.D0/3.D0, F2o3 = 2.D0/3.D0,F3o2=1.5d0, F1o6 = 1.D0/6.D0
real*8, parameter :: F16=1.6d1,F8=8.d0 real*8, parameter :: F16=1.6d1,F8=8.d0
@@ -96,11 +104,11 @@
real*8, dimension(ex(1),ex(2),ex(3)) :: reta real*8, dimension(ex(1),ex(2),ex(3)) :: reta
#endif #endif
#if (GAUGE == 6 || GAUGE == 7) #if (GAUGE == 6 || GAUGE == 7)
integer :: BHN,i,j,k integer :: BHN
real*8, dimension(9) :: Porg real*8, dimension(9) :: Porg
real*8, dimension(3) :: Mass real*8, dimension(3) :: Mass
real*8 :: r1,r2,M,A,w1,w2,C1,C2 real*8 :: r1,r2,M,A,w1,w2,C1,C2
real*8, dimension(ex(1),ex(2),ex(3)) :: reta real*8, dimension(ex(1),ex(2),ex(3)) :: reta
call getpbh(BHN,Porg,Mass) call getpbh(BHN,Porg,Mass)
@@ -145,174 +153,204 @@
dY = Y(2) - Y(1) dY = Y(2) - Y(1)
dZ = Z(2) - Z(1) dZ = Z(2) - Z(1)
alpn1 = Lap + ONE do k=1,ex(3)
chin1 = chi + ONE do j=1,ex(2)
gxx = dxx + ONE do i=1,ex(1)
gyy = dyy + ONE alpn1(i,j,k) = Lap(i,j,k) + ONE
gzz = dzz + ONE chin1(i,j,k) = chi(i,j,k) + ONE
gxx(i,j,k) = dxx(i,j,k) + ONE
gyy(i,j,k) = dyy(i,j,k) + ONE
gzz(i,j,k) = dzz(i,j,k) + ONE
enddo
enddo
enddo
call fderivs(ex,betax,betaxx,betaxy,betaxz,X,Y,Z,ANTI, SYM, SYM,Symmetry,Lev) call fderivs(ex,betax,betaxx,betaxy,betaxz,X,Y,Z,ANTI, SYM, SYM,Symmetry,Lev)
call fderivs(ex,betay,betayx,betayy,betayz,X,Y,Z, SYM,ANTI, SYM,Symmetry,Lev) call fderivs(ex,betay,betayx,betayy,betayz,X,Y,Z, SYM,ANTI, SYM,Symmetry,Lev)
call fderivs(ex,betaz,betazx,betazy,betazz,X,Y,Z, SYM, SYM,ANTI,Symmetry,Lev) call fderivs(ex,betaz,betazx,betazy,betazz,X,Y,Z, SYM, SYM,ANTI,Symmetry,Lev)
div_beta = betaxx + betayy + betazz call fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
call fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
chi_rhs = F2o3 *chin1*( alpn1 * trK - div_beta ) !rhs for chi call fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
call fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev)
call fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev) call fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev)
call fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev) call fderivs(ex,dyy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
call fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev) call fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev)
call fderivs(ex,dyy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev) call fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
call fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev)
call fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev) do k=1,ex(3)
do j=1,ex(2)
gxx_rhs = - TWO * alpn1 * Axx - F2o3 * gxx * div_beta + & do i=1,ex(1)
TWO *( gxx * betaxx + gxy * betayx + gxz * betazx) divb_loc = betaxx(i,j,k) + betayy(i,j,k) + betazz(i,j,k)
div_beta(i,j,k) = divb_loc
gyy_rhs = - TWO * alpn1 * Ayy - F2o3 * gyy * div_beta + &
TWO *( gxy * betaxy + gyy * betayy + gyz * betazy) chi_rhs(i,j,k) = F2o3 * chin1(i,j,k) * (alpn1(i,j,k) * trK(i,j,k) - divb_loc)
gzz_rhs = - TWO * alpn1 * Azz - F2o3 * gzz * div_beta + & gxx_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Axx(i,j,k) - F2o3 * gxx(i,j,k) * divb_loc + &
TWO *( gxz * betaxz + gyz * betayz + gzz * betazz) TWO * ( gxx(i,j,k) * betaxx(i,j,k) + gxy(i,j,k) * betayx(i,j,k) + gxz(i,j,k) * betazx(i,j,k) )
gxy_rhs = - TWO * alpn1 * Axy + F1o3 * gxy * div_beta + & gyy_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Ayy(i,j,k) - F2o3 * gyy(i,j,k) * divb_loc + &
gxx * betaxy + gxz * betazy + & TWO * ( gxy(i,j,k) * betaxy(i,j,k) + gyy(i,j,k) * betayy(i,j,k) + gyz(i,j,k) * betazy(i,j,k) )
gyy * betayx + gyz * betazx &
- gxy * betazz gzz_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Azz(i,j,k) - F2o3 * gzz(i,j,k) * divb_loc + &
TWO * ( gxz(i,j,k) * betaxz(i,j,k) + gyz(i,j,k) * betayz(i,j,k) + gzz(i,j,k) * betazz(i,j,k) )
gyz_rhs = - TWO * alpn1 * Ayz + F1o3 * gyz * div_beta + &
gxy * betaxz + gyy * betayz + & gxy_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Axy(i,j,k) + F1o3 * gxy(i,j,k) * divb_loc + &
gxz * betaxy + gzz * betazy & gxx(i,j,k) * betaxy(i,j,k) + gxz(i,j,k) * betazy(i,j,k) + gyy(i,j,k) * betayx(i,j,k) + &
- gyz * betaxx gyz(i,j,k) * betazx(i,j,k) - gxy(i,j,k) * betazz(i,j,k)
gxz_rhs = - TWO * alpn1 * Axz + F1o3 * gxz * div_beta + & gyz_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Ayz(i,j,k) + F1o3 * gyz(i,j,k) * divb_loc + &
gxx * betaxz + gxy * betayz + & gxy(i,j,k) * betaxz(i,j,k) + gyy(i,j,k) * betayz(i,j,k) + gxz(i,j,k) * betaxy(i,j,k) + &
gyz * betayx + gzz * betazx & gzz(i,j,k) * betazy(i,j,k) - gyz(i,j,k) * betaxx(i,j,k)
- gxz * betayy !rhs for gij
gxz_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Axz(i,j,k) + F1o3 * gxz(i,j,k) * divb_loc + &
! invert tilted metric gxx(i,j,k) * betaxz(i,j,k) + gxy(i,j,k) * betayz(i,j,k) + gyz(i,j,k) * betayx(i,j,k) + &
gupzz = gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - & gzz(i,j,k) * betazx(i,j,k) - gxz(i,j,k) * betayy(i,j,k)
gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
gupxx = ( gyy * gzz - gyz * gyz ) / gupzz det_loc = gxx(i,j,k) * gyy(i,j,k) * gzz(i,j,k) + gxy(i,j,k) * gyz(i,j,k) * gxz(i,j,k) + &
gupxy = - ( gxy * gzz - gyz * gxz ) / gupzz gxz(i,j,k) * gxy(i,j,k) * gyz(i,j,k) - gxz(i,j,k) * gyy(i,j,k) * gxz(i,j,k) - &
gupxz = ( gxy * gyz - gyy * gxz ) / gupzz gxy(i,j,k) * gxy(i,j,k) * gzz(i,j,k) - gxx(i,j,k) * gyz(i,j,k) * gyz(i,j,k)
gupyy = ( gxx * gzz - gxz * gxz ) / gupzz gupxx_loc = ( gyy(i,j,k) * gzz(i,j,k) - gyz(i,j,k) * gyz(i,j,k) ) / det_loc
gupyz = - ( gxx * gyz - gxy * gxz ) / gupzz gupxy_loc = - ( gxy(i,j,k) * gzz(i,j,k) - gyz(i,j,k) * gxz(i,j,k) ) / det_loc
gupzz = ( gxx * gyy - gxy * gxy ) / gupzz gupxz_loc = ( gxy(i,j,k) * gyz(i,j,k) - gyy(i,j,k) * gxz(i,j,k) ) / det_loc
gupyy_loc = ( gxx(i,j,k) * gzz(i,j,k) - gxz(i,j,k) * gxz(i,j,k) ) / det_loc
if(co == 0)then gupyz_loc = - ( gxx(i,j,k) * gyz(i,j,k) - gxy(i,j,k) * gxz(i,j,k) ) / det_loc
! Gam^i_Res = Gam^i + gup^ij_,j gupzz_loc = ( gxx(i,j,k) * gyy(i,j,k) - gxy(i,j,k) * gxy(i,j,k) ) / det_loc
Gmx_Res = Gamx - (gupxx*(gupxx*gxxx+gupxy*gxyx+gupxz*gxzx)& gupxx(i,j,k) = gupxx_loc
+gupxy*(gupxx*gxyx+gupxy*gyyx+gupxz*gyzx)& gupxy(i,j,k) = gupxy_loc
+gupxz*(gupxx*gxzx+gupxy*gyzx+gupxz*gzzx)& gupxz(i,j,k) = gupxz_loc
+gupxx*(gupxy*gxxy+gupyy*gxyy+gupyz*gxzy)& gupyy(i,j,k) = gupyy_loc
+gupxy*(gupxy*gxyy+gupyy*gyyy+gupyz*gyzy)& gupyz(i,j,k) = gupyz_loc
+gupxz*(gupxy*gxzy+gupyy*gyzy+gupyz*gzzy)& gupzz(i,j,k) = gupzz_loc
+gupxx*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
+gupxy*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)& if(co == 0)then
+gupxz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz)) Gmx_Res(i,j,k) = Gamx(i,j,k) - ( &
Gmy_Res = Gamy - (gupxx*(gupxy*gxxx+gupyy*gxyx+gupyz*gxzx)& gupxx_loc*(gupxx_loc*gxxx(i,j,k)+gupxy_loc*gxyx(i,j,k)+gupxz_loc*gxzx(i,j,k)) + &
+gupxy*(gupxy*gxyx+gupyy*gyyx+gupyz*gyzx)& gupxy_loc*(gupxx_loc*gxyx(i,j,k)+gupxy_loc*gyyx(i,j,k)+gupxz_loc*gyzx(i,j,k)) + &
+gupxz*(gupxy*gxzx+gupyy*gyzx+gupyz*gzzx)& gupxz_loc*(gupxx_loc*gxzx(i,j,k)+gupxy_loc*gyzx(i,j,k)+gupxz_loc*gzzx(i,j,k)) + &
+gupxy*(gupxy*gxxy+gupyy*gxyy+gupyz*gxzy)& gupxx_loc*(gupxy_loc*gxxy(i,j,k)+gupyy_loc*gxyy(i,j,k)+gupyz_loc*gxzy(i,j,k)) + &
+gupyy*(gupxy*gxyy+gupyy*gyyy+gupyz*gyzy)& gupxy_loc*(gupxy_loc*gxyy(i,j,k)+gupyy_loc*gyyy(i,j,k)+gupyz_loc*gyzy(i,j,k)) + &
+gupyz*(gupxy*gxzy+gupyy*gyzy+gupyz*gzzy)& gupxz_loc*(gupxy_loc*gxzy(i,j,k)+gupyy_loc*gyzy(i,j,k)+gupyz_loc*gzzy(i,j,k)) + &
+gupxy*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)& gupxx_loc*(gupxz_loc*gxxz(i,j,k)+gupyz_loc*gxyz(i,j,k)+gupzz_loc*gxzz(i,j,k)) + &
+gupyy*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)& gupxy_loc*(gupxz_loc*gxyz(i,j,k)+gupyz_loc*gyyz(i,j,k)+gupzz_loc*gyzz(i,j,k)) + &
+gupyz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz)) gupxz_loc*(gupxz_loc*gxzz(i,j,k)+gupyz_loc*gyzz(i,j,k)+gupzz_loc*gzzz(i,j,k)))
Gmz_Res = Gamz - (gupxx*(gupxz*gxxx+gupyz*gxyx+gupzz*gxzx)& Gmy_Res(i,j,k) = Gamy(i,j,k) - ( &
+gupxy*(gupxz*gxyx+gupyz*gyyx+gupzz*gyzx)& gupxx_loc*(gupxy_loc*gxxx(i,j,k)+gupyy_loc*gxyx(i,j,k)+gupyz_loc*gxzx(i,j,k)) + &
+gupxz*(gupxz*gxzx+gupyz*gyzx+gupzz*gzzx)& gupxy_loc*(gupxy_loc*gxyx(i,j,k)+gupyy_loc*gyyx(i,j,k)+gupyz_loc*gyzx(i,j,k)) + &
+gupxy*(gupxz*gxxy+gupyz*gxyy+gupzz*gxzy)& gupxz_loc*(gupxy_loc*gxzx(i,j,k)+gupyy_loc*gyzx(i,j,k)+gupyz_loc*gzzx(i,j,k)) + &
+gupyy*(gupxz*gxyy+gupyz*gyyy+gupzz*gyzy)& gupxy_loc*(gupxy_loc*gxxy(i,j,k)+gupyy_loc*gxyy(i,j,k)+gupyz_loc*gxzy(i,j,k)) + &
+gupyz*(gupxz*gxzy+gupyz*gyzy+gupzz*gzzy)& gupyy_loc*(gupxy_loc*gxyy(i,j,k)+gupyy_loc*gyyy(i,j,k)+gupyz_loc*gyzy(i,j,k)) + &
+gupxz*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)& gupyz_loc*(gupxy_loc*gxzy(i,j,k)+gupyy_loc*gyzy(i,j,k)+gupyz_loc*gzzy(i,j,k)) + &
+gupyz*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)& gupxy_loc*(gupxz_loc*gxxz(i,j,k)+gupyz_loc*gxyz(i,j,k)+gupzz_loc*gxzz(i,j,k)) + &
+gupzz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz)) gupyy_loc*(gupxz_loc*gxyz(i,j,k)+gupyz_loc*gyyz(i,j,k)+gupzz_loc*gyzz(i,j,k)) + &
endif gupyz_loc*(gupxz_loc*gxzz(i,j,k)+gupyz_loc*gyzz(i,j,k)+gupzz_loc*gzzz(i,j,k)))
Gmz_Res(i,j,k) = Gamz(i,j,k) - ( &
! second kind of connection gupxx_loc*(gupxz_loc*gxxx(i,j,k)+gupyz_loc*gxyx(i,j,k)+gupzz_loc*gxzx(i,j,k)) + &
Gamxxx =HALF*( gupxx*gxxx + gupxy*(TWO*gxyx - gxxy ) + gupxz*(TWO*gxzx - gxxz )) gupxy_loc*(gupxz_loc*gxyx(i,j,k)+gupyz_loc*gyyx(i,j,k)+gupzz_loc*gyzx(i,j,k)) + &
Gamyxx =HALF*( gupxy*gxxx + gupyy*(TWO*gxyx - gxxy ) + gupyz*(TWO*gxzx - gxxz )) gupxz_loc*(gupxz_loc*gxzx(i,j,k)+gupyz_loc*gyzx(i,j,k)+gupzz_loc*gzzx(i,j,k)) + &
Gamzxx =HALF*( gupxz*gxxx + gupyz*(TWO*gxyx - gxxy ) + gupzz*(TWO*gxzx - gxxz )) gupxy_loc*(gupxz_loc*gxxy(i,j,k)+gupyz_loc*gxyy(i,j,k)+gupzz_loc*gxzy(i,j,k)) + &
gupyy_loc*(gupxz_loc*gxyy(i,j,k)+gupyz_loc*gyyy(i,j,k)+gupzz_loc*gyzy(i,j,k)) + &
Gamxyy =HALF*( gupxx*(TWO*gxyy - gyyx ) + gupxy*gyyy + gupxz*(TWO*gyzy - gyyz )) gupyz_loc*(gupxz_loc*gxzy(i,j,k)+gupyz_loc*gyzy(i,j,k)+gupzz_loc*gzzy(i,j,k)) + &
Gamyyy =HALF*( gupxy*(TWO*gxyy - gyyx ) + gupyy*gyyy + gupyz*(TWO*gyzy - gyyz )) gupxz_loc*(gupxz_loc*gxxz(i,j,k)+gupyz_loc*gxyz(i,j,k)+gupzz_loc*gxzz(i,j,k)) + &
Gamzyy =HALF*( gupxz*(TWO*gxyy - gyyx ) + gupyz*gyyy + gupzz*(TWO*gyzy - gyyz )) gupyz_loc*(gupxz_loc*gxyz(i,j,k)+gupyz_loc*gyyz(i,j,k)+gupzz_loc*gyzz(i,j,k)) + &
gupzz_loc*(gupxz_loc*gxzz(i,j,k)+gupyz_loc*gyzz(i,j,k)+gupzz_loc*gzzz(i,j,k)))
Gamxzz =HALF*( gupxx*(TWO*gxzz - gzzx ) + gupxy*(TWO*gyzz - gzzy ) + gupxz*gzzz) endif
Gamyzz =HALF*( gupxy*(TWO*gxzz - gzzx ) + gupyy*(TWO*gyzz - gzzy ) + gupyz*gzzz)
Gamzzz =HALF*( gupxz*(TWO*gxzz - gzzx ) + gupyz*(TWO*gyzz - gzzy ) + gupzz*gzzz) Gamxxx(i,j,k)=HALF*( gupxx_loc*gxxx(i,j,k) + gupxy_loc*(TWO*gxyx(i,j,k) - gxxy(i,j,k)) + gupxz_loc*(TWO*gxzx(i,j,k) - gxxz(i,j,k)))
Gamyxx(i,j,k)=HALF*( gupxy_loc*gxxx(i,j,k) + gupyy_loc*(TWO*gxyx(i,j,k) - gxxy(i,j,k)) + gupyz_loc*(TWO*gxzx(i,j,k) - gxxz(i,j,k)))
Gamxxy =HALF*( gupxx*gxxy + gupxy*gyyx + gupxz*( gxzy + gyzx - gxyz ) ) Gamzxx(i,j,k)=HALF*( gupxz_loc*gxxx(i,j,k) + gupyz_loc*(TWO*gxyx(i,j,k) - gxxy(i,j,k)) + gupzz_loc*(TWO*gxzx(i,j,k) - gxxz(i,j,k)))
Gamyxy =HALF*( gupxy*gxxy + gupyy*gyyx + gupyz*( gxzy + gyzx - gxyz ) )
Gamzxy =HALF*( gupxz*gxxy + gupyz*gyyx + gupzz*( gxzy + gyzx - gxyz ) ) Gamxyy(i,j,k)=HALF*( gupxx_loc*(TWO*gxyy(i,j,k) - gyyx(i,j,k)) + gupxy_loc*gyyy(i,j,k) + gupxz_loc*(TWO*gyzy(i,j,k) - gyyz(i,j,k)))
Gamyyy(i,j,k)=HALF*( gupxy_loc*(TWO*gxyy(i,j,k) - gyyx(i,j,k)) + gupyy_loc*gyyy(i,j,k) + gupyz_loc*(TWO*gyzy(i,j,k) - gyyz(i,j,k)))
Gamxxz =HALF*( gupxx*gxxz + gupxy*( gxyz + gyzx - gxzy ) + gupxz*gzzx ) Gamzyy(i,j,k)=HALF*( gupxz_loc*(TWO*gxyy(i,j,k) - gyyx(i,j,k)) + gupyz_loc*gyyy(i,j,k) + gupzz_loc*(TWO*gyzy(i,j,k) - gyyz(i,j,k)))
Gamyxz =HALF*( gupxy*gxxz + gupyy*( gxyz + gyzx - gxzy ) + gupyz*gzzx )
Gamzxz =HALF*( gupxz*gxxz + gupyz*( gxyz + gyzx - gxzy ) + gupzz*gzzx ) Gamxzz(i,j,k)=HALF*( gupxx_loc*(TWO*gxzz(i,j,k) - gzzx(i,j,k)) + gupxy_loc*(TWO*gyzz(i,j,k) - gzzy(i,j,k)) + gupxz_loc*gzzz(i,j,k))
Gamyzz(i,j,k)=HALF*( gupxy_loc*(TWO*gxzz(i,j,k) - gzzx(i,j,k)) + gupyy_loc*(TWO*gyzz(i,j,k) - gzzy(i,j,k)) + gupyz_loc*gzzz(i,j,k))
Gamxyz =HALF*( gupxx*( gxyz + gxzy - gyzx ) + gupxy*gyyz + gupxz*gzzy ) Gamzzz(i,j,k)=HALF*( gupxz_loc*(TWO*gxzz(i,j,k) - gzzx(i,j,k)) + gupyz_loc*(TWO*gyzz(i,j,k) - gzzy(i,j,k)) + gupzz_loc*gzzz(i,j,k))
Gamyyz =HALF*( gupxy*( gxyz + gxzy - gyzx ) + gupyy*gyyz + gupyz*gzzy )
Gamzyz =HALF*( gupxz*( gxyz + gxzy - gyzx ) + gupyz*gyyz + gupzz*gzzy ) Gamxxy(i,j,k)=HALF*( gupxx_loc*gxxy(i,j,k) + gupxy_loc*gyyx(i,j,k) + gupxz_loc*(gxzy(i,j,k) + gyzx(i,j,k) - gxyz(i,j,k)) )
! Raise indices of \tilde A_{ij} and store in R_ij Gamyxy(i,j,k)=HALF*( gupxy_loc*gxxy(i,j,k) + gupyy_loc*gyyx(i,j,k) + gupyz_loc*(gxzy(i,j,k) + gyzx(i,j,k) - gxyz(i,j,k)) )
Gamzxy(i,j,k)=HALF*( gupxz_loc*gxxy(i,j,k) + gupyz_loc*gyyx(i,j,k) + gupzz_loc*(gxzy(i,j,k) + gyzx(i,j,k) - gxyz(i,j,k)) )
Rxx = gupxx * gupxx * Axx + gupxy * gupxy * Ayy + gupxz * gupxz * Azz + &
TWO*(gupxx * gupxy * Axy + gupxx * gupxz * Axz + gupxy * gupxz * Ayz) Gamxxz(i,j,k)=HALF*( gupxx_loc*gxxz(i,j,k) + gupxy_loc*(gxyz(i,j,k) + gyzx(i,j,k) - gxzy(i,j,k)) + gupxz_loc*gzzx(i,j,k) )
Gamyxz(i,j,k)=HALF*( gupxy_loc*gxxz(i,j,k) + gupyy_loc*(gxyz(i,j,k) + gyzx(i,j,k) - gxzy(i,j,k)) + gupyz_loc*gzzx(i,j,k) )
Ryy = gupxy * gupxy * Axx + gupyy * gupyy * Ayy + gupyz * gupyz * Azz + & Gamzxz(i,j,k)=HALF*( gupxz_loc*gxxz(i,j,k) + gupyz_loc*(gxyz(i,j,k) + gyzx(i,j,k) - gxzy(i,j,k)) + gupzz_loc*gzzx(i,j,k) )
TWO*(gupxy * gupyy * Axy + gupxy * gupyz * Axz + gupyy * gupyz * Ayz)
Gamxyz(i,j,k)=HALF*( gupxx_loc*(gxyz(i,j,k) + gxzy(i,j,k) - gyzx(i,j,k)) + gupxy_loc*gyyz(i,j,k) + gupxz_loc*gzzy(i,j,k) )
Rzz = gupxz * gupxz * Axx + gupyz * gupyz * Ayy + gupzz * gupzz * Azz + & Gamyyz(i,j,k)=HALF*( gupxy_loc*(gxyz(i,j,k) + gxzy(i,j,k) - gyzx(i,j,k)) + gupyy_loc*gyyz(i,j,k) + gupyz_loc*gzzy(i,j,k) )
TWO*(gupxz * gupyz * Axy + gupxz * gupzz * Axz + gupyz * gupzz * Ayz) Gamzyz(i,j,k)=HALF*( gupxz_loc*(gxyz(i,j,k) + gxzy(i,j,k) - gyzx(i,j,k)) + gupyz_loc*gyyz(i,j,k) + gupzz_loc*gzzy(i,j,k) )
enddo
Rxy = gupxx * gupxy * Axx + gupxy * gupyy * Ayy + gupxz * gupyz * Azz + & enddo
(gupxx * gupyy + gupxy * gupxy)* Axy + & enddo
(gupxx * gupyz + gupxz * gupxy)* Axz + & ! Raise indices of \tilde A_{ij} and store in R_ij
(gupxy * gupyz + gupxz * gupyy)* Ayz
! Right hand side for Gam^i without shift terms...
Rxz = gupxx * gupxz * Axx + gupxy * gupyz * Ayy + gupxz * gupzz * Azz + & call fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
(gupxx * gupyz + gupxy * gupxz)* Axy + & call fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
(gupxx * gupzz + gupxz * gupxz)* Axz + & do k=1,ex(3)
(gupxy * gupzz + gupxz * gupyz)* Ayz do j=1,ex(2)
do i=1,ex(1)
Ryz = gupxy * gupxz * Axx + gupyy * gupyz * Ayy + gupyz * gupzz * Azz + & gupxx_loc = gupxx(i,j,k)
(gupxy * gupyz + gupyy * gupxz)* Axy + & gupxy_loc = gupxy(i,j,k)
(gupxy * gupzz + gupyz * gupxz)* Axz + & gupxz_loc = gupxz(i,j,k)
(gupyy * gupzz + gupyz * gupyz)* Ayz gupyy_loc = gupyy(i,j,k)
gupyz_loc = gupyz(i,j,k)
! Right hand side for Gam^i without shift terms... gupzz_loc = gupzz(i,j,k)
call fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
call fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev) Rxx_loc = gupxx_loc * gupxx_loc * Axx(i,j,k) + gupxy_loc * gupxy_loc * Ayy(i,j,k) + gupxz_loc * gupxz_loc * Azz(i,j,k) + &
TWO * (gupxx_loc * gupxy_loc * Axy(i,j,k) + gupxx_loc * gupxz_loc * Axz(i,j,k) + gupxy_loc * gupxz_loc * Ayz(i,j,k))
Gamx_rhs = - TWO * ( Lapx * Rxx + Lapy * Rxy + Lapz * Rxz ) + & Ryy_loc = gupxy_loc * gupxy_loc * Axx(i,j,k) + gupyy_loc * gupyy_loc * Ayy(i,j,k) + gupyz_loc * gupyz_loc * Azz(i,j,k) + &
TWO * alpn1 * ( & TWO * (gupxy_loc * gupyy_loc * Axy(i,j,k) + gupxy_loc * gupyz_loc * Axz(i,j,k) + gupyy_loc * gupyz_loc * Ayz(i,j,k))
-F3o2/chin1 * ( chix * Rxx + chiy * Rxy + chiz * Rxz ) - & Rzz_loc = gupxz_loc * gupxz_loc * Axx(i,j,k) + gupyz_loc * gupyz_loc * Ayy(i,j,k) + gupzz_loc * gupzz_loc * Azz(i,j,k) + &
gupxx * ( F2o3 * Kx + EIGHT * PI * Sx ) - & TWO * (gupxz_loc * gupyz_loc * Axy(i,j,k) + gupxz_loc * gupzz_loc * Axz(i,j,k) + gupyz_loc * gupzz_loc * Ayz(i,j,k))
gupxy * ( F2o3 * Ky + EIGHT * PI * Sy ) - & Rxy_loc = gupxx_loc * gupxy_loc * Axx(i,j,k) + gupxy_loc * gupyy_loc * Ayy(i,j,k) + gupxz_loc * gupyz_loc * Azz(i,j,k) + &
gupxz * ( F2o3 * Kz + EIGHT * PI * Sz ) + & (gupxx_loc * gupyy_loc + gupxy_loc * gupxy_loc) * Axy(i,j,k) + &
Gamxxx * Rxx + Gamxyy * Ryy + Gamxzz * Rzz + & (gupxx_loc * gupyz_loc + gupxz_loc * gupxy_loc) * Axz(i,j,k) + &
TWO * ( Gamxxy * Rxy + Gamxxz * Rxz + Gamxyz * Ryz ) ) (gupxy_loc * gupyz_loc + gupxz_loc * gupyy_loc) * Ayz(i,j,k)
Rxz_loc = gupxx_loc * gupxz_loc * Axx(i,j,k) + gupxy_loc * gupyz_loc * Ayy(i,j,k) + gupxz_loc * gupzz_loc * Azz(i,j,k) + &
Gamy_rhs = - TWO * ( Lapx * Rxy + Lapy * Ryy + Lapz * Ryz ) + & (gupxx_loc * gupyz_loc + gupxy_loc * gupxz_loc) * Axy(i,j,k) + &
TWO * alpn1 * ( & (gupxx_loc * gupzz_loc + gupxz_loc * gupxz_loc) * Axz(i,j,k) + &
-F3o2/chin1 * ( chix * Rxy + chiy * Ryy + chiz * Ryz ) - & (gupxy_loc * gupzz_loc + gupxz_loc * gupyz_loc) * Ayz(i,j,k)
gupxy * ( F2o3 * Kx + EIGHT * PI * Sx ) - & Ryz_loc = gupxy_loc * gupxz_loc * Axx(i,j,k) + gupyy_loc * gupyz_loc * Ayy(i,j,k) + gupyz_loc * gupzz_loc * Azz(i,j,k) + &
gupyy * ( F2o3 * Ky + EIGHT * PI * Sy ) - & (gupxy_loc * gupyz_loc + gupyy_loc * gupxz_loc) * Axy(i,j,k) + &
gupyz * ( F2o3 * Kz + EIGHT * PI * Sz ) + & (gupxy_loc * gupzz_loc + gupyz_loc * gupxz_loc) * Axz(i,j,k) + &
Gamyxx * Rxx + Gamyyy * Ryy + Gamyzz * Rzz + & (gupyy_loc * gupzz_loc + gupyz_loc * gupyz_loc) * Ayz(i,j,k)
TWO * ( Gamyxy * Rxy + Gamyxz * Rxz + Gamyyz * Ryz ) ) Rxx(i,j,k) = Rxx_loc
Ryy(i,j,k) = Ryy_loc
Gamz_rhs = - TWO * ( Lapx * Rxz + Lapy * Ryz + Lapz * Rzz ) + & Rzz(i,j,k) = Rzz_loc
TWO * alpn1 * ( & Rxy(i,j,k) = Rxy_loc
-F3o2/chin1 * ( chix * Rxz + chiy * Ryz + chiz * Rzz ) - & Rxz(i,j,k) = Rxz_loc
gupxz * ( F2o3 * Kx + EIGHT * PI * Sx ) - & Ryz(i,j,k) = Ryz_loc
gupyz * ( F2o3 * Ky + EIGHT * PI * Sy ) - &
gupzz * ( F2o3 * Kz + EIGHT * PI * Sz ) + & Gamx_rhs(i,j,k) = - TWO * (Lapx(i,j,k) * Rxx_loc + Lapy(i,j,k) * Rxy_loc + Lapz(i,j,k) * Rxz_loc) + &
Gamzxx * Rxx + Gamzyy * Ryy + Gamzzz * Rzz + & TWO * alpn1(i,j,k) * ( &
TWO * ( Gamzxy * Rxy + Gamzxz * Rxz + Gamzyz * Ryz ) ) -F3o2/chin1(i,j,k) * (chix(i,j,k) * Rxx_loc + chiy(i,j,k) * Rxy_loc + chiz(i,j,k) * Rxz_loc) - &
gupxx_loc * (F2o3 * Kx(i,j,k) + EIGHT * PI * Sx(i,j,k)) - &
gupxy_loc * (F2o3 * Ky(i,j,k) + EIGHT * PI * Sy(i,j,k)) - &
gupxz_loc * (F2o3 * Kz(i,j,k) + EIGHT * PI * Sz(i,j,k)) + &
Gamxxx(i,j,k) * Rxx_loc + Gamxyy(i,j,k) * Ryy_loc + Gamxzz(i,j,k) * Rzz_loc + &
TWO * (Gamxxy(i,j,k) * Rxy_loc + Gamxxz(i,j,k) * Rxz_loc + Gamxyz(i,j,k) * Ryz_loc))
Gamy_rhs(i,j,k) = - TWO * (Lapx(i,j,k) * Rxy_loc + Lapy(i,j,k) * Ryy_loc + Lapz(i,j,k) * Ryz_loc) + &
TWO * alpn1(i,j,k) * ( &
-F3o2/chin1(i,j,k) * (chix(i,j,k) * Rxy_loc + chiy(i,j,k) * Ryy_loc + chiz(i,j,k) * Ryz_loc) - &
gupxy_loc * (F2o3 * Kx(i,j,k) + EIGHT * PI * Sx(i,j,k)) - &
gupyy_loc * (F2o3 * Ky(i,j,k) + EIGHT * PI * Sy(i,j,k)) - &
gupyz_loc * (F2o3 * Kz(i,j,k) + EIGHT * PI * Sz(i,j,k)) + &
Gamyxx(i,j,k) * Rxx_loc + Gamyyy(i,j,k) * Ryy_loc + Gamyzz(i,j,k) * Rzz_loc + &
TWO * (Gamyxy(i,j,k) * Rxy_loc + Gamyxz(i,j,k) * Rxz_loc + Gamyyz(i,j,k) * Ryz_loc))
Gamz_rhs(i,j,k) = - TWO * (Lapx(i,j,k) * Rxz_loc + Lapy(i,j,k) * Ryz_loc + Lapz(i,j,k) * Rzz_loc) + &
TWO * alpn1(i,j,k) * ( &
-F3o2/chin1(i,j,k) * (chix(i,j,k) * Rxz_loc + chiy(i,j,k) * Ryz_loc + chiz(i,j,k) * Rzz_loc) - &
gupxz_loc * (F2o3 * Kx(i,j,k) + EIGHT * PI * Sx(i,j,k)) - &
gupyz_loc * (F2o3 * Ky(i,j,k) + EIGHT * PI * Sy(i,j,k)) - &
gupzz_loc * (F2o3 * Kz(i,j,k) + EIGHT * PI * Sz(i,j,k)) + &
Gamzxx(i,j,k) * Rxx_loc + Gamzyy(i,j,k) * Ryy_loc + Gamzzz(i,j,k) * Rzz_loc + &
TWO * (Gamzxy(i,j,k) * Rxy_loc + Gamzxz(i,j,k) * Rxz_loc + Gamzyz(i,j,k) * Ryz_loc))
enddo
enddo
enddo
call fdderivs(ex,betax,gxxx,gxyx,gxzx,gyyx,gyzx,gzzx,& call fdderivs(ex,betax,gxxx,gxyx,gxzx,gyyx,gyzx,gzzx,&
X,Y,Z,ANTI,SYM, SYM ,Symmetry,Lev) X,Y,Z,ANTI,SYM, SYM ,Symmetry,Lev)
@@ -321,38 +359,54 @@
call fdderivs(ex,betaz,gxxz,gxyz,gxzz,gyyz,gyzz,gzzz,& call fdderivs(ex,betaz,gxxz,gxyz,gxzz,gyyz,gyzz,gzzz,&
X,Y,Z,SYM ,SYM, ANTI,Symmetry,Lev) X,Y,Z,SYM ,SYM, ANTI,Symmetry,Lev)
fxx = gxxx + gxyy + gxzz call fderivs(ex,Gamx,Gamxx,Gamxy,Gamxz,X,Y,Z,ANTI,SYM ,SYM ,Symmetry,Lev)
fxy = gxyx + gyyy + gyzz call fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev)
fxz = gxzx + gyzy + gzzz call fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev)
do k=1,ex(3)
Gamxa = gupxx * Gamxxx + gupyy * Gamxyy + gupzz * Gamxzz + & do j=1,ex(2)
TWO*( gupxy * Gamxxy + gupxz * Gamxxz + gupyz * Gamxyz ) do i=1,ex(1)
Gamya = gupxx * Gamyxx + gupyy * Gamyyy + gupzz * Gamyzz + & divb_loc = div_beta(i,j,k)
TWO*( gupxy * Gamyxy + gupxz * Gamyxz + gupyz * Gamyyz ) fxx_loc = gxxx(i,j,k) + gxyy(i,j,k) + gxzz(i,j,k)
Gamza = gupxx * Gamzxx + gupyy * Gamzyy + gupzz * Gamzzz + & fxy_loc = gxyx(i,j,k) + gyyy(i,j,k) + gyzz(i,j,k)
TWO*( gupxy * Gamzxy + gupxz * Gamzxz + gupyz * Gamzyz ) fxz_loc = gxzx(i,j,k) + gyzy(i,j,k) + gzzz(i,j,k)
call fderivs(ex,Gamx,Gamxx,Gamxy,Gamxz,X,Y,Z,ANTI,SYM ,SYM ,Symmetry,Lev) gupxx_loc = gupxx(i,j,k)
call fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev) gupxy_loc = gupxy(i,j,k)
call fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev) gupxz_loc = gupxz(i,j,k)
gupyy_loc = gupyy(i,j,k)
Gamx_rhs = Gamx_rhs + F2o3 * Gamxa * div_beta - & gupyz_loc = gupyz(i,j,k)
Gamxa * betaxx - Gamya * betaxy - Gamza * betaxz + & gupzz_loc = gupzz(i,j,k)
F1o3 * (gupxx * fxx + gupxy * fxy + gupxz * fxz ) + &
gupxx * gxxx + gupyy * gyyx + gupzz * gzzx + & Gamxa_loc = gupxx_loc * Gamxxx(i,j,k) + gupyy_loc * Gamxyy(i,j,k) + gupzz_loc * Gamxzz(i,j,k) + &
TWO * (gupxy * gxyx + gupxz * gxzx + gupyz * gyzx ) TWO * (gupxy_loc * Gamxxy(i,j,k) + gupxz_loc * Gamxxz(i,j,k) + gupyz_loc * Gamxyz(i,j,k))
Gamya_loc = gupxx_loc * Gamyxx(i,j,k) + gupyy_loc * Gamyyy(i,j,k) + gupzz_loc * Gamyzz(i,j,k) + &
Gamy_rhs = Gamy_rhs + F2o3 * Gamya * div_beta - & TWO * (gupxy_loc * Gamyxy(i,j,k) + gupxz_loc * Gamyxz(i,j,k) + gupyz_loc * Gamyyz(i,j,k))
Gamxa * betayx - Gamya * betayy - Gamza * betayz + & Gamza_loc = gupxx_loc * Gamzxx(i,j,k) + gupyy_loc * Gamzyy(i,j,k) + gupzz_loc * Gamzzz(i,j,k) + &
F1o3 * (gupxy * fxx + gupyy * fxy + gupyz * fxz ) + & TWO * (gupxy_loc * Gamzxy(i,j,k) + gupxz_loc * Gamzxz(i,j,k) + gupyz_loc * Gamzyz(i,j,k))
gupxx * gxxy + gupyy * gyyy + gupzz * gzzy + & Gamxa(i,j,k) = Gamxa_loc
TWO * (gupxy * gxyy + gupxz * gxzy + gupyz * gyzy ) Gamya(i,j,k) = Gamya_loc
Gamza(i,j,k) = Gamza_loc
Gamz_rhs = Gamz_rhs + F2o3 * Gamza * div_beta - &
Gamxa * betazx - Gamya * betazy - Gamza * betazz + & Gamx_rhs(i,j,k) = Gamx_rhs(i,j,k) + F2o3 * Gamxa_loc * divb_loc - &
F1o3 * (gupxz * fxx + gupyz * fxy + gupzz * fxz ) + & Gamxa_loc * betaxx(i,j,k) - Gamya_loc * betaxy(i,j,k) - Gamza_loc * betaxz(i,j,k) + &
gupxx * gxxz + gupyy * gyyz + gupzz * gzzz + & F1o3 * (gupxx_loc * fxx_loc + gupxy_loc * fxy_loc + gupxz_loc * fxz_loc) + &
TWO * (gupxy * gxyz + gupxz * gxzz + gupyz * gyzz ) !rhs for Gam^i gupxx_loc * gxxx(i,j,k) + gupyy_loc * gyyx(i,j,k) + gupzz_loc * gzzx(i,j,k) + &
TWO * (gupxy_loc * gxyx(i,j,k) + gupxz_loc * gxzx(i,j,k) + gupyz_loc * gyzx(i,j,k))
Gamy_rhs(i,j,k) = Gamy_rhs(i,j,k) + F2o3 * Gamya_loc * divb_loc - &
Gamxa_loc * betayx(i,j,k) - Gamya_loc * betayy(i,j,k) - Gamza_loc * betayz(i,j,k) + &
F1o3 * (gupxy_loc * fxx_loc + gupyy_loc * fxy_loc + gupyz_loc * fxz_loc) + &
gupxx_loc * gxxy(i,j,k) + gupyy_loc * gyyy(i,j,k) + gupzz_loc * gzzy(i,j,k) + &
TWO * (gupxy_loc * gxyy(i,j,k) + gupxz_loc * gxzy(i,j,k) + gupyz_loc * gyzy(i,j,k))
Gamz_rhs(i,j,k) = Gamz_rhs(i,j,k) + F2o3 * Gamza_loc * divb_loc - &
Gamxa_loc * betazx(i,j,k) - Gamya_loc * betazy(i,j,k) - Gamza_loc * betazz(i,j,k) + &
F1o3 * (gupxz_loc * fxx_loc + gupyz_loc * fxy_loc + gupzz_loc * fxz_loc) + &
gupxx_loc * gxxz(i,j,k) + gupyy_loc * gyyz(i,j,k) + gupzz_loc * gzzz(i,j,k) + &
TWO * (gupxy_loc * gxyz(i,j,k) + gupxz_loc * gxzz(i,j,k) + gupyz_loc * gyzz(i,j,k))
enddo
enddo
enddo
!first kind of connection stored in gij,k !first kind of connection stored in gij,k
gxxx = gxx * Gamxxx + gxy * Gamyxx + gxz * Gamzxx gxxx = gxx * Gamxxx + gxy * Gamyxx + gxz * Gamzxx
@@ -601,192 +655,190 @@
Gamxyz * gxzz + Gamyyz * gyzz + Gamzyz * gzzz + & Gamxyz * gxzz + Gamyyz * gyzz + Gamzyz * gzzz + &
Gamxzz * gxzy + Gamyzz * gyzy + Gamzzz * gzzy + & Gamxzz * gxzy + Gamyzz * gyzy + Gamzzz * gzzy + &
Gamxyz * gzzx + Gamyyz * gzzy + Gamzyz * gzzz ) Gamxyz * gzzx + Gamyyz * gzzy + Gamzyz * gzzz )
!covariant second derivative of chi respect to tilted metric !covariant second derivative of chi respect to tilted metric
call fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev) call fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
fxx = fxx - Gamxxx * chix - Gamyxx * chiy - Gamzxx * chiz do k=1,ex(3)
fxy = fxy - Gamxxy * chix - Gamyxy * chiy - Gamzxy * chiz do j=1,ex(2)
fxz = fxz - Gamxxz * chix - Gamyxz * chiy - Gamzxz * chiz do i=1,ex(1)
fyy = fyy - Gamxyy * chix - Gamyyy * chiy - Gamzyy * chiz fxx(i,j,k) = fxx(i,j,k) - Gamxxx(i,j,k) * chix(i,j,k) - Gamyxx(i,j,k) * chiy(i,j,k) - Gamzxx(i,j,k) * chiz(i,j,k)
fyz = fyz - Gamxyz * chix - Gamyyz * chiy - Gamzyz * chiz fxy(i,j,k) = fxy(i,j,k) - Gamxxy(i,j,k) * chix(i,j,k) - Gamyxy(i,j,k) * chiy(i,j,k) - Gamzxy(i,j,k) * chiz(i,j,k)
fzz = fzz - Gamxzz * chix - Gamyzz * chiy - Gamzzz * chiz fxz(i,j,k) = fxz(i,j,k) - Gamxxz(i,j,k) * chix(i,j,k) - Gamyxz(i,j,k) * chiy(i,j,k) - Gamzxz(i,j,k) * chiz(i,j,k)
! Store D^l D_l chi - 3/(2*chi) D^l chi D_l chi in f fyy(i,j,k) = fyy(i,j,k) - Gamxyy(i,j,k) * chix(i,j,k) - Gamyyy(i,j,k) * chiy(i,j,k) - Gamzyy(i,j,k) * chiz(i,j,k)
fyz(i,j,k) = fyz(i,j,k) - Gamxyz(i,j,k) * chix(i,j,k) - Gamyyz(i,j,k) * chiy(i,j,k) - Gamzyz(i,j,k) * chiz(i,j,k)
f = gupxx * ( fxx - F3o2/chin1 * chix * chix ) + & fzz(i,j,k) = fzz(i,j,k) - Gamxzz(i,j,k) * chix(i,j,k) - Gamyzz(i,j,k) * chiy(i,j,k) - Gamzzz(i,j,k) * chiz(i,j,k)
gupyy * ( fyy - F3o2/chin1 * chiy * chiy ) + &
gupzz * ( fzz - F3o2/chin1 * chiz * chiz ) + & chin_loc = chin1(i,j,k)
TWO * gupxy * ( fxy - F3o2/chin1 * chix * chiy ) + & f_loc = gupxx(i,j,k) * (fxx(i,j,k) - F3o2/chin_loc * chix(i,j,k) * chix(i,j,k)) + &
TWO * gupxz * ( fxz - F3o2/chin1 * chix * chiz ) + & gupyy(i,j,k) * (fyy(i,j,k) - F3o2/chin_loc * chiy(i,j,k) * chiy(i,j,k)) + &
TWO * gupyz * ( fyz - F3o2/chin1 * chiy * chiz ) gupzz(i,j,k) * (fzz(i,j,k) - F3o2/chin_loc * chiz(i,j,k) * chiz(i,j,k)) + &
! Add chi part to Ricci tensor: TWO * gupxy(i,j,k) * (fxy(i,j,k) - F3o2/chin_loc * chix(i,j,k) * chiy(i,j,k)) + &
TWO * gupxz(i,j,k) * (fxz(i,j,k) - F3o2/chin_loc * chix(i,j,k) * chiz(i,j,k)) + &
Rxx = Rxx + (fxx - chix*chix/chin1/TWO + gxx * f)/chin1/TWO TWO * gupyz(i,j,k) * (fyz(i,j,k) - F3o2/chin_loc * chiy(i,j,k) * chiz(i,j,k))
Ryy = Ryy + (fyy - chiy*chiy/chin1/TWO + gyy * f)/chin1/TWO f(i,j,k) = f_loc
Rzz = Rzz + (fzz - chiz*chiz/chin1/TWO + gzz * f)/chin1/TWO
Rxy = Rxy + (fxy - chix*chiy/chin1/TWO + gxy * f)/chin1/TWO Rxx(i,j,k) = Rxx(i,j,k) + (fxx(i,j,k) - chix(i,j,k)*chix(i,j,k)/chin_loc/TWO + gxx(i,j,k) * f_loc)/chin_loc/TWO
Rxz = Rxz + (fxz - chix*chiz/chin1/TWO + gxz * f)/chin1/TWO Ryy(i,j,k) = Ryy(i,j,k) + (fyy(i,j,k) - chiy(i,j,k)*chiy(i,j,k)/chin_loc/TWO + gyy(i,j,k) * f_loc)/chin_loc/TWO
Ryz = Ryz + (fyz - chiy*chiz/chin1/TWO + gyz * f)/chin1/TWO Rzz(i,j,k) = Rzz(i,j,k) + (fzz(i,j,k) - chiz(i,j,k)*chiz(i,j,k)/chin_loc/TWO + gzz(i,j,k) * f_loc)/chin_loc/TWO
Rxy(i,j,k) = Rxy(i,j,k) + (fxy(i,j,k) - chix(i,j,k)*chiy(i,j,k)/chin_loc/TWO + gxy(i,j,k) * f_loc)/chin_loc/TWO
! covariant second derivatives of the lapse respect to physical metric Rxz(i,j,k) = Rxz(i,j,k) + (fxz(i,j,k) - chix(i,j,k)*chiz(i,j,k)/chin_loc/TWO + gxz(i,j,k) * f_loc)/chin_loc/TWO
call fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z, & Ryz(i,j,k) = Ryz(i,j,k) + (fyz(i,j,k) - chiy(i,j,k)*chiz(i,j,k)/chin_loc/TWO + gyz(i,j,k) * f_loc)/chin_loc/TWO
SYM,SYM,SYM,symmetry,Lev) enddo
enddo
gxxx = (gupxx * chix + gupxy * chiy + gupxz * chiz)/chin1 enddo
gxxy = (gupxy * chix + gupyy * chiy + gupyz * chiz)/chin1
gxxz = (gupxz * chix + gupyz * chiy + gupzz * chiz)/chin1 ! covariant second derivatives of the lapse respect to physical metric
! now get physical second kind of connection call fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z, &
Gamxxx = Gamxxx - ( (chix + chix)/chin1 - gxx * gxxx )*HALF SYM,SYM,SYM,symmetry,Lev)
Gamyxx = Gamyxx - ( - gxx * gxxy )*HALF
Gamzxx = Gamzxx - ( - gxx * gxxz )*HALF do k=1,ex(3)
Gamxyy = Gamxyy - ( - gyy * gxxx )*HALF do j=1,ex(2)
Gamyyy = Gamyyy - ( (chiy + chiy)/chin1 - gyy * gxxy )*HALF do i=1,ex(1)
Gamzyy = Gamzyy - ( - gyy * gxxz )*HALF chin_loc = chin1(i,j,k)
Gamxzz = Gamxzz - ( - gzz * gxxx )*HALF gxxx(i,j,k) = (gupxx(i,j,k) * chix(i,j,k) + gupxy(i,j,k) * chiy(i,j,k) + gupxz(i,j,k) * chiz(i,j,k)) / chin_loc
Gamyzz = Gamyzz - ( - gzz * gxxy )*HALF gxxy(i,j,k) = (gupxy(i,j,k) * chix(i,j,k) + gupyy(i,j,k) * chiy(i,j,k) + gupyz(i,j,k) * chiz(i,j,k)) / chin_loc
Gamzzz = Gamzzz - ( (chiz + chiz)/chin1 - gzz * gxxz )*HALF gxxz(i,j,k) = (gupxz(i,j,k) * chix(i,j,k) + gupyz(i,j,k) * chiy(i,j,k) + gupzz(i,j,k) * chiz(i,j,k)) / chin_loc
Gamxxy = Gamxxy - ( chiy /chin1 - gxy * gxxx )*HALF
Gamyxy = Gamyxy - ( chix /chin1 - gxy * gxxy )*HALF Gamxxx(i,j,k) = Gamxxx(i,j,k) - ( (chix(i,j,k) + chix(i,j,k))/chin_loc - gxx(i,j,k) * gxxx(i,j,k) )*HALF
Gamzxy = Gamzxy - ( - gxy * gxxz )*HALF Gamyxx(i,j,k) = Gamyxx(i,j,k) - ( - gxx(i,j,k) * gxxy(i,j,k) )*HALF
Gamxxz = Gamxxz - ( chiz /chin1 - gxz * gxxx )*HALF Gamzxx(i,j,k) = Gamzxx(i,j,k) - ( - gxx(i,j,k) * gxxz(i,j,k) )*HALF
Gamyxz = Gamyxz - ( - gxz * gxxy )*HALF Gamxyy(i,j,k) = Gamxyy(i,j,k) - ( - gyy(i,j,k) * gxxx(i,j,k) )*HALF
Gamzxz = Gamzxz - ( chix /chin1 - gxz * gxxz )*HALF Gamyyy(i,j,k) = Gamyyy(i,j,k) - ( (chiy(i,j,k) + chiy(i,j,k))/chin_loc - gyy(i,j,k) * gxxy(i,j,k) )*HALF
Gamxyz = Gamxyz - ( - gyz * gxxx )*HALF Gamzyy(i,j,k) = Gamzyy(i,j,k) - ( - gyy(i,j,k) * gxxz(i,j,k) )*HALF
Gamyyz = Gamyyz - ( chiz /chin1 - gyz * gxxy )*HALF Gamxzz(i,j,k) = Gamxzz(i,j,k) - ( - gzz(i,j,k) * gxxx(i,j,k) )*HALF
Gamzyz = Gamzyz - ( chiy /chin1 - gyz * gxxz )*HALF Gamyzz(i,j,k) = Gamyzz(i,j,k) - ( - gzz(i,j,k) * gxxy(i,j,k) )*HALF
Gamzzz(i,j,k) = Gamzzz(i,j,k) - ( (chiz(i,j,k) + chiz(i,j,k))/chin_loc - gzz(i,j,k) * gxxz(i,j,k) )*HALF
fxx = fxx - Gamxxx*Lapx - Gamyxx*Lapy - Gamzxx*Lapz Gamxxy(i,j,k) = Gamxxy(i,j,k) - ( chiy(i,j,k) /chin_loc - gxy(i,j,k) * gxxx(i,j,k) )*HALF
fyy = fyy - Gamxyy*Lapx - Gamyyy*Lapy - Gamzyy*Lapz Gamyxy(i,j,k) = Gamyxy(i,j,k) - ( chix(i,j,k) /chin_loc - gxy(i,j,k) * gxxy(i,j,k) )*HALF
fzz = fzz - Gamxzz*Lapx - Gamyzz*Lapy - Gamzzz*Lapz Gamzxy(i,j,k) = Gamzxy(i,j,k) - ( - gxy(i,j,k) * gxxz(i,j,k) )*HALF
fxy = fxy - Gamxxy*Lapx - Gamyxy*Lapy - Gamzxy*Lapz Gamxxz(i,j,k) = Gamxxz(i,j,k) - ( chiz(i,j,k) /chin_loc - gxz(i,j,k) * gxxx(i,j,k) )*HALF
fxz = fxz - Gamxxz*Lapx - Gamyxz*Lapy - Gamzxz*Lapz Gamyxz(i,j,k) = Gamyxz(i,j,k) - ( - gxz(i,j,k) * gxxy(i,j,k) )*HALF
fyz = fyz - Gamxyz*Lapx - Gamyyz*Lapy - Gamzyz*Lapz Gamzxz(i,j,k) = Gamzxz(i,j,k) - ( chix(i,j,k) /chin_loc - gxz(i,j,k) * gxxz(i,j,k) )*HALF
Gamxyz(i,j,k) = Gamxyz(i,j,k) - ( - gyz(i,j,k) * gxxx(i,j,k) )*HALF
! store D^i D_i Lap in trK_rhs upto chi Gamyyz(i,j,k) = Gamyyz(i,j,k) - ( chiz(i,j,k) /chin_loc - gyz(i,j,k) * gxxy(i,j,k) )*HALF
trK_rhs = gupxx * fxx + gupyy * fyy + gupzz * fzz + & Gamzyz(i,j,k) = Gamzyz(i,j,k) - ( chiy(i,j,k) /chin_loc - gyz(i,j,k) * gxxz(i,j,k) )*HALF
TWO* ( gupxy * fxy + gupxz * fxz + gupyz * fyz )
#if 1 fxx(i,j,k) = fxx(i,j,k) - Gamxxx(i,j,k)*Lapx(i,j,k) - Gamyxx(i,j,k)*Lapy(i,j,k) - Gamzxx(i,j,k)*Lapz(i,j,k)
!! follow bam code fyy(i,j,k) = fyy(i,j,k) - Gamxyy(i,j,k)*Lapx(i,j,k) - Gamyyy(i,j,k)*Lapy(i,j,k) - Gamzyy(i,j,k)*Lapz(i,j,k)
S = chin1 * ( gupxx * Sxx + gupyy * Syy + gupzz * Szz + & fzz(i,j,k) = fzz(i,j,k) - Gamxzz(i,j,k)*Lapx(i,j,k) - Gamyzz(i,j,k)*Lapy(i,j,k) - Gamzzz(i,j,k)*Lapz(i,j,k)
TWO * ( gupxy * Sxy + gupxz * Sxz + gupyz * Syz ) ) fxy(i,j,k) = fxy(i,j,k) - Gamxxy(i,j,k)*Lapx(i,j,k) - Gamyxy(i,j,k)*Lapy(i,j,k) - Gamzxy(i,j,k)*Lapz(i,j,k)
f = F2o3 * trK * trK -(& fxz(i,j,k) = fxz(i,j,k) - Gamxxz(i,j,k)*Lapx(i,j,k) - Gamyxz(i,j,k)*Lapy(i,j,k) - Gamzxz(i,j,k)*Lapz(i,j,k)
gupxx * ( & fyz(i,j,k) = fyz(i,j,k) - Gamxyz(i,j,k)*Lapx(i,j,k) - Gamyyz(i,j,k)*Lapy(i,j,k) - Gamzyz(i,j,k)*Lapz(i,j,k)
gupxx * Axx * Axx + gupyy * Axy * Axy + gupzz * Axz * Axz + &
TWO * (gupxy * Axx * Axy + gupxz * Axx * Axz + gupyz * Axy * Axz) ) + & trK_rhs(i,j,k) = gupxx(i,j,k) * fxx(i,j,k) + gupyy(i,j,k) * fyy(i,j,k) + gupzz(i,j,k) * fzz(i,j,k) + &
gupyy * ( & TWO * (gupxy(i,j,k) * fxy(i,j,k) + gupxz(i,j,k) * fxz(i,j,k) + gupyz(i,j,k) * fyz(i,j,k))
gupxx * Axy * Axy + gupyy * Ayy * Ayy + gupzz * Ayz * Ayz + & enddo
TWO * (gupxy * Axy * Ayy + gupxz * Axy * Ayz + gupyz * Ayy * Ayz) ) + & enddo
gupzz * ( & enddo
gupxx * Axz * Axz + gupyy * Ayz * Ayz + gupzz * Azz * Azz + & do k=1,ex(3)
TWO * (gupxy * Axz * Ayz + gupxz * Axz * Azz + gupyz * Ayz * Azz) ) + & do j=1,ex(2)
TWO * ( & do i=1,ex(1)
gupxy * ( & divb_loc = div_beta(i,j,k)
gupxx * Axx * Axy + gupyy * Axy * Ayy + gupzz * Axz * Ayz + & chin_loc = chin1(i,j,k)
gupxy * (Axx * Ayy + Axy * Axy) + &
gupxz * (Axx * Ayz + Axz * Axy) + & S_loc = chin_loc * ( gupxx(i,j,k) * Sxx(i,j,k) + gupyy(i,j,k) * Syy(i,j,k) + gupzz(i,j,k) * Szz(i,j,k) + &
gupyz * (Axy * Ayz + Axz * Ayy) ) + & TWO * (gupxy(i,j,k) * Sxy(i,j,k) + gupxz(i,j,k) * Sxz(i,j,k) + gupyz(i,j,k) * Syz(i,j,k)) )
gupxz * ( & S(i,j,k) = S_loc
gupxx * Axx * Axz + gupyy * Axy * Ayz + gupzz * Axz * Azz + &
gupxy * (Axx * Ayz + Axy * Axz) + & f_loc = F2o3 * trK(i,j,k) * trK(i,j,k) - ( &
gupxz * (Axx * Azz + Axz * Axz) + & gupxx(i,j,k) * ( gupxx(i,j,k) * Axx(i,j,k) * Axx(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + &
gupyz * (Axy * Azz + Axz * Ayz) ) + & gupzz(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + &
gupyz * ( & TWO * (gupxy(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + gupxz(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + &
gupxx * Axy * Axz + gupyy * Ayy * Ayz + gupzz * Ayz * Azz + & gupyz(i,j,k) * Axy(i,j,k) * Axz(i,j,k)) ) + &
gupxy * (Axy * Ayz + Ayy * Axz) + & gupyy(i,j,k) * ( gupxx(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayy(i,j,k) + &
gupxz * (Axy * Azz + Ayz * Axz) + & gupzz(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + &
gupyz * (Ayy * Azz + Ayz * Ayz) ) )) -1.6d1*PI*rho + EIGHT * PI * S TWO * (gupxy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + gupxz(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + &
f = - F1o3 *( gupxx * fxx + gupyy * fyy + gupzz * fzz + & gupyz(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k)) ) + &
TWO* ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) + alpn1/chin1*f) gupzz(i,j,k) * ( gupxx(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + &
gupzz(i,j,k) * Azz(i,j,k) * Azz(i,j,k) + &
fxx = alpn1 * (Rxx - EIGHT * PI * Sxx) - fxx TWO * (gupxy(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + gupxz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + &
fxy = alpn1 * (Rxy - EIGHT * PI * Sxy) - fxy gupyz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k)) ) + &
fxz = alpn1 * (Rxz - EIGHT * PI * Sxz) - fxz TWO * ( gupxy(i,j,k) * ( gupxx(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + &
fyy = alpn1 * (Ryy - EIGHT * PI * Syy) - fyy gupzz(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + &
fyz = alpn1 * (Ryz - EIGHT * PI * Syz) - fyz gupxy(i,j,k) * (Axx(i,j,k) * Ayy(i,j,k) + Axy(i,j,k) * Axy(i,j,k)) + &
fzz = alpn1 * (Rzz - EIGHT * PI * Szz) - fzz gupxz(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Axy(i,j,k)) + &
#else gupyz(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Ayy(i,j,k)) ) + &
! Add lapse and S_ij parts to Ricci tensor: gupxz(i,j,k) * ( gupxx(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + &
gupzz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + &
fxx = alpn1 * (Rxx - EIGHT * PI * Sxx) - fxx gupxy(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axy(i,j,k) * Axz(i,j,k)) + &
fxy = alpn1 * (Rxy - EIGHT * PI * Sxy) - fxy gupxz(i,j,k) * (Axx(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Axz(i,j,k)) + &
fxz = alpn1 * (Rxz - EIGHT * PI * Sxz) - fxz gupyz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Ayz(i,j,k)) ) + &
fyy = alpn1 * (Ryy - EIGHT * PI * Syy) - fyy gupyz(i,j,k) * ( gupxx(i,j,k) * Axy(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k) + &
fyz = alpn1 * (Ryz - EIGHT * PI * Syz) - fyz gupzz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k) + &
fzz = alpn1 * (Rzz - EIGHT * PI * Szz) - fzz gupxy(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Ayy(i,j,k) * Axz(i,j,k)) + &
gupxz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Axz(i,j,k)) + &
! Compute trace-free part (note: chi^-1 and chi cancel!): gupyz(i,j,k) * (Ayy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Ayz(i,j,k)) ) ) ) - &
F16 * PI * rho(i,j,k) + EIGHT * PI * S_loc
f = F1o3 *( gupxx * fxx + gupyy * fyy + gupzz * fzz + &
TWO* ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) ) f_loc = -F1o3 * ( gupxx(i,j,k) * fxx(i,j,k) + gupyy(i,j,k) * fyy(i,j,k) + gupzz(i,j,k) * fzz(i,j,k) + &
#endif TWO * (gupxy(i,j,k) * fxy(i,j,k) + gupxz(i,j,k) * fxz(i,j,k) + gupyz(i,j,k) * fyz(i,j,k)) + &
alpn1(i,j,k)/chin_loc * f_loc )
Axx_rhs = fxx - gxx * f f(i,j,k) = f_loc
Ayy_rhs = fyy - gyy * f
Azz_rhs = fzz - gzz * f l_fxx = alpn1(i,j,k) * (Rxx(i,j,k) - EIGHT * PI * Sxx(i,j,k)) - fxx(i,j,k)
Axy_rhs = fxy - gxy * f l_fxy = alpn1(i,j,k) * (Rxy(i,j,k) - EIGHT * PI * Sxy(i,j,k)) - fxy(i,j,k)
Axz_rhs = fxz - gxz * f l_fxz = alpn1(i,j,k) * (Rxz(i,j,k) - EIGHT * PI * Sxz(i,j,k)) - fxz(i,j,k)
Ayz_rhs = fyz - gyz * f l_fyy = alpn1(i,j,k) * (Ryy(i,j,k) - EIGHT * PI * Syy(i,j,k)) - fyy(i,j,k)
l_fyz = alpn1(i,j,k) * (Ryz(i,j,k) - EIGHT * PI * Syz(i,j,k)) - fyz(i,j,k)
! Now: store A_il A^l_j into fij: l_fzz = alpn1(i,j,k) * (Rzz(i,j,k) - EIGHT * PI * Szz(i,j,k)) - fzz(i,j,k)
fxx = gupxx * Axx * Axx + gupyy * Axy * Axy + gupzz * Axz * Axz + & Axx_rhs(i,j,k) = l_fxx - gxx(i,j,k) * f_loc
TWO * (gupxy * Axx * Axy + gupxz * Axx * Axz + gupyz * Axy * Axz) Ayy_rhs(i,j,k) = l_fyy - gyy(i,j,k) * f_loc
fyy = gupxx * Axy * Axy + gupyy * Ayy * Ayy + gupzz * Ayz * Ayz + & Azz_rhs(i,j,k) = l_fzz - gzz(i,j,k) * f_loc
TWO * (gupxy * Axy * Ayy + gupxz * Axy * Ayz + gupyz * Ayy * Ayz) Axy_rhs(i,j,k) = l_fxy - gxy(i,j,k) * f_loc
fzz = gupxx * Axz * Axz + gupyy * Ayz * Ayz + gupzz * Azz * Azz + & Axz_rhs(i,j,k) = l_fxz - gxz(i,j,k) * f_loc
TWO * (gupxy * Axz * Ayz + gupxz * Axz * Azz + gupyz * Ayz * Azz) Ayz_rhs(i,j,k) = l_fyz - gyz(i,j,k) * f_loc
fxy = gupxx * Axx * Axy + gupyy * Axy * Ayy + gupzz * Axz * Ayz + &
gupxy *(Axx * Ayy + Axy * Axy) + & fxx(i,j,k) = gupxx(i,j,k) * Axx(i,j,k) * Axx(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + &
gupxz *(Axx * Ayz + Axz * Axy) + & gupzz(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + TWO * (gupxy(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + &
gupyz *(Axy * Ayz + Axz * Ayy) gupxz(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + gupyz(i,j,k) * Axy(i,j,k) * Axz(i,j,k))
fxz = gupxx * Axx * Axz + gupyy * Axy * Ayz + gupzz * Axz * Azz + & fyy(i,j,k) = gupxx(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayy(i,j,k) + &
gupxy *(Axx * Ayz + Axy * Axz) + & gupzz(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + TWO * (gupxy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + &
gupxz *(Axx * Azz + Axz * Axz) + & gupxz(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + gupyz(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k))
gupyz *(Axy * Azz + Axz * Ayz) fzz(i,j,k) = gupxx(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + &
fyz = gupxx * Axy * Axz + gupyy * Ayy * Ayz + gupzz * Ayz * Azz + & gupzz(i,j,k) * Azz(i,j,k) * Azz(i,j,k) + TWO * (gupxy(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + &
gupxy *(Axy * Ayz + Ayy * Axz) + & gupxz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + gupyz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k))
gupxz *(Axy * Azz + Ayz * Axz) + & fxy(i,j,k) = gupxx(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + &
gupyz *(Ayy * Azz + Ayz * Ayz) gupzz(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + gupxy(i,j,k) * (Axx(i,j,k) * Ayy(i,j,k) + Axy(i,j,k) * Axy(i,j,k)) + &
gupxz(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Axy(i,j,k)) + &
f = chin1 gupyz(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Ayy(i,j,k))
! store D^i D_i Lap in trK_rhs fxz(i,j,k) = gupxx(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + &
trK_rhs = f*trK_rhs gupzz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + gupxy(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axy(i,j,k) * Axz(i,j,k)) + &
gupxz(i,j,k) * (Axx(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Axz(i,j,k)) + &
Axx_rhs = f * Axx_rhs+ alpn1 * (trK * Axx - TWO * fxx) + & gupyz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Ayz(i,j,k))
TWO * ( Axx * betaxx + Axy * betayx + Axz * betazx )- & fyz(i,j,k) = gupxx(i,j,k) * Axy(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k) + &
F2o3 * Axx * div_beta gupzz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k) + gupxy(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Ayy(i,j,k) * Axz(i,j,k)) + &
gupxz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Axz(i,j,k)) + &
Ayy_rhs = f * Ayy_rhs+ alpn1 * (trK * Ayy - TWO * fyy) + & gupyz(i,j,k) * (Ayy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Ayz(i,j,k))
TWO * ( Axy * betaxy + Ayy * betayy + Ayz * betazy )- &
F2o3 * Ayy * div_beta trK_rhs(i,j,k) = chin_loc * trK_rhs(i,j,k)
Azz_rhs = f * Azz_rhs+ alpn1 * (trK * Azz - TWO * fzz) + & Axx_rhs(i,j,k) = chin_loc * Axx_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Axx(i,j,k) - TWO * fxx(i,j,k)) + &
TWO * ( Axz * betaxz + Ayz * betayz + Azz * betazz )- & TWO * (Axx(i,j,k) * betaxx(i,j,k) + Axy(i,j,k) * betayx(i,j,k) + Axz(i,j,k) * betazx(i,j,k)) - &
F2o3 * Azz * div_beta F2o3 * Axx(i,j,k) * divb_loc
Ayy_rhs(i,j,k) = chin_loc * Ayy_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Ayy(i,j,k) - TWO * fyy(i,j,k)) + &
Axy_rhs = f * Axy_rhs+ alpn1 *( trK * Axy - TWO * fxy )+ & TWO * (Axy(i,j,k) * betaxy(i,j,k) + Ayy(i,j,k) * betayy(i,j,k) + Ayz(i,j,k) * betazy(i,j,k)) - &
Axx * betaxy + Axz * betazy + & F2o3 * Ayy(i,j,k) * divb_loc
Ayy * betayx + Ayz * betazx + & Azz_rhs(i,j,k) = chin_loc * Azz_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Azz(i,j,k) - TWO * fzz(i,j,k)) + &
F1o3 * Axy * div_beta - Axy * betazz TWO * (Axz(i,j,k) * betaxz(i,j,k) + Ayz(i,j,k) * betayz(i,j,k) + Azz(i,j,k) * betazz(i,j,k)) - &
F2o3 * Azz(i,j,k) * divb_loc
Ayz_rhs = f * Ayz_rhs+ alpn1 *( trK * Ayz - TWO * fyz )+ & Axy_rhs(i,j,k) = chin_loc * Axy_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Axy(i,j,k) - TWO * fxy(i,j,k)) + &
Axy * betaxz + Ayy * betayz + & Axx(i,j,k) * betaxy(i,j,k) + Axz(i,j,k) * betazy(i,j,k) + Ayy(i,j,k) * betayx(i,j,k) + &
Axz * betaxy + Azz * betazy + & Ayz(i,j,k) * betazx(i,j,k) + F1o3 * Axy(i,j,k) * divb_loc - Axy(i,j,k) * betazz(i,j,k)
F1o3 * Ayz * div_beta - Ayz * betaxx Ayz_rhs(i,j,k) = chin_loc * Ayz_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Ayz(i,j,k) - TWO * fyz(i,j,k)) + &
Axy(i,j,k) * betaxz(i,j,k) + Ayy(i,j,k) * betayz(i,j,k) + Axz(i,j,k) * betaxy(i,j,k) + &
Axz_rhs = f * Axz_rhs+ alpn1 *( trK * Axz - TWO * fxz )+ & Azz(i,j,k) * betazy(i,j,k) + F1o3 * Ayz(i,j,k) * divb_loc - Ayz(i,j,k) * betaxx(i,j,k)
Axx * betaxz + Axy * betayz + & Axz_rhs(i,j,k) = chin_loc * Axz_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Axz(i,j,k) - TWO * fxz(i,j,k)) + &
Ayz * betayx + Azz * betazx + & Axx(i,j,k) * betaxz(i,j,k) + Axy(i,j,k) * betayz(i,j,k) + Ayz(i,j,k) * betayx(i,j,k) + &
F1o3 * Axz * div_beta - Axz * betayy !rhs for Aij Azz(i,j,k) * betazx(i,j,k) + F1o3 * Axz(i,j,k) * divb_loc - Axz(i,j,k) * betayy(i,j,k)
! Compute trace of S_ij trK_rhs(i,j,k) = - trK_rhs(i,j,k) + alpn1(i,j,k) * ( F1o3 * trK(i,j,k) * trK(i,j,k) + &
gupxx(i,j,k) * fxx(i,j,k) + gupyy(i,j,k) * fyy(i,j,k) + gupzz(i,j,k) * fzz(i,j,k) + &
S = f * ( gupxx * Sxx + gupyy * Syy + gupzz * Szz + & TWO * (gupxy(i,j,k) * fxy(i,j,k) + gupxz(i,j,k) * fxz(i,j,k) + gupyz(i,j,k) * fyz(i,j,k)) + &
TWO * ( gupxy * Sxy + gupxz * Sxz + gupyz * Syz ) ) FOUR * PI * (rho(i,j,k) + S_loc) )
enddo
trK_rhs = - trK_rhs + alpn1 *( F1o3 * trK * trK + & enddo
gupxx * fxx + gupyy * fyy + gupzz * fzz + & enddo
TWO * ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) + &
FOUR * PI * ( rho + S )) !rhs for trK
!!!! gauge variable part !!!! gauge variable part
@@ -948,15 +1000,15 @@
!!!!!!!!!advection term + Kreiss-Oliger dissipation (merged for cache efficiency) !!!!!!!!!advection term + Kreiss-Oliger dissipation (merged for cache efficiency)
! lopsided_kodis shares the symmetry_bd buffer between advection and ! lopsided_kodis shares the symmetry_bd buffer between advection and
! dissipation, eliminating redundant full-grid copies. For metric variables ! dissipation, eliminating redundant full-grid copies. For metric variables
! gxx/gyy/gzz (=dxx/dyy/dzz+1): kodis stencil coefficients sum to zero, ! gxx/gyy/gzz (=dxx/dyy/dzz+1): stencil coefficients sum to zero,
! so the constant offset has no effect on dissipation. ! so the constant offset has no effect on dissipation.
call lopsided_kodis(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided_kodis(ex,X,Y,Z,dxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps) call lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
call lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps) call lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps)
call lopsided_kodis(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided_kodis(ex,X,Y,Z,dyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps) call lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps)
call lopsided_kodis(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided_kodis(ex,X,Y,Z,dzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps) call lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps)

View File

@@ -1022,9 +1022,16 @@ int f_compute_rhs_bssn(int *ex, double &T,
+ gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] ); + gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] );
#if (GAUGE == 2) #if (GAUGE == 2)
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - sqrt(chin1[i])), 2.0 ); {
const double chi_sqrt = sqrt(chin1[i]);
const double damping = ONE - chi_sqrt;
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping);
}
#else #else
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - chin1[i]), 2.0 ); {
const double damping = ONE - chin1[i];
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping);
}
#endif #endif
dtSfx_rhs[i] = Gamx_rhs[i] - reta[i] * dtSfx[i]; dtSfx_rhs[i] = Gamx_rhs[i] - reta[i] * dtSfx[i];
@@ -1040,9 +1047,16 @@ int f_compute_rhs_bssn(int *ex, double &T,
+ gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] ); + gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] );
#if (GAUGE == 4) #if (GAUGE == 4)
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - sqrt(chin1[i])), 2.0 ); {
const double chi_sqrt = sqrt(chin1[i]);
const double damping = ONE - chi_sqrt;
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping);
}
#else #else
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - chin1[i]), 2.0 ); {
const double damping = ONE - chin1[i];
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping);
}
#endif #endif
betax_rhs[i] = FF * Gamx[i] - reta[i] * betax[i]; betax_rhs[i] = FF * Gamx[i] - reta[i] * betax[i];
@@ -1139,59 +1153,59 @@ int f_compute_rhs_bssn(int *ex, double &T,
fderivs(ex,Ayy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0); fderivs(ex,Ayy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0);
fderivs(ex,Ayz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,0); fderivs(ex,Ayz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,0);
fderivs(ex,Azz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0); fderivs(ex,Azz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0);
} // 7ms //
// 7ms // for (int i=0;i<all;i+=1) {
for (int i=0;i<all;i+=1) { gxxx[i] = gxxx[i] - ( Gamxxx[i] * Axx[i] + Gamyxx[i] * Axy[i] + Gamzxx[i] * Axz[i]
gxxx[i] = gxxx[i] - ( Gamxxx[i] * Axx[i] + Gamyxx[i] * Axy[i] + Gamzxx[i] * Axz[i] + Gamxxx[i] * Axx[i] + Gamyxx[i] * Axy[i] + Gamzxx[i] * Axz[i]) - chix[i]*Axx[i]/chin1[i];
+ Gamxxx[i] * Axx[i] + Gamyxx[i] * Axy[i] + Gamzxx[i] * Axz[i]) - chix[i]*Axx[i]/chin1[i]; gxyx[i] = gxyx[i] - ( Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i]
gxyx[i] = gxyx[i] - ( Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i] + Gamxxx[i] * Axy[i] + Gamyxx[i] * Ayy[i] + Gamzxx[i] * Ayz[i]) - chix[i]*Axy[i]/chin1[i];
+ Gamxxx[i] * Axy[i] + Gamyxx[i] * Ayy[i] + Gamzxx[i] * Ayz[i]) - chix[i]*Axy[i]/chin1[i]; gxzx[i] = gxzx[i] - ( Gamxxz[i] * Axx[i] + Gamyxz[i] * Axy[i] + Gamzxz[i] * Axz[i]
gxzx[i] = gxzx[i] - ( Gamxxz[i] * Axx[i] + Gamyxz[i] * Axy[i] + Gamzxz[i] * Axz[i] + Gamxxx[i] * Axz[i] + Gamyxx[i] * Ayz[i] + Gamzxx[i] * Azz[i]) - chix[i]*Axz[i]/chin1[i];
+ Gamxxx[i] * Axz[i] + Gamyxx[i] * Ayz[i] + Gamzxx[i] * Azz[i]) - chix[i]*Axz[i]/chin1[i]; gyyx[i] = gyyx[i] - ( Gamxxy[i] * Axy[i] + Gamyxy[i] * Ayy[i] + Gamzxy[i] * Ayz[i]
gyyx[i] = gyyx[i] - ( Gamxxy[i] * Axy[i] + Gamyxy[i] * Ayy[i] + Gamzxy[i] * Ayz[i] + Gamxxy[i] * Axy[i] + Gamyxy[i] * Ayy[i] + Gamzxy[i] * Ayz[i]) - chix[i]*Ayy[i]/chin1[i];
+ Gamxxy[i] * Axy[i] + Gamyxy[i] * Ayy[i] + Gamzxy[i] * Ayz[i]) - chix[i]*Ayy[i]/chin1[i]; gyzx[i] = gyzx[i] - ( Gamxxz[i] * Axy[i] + Gamyxz[i] * Ayy[i] + Gamzxz[i] * Ayz[i]
gyzx[i] = gyzx[i] - ( Gamxxz[i] * Axy[i] + Gamyxz[i] * Ayy[i] + Gamzxz[i] * Ayz[i] + Gamxxy[i] * Axz[i] + Gamyxy[i] * Ayz[i] + Gamzxy[i] * Azz[i]) - chix[i]*Ayz[i]/chin1[i];
+ Gamxxy[i] * Axz[i] + Gamyxy[i] * Ayz[i] + Gamzxy[i] * Azz[i]) - chix[i]*Ayz[i]/chin1[i]; gzzx[i] = gzzx[i] - ( Gamxxz[i] * Axz[i] + Gamyxz[i] * Ayz[i] + Gamzxz[i] * Azz[i]
gzzx[i] = gzzx[i] - ( Gamxxz[i] * Axz[i] + Gamyxz[i] * Ayz[i] + Gamzxz[i] * Azz[i] + Gamxxz[i] * Axz[i] + Gamyxz[i] * Ayz[i] + Gamzxz[i] * Azz[i]) - chix[i]*Azz[i]/chin1[i];
+ Gamxxz[i] * Axz[i] + Gamyxz[i] * Ayz[i] + Gamzxz[i] * Azz[i]) - chix[i]*Azz[i]/chin1[i]; gxxy[i] = gxxy[i] - ( Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i]
gxxy[i] = gxxy[i] - ( Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i] + Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i]) - chiy[i]*Axx[i]/chin1[i];
+ Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i]) - chiy[i]*Axx[i]/chin1[i]; gxyy[i] = gxyy[i] - ( Gamxyy[i] * Axx[i] + Gamyyy[i] * Axy[i] + Gamzyy[i] * Axz[i]
gxyy[i] = gxyy[i] - ( Gamxyy[i] * Axx[i] + Gamyyy[i] * Axy[i] + Gamzyy[i] * Axz[i] + Gamxxy[i] * Axy[i] + Gamyxy[i] * Ayy[i] + Gamzxy[i] * Ayz[i]) - chiy[i]*Axy[i]/chin1[i];
+ Gamxxy[i] * Axy[i] + Gamyxy[i] * Ayy[i] + Gamzxy[i] * Ayz[i]) - chiy[i]*Axy[i]/chin1[i]; gxzy[i] = gxzy[i] - ( Gamxyz[i] * Axx[i] + Gamyyz[i] * Axy[i] + Gamzyz[i] * Axz[i]
gxzy[i] = gxzy[i] - ( Gamxyz[i] * Axx[i] + Gamyyz[i] * Axy[i] + Gamzyz[i] * Axz[i] + Gamxxy[i] * Axz[i] + Gamyxy[i] * Ayz[i] + Gamzxy[i] * Azz[i]) - chiy[i]*Axz[i]/chin1[i];
+ Gamxxy[i] * Axz[i] + Gamyxy[i] * Ayz[i] + Gamzxy[i] * Azz[i]) - chiy[i]*Axz[i]/chin1[i]; gyyy[i] = gyyy[i] - ( Gamxyy[i] * Axy[i] + Gamyyy[i] * Ayy[i] + Gamzyy[i] * Ayz[i]
gyyy[i] = gyyy[i] - ( Gamxyy[i] * Axy[i] + Gamyyy[i] * Ayy[i] + Gamzyy[i] * Ayz[i] + Gamxyy[i] * Axy[i] + Gamyyy[i] * Ayy[i] + Gamzyy[i] * Ayz[i]) - chiy[i]*Ayy[i]/chin1[i];
+ Gamxyy[i] * Axy[i] + Gamyyy[i] * Ayy[i] + Gamzyy[i] * Ayz[i]) - chiy[i]*Ayy[i]/chin1[i]; gyzy[i] = gyzy[i] - ( Gamxyz[i] * Axy[i] + Gamyyz[i] * Ayy[i] + Gamzyz[i] * Ayz[i]
gyzy[i] = gyzy[i] - ( Gamxyz[i] * Axy[i] + Gamyyz[i] * Ayy[i] + Gamzyz[i] * Ayz[i] + Gamxyy[i] * Axz[i] + Gamyyy[i] * Ayz[i] + Gamzyy[i] * Azz[i]) - chiy[i]*Ayz[i]/chin1[i];
+ Gamxyy[i] * Axz[i] + Gamyyy[i] * Ayz[i] + Gamzyy[i] * Azz[i]) - chiy[i]*Ayz[i]/chin1[i]; gzzy[i] = gzzy[i] - ( Gamxyz[i] * Axz[i] + Gamyyz[i] * Ayz[i] + Gamzyz[i] * Azz[i]
gzzy[i] = gzzy[i] - ( Gamxyz[i] * Axz[i] + Gamyyz[i] * Ayz[i] + Gamzyz[i] * Azz[i] + Gamxyz[i] * Axz[i] + Gamyyz[i] * Ayz[i] + Gamzyz[i] * Azz[i]) - chiy[i]*Azz[i]/chin1[i];
+ Gamxyz[i] * Axz[i] + Gamyyz[i] * Ayz[i] + Gamzyz[i] * Azz[i]) - chiy[i]*Azz[i]/chin1[i]; gxxz[i] = gxxz[i] - ( Gamxxz[i] * Axx[i] + Gamyxz[i] * Axy[i] + Gamzxz[i] * Axz[i]
gxxz[i] = gxxz[i] - ( Gamxxz[i] * Axx[i] + Gamyxz[i] * Axy[i] + Gamzxz[i] * Axz[i] + Gamxxz[i] * Axx[i] + Gamyxz[i] * Axy[i] + Gamzxz[i] * Axz[i]) - chiz[i]*Axx[i]/chin1[i];
+ Gamxxz[i] * Axx[i] + Gamyxz[i] * Axy[i] + Gamzxz[i] * Axz[i]) - chiz[i]*Axx[i]/chin1[i]; gxyz[i] = gxyz[i] - ( Gamxyz[i] * Axx[i] + Gamyyz[i] * Axy[i] + Gamzyz[i] * Axz[i]
gxyz[i] = gxyz[i] - ( Gamxyz[i] * Axx[i] + Gamyyz[i] * Axy[i] + Gamzyz[i] * Axz[i] + Gamxxz[i] * Axy[i] + Gamyxz[i] * Ayy[i] + Gamzxz[i] * Ayz[i]) - chiz[i]*Axy[i]/chin1[i];
+ Gamxxz[i] * Axy[i] + Gamyxz[i] * Ayy[i] + Gamzxz[i] * Ayz[i]) - chiz[i]*Axy[i]/chin1[i]; gxzz[i] = gxzz[i] - ( Gamxzz[i] * Axx[i] + Gamyzz[i] * Axy[i] + Gamzzz[i] * Axz[i]
gxzz[i] = gxzz[i] - ( Gamxzz[i] * Axx[i] + Gamyzz[i] * Axy[i] + Gamzzz[i] * Axz[i] + Gamxxz[i] * Axz[i] + Gamyxz[i] * Ayz[i] + Gamzxz[i] * Azz[i]) - chiz[i]*Axz[i]/chin1[i];
+ Gamxxz[i] * Axz[i] + Gamyxz[i] * Ayz[i] + Gamzxz[i] * Azz[i]) - chiz[i]*Axz[i]/chin1[i]; gyyz[i] = gyyz[i] - ( Gamxyz[i] * Axy[i] + Gamyyz[i] * Ayy[i] + Gamzyz[i] * Ayz[i]
gyyz[i] = gyyz[i] - ( Gamxyz[i] * Axy[i] + Gamyyz[i] * Ayy[i] + Gamzyz[i] * Ayz[i] + Gamxyz[i] * Axy[i] + Gamyyz[i] * Ayy[i] + Gamzyz[i] * Ayz[i]) - chiz[i]*Ayy[i]/chin1[i];
+ Gamxyz[i] * Axy[i] + Gamyyz[i] * Ayy[i] + Gamzyz[i] * Ayz[i]) - chiz[i]*Ayy[i]/chin1[i]; gyzz[i] = gyzz[i] - ( Gamxzz[i] * Axy[i] + Gamyzz[i] * Ayy[i] + Gamzzz[i] * Ayz[i]
gyzz[i] = gyzz[i] - ( Gamxzz[i] * Axy[i] + Gamyzz[i] * Ayy[i] + Gamzzz[i] * Ayz[i] + Gamxyz[i] * Axz[i] + Gamyyz[i] * Ayz[i] + Gamzyz[i] * Azz[i]) - chiz[i]*Ayz[i]/chin1[i];
+ Gamxyz[i] * Axz[i] + Gamyyz[i] * Ayz[i] + Gamzyz[i] * Azz[i]) - chiz[i]*Ayz[i]/chin1[i]; gzzz[i] = gzzz[i] - ( Gamxzz[i] * Axz[i] + Gamyzz[i] * Ayz[i] + Gamzzz[i] * Azz[i]
gzzz[i] = gzzz[i] - ( Gamxzz[i] * Axz[i] + Gamyzz[i] * Ayz[i] + Gamzzz[i] * Azz[i] + Gamxzz[i] * Axz[i] + Gamyzz[i] * Ayz[i] + Gamzzz[i] * Azz[i]) - chiz[i]*Azz[i]/chin1[i];
+ Gamxzz[i] * Axz[i] + Gamyzz[i] * Ayz[i] + Gamzzz[i] * Azz[i]) - chiz[i]*Azz[i]/chin1[i];
movx_Res[i] = gupxx[i]*gxxx[i] + gupyy[i]*gxyy[i] + gupzz[i]*gxzz[i] movx_Res[i] = gupxx[i]*gxxx[i] + gupyy[i]*gxyy[i] + gupzz[i]*gxzz[i]
+ gupxy[i]*gxyx[i] + gupxz[i]*gxzx[i] + gupyz[i]*gxzy[i] + gupxy[i]*gxyx[i] + gupxz[i]*gxzx[i] + gupyz[i]*gxzy[i]
+ gupxy[i]*gxxy[i] + gupxz[i]*gxxz[i] + gupyz[i]*gxyz[i]; + gupxy[i]*gxxy[i] + gupxz[i]*gxxz[i] + gupyz[i]*gxyz[i];
movy_Res[i] = gupxx[i]*gxyx[i] + gupyy[i]*gyyy[i] + gupzz[i]*gyzz[i] movy_Res[i] = gupxx[i]*gxyx[i] + gupyy[i]*gyyy[i] + gupzz[i]*gyzz[i]
+ gupxy[i]*gyyx[i] + gupxz[i]*gyzx[i] + gupyz[i]*gyzy[i] + gupxy[i]*gyyx[i] + gupxz[i]*gyzx[i] + gupyz[i]*gyzy[i]
+ gupxy[i]*gxyy[i] + gupxz[i]*gxyz[i] + gupyz[i]*gyyz[i]; + gupxy[i]*gxyy[i] + gupxz[i]*gxyz[i] + gupyz[i]*gyyz[i];
movz_Res[i] = gupxx[i]*gxzx[i] + gupyy[i]*gyzy[i] + gupzz[i]*gzzz[i] movz_Res[i] = gupxx[i]*gxzx[i] + gupyy[i]*gyzy[i] + gupzz[i]*gzzz[i]
+ gupxy[i]*gyzx[i] + gupxz[i]*gzzx[i] + gupyz[i]*gzzy[i] + gupxy[i]*gyzx[i] + gupxz[i]*gzzx[i] + gupyz[i]*gzzy[i]
+ gupxy[i]*gxzy[i] + gupxz[i]*gxzz[i] + gupyz[i]*gyzz[i]; + gupxy[i]*gxzy[i] + gupxz[i]*gxzz[i] + gupyz[i]*gyzz[i];
movx_Res[i] = movx_Res[i] - F2o3*Kx[i] - F8*PI*Sx[i]; movx_Res[i] = movx_Res[i] - F2o3*Kx[i] - F8*PI*Sx[i];
movy_Res[i] = movy_Res[i] - F2o3*Ky[i] - F8*PI*Sy[i]; movy_Res[i] = movy_Res[i] - F2o3*Ky[i] - F8*PI*Sy[i];
movz_Res[i] = movz_Res[i] - F2o3*Kz[i] - F8*PI*Sz[i]; movz_Res[i] = movz_Res[i] - F2o3*Kz[i] - F8*PI*Sz[i];
}
} }

View File

@@ -23,10 +23,14 @@ using namespace std;
#include <mpi.h> #include <mpi.h>
#include "macrodef.h" #include "macrodef.h"
#include "misc.h" #include "misc.h"
#include "cgh.h" #include "cgh.h"
#include "Parallel.h" #include "Parallel.h"
#include "parameters.h" #include "parameters.h"
#ifdef USE_GPU
#include "bssn_gpu.h"
#include "bssn_cuda_ops.h"
#endif
//================================================================================================ //================================================================================================
@@ -881,13 +885,20 @@ void cgh::recompose_cgh(int nprocs, bool *lev_flag,
tmPat = construct_patchlist(lev, Symmetry); tmPat = construct_patchlist(lev, Symmetry);
// tmPat construction completes // tmPat construction completes
Parallel::distribute(tmPat, nprocs, ingfs, fngfs, false); Parallel::distribute(tmPat, nprocs, ingfs, fngfs, false);
// checkPatchList(tmPat,true); // checkPatchList(tmPat,true);
bool CC = (lev > trfls); bool CC = (lev > trfls);
Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC); Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
Parallel::KillBlocks(PatL[lev]); #ifdef USE_GPU
PatL[lev]->destroyList(); bssn_gpu_clear_cached_device_buffers();
PatL[lev] = tmPat; bssn_gpu_release_pinned_host_buffers();
bssn_cuda_release_rk4_caches();
bssn_cuda_release_interp_caches();
patch_release_interp_plan_cache();
#endif
Parallel::KillBlocks(PatL[lev]);
PatL[lev]->destroyList();
PatL[lev] = tmPat;
#if (RPB == 1) #if (RPB == 1)
Parallel::destroypsuList_bam(bdsul[lev]); Parallel::destroypsuList_bam(bdsul[lev]);
Parallel::destroypsuList_bam(rsul[lev]); Parallel::destroypsuList_bam(rsul[lev]);
@@ -910,13 +921,20 @@ void cgh::recompose_cgh(int nprocs, bool *lev_flag,
tmPat = construct_patchlist(lev, Symmetry); tmPat = construct_patchlist(lev, Symmetry);
// tmPat construction completes // tmPat construction completes
Parallel::distribute(tmPat, end_rank[lev] - start_rank[lev] + 1, ingfs, fngfs, false, start_rank[lev], end_rank[lev]); Parallel::distribute(tmPat, end_rank[lev] - start_rank[lev] + 1, ingfs, fngfs, false, start_rank[lev], end_rank[lev]);
// checkPatchList(tmPat,true); // checkPatchList(tmPat,true);
bool CC = (lev > trfls); bool CC = (lev > trfls);
Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC); Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
Parallel::KillBlocks(PatL[lev]); #ifdef USE_GPU
PatL[lev]->destroyList(); bssn_gpu_clear_cached_device_buffers();
PatL[lev] = tmPat; bssn_gpu_release_pinned_host_buffers();
bssn_cuda_release_rk4_caches();
bssn_cuda_release_interp_caches();
patch_release_interp_plan_cache();
#endif
Parallel::KillBlocks(PatL[lev]);
PatL[lev]->destroyList();
PatL[lev] = tmPat;
#if (RPB == 1) #if (RPB == 1)
#error "not support yet" #error "not support yet"
#endif #endif
@@ -1518,13 +1536,20 @@ void cgh::recompose_cgh_Onelevel(int nprocs, int lev,
tmPat = construct_patchlist(lev, Symmetry); tmPat = construct_patchlist(lev, Symmetry);
// tmPat construction completes // tmPat construction completes
Parallel::distribute(tmPat, nprocs, ingfs, fngfs, false); Parallel::distribute(tmPat, nprocs, ingfs, fngfs, false);
// checkPatchList(tmPat,true); // checkPatchList(tmPat,true);
bool CC = (lev > trfls); bool CC = (lev > trfls);
Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC); Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
Parallel::KillBlocks(PatL[lev]); #ifdef USE_GPU
PatL[lev]->destroyList(); bssn_gpu_clear_cached_device_buffers();
PatL[lev] = tmPat; bssn_gpu_release_pinned_host_buffers();
bssn_cuda_release_rk4_caches();
bssn_cuda_release_interp_caches();
patch_release_interp_plan_cache();
#endif
Parallel::KillBlocks(PatL[lev]);
PatL[lev]->destroyList();
PatL[lev] = tmPat;
} }
#elif (PSTR == 1 || PSTR == 2 || PSTR == 3) #elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
#warning "recompose_cgh_Onelevel is not implimented yet" #warning "recompose_cgh_Onelevel is not implimented yet"
@@ -1540,14 +1565,21 @@ void cgh::recompose_cgh_Onelevel(int nprocs, int lev,
// tmPat construction completes // tmPat construction completes
Parallel::distribute(tmPat, end_rank[lev] - start_rank[lev] + 1, ingfs, fngfs, false, start_rank[lev], end_rank[lev]); Parallel::distribute(tmPat, end_rank[lev] - start_rank[lev] + 1, ingfs, fngfs, false, start_rank[lev], end_rank[lev]);
misc::tillherecheck(Commlev[lev], start_rank[lev], "after distribute"); misc::tillherecheck(Commlev[lev], start_rank[lev], "after distribute");
// checkPatchList(tmPat,true); // checkPatchList(tmPat,true);
bool CC = (lev > trfls); bool CC = (lev > trfls);
Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC); Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
misc::tillherecheck(Commlev[lev], start_rank[lev], "after fill_level_data"); misc::tillherecheck(Commlev[lev], start_rank[lev], "after fill_level_data");
Parallel::KillBlocks(PatL[lev]); #ifdef USE_GPU
PatL[lev]->destroyList(); bssn_gpu_clear_cached_device_buffers();
PatL[lev] = tmPat; bssn_gpu_release_pinned_host_buffers();
bssn_cuda_release_rk4_caches();
bssn_cuda_release_interp_caches();
patch_release_interp_plan_cache();
#endif
Parallel::KillBlocks(PatL[lev]);
PatL[lev]->destroyList();
PatL[lev] = tmPat;
} }

View File

@@ -30,8 +30,8 @@ CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
endif endif
.SUFFIXES: .o .f90 .C .for .cu .SUFFIXES: .o .f90 .C .for .cu
.f90.o: .f90.o:
$(f90) $(f90appflags) -c $< -o $@ $(f90) $(f90appflags) -c $< -o $@
@@ -64,8 +64,8 @@ lopsided_c.o: lopsided_c.C
lopsided_kodis_c.o: lopsided_kodis_c.C lopsided_kodis_c.o: lopsided_kodis_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h #interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ # ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS ## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
@@ -105,13 +105,12 @@ C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\ Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o
C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
cgh.o surface_integral.o ShellPatch.o\ cgh.o bssn_class.o surface_integral.o ShellPatch.o\
bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\ bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\ bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\ Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
NullShellPatch2_Evo.o \ NullShellPatch2_Evo.o bssn_cuda_step.o writefile_f.o
bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o
F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\ F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
prolongrestrict_cell.o prolongrestrict_vertex.o\ prolongrestrict_cell.o prolongrestrict_vertex.o\
@@ -143,7 +142,7 @@ initial_guess.o Newton.o Jacobian.o ilucg.o IntPnts0.o IntPnts.o
TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o
CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o CUDAFILES = bssn_gpu.o bssn_cuda_ops.o
# file dependences # file dependences
$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh $(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh

View File

@@ -9,6 +9,7 @@ filein = -I/usr/include/ -I${MKLROOT}/include
## Using sequential MKL (OpenMP disabled for better single-threaded performance) ## Using sequential MKL (OpenMP disabled for better single-threaded performance)
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5 LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
CUDA_LDLIBS = -L/usr/local/cuda-12.9/targets/x86_64-linux/lib -lcudart
## Memory allocator switch ## Memory allocator switch
## 1 (default) : link Intel oneTBB allocator (libtbbmalloc) ## 1 (default) : link Intel oneTBB allocator (libtbbmalloc)
@@ -24,6 +25,8 @@ ifeq ($(USE_TBBMALLOC),1)
LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS) LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
endif endif
LDLIBS := $(CUDA_LDLIBS) $(LDLIBS)
## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags) ## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
## opt : (default) maximum performance with PGO profile-guided optimization ## opt : (default) maximum performance with PGO profile-guided optimization
## instrument : PGO Phase 1 instrumentation to collect fresh profile data ## instrument : PGO Phase 1 instrumentation to collect fresh profile data

View File

@@ -180,19 +180,64 @@ surface_integral::surface_integral(int iSymmetry) : Symmetry(iSymmetry)
//|============================================================================ //|============================================================================
//| Destructor //| Destructor
//|============================================================================ //|============================================================================
surface_integral::~surface_integral() surface_integral::~surface_integral()
{ {
delete[] nx_g; release_cached_buffers();
delete[] ny_g; delete[] nx_g;
delete[] nz_g; delete[] ny_g;
delete[] arcostheta; delete[] nz_g;
#ifdef GaussInt delete[] arcostheta;
delete[] wtcostheta; #ifdef GaussInt
#endif delete[] wtcostheta;
} #endif
//|---------------------------------------------------------------- }
// spin weighted spinw component of psi4, general routine
// l takes from spinw to maxl; m takes from -l to l void surface_integral::get_surface_points(double rex, double **pox)
{
SpherePointCache &cache = sphere_point_cache[rex];
if (!cache.pox[0])
{
for (int i = 0; i < 3; ++i)
cache.pox[i] = new double[n_tot];
for (int n = 0; n < n_tot; ++n)
{
cache.pox[0][n] = rex * nx_g[n];
cache.pox[1][n] = rex * ny_g[n];
cache.pox[2][n] = rex * nz_g[n];
}
}
pox[0] = cache.pox[0];
pox[1] = cache.pox[1];
pox[2] = cache.pox[2];
}
double *surface_integral::get_shellf_buffer(int num_var)
{
double *&buffer = shellf_cache[num_var];
if (!buffer)
buffer = new double[n_tot * num_var];
return buffer;
}
void surface_integral::release_cached_buffers()
{
for (map<double, SpherePointCache>::iterator it = sphere_point_cache.begin(); it != sphere_point_cache.end(); ++it)
{
delete[] it->second.pox[0];
delete[] it->second.pox[1];
delete[] it->second.pox[2];
it->second.pox[0] = it->second.pox[1] = it->second.pox[2] = 0;
}
sphere_point_cache.clear();
for (map<int, double *>::iterator it = shellf_cache.begin(); it != shellf_cache.end(); ++it)
delete[] it->second;
shellf_cache.clear();
}
//|----------------------------------------------------------------
// spin weighted spinw component of psi4, general routine
// l takes from spinw to maxl; m takes from -l to l
//|---------------------------------------------------------------- //|----------------------------------------------------------------
void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4, void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
int spinw, int maxl, int NN, double *RP, double *IP, int spinw, int maxl, int NN, double *RP, double *IP,
@@ -209,16 +254,9 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
MyList<var> *DG_List = new MyList<var>(Rpsi4); MyList<var> *DG_List = new MyList<var>(Rpsi4);
DG_List->insert(Ipsi4); DG_List->insert(Ipsi4);
int n; int n;
double *pox[3]; double *pox[3];
for (int i = 0; i < 3; i++) get_surface_points(rex, pox);
pox[i] = new double[n_tot];
for (n = 0; n < n_tot; n++)
{
pox[0][n] = rex * nx_g[n];
pox[1][n] = rex * ny_g[n];
pox[2][n] = rex * nz_g[n];
}
int mp, Lp, Nmin, Nmax; int mp, Lp, Nmin, Nmax;
mp = n_tot / cpusize; mp = n_tot / cpusize;
@@ -234,8 +272,7 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
Nmax = Nmin + mp - 1; Nmax = Nmin + mp - 1;
} }
double *shellf; double *shellf = get_shellf_buffer(InList);
shellf = new double[n_tot * InList];
GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax); GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
@@ -375,14 +412,10 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
//|------= Free memory. //|------= Free memory.
delete[] pox[0]; delete[] RP_out;
delete[] pox[1]; delete[] IP_out;
delete[] pox[2]; DG_List->clearList();
delete[] shellf; }
delete[] RP_out;
delete[] IP_out;
DG_List->clearList();
}
void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4, void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
int spinw, int maxl, int NN, double *RP, double *IP, int spinw, int maxl, int NN, double *RP, double *IP,
monitor *Monitor, MPI_Comm Comm_here) // NN is the length of RP and IP monitor *Monitor, MPI_Comm Comm_here) // NN is the length of RP and IP
@@ -402,19 +435,11 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
MyList<var> *DG_List = new MyList<var>(Rpsi4); MyList<var> *DG_List = new MyList<var>(Rpsi4);
DG_List->insert(Ipsi4); DG_List->insert(Ipsi4);
int n; int n;
double *pox[3]; double *pox[3];
for (int i = 0; i < 3; i++) get_surface_points(rex, pox);
pox[i] = new double[n_tot];
for (n = 0; n < n_tot; n++) double *shellf = get_shellf_buffer(InList);
{
pox[0][n] = rex * nx_g[n];
pox[1][n] = rex * ny_g[n];
pox[2][n] = rex * nz_g[n];
}
double *shellf;
shellf = new double[n_tot * InList];
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Interp_Points"); // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Interp_Points");
@@ -577,14 +602,10 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
//|------= Free memory. //|------= Free memory.
delete[] pox[0]; delete[] RP_out;
delete[] pox[1]; delete[] IP_out;
delete[] pox[2]; DG_List->clearList();
delete[] shellf; }
delete[] RP_out;
delete[] IP_out;
DG_List->clearList();
}
//|---------------------------------------------------------------- //|----------------------------------------------------------------
// for shell patch // for shell patch
//|---------------------------------------------------------------- //|----------------------------------------------------------------
@@ -597,19 +618,11 @@ void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH, var *Rpsi4
MyList<var> *DG_List = new MyList<var>(Rpsi4); MyList<var> *DG_List = new MyList<var>(Rpsi4);
DG_List->insert(Ipsi4); DG_List->insert(Ipsi4);
int n; int n;
double *pox[3]; double *pox[3];
for (int i = 0; i < 3; i++) get_surface_points(rex, pox);
pox[i] = new double[n_tot];
for (n = 0; n < n_tot; n++)
{
pox[0][n] = rex * nx_g[n];
pox[1][n] = rex * ny_g[n];
pox[2][n] = rex * nz_g[n];
}
double *shellf; double *shellf = get_shellf_buffer(InList);
shellf = new double[n_tot * InList];
GH->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry); GH->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);
@@ -2570,12 +2583,8 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
Rout[5] = sy; Rout[5] = sy;
Rout[6] = sz; Rout[6] = sz;
delete[] pox[0]; DG_List->clearList();
delete[] pox[1]; }
delete[] pox[2];
delete[] shellf;
DG_List->clearList();
}
void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var *trK, void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz, var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz, var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
@@ -2637,19 +2646,11 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
DG_List->insert(Ayz); DG_List->insert(Ayz);
DG_List->insert(Azz); DG_List->insert(Azz);
int n; int n;
double *pox[3]; double *pox[3];
for (int i = 0; i < 3; i++) get_surface_points(rex, pox);
pox[i] = new double[n_tot];
for (n = 0; n < n_tot; n++) double *shellf = get_shellf_buffer(InList);
{
pox[0][n] = rex * nx_g[n];
pox[1][n] = rex * ny_g[n];
pox[2][n] = rex * nz_g[n];
}
double *shellf;
shellf = new double[n_tot * InList];
// we have assumed there is only one box on this level, // we have assumed there is only one box on this level,
// so we do not need loop boxes // so we do not need loop boxes
@@ -2839,12 +2840,8 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
Rout[5] = sy; Rout[5] = sy;
Rout[6] = sz; Rout[6] = sz;
delete[] pox[0]; DG_List->clearList();
delete[] pox[1]; }
delete[] pox[2];
delete[] shellf;
DG_List->clearList();
}
//|---------------------------------------------------------------- //|----------------------------------------------------------------
// for shell patch // for shell patch
//|---------------------------------------------------------------- //|----------------------------------------------------------------

View File

@@ -20,25 +20,41 @@ using namespace std;
#include "cgh.h" #include "cgh.h"
#include "ShellPatch.h" #include "ShellPatch.h"
#include "NullShellPatch.h" #include "NullShellPatch.h"
#include "NullShellPatch2.h" #include "NullShellPatch2.h"
#include "var.h" #include "var.h"
#include "monitor.h" #include "monitor.h"
#include <map>
class surface_integral class surface_integral
{ {
private: private:
int Symmetry, factor; struct SpherePointCache
int N_theta, N_phi; // Number of points in Theta & Phi directions {
double dphi, dcostheta; double *pox[3];
double *arcostheta, *wtcostheta; SpherePointCache()
int n_tot; // size of arrays {
pox[0] = pox[1] = pox[2] = 0;
double *nx_g, *ny_g, *nz_g; // global list of unit normals }
int myrank, cpusize; };
public: int Symmetry, factor;
surface_integral(int iSymmetry); int N_theta, N_phi; // Number of points in Theta & Phi directions
double dphi, dcostheta;
double *arcostheta, *wtcostheta;
int n_tot; // size of arrays
double *nx_g, *ny_g, *nz_g; // global list of unit normals
int myrank, cpusize;
map<double, SpherePointCache> sphere_point_cache;
map<int, double *> shellf_cache;
void get_surface_points(double rex, double **pox);
double *get_shellf_buffer(int num_var);
void release_cached_buffers();
public:
surface_integral(int iSymmetry);
~surface_integral(); ~surface_integral();
void surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4, void surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,

View File

@@ -9,6 +9,7 @@
import AMSS_NCKU_Input as input_data import AMSS_NCKU_Input as input_data
import os
import subprocess import subprocess
import time import time
@@ -57,6 +58,48 @@ BUILD_JOBS = 64
################################################################## ##################################################################
##################################################################
def prepare_gpu_runtime_env():
"""
Create a user-private CUDA MPS environment for GPU runs.
On shared machines another user's daemon may already occupy the default
/tmp/nvidia-mps pipe directory, which makes plain cudaSetDevice/cudaMalloc
fail with cudaErrorMpsConnectionFailed. Binding AMSS-NCKU to a private
pipe directory avoids cross-user interference.
"""
env = os.environ.copy()
pipe_dir = env.get("CUDA_MPS_PIPE_DIRECTORY", f"/tmp/amss-ncku-mps-{os.getuid()}")
log_dir = env.get("CUDA_MPS_LOG_DIRECTORY", f"/tmp/amss-ncku-mps-log-{os.getuid()}")
os.makedirs(pipe_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)
env["CUDA_MPS_PIPE_DIRECTORY"] = pipe_dir
env["CUDA_MPS_LOG_DIRECTORY"] = log_dir
control_socket = os.path.join(pipe_dir, "control")
if not os.path.exists(control_socket):
start = subprocess.run(
["nvidia-cuda-mps-control", "-d"],
env=env,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
if start.returncode != 0:
print(f" Warning: failed to start private CUDA MPS daemon in {pipe_dir}")
else:
print(f" Using private CUDA MPS pipe directory: {pipe_dir}")
else:
print(f" Using existing private CUDA MPS pipe directory: {pipe_dir}")
return env
##################################################################
################################################################## ##################################################################
@@ -146,16 +189,29 @@ def run_ABE():
## Define the command to run; cast other values to strings as needed ## Define the command to run; cast other values to strings as needed
run_env = None
if (input_data.GPU_Calculation == "no"): if (input_data.GPU_Calculation == "no"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
#mpi_command = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" #mpi_command = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
mpi_command_outfile = "ABE_out.log" mpi_command_outfile = "ABE_out.log"
elif (input_data.GPU_Calculation == "yes"): elif (input_data.GPU_Calculation == "yes"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU" run_env = prepare_gpu_runtime_env()
if int(input_data.MPI_processes) == 1:
mpi_command = "./ABEGPU"
else:
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
mpi_command_outfile = "ABEGPU_out.log" mpi_command_outfile = "ABEGPU_out.log"
## Execute the MPI command and stream output ## Execute the MPI command and stream output
mpi_process = subprocess.Popen(mpi_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) mpi_process = subprocess.Popen(
mpi_command,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
env=run_env,
)
## Write ABE run output to file while printing to stdout ## Write ABE run output to file while printing to stdout
with open(mpi_command_outfile, 'w') as file0: with open(mpi_command_outfile, 'w') as file0: