Compare commits

..

3 Commits

Author SHA1 Message Date
9c44d1c885 fix(bssn_rhs) 2026-03-03 16:00:45 +08:00
4b9de28feb 将 Restrict/Prolong 链路里的 coarse-level Sync_cached 改为可选(默认跳过)
OutBdLow2Hi_cached 读的是 coarse owned 区域(非 coarse ghost/buffer)
回退旧行为:编译时定义 RP_SYNC_COARSE_AFTER_RESTRICT=1
2026-03-03 14:25:27 +08:00
4eb5dc4ddb 删除重复的一次 chi 一阶导计算 2026-03-03 14:23:56 +08:00
22 changed files with 1725 additions and 7355 deletions

View File

@@ -23,20 +23,22 @@ using namespace std;
#include <mpi.h> #include <mpi.h>
#include "misc.h" #include "misc.h"
#include "macrodef.h" #include "macrodef.h"
#ifdef USE_GPU
extern void bssn_cuda_dump_stage_profile();
#endif
#ifndef ABEtype #ifndef ABEtype
#error "not define ABEtype" #error "not define ABEtype"
#endif #endif
#if (ABEtype == 0) #if (ABEtype == 0)
#include "bssn_class.h"
#ifdef USE_GPU
#elif (ABEtype == 1) #include "bssn_gpu_class.h"
#include "bssnEScalar_class.h" #else
#include "bssn_class.h"
#endif
#elif (ABEtype == 1)
#include "bssnEScalar_class.h"
#elif (ABEtype == 2) #elif (ABEtype == 2)
#include "Z4c_class.h" #include "Z4c_class.h"
@@ -472,13 +474,10 @@ int main(int argc, char *argv[])
cout << endl; cout << endl;
} }
ADM->Evolve(Steps); ADM->Evolve(Steps);
#ifdef USE_GPU
bssn_cuda_dump_stage_profile(); if (myrank == 0)
#endif {
if (myrank == 0)
{
cout << endl; cout << endl;
cout << " Total Evolve Time: " << MPI_Wtime() - End_clock << " seconds!" << endl; cout << " Total Evolve Time: " << MPI_Wtime() - End_clock << " seconds!" << endl;
cout << " Total Running Time: " << MPI_Wtime() - Begin_clock << " seconds!" << endl; cout << " Total Running Time: " << MPI_Wtime() - Begin_clock << " seconds!" << endl;

View File

@@ -9,12 +9,8 @@
#include <new> #include <new>
using namespace std; using namespace std;
#include "Block.h" #include "Block.h"
#include "misc.h" #include "misc.h"
#ifdef USE_GPU
#include "bssn_gpu.h"
#include "bssn_cuda_ops.h"
#endif
Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfsi, int levi, const int cgpui) : rank(ranki), ingfs(ingfsi), fngfs(fngfsi), lev(levi), cgpu(cgpui) Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfsi, int levi, const int cgpui) : rank(ranki), ingfs(ingfsi), fngfs(fngfsi), lev(levi), cgpu(cgpui)
{ {
@@ -99,19 +95,14 @@ Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fng
} }
#endif #endif
} }
Block::~Block() Block::~Block()
{ {
int myrank; int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
if (myrank == rank) if (myrank == rank)
{ {
#ifdef USE_GPU for (int i = 0; i < dim; i++)
bssn_gpu_clear_cached_device_buffers(); delete[] X[i];
bssn_cuda_release_rk4_caches();
bssn_cuda_release_interp_caches();
#endif
for (int i = 0; i < dim; i++)
delete[] X[i];
for (int i = 0; i < ingfs; i++) for (int i = 0; i < ingfs; i++)
free(igfs[i]); free(igfs[i]);
delete[] igfs; delete[] igfs;

View File

@@ -2,100 +2,29 @@
#include <iostream> #include <iostream>
#include <iomanip> #include <iomanip>
#include <fstream> #include <fstream>
#include <cstdlib> #include <cstdlib>
#include <cstdio> #include <cstdio>
#include <string> #include <string>
#include <cmath> #include <cmath>
#include <new> #include <new>
#include <map> #include <vector>
#include <vector> using namespace std;
using namespace std;
#include "misc.h" #include "misc.h"
#include "MPatch.h" #include "MPatch.h"
#include "Parallel.h" #include "Parallel.h"
#include "fmisc.h" #include "fmisc.h"
#include "bssn_cuda_ops.h" #ifdef INTERP_LB_PROFILE
#ifdef INTERP_LB_PROFILE #include "interp_lb_profile.h"
#include "interp_lb_profile.h" #endif
#endif
namespace
#if defined(__GNUC__) || defined(__clang__) {
extern int bssn_cuda_interp_points_batch(const int *ex, struct InterpBlockView
const double *X, const double *Y, const double *Z, {
const double *const *fields, Block *bp;
const double *soa_flat, double llb[dim];
int num_var, double uub[dim];
const double *px, const double *py, const double *pz,
int num_points,
int ordn,
int symmetry,
double *out) __attribute__((weak));
#endif
namespace
{
struct InterpVarDesc
{
int sgfn;
double soa[dim];
};
struct InterpPlanKey
{
const Patch *patch;
const double *x;
const double *y;
const double *z;
int NN;
int Symmetry;
int myrank;
};
struct InterpPlanKeyLess
{
bool operator()(const InterpPlanKey &lhs, const InterpPlanKey &rhs) const
{
if (lhs.patch != rhs.patch) return lhs.patch < rhs.patch;
if (lhs.x != rhs.x) return lhs.x < rhs.x;
if (lhs.y != rhs.y) return lhs.y < rhs.y;
if (lhs.z != rhs.z) return lhs.z < rhs.z;
if (lhs.NN != rhs.NN) return lhs.NN < rhs.NN;
if (lhs.Symmetry != rhs.Symmetry) return lhs.Symmetry < rhs.Symmetry;
return lhs.myrank < rhs.myrank;
}
};
struct CachedInterpPlan
{
int nblocks;
vector<int> owner_rank;
vector<int> owner_block;
vector<vector<int> > block_points;
vector<vector<double> > block_px;
vector<vector<double> > block_py;
vector<vector<double> > block_pz;
CachedInterpPlan() : nblocks(0) {}
};
struct CachedInterpPlanEntry
{
bool valid;
InterpPlanKey key;
vector<double> xvals;
vector<double> yvals;
vector<double> zvals;
CachedInterpPlan plan;
CachedInterpPlanEntry() : valid(false) {}
};
struct InterpBlockView
{
Block *bp;
double llb[dim];
double uub[dim];
}; };
struct BlockBinIndex struct BlockBinIndex
@@ -225,10 +154,10 @@ void build_block_bin_index(Patch *patch, const double *DH, BlockBinIndex &index)
index.valid = true; index.valid = true;
} }
int find_block_index_for_point(const BlockBinIndex &index, const double *pox, const double *DH) int find_block_index_for_point(const BlockBinIndex &index, const double *pox, const double *DH)
{ {
if (!index.valid) if (!index.valid)
return -1; return -1;
const int bx = coord_to_bin(pox[0], index.lo[0], index.inv[0], index.bins[0]); const int bx = coord_to_bin(pox[0], index.lo[0], index.inv[0], index.bins[0]);
const int by = coord_to_bin(pox[1], index.lo[1], index.inv[1], index.bins[1]); const int by = coord_to_bin(pox[1], index.lo[1], index.inv[1], index.bins[1]);
@@ -246,314 +175,13 @@ int find_block_index_for_point(const BlockBinIndex &index, const double *pox, co
for (size_t bi = 0; bi < index.views.size(); bi++) for (size_t bi = 0; bi < index.views.size(); bi++)
if (point_in_block_view(index.views[bi], pox, DH)) if (point_in_block_view(index.views[bi], pox, DH))
return int(bi); return int(bi);
return -1; return -1;
} }
} // namespace
void collect_interp_vars(MyList<var> *VarList, vector<InterpVarDesc> &vars)
{ Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
vars.clear(); {
MyList<var> *varl = VarList;
while (varl)
{
InterpVarDesc desc;
desc.sgfn = varl->data->sgfn;
for (int d = 0; d < dim; ++d)
desc.soa[d] = varl->data->SoA[d];
vars.push_back(desc);
varl = varl->next;
}
}
bool should_try_cuda_interp(int ordn, int num_points, int num_var)
{
#if defined(__GNUC__) || defined(__clang__)
if (!bssn_cuda_interp_points_batch)
return false;
#else
return false;
#endif
if (ordn != 6)
return false;
if (num_points < 32)
return false;
return num_points * num_var >= 256;
}
CachedInterpPlanEntry &interp_plan_cache_entry()
{
static CachedInterpPlanEntry cache;
return cache;
}
bool same_interp_plan_key(const InterpPlanKey &lhs, const InterpPlanKey &rhs)
{
return lhs.patch == rhs.patch &&
lhs.NN == rhs.NN &&
lhs.Symmetry == rhs.Symmetry &&
lhs.myrank == rhs.myrank;
}
bool same_interp_plan_points(const CachedInterpPlanEntry &cache, int NN, double **XX)
{
if (static_cast<int>(cache.xvals.size()) != NN ||
static_cast<int>(cache.yvals.size()) != NN ||
static_cast<int>(cache.zvals.size()) != NN)
return false;
for (int j = 0; j < NN; ++j)
{
if (cache.xvals[j] != XX[0][j] ||
cache.yvals[j] != XX[1][j] ||
cache.zvals[j] != XX[2][j])
return false;
}
return true;
}
CachedInterpPlan &get_cached_interp_plan(Patch *patch,
int NN, double **XX,
int Symmetry, int myrank,
const double *DH,
const BlockBinIndex &block_index,
bool report_bounds_here,
bool allow_missing_points)
{
InterpPlanKey key;
key.patch = patch;
key.x = XX[0];
key.y = XX[1];
key.z = XX[2];
key.NN = NN;
key.Symmetry = Symmetry;
key.myrank = myrank;
CachedInterpPlanEntry &cache = interp_plan_cache_entry();
if (cache.valid &&
same_interp_plan_key(cache.key, key) &&
same_interp_plan_points(cache, NN, XX) &&
cache.plan.nblocks == static_cast<int>(block_index.views.size()))
return cache.plan;
cache.valid = true;
cache.key = key;
cache.xvals.assign(XX[0], XX[0] + NN);
cache.yvals.assign(XX[1], XX[1] + NN);
cache.zvals.assign(XX[2], XX[2] + NN);
cache.plan = CachedInterpPlan();
CachedInterpPlan &plan = cache.plan;
plan.nblocks = static_cast<int>(block_index.views.size());
plan.owner_rank.assign(NN, -1);
plan.owner_block.assign(NN, -1);
plan.block_points.resize(plan.nblocks);
plan.block_px.resize(plan.nblocks);
plan.block_py.resize(plan.nblocks);
plan.block_pz.resize(plan.nblocks);
for (int j = 0; j < NN; ++j)
{
double pox[dim];
for (int i = 0; i < dim; ++i)
{
pox[i] = XX[i][j];
if (report_bounds_here &&
(XX[i][j] < patch->bbox[i] + patch->lli[i] * DH[i] ||
XX[i][j] > patch->bbox[dim + i] - patch->uui[i] * DH[i]))
{
cout << "Patch::Interp_Points: point (";
for (int k = 0; k < dim; ++k)
{
cout << XX[k][j];
if (k < dim - 1)
cout << ",";
else
cout << ") is out of current Patch." << endl;
}
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
const int block_i = find_block_index_for_point(block_index, pox, DH);
if (block_i >= 0)
{
Block *BP = block_index.views[block_i].bp;
plan.owner_rank[j] = BP->rank;
plan.owner_block[j] = block_i;
if (BP->rank == myrank)
{
plan.block_points[block_i].push_back(j);
plan.block_px[block_i].push_back(XX[0][j]);
plan.block_py[block_i].push_back(XX[1][j]);
plan.block_pz[block_i].push_back(XX[2][j]);
}
}
}
if (!allow_missing_points && report_bounds_here)
{
for (int j = 0; j < NN; ++j)
{
if (plan.owner_rank[j] >= 0)
continue;
cout << "ERROR: Patch::Interp_Points fails to find point (";
for (int d = 0; d < dim; ++d)
{
cout << XX[d][j];
if (d < dim - 1)
cout << ",";
else
cout << ")";
}
cout << " on Patch (";
for (int d = 0; d < dim; ++d)
{
cout << patch->bbox[d] << "+" << patch->lli[d] * DH[d];
if (d < dim - 1)
cout << ",";
else
cout << ")--";
}
cout << "(";
for (int d = 0; d < dim; ++d)
{
cout << patch->bbox[dim + d] << "-" << patch->uui[d] * DH[d];
if (d < dim - 1)
cout << ",";
else
cout << ")" << endl;
}
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
return plan;
}
void release_interp_plan_cache_internal()
{
CachedInterpPlanEntry &cache = interp_plan_cache_entry();
cache.valid = false;
cache.xvals.clear();
cache.yvals.clear();
cache.zvals.clear();
cache.plan = CachedInterpPlan();
}
bool run_cuda_interp_for_block(Block *BP,
const vector<InterpVarDesc> &vars,
const vector<int> &point_ids,
const vector<double> &px,
const vector<double> &py,
const vector<double> &pz,
double *Shellf,
int num_var,
int ordn,
int Symmetry)
{
if (!should_try_cuda_interp(ordn, static_cast<int>(point_ids.size()), num_var))
return false;
vector<const double *> field_ptrs(num_var);
vector<double> soa_flat(3 * num_var);
for (int v = 0; v < num_var; ++v)
{
field_ptrs[v] = BP->fgfs[vars[v].sgfn];
for (int d = 0; d < dim; ++d)
soa_flat[3 * v + d] = vars[v].soa[d];
}
const int npts = static_cast<int>(point_ids.size());
vector<double> out(static_cast<size_t>(npts) * static_cast<size_t>(num_var));
if (bssn_cuda_interp_points_batch(BP->shape,
BP->X[0], BP->X[1], BP->X[2],
field_ptrs.data(),
soa_flat.data(),
num_var,
px.data(), py.data(), pz.data(),
npts,
ordn,
Symmetry,
out.data()) != 0)
{
return false;
}
for (int p = 0; p < npts; ++p)
{
const int j = point_ids[p];
memcpy(Shellf + j * num_var, out.data() + p * num_var, sizeof(double) * num_var);
}
return true;
}
void run_cpu_interp_for_block(Block *BP,
const vector<InterpVarDesc> &vars,
const vector<int> &point_ids,
const vector<double> &px,
const vector<double> &py,
const vector<double> &pz,
double *Shellf,
int num_var,
int ordn,
int Symmetry)
{
for (size_t p = 0; p < point_ids.size(); ++p)
{
const int j = point_ids[p];
double x = px[p];
double y = py[p];
double z = pz[p];
int ordn_local = ordn;
int symmetry_local = Symmetry;
for (int v = 0; v < num_var; ++v)
{
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2],
BP->fgfs[vars[v].sgfn], Shellf[j * num_var + v],
x, y, z, ordn_local, const_cast<double *>(vars[v].soa), symmetry_local);
}
}
}
void interpolate_owned_points(MyList<var> *VarList,
double *Shellf, int Symmetry,
int ordn,
const BlockBinIndex &block_index,
const CachedInterpPlan &plan)
{
vector<InterpVarDesc> vars;
collect_interp_vars(VarList, vars);
const int num_var = static_cast<int>(vars.size());
for (size_t bi = 0; bi < plan.block_points.size(); ++bi)
{
if (plan.block_points[bi].empty())
continue;
Block *BP = block_index.views[bi].bp;
bool done = run_cuda_interp_for_block(BP, vars,
plan.block_points[bi],
plan.block_px[bi],
plan.block_py[bi],
plan.block_pz[bi],
Shellf, num_var, ordn, Symmetry);
if (!done)
run_cpu_interp_for_block(BP, vars,
plan.block_points[bi],
plan.block_px[bi],
plan.block_py[bi],
plan.block_pz[bi],
Shellf, num_var, ordn, Symmetry);
}
}
} // namespace
void patch_release_interp_plan_cache()
{
release_interp_plan_cache_internal();
}
Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
{
int hbuffer_width = buffer_width; int hbuffer_width = buffer_width;
if (lev == 0) if (lev == 0)
@@ -895,15 +523,60 @@ void Patch::Interp_Points(MyList<var> *VarList,
memset(Shellf, 0, sizeof(double) * NN * num_var); memset(Shellf, 0, sizeof(double) * NN * num_var);
double DH[dim]; // owner_rank[j] records which MPI rank owns point j
for (int i = 0; i < dim; i++) // All ranks traverse the same block list so they all agree on ownership
DH[i] = getdX(i); int *owner_rank;
BlockBinIndex block_index; owner_rank = new int[NN];
build_block_bin_index(this, DH, block_index); for (int j = 0; j < NN; j++)
CachedInterpPlan &plan = get_cached_interp_plan(this, NN, XX, Symmetry, myrank, DH, block_index, myrank == 0, false); owner_rank[j] = -1;
const int *owner_rank = plan.owner_rank.data();
double DH[dim];
interpolate_owned_points(VarList, Shellf, Symmetry, ordn, block_index, plan); for (int i = 0; i < dim; i++)
DH[i] = getdX(i);
BlockBinIndex block_index;
build_block_bin_index(this, DH, block_index);
for (int j = 0; j < NN; j++) // run along points
{
double pox[dim];
for (int i = 0; i < dim; i++)
{
pox[i] = XX[i][j];
if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
{
cout << "Patch::Interp_Points: point (";
for (int k = 0; k < dim; k++)
{
cout << XX[k][j];
if (k < dim - 1)
cout << ",";
else
cout << ") is out of current Patch." << endl;
}
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
const int block_i = find_block_index_for_point(block_index, pox, DH);
if (block_i >= 0)
{
Block *BP = block_index.views[block_i].bp;
owner_rank[j] = BP->rank;
if (myrank == BP->rank)
{
//---> interpolation
varl = VarList;
int k = 0;
while (varl) // run along variables
{
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next;
k++;
}
}
}
}
// Replace MPI_Allreduce with per-owner MPI_Bcast: // Replace MPI_Allreduce with per-owner MPI_Bcast:
// Group consecutive points by owner rank and broadcast each group. // Group consecutive points by owner rank and broadcast each group.
@@ -958,8 +631,9 @@ void Patch::Interp_Points(MyList<var> *VarList,
MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner, MPI_COMM_WORLD); MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner, MPI_COMM_WORLD);
} }
} }
} delete[] owner_rank;
}
void Patch::Interp_Points(MyList<var> *VarList, void Patch::Interp_Points(MyList<var> *VarList,
int NN, double **XX, int NN, double **XX,
double *Shellf, int Symmetry, double *Shellf, int Symmetry,
@@ -987,22 +661,102 @@ void Patch::Interp_Points(MyList<var> *VarList,
memset(Shellf, 0, sizeof(double) * NN * num_var); memset(Shellf, 0, sizeof(double) * NN * num_var);
double DH[dim]; // owner_rank[j] records which MPI rank owns point j
for (int i = 0; i < dim; i++) int *owner_rank;
DH[i] = getdX(i); owner_rank = new int[NN];
BlockBinIndex block_index; for (int j = 0; j < NN; j++)
build_block_bin_index(this, DH, block_index); owner_rank[j] = -1;
CachedInterpPlan &plan = get_cached_interp_plan(this, NN, XX, Symmetry, myrank, DH, block_index, myrank == 0, false);
const int *owner_rank = plan.owner_rank.data(); double DH[dim];
for (int i = 0; i < dim; i++)
interpolate_owned_points(VarList, Shellf, Symmetry, ordn, block_index, plan); DH[i] = getdX(i);
BlockBinIndex block_index;
build_block_bin_index(this, DH, block_index);
// --- Interpolation phase (identical to original) ---
for (int j = 0; j < NN; j++)
{
double pox[dim];
for (int i = 0; i < dim; i++)
{
pox[i] = XX[i][j];
if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
{
cout << "Patch::Interp_Points: point (";
for (int k = 0; k < dim; k++)
{
cout << XX[k][j];
if (k < dim - 1)
cout << ",";
else
cout << ") is out of current Patch." << endl;
}
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
const int block_i = find_block_index_for_point(block_index, pox, DH);
if (block_i >= 0)
{
Block *BP = block_index.views[block_i].bp;
owner_rank[j] = BP->rank;
if (myrank == BP->rank)
{
varl = VarList;
int k = 0;
while (varl)
{
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next;
k++;
}
}
}
}
#ifdef INTERP_LB_PROFILE #ifdef INTERP_LB_PROFILE
double t_interp_end = MPI_Wtime(); double t_interp_end = MPI_Wtime();
double t_interp_local = t_interp_end - t_interp_start; double t_interp_local = t_interp_end - t_interp_start;
#endif #endif
// --- Targeted point-to-point communication phase --- // --- Error check for unfound points ---
for (int j = 0; j < NN; j++)
{
if (owner_rank[j] < 0 && myrank == 0)
{
cout << "ERROR: Patch::Interp_Points fails to find point (";
for (int d = 0; d < dim; d++)
{
cout << XX[d][j];
if (d < dim - 1)
cout << ",";
else
cout << ")";
}
cout << " on Patch (";
for (int d = 0; d < dim; d++)
{
cout << bbox[d] << "+" << lli[d] * DH[d];
if (d < dim - 1)
cout << ",";
else
cout << ")--";
}
cout << "(";
for (int d = 0; d < dim; d++)
{
cout << bbox[dim + d] << "-" << uui[d] * DH[d];
if (d < dim - 1)
cout << ",";
else
cout << ")" << endl;
}
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
// --- Targeted point-to-point communication phase ---
// Compute consumer_rank[j] using the same deterministic formula as surface_integral // Compute consumer_rank[j] using the same deterministic formula as surface_integral
int *consumer_rank = new int[NN]; int *consumer_rank = new int[NN];
{ {
@@ -1119,8 +873,9 @@ void Patch::Interp_Points(MyList<var> *VarList,
delete[] send_offset; delete[] send_offset;
delete[] recv_offset; delete[] recv_offset;
delete[] send_count; delete[] send_count;
delete[] recv_count; delete[] recv_count;
delete[] consumer_rank; delete[] consumer_rank;
delete[] owner_rank;
#ifdef INTERP_LB_PROFILE #ifdef INTERP_LB_PROFILE
{ {
@@ -1168,20 +923,64 @@ void Patch::Interp_Points(MyList<var> *VarList,
memset(Shellf, 0, sizeof(double) * NN * num_var); memset(Shellf, 0, sizeof(double) * NN * num_var);
// Build global-to-local rank translation for Comm_here // owner_rank[j] stores the global rank that owns point j
MPI_Group world_group, local_group; int *owner_rank;
MPI_Comm_group(MPI_COMM_WORLD, &world_group); owner_rank = new int[NN];
MPI_Comm_group(Comm_here, &local_group); for (int j = 0; j < NN; j++)
owner_rank[j] = -1;
double DH[dim]; // Build global-to-local rank translation for Comm_here
for (int i = 0; i < dim; i++) MPI_Group world_group, local_group;
DH[i] = getdX(i); MPI_Comm_group(MPI_COMM_WORLD, &world_group);
BlockBinIndex block_index; MPI_Comm_group(Comm_here, &local_group);
build_block_bin_index(this, DH, block_index);
CachedInterpPlan &plan = get_cached_interp_plan(this, NN, XX, Symmetry, myrank, DH, block_index, lmyrank == 0, true); double DH[dim];
const int *owner_rank = plan.owner_rank.data(); for (int i = 0; i < dim; i++)
DH[i] = getdX(i);
interpolate_owned_points(VarList, Shellf, Symmetry, ordn, block_index, plan); BlockBinIndex block_index;
build_block_bin_index(this, DH, block_index);
for (int j = 0; j < NN; j++) // run along points
{
double pox[dim];
for (int i = 0; i < dim; i++)
{
pox[i] = XX[i][j];
if (lmyrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
{
cout << "Patch::Interp_Points: point (";
for (int k = 0; k < dim; k++)
{
cout << XX[k][j];
if (k < dim - 1)
cout << ",";
else
cout << ") is out of current Patch." << endl;
}
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
const int block_i = find_block_index_for_point(block_index, pox, DH);
if (block_i >= 0)
{
Block *BP = block_index.views[block_i].bp;
owner_rank[j] = BP->rank;
if (myrank == BP->rank)
{
//---> interpolation
varl = VarList;
int k = 0;
while (varl) // run along variables
{
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next;
k++;
}
}
}
}
// Collect unique global owner ranks and translate to local ranks in Comm_here // Collect unique global owner ranks and translate to local ranks in Comm_here
// Then broadcast each owner's points via MPI_Bcast on Comm_here // Then broadcast each owner's points via MPI_Bcast on Comm_here
@@ -1209,9 +1008,10 @@ void Patch::Interp_Points(MyList<var> *VarList,
} }
} }
MPI_Group_free(&world_group); MPI_Group_free(&world_group);
MPI_Group_free(&local_group); MPI_Group_free(&local_group);
} delete[] owner_rank;
}
void Patch::checkBlock() void Patch::checkBlock()
{ {
int myrank; int myrank;

View File

@@ -8,7 +8,7 @@
#include "var.h" #include "var.h"
#include "macrodef.h" //need dim here; Vertex or Cell; ghost_width #include "macrodef.h" //need dim here; Vertex or Cell; ghost_width
class Patch class Patch
{ {
public: public:
@@ -50,8 +50,6 @@ public:
double *Shellf, int Symmetry, MPI_Comm Comm_here); double *Shellf, int Symmetry, MPI_Comm Comm_here);
void Find_Maximum(MyList<var> *VarList, double *XX, void Find_Maximum(MyList<var> *VarList, double *XX,
double *Shellf, MPI_Comm Comm_here); double *Shellf, MPI_Comm Comm_here);
}; };
void patch_release_interp_plan_cache(); #endif /* PATCH_H */
#endif /* PATCH_H */

File diff suppressed because it is too large Load Diff

View File

@@ -89,12 +89,9 @@ namespace Parallel
void transfermix(MyList<gridseg> **src, MyList<gridseg> **dst, void transfermix(MyList<gridseg> **src, MyList<gridseg> **dst,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */, MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
int Symmetry); int Symmetry);
void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry); void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry);
void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry, const char *context); void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry); void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, const char *context);
void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, const char *context);
struct SyncCache { struct SyncCache {
bool valid; bool valid;
@@ -108,13 +105,12 @@ namespace Parallel
int *send_buf_caps; int *send_buf_caps;
int *recv_buf_caps; int *recv_buf_caps;
MPI_Request *reqs; MPI_Request *reqs;
MPI_Status *stats; MPI_Status *stats;
int max_reqs; int max_reqs;
bool lengths_valid; bool lengths_valid;
int lengths_var_count; int *tc_req_node;
int *tc_req_node; int *tc_req_is_recv;
int *tc_req_is_recv; int *tc_completed;
int *tc_completed;
SyncCache(); SyncCache();
void invalidate(); void invalidate();
void destroy(); void destroy();
@@ -125,20 +121,19 @@ namespace Parallel
MyList<var> *VarList1, MyList<var> *VarList2, MyList<var> *VarList1, MyList<var> *VarList2,
int Symmetry, SyncCache &cache); int Symmetry, SyncCache &cache);
struct AsyncSyncState { struct AsyncSyncState {
int req_no; int req_no;
bool active; bool active;
int mpi_tag; int *req_node;
int *req_node; int *req_is_recv;
int *req_is_recv; int pending_recv;
int pending_recv; AsyncSyncState() : req_no(0), active(false), req_node(0), req_is_recv(0), pending_recv(0) {}
AsyncSyncState() : req_no(0), active(false), mpi_tag(0), req_node(0), req_is_recv(0), pending_recv(0) {} };
};
void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
SyncCache &cache, AsyncSyncState &state); SyncCache &cache, AsyncSyncState &state);
void Sync_finish(SyncCache &cache, AsyncSyncState &state, void Sync_finish(SyncCache &cache, AsyncSyncState &state,
MyList<var> *VarList, int Symmetry, bool unpack_to_host = true); MyList<var> *VarList, int Symmetry);
void OutBdLow2Hi(Patch *Patc, Patch *Patf, void OutBdLow2Hi(Patch *Patc, Patch *Patf,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */, MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
int Symmetry); int Symmetry);

View File

@@ -14,8 +14,7 @@ using namespace std;
#include <string.h> #include <string.h>
#endif #endif
#include <time.h> #include <time.h>
#include <unistd.h>
#include "macrodef.h" #include "macrodef.h"
#include "misc.h" #include "misc.h"
@@ -29,30 +28,20 @@ using namespace std;
#include "rungekutta4_rout.h" #include "rungekutta4_rout.h"
#include "sommerfeld_rout.h" #include "sommerfeld_rout.h"
#include "getnp4.h" #include "getnp4.h"
#include "shellfunctions.h" #include "shellfunctions.h"
#include "parameters.h" #include "parameters.h"
#ifdef USE_GPU
#include "bssn_macro.h"
#include "bssn_gpu.h"
#endif
#ifdef With_AHF #ifdef With_AHF
#include "derivatives.h" #include "derivatives.h"
#include "myglobal.h" #include "myglobal.h"
#endif #endif
#include "perf.h" #include "perf.h"
#include "derivatives.h" #include "derivatives.h"
#include "ricci_gamma.h" #include "ricci_gamma.h"
// Compile-time switch for per-timestep memory usage collection/printing. //================================================================================================
// Default is OFF to reduce overhead in production runs.
#ifndef BSSN_ENABLE_MEM_USAGE_LOG
#define BSSN_ENABLE_MEM_USAGE_LOG 0
#endif
//================================================================================================
// define bssn_class // define bssn_class
@@ -745,12 +734,11 @@ void bssn_class::Initialize()
// Initialize sync caches (per-level, for predictor and corrector) // Initialize sync caches (per-level, for predictor and corrector)
sync_cache_pre = new Parallel::SyncCache[GH->levels]; sync_cache_pre = new Parallel::SyncCache[GH->levels];
sync_cache_cor = new Parallel::SyncCache[GH->levels]; sync_cache_cor = new Parallel::SyncCache[GH->levels];
sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels]; sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
sync_cache_rp_fine = new Parallel::SyncCache[GH->levels]; sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
sync_cache_restrict = new Parallel::SyncCache[GH->levels]; sync_cache_restrict = new Parallel::SyncCache[GH->levels];
sync_cache_outbd = new Parallel::SyncCache[GH->levels]; sync_cache_outbd = new Parallel::SyncCache[GH->levels];
sync_cache_psi4 = new Parallel::SyncCache[GH->levels]; }
}
//================================================================================================ //================================================================================================
@@ -762,8 +750,8 @@ void bssn_class::Initialize()
//================================================================================================ //================================================================================================
bssn_class::~bssn_class() bssn_class::~bssn_class()
{ {
#ifdef With_AHF #ifdef With_AHF
AHList->clearList(); AHList->clearList();
AHDList->clearList(); AHDList->clearList();
@@ -1020,30 +1008,12 @@ bssn_class::~bssn_class()
sync_cache_rp_coarse[i].destroy(); sync_cache_rp_coarse[i].destroy();
delete[] sync_cache_rp_coarse; delete[] sync_cache_rp_coarse;
} }
if (sync_cache_rp_fine) if (sync_cache_rp_fine)
{ {
for (int i = 0; i < GH->levels; i++) for (int i = 0; i < GH->levels; i++)
sync_cache_rp_fine[i].destroy(); sync_cache_rp_fine[i].destroy();
delete[] sync_cache_rp_fine; delete[] sync_cache_rp_fine;
} }
if (sync_cache_restrict)
{
for (int i = 0; i < GH->levels; i++)
sync_cache_restrict[i].destroy();
delete[] sync_cache_restrict;
}
if (sync_cache_outbd)
{
for (int i = 0; i < GH->levels; i++)
sync_cache_outbd[i].destroy();
delete[] sync_cache_outbd;
}
if (sync_cache_psi4)
{
for (int i = 0; i < GH->levels; i++)
sync_cache_psi4[i].destroy();
delete[] sync_cache_psi4;
}
delete GH; delete GH;
#ifdef WithShell #ifdef WithShell
@@ -1076,25 +1046,8 @@ bssn_class::~bssn_class()
delete ConVMonitor; delete ConVMonitor;
delete Waveshell; delete Waveshell;
delete CheckPoint; delete CheckPoint;
} }
void bssn_class::InvalidateSyncCaches()
{
if (!GH)
return;
for (int il = 0; il < GH->levels; il++)
{
sync_cache_pre[il].invalidate();
sync_cache_cor[il].invalidate();
sync_cache_rp_coarse[il].invalidate();
sync_cache_rp_fine[il].invalidate();
sync_cache_restrict[il].invalidate();
sync_cache_outbd[il].invalidate();
sync_cache_psi4[il].invalidate();
}
}
//================================================================================================ //================================================================================================
@@ -2077,10 +2030,9 @@ void bssn_class::Read_Ansorg()
void bssn_class::Evolve(int Steps) void bssn_class::Evolve(int Steps)
{ {
clock_t prev_clock, curr_clock; clock_t prev_clock, curr_clock;
double LastDump = 0.0, LastCheck = 0.0, Last2dDump = 0.0; double LastDump = 0.0, LastCheck = 0.0, Last2dDump = 0.0;
LastAnas = 0; LastAnas = 0;
LastConsOut = 0;
#if 0 #if 0
//initial checkpoint for special uasge //initial checkpoint for special uasge
{ {
@@ -2177,10 +2129,8 @@ void bssn_class::Evolve(int Steps)
#endif #endif
*/ */
#if BSSN_ENABLE_MEM_USAGE_LOG perf bssn_perf;
perf bssn_perf; size_t current_min, current_avg, current_max, peak_min, peak_avg, peak_max;
size_t current_min, current_avg, current_max, peak_min, peak_avg, peak_max;
#endif
for (int lev = 0; lev < GH->levels; lev++) for (int lev = 0; lev < GH->levels; lev++)
GH->Lt[lev] = PhysTime; GH->Lt[lev] = PhysTime;
@@ -2265,7 +2215,7 @@ void bssn_class::Evolve(int Steps)
GH->Regrid(Symmetry, BH_num, Porgbr, Porg0, GH->Regrid(Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor); fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor);
InvalidateSyncCaches(); for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
#endif #endif
#if (REGLEV == 0 && (PSTR == 1 || PSTR == 2)) #if (REGLEV == 0 && (PSTR == 1 || PSTR == 2))
@@ -2274,23 +2224,21 @@ void bssn_class::Evolve(int Steps)
// fgt(PhysTime-dT_mon,StartTime,dT_mon/2),ErrorMonitor); // fgt(PhysTime-dT_mon,StartTime,dT_mon/2),ErrorMonitor);
#endif #endif
#if BSSN_ENABLE_MEM_USAGE_LOG // Retrieve memory usage information used during computation; master process prints it
// Retrieve memory usage information used during computation; master process prints it bssn_perf.MemoryUsage(&current_min, &current_avg, &current_max,
bssn_perf.MemoryUsage(&current_min, &current_avg, &current_max, &peak_min, &peak_avg, &peak_max, nprocs);
&peak_min, &peak_avg, &peak_max, nprocs); if (myrank == 0)
if (myrank == 0) {
{ printf(" Memory usage: current %0.4lg/%0.4lg/%0.4lgMB, "
printf(" Memory usage: current %0.4lg/%0.4lg/%0.4lgMB, " "peak %0.4lg/%0.4lg/%0.4lgMB\n",
"peak %0.4lg/%0.4lg/%0.4lgMB\n", (double)current_min / (1024.0 * 1024.0),
(double)current_min / (1024.0 * 1024.0), (double)current_avg / (1024.0 * 1024.0),
(double)current_avg / (1024.0 * 1024.0), (double)current_max / (1024.0 * 1024.0),
(double)current_max / (1024.0 * 1024.0), (double)peak_min / (1024.0 * 1024.0),
(double)peak_min / (1024.0 * 1024.0), (double)peak_avg / (1024.0 * 1024.0),
(double)peak_avg / (1024.0 * 1024.0), (double)peak_max / (1024.0 * 1024.0));
(double)peak_max / (1024.0 * 1024.0)); cout << endl;
cout << endl; }
}
#endif
// Output puncture positions at each step // Output puncture positions at each step
if (myrank == 0) if (myrank == 0)
@@ -2338,21 +2286,18 @@ void bssn_class::Evolve(int Steps)
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// When LastCheck >= CheckTime, perform runtime checks and output status data // When LastCheck >= CheckTime, perform runtime checks and output status data
if (LastCheck >= CheckTime) if (LastCheck >= CheckTime)
{ {
LastCheck = 0; LastCheck = 0;
CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass); CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass);
CheckPoint->writecheck_cgh(PhysTime, GH); CheckPoint->writecheck_cgh(PhysTime, GH);
#ifdef WithShell #ifdef WithShell
CheckPoint->writecheck_sh(PhysTime, SH); CheckPoint->writecheck_sh(PhysTime, SH);
#endif #endif
CheckPoint->write_bssn(LastDump, Last2dDump, LastAnas); CheckPoint->write_bssn(LastDump, Last2dDump, LastAnas);
} }
}
// Keep output/analysis phases aligned across ranks before the next coarse step.
MPI_Barrier(MPI_COMM_WORLD);
}
/* /*
#ifdef With_AHF #ifdef With_AHF
// final apparent horizon finding // final apparent horizon finding
@@ -2486,7 +2431,7 @@ void bssn_class::RecursiveStep(int lev)
if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
InvalidateSyncCaches(); for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
#endif #endif
} }
@@ -2665,7 +2610,7 @@ void bssn_class::ParallelStep()
if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0, if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
InvalidateSyncCaches(); for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
#endif #endif
} }
@@ -2832,7 +2777,7 @@ void bssn_class::ParallelStep()
if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0, if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor)) fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor))
InvalidateSyncCaches(); for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
// a_stream.clear(); // a_stream.clear();
// a_stream.str(""); // a_stream.str("");
@@ -2847,7 +2792,7 @@ void bssn_class::ParallelStep()
if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
InvalidateSyncCaches(); for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
// a_stream.clear(); // a_stream.clear();
// a_stream.str(""); // a_stream.str("");
@@ -2866,7 +2811,7 @@ void bssn_class::ParallelStep()
if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor)) fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
InvalidateSyncCaches(); for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
// a_stream.clear(); // a_stream.clear();
// a_stream.str(""); // a_stream.str("");
@@ -2882,7 +2827,7 @@ void bssn_class::ParallelStep()
if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor)) fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
InvalidateSyncCaches(); for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
// a_stream.clear(); // a_stream.clear();
// a_stream.str(""); // a_stream.str("");
@@ -3071,14 +3016,9 @@ void bssn_class::RecursiveStep(int lev, int num) // in all 2^(lev+1)-1 steps
#if (PSTR == 0) #if (PSTR == 0)
#if 1 #if 1
void bssn_class::Step(int lev, int YN) void bssn_class::Step(int lev, int YN)
{ {
#ifdef USE_GPU setpbh(BH_num, Porg0, Mass, BH_num_input);
Step_MainPath_GPU(lev, YN);
return;
#endif
setpbh(BH_num, Porg0, Mass, BH_num_input);
double dT_lev = dT * pow(0.5, Mymax(lev, trfls)); double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
@@ -5805,11 +5745,17 @@ void bssn_class::SHStep()
//================================================================================================ //================================================================================================
// 0: do not use mixing two levels data for OutBD; 1: do use // 0: do not use mixing two levels data for OutBD; 1: do use
#define MIXOUTB 0 #define MIXOUTB 0
void bssn_class::RestrictProlong(int lev, int YN, bool BB, // In the cached Restrict->OutBdLow2Hi path, coarse Sync is usually redundant:
MyList<var> *SL, MyList<var> *OL, MyList<var> *corL) // OutBdLow2Hi_cached reads coarse owned cells (build_owned_gsl type-4), not coarse ghost/buffer cells.
// Keep a switch to restore the old behavior if needed for debugging.
#ifndef RP_SYNC_COARSE_AFTER_RESTRICT
#define RP_SYNC_COARSE_AFTER_RESTRICT 0
#endif
void bssn_class::RestrictProlong(int lev, int YN, bool BB,
MyList<var> *SL, MyList<var> *OL, MyList<var> *corL)
// we assume // we assume
// StateList 1 ----------- // StateList 1 -----------
// //
@@ -5871,7 +5817,9 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
// misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str()); // misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
#endif #endif
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]); #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
#endif
#if (PSTR == 1 || PSTR == 2) #if (PSTR == 1 || PSTR == 2)
// a_stream.clear(); // a_stream.clear();
@@ -5922,7 +5870,9 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
// misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str()); // misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
#endif #endif
Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]); #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);
#endif
#if (PSTR == 1 || PSTR == 2) #if (PSTR == 1 || PSTR == 2)
// a_stream.clear(); // a_stream.clear();
@@ -6008,7 +5958,9 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
#endif #endif
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]); #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
#endif
#if (RPB == 0) #if (RPB == 0)
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
@@ -6030,7 +5982,9 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
#endif #endif
Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]); #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);
#endif
#if (RPB == 0) #if (RPB == 0)
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
@@ -6095,7 +6049,9 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry);
#endif #endif
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]); #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
#endif
#if (RPB == 0) #if (RPB == 0)
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
@@ -6119,7 +6075,9 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry);
#endif #endif
Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]); #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
#endif
#if (RPB == 0) #if (RPB == 0)
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
@@ -6143,7 +6101,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
//================================================================================================ //================================================================================================
void bssn_class::ProlongRestrict(int lev, int YN, bool BB) void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
{ {
if (lev > 0) if (lev > 0)
{ {
@@ -6196,15 +6154,18 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
#else #else
Parallel::Restrict_after(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry); Parallel::Restrict_after(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry);
#endif #endif
Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]); #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
#endif
} }
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]); Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
} }
} }
#undef MIXOUTB #undef MIXOUTB
#undef RP_SYNC_COARSE_AFTER_RESTRICT
//================================================================================================
//================================================================================================
@@ -6298,7 +6259,7 @@ for(int ilev = GH->levels-1;ilev>=lev;ilev--)
for(int ilev=GH->levels-1;ilev>lev;ilev--) for(int ilev=GH->levels-1;ilev>lev;ilev--)
RestrictProlong(ilev,1,false,DG_List,DG_List,DG_List); RestrictProlong(ilev,1,false,DG_List,DG_List,DG_List);
#else #else
Parallel::Sync_cached(GH->PatL[lev], DG_List, Symmetry, sync_cache_psi4[lev]); Parallel::Sync(GH->PatL[lev], DG_List, Symmetry);
#endif #endif
#ifdef WithShell #ifdef WithShell
@@ -6953,10 +6914,10 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)
{ {
LastAnas += dT_lev; LastAnas += dT_lev;
if (LastAnas >= AnasTime) if (LastAnas >= AnasTime)
{ {
#ifdef Point_Psi4 #ifdef Point_Psi4
#error "not support parallel levels yet" #error "not support parallel levels yet"
// Gam_ijk and R_ij have been calculated in Interp_Constraint() // Gam_ijk and R_ij have been calculated in Interp_Constraint()
double SYM = 1, ANT = -1; double SYM = 1, ANT = -1;
for (int levh = lev; levh < GH->levels; levh++) for (int levh = lev; levh < GH->levels; levh++)
@@ -7300,9 +7261,9 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)
//================================================================================================ //================================================================================================
void bssn_class::Constraint_Out() void bssn_class::Constraint_Out()
{ {
LastConsOut += dT * pow(0.5, Mymax(0, trfls)); LastConsOut += dT * pow(0.5, Mymax(0, trfls));
if (LastConsOut >= AnasTime) if (LastConsOut >= AnasTime)
// Constraint violation // Constraint violation
@@ -7322,15 +7283,12 @@ void bssn_class::Constraint_Out()
MyList<Block> *BP = Pp->data->blb; MyList<Block> *BP = Pp->data->blb;
while (BP) while (BP)
{ {
Block *cg = BP->data; Block *cg = BP->data;
if (myrank == cg->rank) if (myrank == cg->rank)
{ {
#ifdef USE_GPU f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
gpu_rhs(CALLED_BY_CONSTRAINT_CONS_ONLY, myrank, RHS_PARA_CALLED_Constraint_Out); cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
#else cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn], cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn],
@@ -7358,12 +7316,11 @@ void bssn_class::Constraint_Out()
cg->fgfs[Gamzyy->sgfn], cg->fgfs[Gamzyz->sgfn], cg->fgfs[Gamzzz->sgfn], cg->fgfs[Gamzyy->sgfn], cg->fgfs[Gamzyz->sgfn], cg->fgfs[Gamzzz->sgfn],
cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn], cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn],
cg->fgfs[Ryy->sgfn], cg->fgfs[Ryz->sgfn], cg->fgfs[Rzz->sgfn], cg->fgfs[Ryy->sgfn], cg->fgfs[Ryz->sgfn], cg->fgfs[Rzz->sgfn],
cg->fgfs[Cons_Ham->sgfn], cg->fgfs[Cons_Ham->sgfn],
cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn], cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn], cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
Symmetry, lev, ndeps, pre); Symmetry, lev, ndeps, pre);
#endif }
}
if (BP == Pp->data->ble) if (BP == Pp->data->ble)
break; break;
BP = BP->next; BP = BP->next;
@@ -7371,7 +7328,7 @@ void bssn_class::Constraint_Out()
Pp = Pp->next; Pp = Pp->next;
} }
} }
Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry, "bssn_class::Constraint_Out[level]"); Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry);
} }
#ifdef WithShell #ifdef WithShell
if (0) // if the constrait quantities can be reused from the step rhs calculation if (0) // if the constrait quantities can be reused from the step rhs calculation
@@ -7593,7 +7550,7 @@ void bssn_class::AH_Prepare_derivatives()
} }
Pp = Pp->next; Pp = Pp->next;
} }
Parallel::Sync(GH->PatL[lev], AHDList, Symmetry, "bssn_class::AH_Prepare_derivatives"); Parallel::Sync(GH->PatL[lev], AHDList, Symmetry);
} }
} }
@@ -7829,15 +7786,12 @@ void bssn_class::Interp_Constraint(bool infg)
MyList<Block> *BP = Pp->data->blb; MyList<Block> *BP = Pp->data->blb;
while (BP) while (BP)
{ {
Block *cg = BP->data; Block *cg = BP->data;
if (myrank == cg->rank) if (myrank == cg->rank)
{ {
#ifdef USE_GPU f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
gpu_rhs(CALLED_BY_CONSTRAINT_CONS_ONLY, myrank, RHS_PARA_CALLED_Interp_Constraint); cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
#else cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn], cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn],
@@ -7865,12 +7819,11 @@ void bssn_class::Interp_Constraint(bool infg)
cg->fgfs[Gamzyy->sgfn], cg->fgfs[Gamzyz->sgfn], cg->fgfs[Gamzzz->sgfn], cg->fgfs[Gamzyy->sgfn], cg->fgfs[Gamzyz->sgfn], cg->fgfs[Gamzzz->sgfn],
cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn], cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn],
cg->fgfs[Ryy->sgfn], cg->fgfs[Ryz->sgfn], cg->fgfs[Rzz->sgfn], cg->fgfs[Ryy->sgfn], cg->fgfs[Ryz->sgfn], cg->fgfs[Rzz->sgfn],
cg->fgfs[Cons_Ham->sgfn], cg->fgfs[Cons_Ham->sgfn],
cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn], cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn], cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
Symmetry, lev, ndeps, pre); Symmetry, lev, ndeps, pre);
#endif }
}
if (BP == Pp->data->ble) if (BP == Pp->data->ble)
break; break;
BP = BP->next; BP = BP->next;
@@ -7878,7 +7831,7 @@ void bssn_class::Interp_Constraint(bool infg)
Pp = Pp->next; Pp = Pp->next;
} }
} }
Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry, "bssn_class::Interp_Constraint[level]"); Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry);
} }
#ifdef WithShell #ifdef WithShell
if (0) // if the constrait quantities can be reused from the step rhs calculation if (0) // if the constrait quantities can be reused from the step rhs calculation
@@ -8136,7 +8089,7 @@ void bssn_class::Compute_Constraint()
Pp = Pp->next; Pp = Pp->next;
} }
} }
Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry, "bssn_class::Compute_Constraint[level]"); Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry);
} }
// prolong restrict constraint quantities // prolong restrict constraint quantities
for (lev = GH->levels - 1; lev > 0; lev--) for (lev = GH->levels - 1; lev > 0; lev--)
@@ -8449,18 +8402,12 @@ void bssn_class::Enforce_algcon(int lev, int fg)
//================================================================================================ //================================================================================================
bool bssn_class::check_Stdin_Abort() bool bssn_class::check_Stdin_Abort()
{ {
// Non-interactive launches (mpirun via Python/subprocess, batch jobs, redirected stdin)
// should not probe stdin. Some MPI runtimes treat stdin as a managed channel and can fd_set readfds;
// fail when rank 0 polls/consumes it.
if (!isatty(STDIN_FILENO)) { struct timeval timeout;
return false;
}
fd_set readfds;
struct timeval timeout;
FD_ZERO(&readfds); FD_ZERO(&readfds);
FD_SET(STDIN_FILENO, &readfds); FD_SET(STDIN_FILENO, &readfds);
@@ -8469,17 +8416,14 @@ bool bssn_class::check_Stdin_Abort()
timeout.tv_sec = 0; timeout.tv_sec = 0;
timeout.tv_usec = 0; timeout.tv_usec = 0;
int activity = select(STDIN_FILENO + 1, &readfds, nullptr, nullptr, &timeout); int activity = select(STDIN_FILENO + 1, &readfds, nullptr, nullptr, &timeout);
if (activity <= 0) {
return false; if (activity > 0 && FD_ISSET(STDIN_FILENO, &readfds)) {
} string input_abort;
if (cin >> input_abort) {
if (FD_ISSET(STDIN_FILENO, &readfds)) { if (input_abort == "stop") {
string input_abort; return true;
if (cin >> input_abort) { }
if (input_abort == "stop") {
return true;
}
} }
} }

View File

@@ -128,11 +128,10 @@ public:
Parallel::SyncCache *sync_cache_pre; // per-level cache for predictor sync Parallel::SyncCache *sync_cache_pre; // per-level cache for predictor sync
Parallel::SyncCache *sync_cache_cor; // per-level cache for corrector sync Parallel::SyncCache *sync_cache_cor; // per-level cache for corrector sync
Parallel::SyncCache *sync_cache_rp_coarse; // RestrictProlong sync on PatL[lev-1] Parallel::SyncCache *sync_cache_rp_coarse; // RestrictProlong sync on PatL[lev-1]
Parallel::SyncCache *sync_cache_rp_fine; // RestrictProlong sync on PatL[lev] Parallel::SyncCache *sync_cache_rp_fine; // RestrictProlong sync on PatL[lev]
Parallel::SyncCache *sync_cache_restrict; // cached Restrict in RestrictProlong Parallel::SyncCache *sync_cache_restrict; // cached Restrict in RestrictProlong
Parallel::SyncCache *sync_cache_outbd; // cached OutBdLow2Hi in RestrictProlong Parallel::SyncCache *sync_cache_outbd; // cached OutBdLow2Hi in RestrictProlong
Parallel::SyncCache *sync_cache_psi4; // cached Psi4 sync on PatL[lev]
monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor; monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
monitor *ConVMonitor; monitor *ConVMonitor;
@@ -172,20 +171,16 @@ public:
bool check_Stdin_Abort(); bool check_Stdin_Abort();
virtual void Setup_Initial_Data_Cao(); virtual void Setup_Initial_Data_Cao();
virtual void Setup_Initial_Data_Lousto(); virtual void Setup_Initial_Data_Lousto();
virtual void Initialize(); virtual void Initialize();
virtual void Read_Ansorg(); virtual void Read_Ansorg();
virtual void Read_Pablo() {}; virtual void Read_Pablo() {};
void InvalidateSyncCaches(); virtual void Compute_Psi4(int lev);
virtual void Compute_Psi4(int lev); virtual void Step(int lev, int YN);
virtual void Step(int lev, int YN); virtual void Interp_Constraint(bool infg);
#ifdef USE_GPU virtual void Constraint_Out();
void Step_MainPath_GPU(int lev, int YN); virtual void Compute_Constraint();
#endif
virtual void Interp_Constraint(bool infg);
virtual void Constraint_Out();
virtual void Compute_Constraint();
#ifdef With_AHF #ifdef With_AHF
protected: protected:

File diff suppressed because it is too large Load Diff

View File

@@ -1,68 +0,0 @@
#ifndef BSSN_CUDA_OPS_H
#define BSSN_CUDA_OPS_H
int bssn_cuda_enforce_ga(int *ex,
double *dxx, double *gxy, double *gxz,
double *dyy, double *gyz, double *dzz,
double *Axx, double *Axy, double *Axz,
double *Ayy, double *Ayz, double *Azz);
int bssn_cuda_rk4_boundary_var(int *ex, double dT,
const double *X, const double *Y, const double *Z,
double xmin, double ymin, double zmin,
double xmax, double ymax, double zmax,
const double *state0,
const double *phi_field,
const double *lap_field,
const double *boundary_src,
double *stage_data,
double *rhs_accum,
double propspeed,
const double SoA[3],
int symmetry,
int lev,
int rk_stage,
bool force_host_boundary_fix,
bool download_to_host = true);
int bssn_cuda_rk4_boundary_batch(int *ex, double dT,
const double *X, const double *Y, const double *Z,
double xmin, double ymin, double zmin,
double xmax, double ymax, double zmax,
int symmetry,
const double *const *state0_list,
double *const *stage_data_list,
double *const *rhs_accum_list,
int num_var,
int rk_stage,
bool download_to_host = false);
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_host = true);
int bssn_cuda_download_buffer(int *ex, double *host_ptr);
void bssn_cuda_release_rk4_caches();
void bssn_cuda_release_interp_caches();
int bssn_cuda_prolong3_pack(int wei,
const double *llbc, const double *uubc, const int *extc, const double *func,
const double *llbf, const double *uubf, const int *extf, double *funf,
const double *llbp, const double *uubp,
const double *SoA, int symmetry);
int bssn_cuda_restrict3_pack(int wei,
const double *llbc, const double *uubc, const int *extc, double *func,
const double *llbf, const double *uubf, const int *extf, const double *funf,
const double *llbr, const double *uubr,
const double *SoA, int symmetry);
int bssn_cuda_interp_points_batch(const int *ex,
const double *X, const double *Y, const double *Z,
const double *const *fields,
const double *soa_flat,
int num_var,
const double *px, const double *py, const double *pz,
int num_points,
int ordn,
int symmetry,
double *out);
#endif

View File

@@ -1,936 +0,0 @@
#include "macrodef.h"
#ifdef USE_GPU
#include <algorithm>
#include <cmath>
#include <cstring>
#include <cstdlib>
#include <iomanip>
#include <vector>
#include "bssn_class.h"
#include "bssn_cuda_ops.h"
#include "bssn_gpu.h"
#include "bssn_macro.h"
namespace
{
enum StageProfileMetric
{
STAGE_PROFILE_TOTAL = 0,
STAGE_PROFILE_RHS,
STAGE_PROFILE_RUN_STAGE,
STAGE_PROFILE_RUN_STAGE_DEVICE,
STAGE_PROFILE_RUN_STAGE_HOST_FIX,
STAGE_PROFILE_LOWERBOUND,
STAGE_PROFILE_ENSURE,
STAGE_PROFILE_DOWNLOAD,
STAGE_PROFILE_CLEAR_CACHE,
STAGE_PROFILE_SYNC_START,
STAGE_PROFILE_SYNC_FINISH,
STAGE_PROFILE_REFRESH,
STAGE_PROFILE_COUNT
};
static const int kStageProfileMaxLevels = 32;
struct StageProfileStore
{
bool env_checked;
bool enabled;
int calls[kStageProfileMaxLevels];
double metric[kStageProfileMaxLevels][STAGE_PROFILE_COUNT];
};
StageProfileStore &stage_profile_store()
{
static StageProfileStore store = {};
return store;
}
bool stage_profile_enabled()
{
StageProfileStore &store = stage_profile_store();
if (!store.env_checked)
{
const char *env = getenv("AMSS_GPU_STAGE_TIMING");
store.enabled = (env && env[0] && strcmp(env, "0") != 0);
store.env_checked = true;
}
return store.enabled;
}
void stage_profile_note_call(int lev)
{
if (lev >= 0 && lev < kStageProfileMaxLevels)
stage_profile_store().calls[lev]++;
}
void stage_profile_add(int lev, StageProfileMetric metric, double seconds)
{
if (lev >= 0 && lev < kStageProfileMaxLevels)
stage_profile_store().metric[lev][metric] += seconds;
}
const char *stage_profile_metric_name(StageProfileMetric metric)
{
switch (metric)
{
case STAGE_PROFILE_TOTAL:
return "total";
case STAGE_PROFILE_RHS:
return "rhs";
case STAGE_PROFILE_RUN_STAGE:
return "run_stage";
case STAGE_PROFILE_RUN_STAGE_DEVICE:
return "run_stage_dev";
case STAGE_PROFILE_RUN_STAGE_HOST_FIX:
return "run_stage_host";
case STAGE_PROFILE_LOWERBOUND:
return "lower";
case STAGE_PROFILE_ENSURE:
return "ensure";
case STAGE_PROFILE_DOWNLOAD:
return "download";
case STAGE_PROFILE_CLEAR_CACHE:
return "clear_cache";
case STAGE_PROFILE_SYNC_START:
return "sync_start";
case STAGE_PROFILE_SYNC_FINISH:
return "sync_finish";
case STAGE_PROFILE_REFRESH:
return "refresh";
default:
return "unknown";
}
}
} // namespace
void bssn_cuda_dump_stage_profile()
{
if (!stage_profile_enabled())
return;
int myrank = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
StageProfileStore &store = stage_profile_store();
int global_calls_sum[kStageProfileMaxLevels] = {};
double global_metric_sum[kStageProfileMaxLevels][STAGE_PROFILE_COUNT] = {};
double global_metric_max[kStageProfileMaxLevels][STAGE_PROFILE_COUNT] = {};
MPI_Reduce(store.calls, global_calls_sum, kStageProfileMaxLevels, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
MPI_Reduce(store.metric[0], global_metric_sum[0],
kStageProfileMaxLevels * STAGE_PROFILE_COUNT,
MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
MPI_Reduce(store.metric[0], global_metric_max[0],
kStageProfileMaxLevels * STAGE_PROFILE_COUNT,
MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
if (myrank != 0)
return;
cout << endl;
cout << " GPU stage timing summary (sum/max over MPI ranks) " << endl;
cout << " lev calls";
for (int metric = 0; metric < STAGE_PROFILE_COUNT; ++metric)
cout << " " << setw(22) << stage_profile_metric_name(static_cast<StageProfileMetric>(metric));
cout << endl;
for (int lev = 0; lev < kStageProfileMaxLevels; ++lev)
{
if (global_calls_sum[lev] == 0)
continue;
cout << setw(4) << lev << " " << setw(5) << global_calls_sum[lev];
for (int metric = 0; metric < STAGE_PROFILE_COUNT; ++metric)
{
cout << " "
<< setw(10) << setprecision(6) << fixed << global_metric_sum[lev][metric]
<< "/"
<< setw(10) << setprecision(6) << fixed << global_metric_max[lev][metric];
}
cout << endl;
}
cout << endl;
}
void bssn_class::Step_MainPath_GPU(int lev, int YN)
{
#ifdef WithShell
#error "Step_MainPath_GPU currently supports Patch grids only."
#endif
const bool profile_enabled = stage_profile_enabled();
const double step_total_begin = profile_enabled ? MPI_Wtime() : 0.0;
if (profile_enabled)
stage_profile_note_call(lev);
if (bssn_gpu_bind_process_device(myrank))
{
cerr << "GPU device bind failure on MPI rank " << myrank << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
if (profile_enabled)
{
const double t0 = MPI_Wtime();
bssn_gpu_clear_cached_device_buffers();
stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0);
}
else
bssn_gpu_clear_cached_device_buffers();
setpbh(BH_num, Porg0, Mass, BH_num_input);
const double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
#if (MAPBH == 1)
if (BH_num > 0 && lev == GH->levels - 1)
{
compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
for (int ith = 0; ith < 3; ith++)
Porg1[ithBH][ith] = Porg0[ithBH][ith] + Porg_rhs[ithBH][ith] * dT_lev;
if (Symmetry > 0)
Porg1[ithBH][2] = fabs(Porg1[ithBH][2]);
if (Symmetry == 2)
{
Porg1[ithBH][0] = fabs(Porg1[ithBH][0]);
Porg1[ithBH][1] = fabs(Porg1[ithBH][1]);
}
}
}
if (lev == a_lev)
AnalysisStuff(lev, dT_lev);
#endif
#ifdef With_AHF
AH_Step_Find(lev, dT_lev);
#endif
const bool BB = fgt(PhysTime, StartTime, dT_lev / 2);
(void)BB;
double ndeps = (lev < GH->movls) ? numepsb : numepss;
double TRK4 = PhysTime;
int iter_count = 0;
int pre = 0, cor = 1;
int ERROR = 0;
const bool keep_stage_sync_on_device = (RPS == 1) && (MAPBH == 1) && (REGLEV == 0);
auto run_stage_on_block =
[&](Block *cg, Patch *patch, MyList<var> *state0_list,
MyList<var> *boundary_src_list, MyList<var> *stage_data_list,
MyList<var> *rhs_list, int rk_stage) {
MyList<var> *varl0 = state0_list;
MyList<var> *varlb = boundary_src_list;
MyList<var> *varls = stage_data_list;
MyList<var> *varlr = rhs_list;
std::vector<const double *> batch_state0;
std::vector<double *> batch_stage;
std::vector<double *> batch_rhs;
while (varl0)
{
const bool force_host_boundary_fix = false;
const bool can_batch_device_path = (lev > 0) && !force_host_boundary_fix;
if (can_batch_device_path)
{
batch_state0.push_back(cg->fgfs[varl0->data->sgfn]);
batch_stage.push_back(cg->fgfs[varls->data->sgfn]);
batch_rhs.push_back(cg->fgfs[varlr->data->sgfn]);
varl0 = varl0->next;
varlb = varlb->next;
varls = varls->next;
varlr = varlr->next;
continue;
}
const double var_begin = profile_enabled ? MPI_Wtime() : 0.0;
if (bssn_cuda_rk4_boundary_var(cg->shape, dT_lev,
cg->X[0], cg->X[1], cg->X[2],
patch->bbox[0], patch->bbox[1], patch->bbox[2],
patch->bbox[3], patch->bbox[4], patch->bbox[5],
cg->fgfs[varl0->data->sgfn],
cg->fgfs[phi0->sgfn],
cg->fgfs[Lap0->sgfn],
cg->fgfs[varlb->data->sgfn],
cg->fgfs[varls->data->sgfn],
cg->fgfs[varlr->data->sgfn],
varl0->data->propspeed,
varl0->data->SoA,
Symmetry, lev, rk_stage,
force_host_boundary_fix, false))
{
cerr << "GPU rk4/boundary failure: lev=" << lev
<< " rk_stage=" << rk_stage
<< " var=" << varl0->data->name
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
break;
}
if (profile_enabled)
{
stage_profile_add(lev,
force_host_boundary_fix ? STAGE_PROFILE_RUN_STAGE_HOST_FIX
: STAGE_PROFILE_RUN_STAGE_DEVICE,
MPI_Wtime() - var_begin);
}
varl0 = varl0->next;
varlb = varlb->next;
varls = varls->next;
varlr = varlr->next;
}
if (!ERROR && !batch_state0.empty())
{
const double batch_begin = profile_enabled ? MPI_Wtime() : 0.0;
if (bssn_cuda_rk4_boundary_batch(cg->shape, dT_lev,
cg->X[0], cg->X[1], cg->X[2],
patch->bbox[0], patch->bbox[1], patch->bbox[2],
patch->bbox[3], patch->bbox[4], patch->bbox[5],
Symmetry,
&batch_state0[0],
&batch_stage[0],
&batch_rhs[0],
static_cast<int>(batch_state0.size()),
rk_stage, false))
{
cerr << "GPU rk4/boundary batch failure: lev=" << lev
<< " rk_stage=" << rk_stage
<< " vars=" << batch_state0.size()
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
else if (profile_enabled)
{
stage_profile_add(lev, STAGE_PROFILE_RUN_STAGE_DEVICE, MPI_Wtime() - batch_begin);
}
}
};
auto stage_download_var_list =
[&](Block *cg, MyList<var> *var_list, bool skip_unmapped) {
std::vector<double *> batch_host_ptrs;
std::vector<MyList<var> *> batch_vars;
while (var_list)
{
double *host_ptr = cg->fgfs[var_list->data->sgfn];
if (skip_unmapped && !bssn_gpu_find_device_buffer(host_ptr))
{
var_list = var_list->next;
continue;
}
batch_host_ptrs.push_back(host_ptr);
batch_vars.push_back(var_list);
var_list = var_list->next;
}
if (!batch_host_ptrs.empty() &&
bssn_gpu_download_buffer_batch(cg->shape, &batch_host_ptrs[0],
static_cast<int>(batch_host_ptrs.size())))
{
for (size_t i = 0; i < batch_host_ptrs.size(); ++i)
{
if (bssn_cuda_download_buffer(cg->shape, batch_host_ptrs[i]))
{
cerr << "GPU stage download failure: lev=" << lev
<< " var=" << batch_vars[i]->data->name
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
break;
}
}
}
};
auto stage_download_patch_list =
[&](MyList<var> *var_list, bool skip_unmapped) {
MyList<Patch> *patch_it = GH->PatL[lev];
while (patch_it)
{
MyList<Block> *block_it = patch_it->data->blb;
while (block_it)
{
Block *cg = block_it->data;
if (myrank == cg->rank)
stage_download_var_list(cg, var_list, skip_unmapped);
if (block_it == patch_it->data->ble)
break;
block_it = block_it->next;
}
if (ERROR)
break;
patch_it = patch_it->next;
}
};
auto ensure_stage_device_var_list =
[&](Block *cg, MyList<var> *var_list) {
const int n = cg->shape[0] * cg->shape[1] * cg->shape[2];
while (var_list)
{
double *host_ptr = cg->fgfs[var_list->data->sgfn];
if (!bssn_gpu_find_device_buffer(host_ptr) &&
bssn_gpu_stage_upload_buffer(host_ptr, n))
{
cerr << "GPU state ensure failure: lev=" << lev
<< " var=" << var_list->data->name
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
break;
}
var_list = var_list->next;
}
};
auto refresh_synced_device_regions =
[&](Block *cg, MyList<var> *var_list, Parallel::SyncCache &cache) {
std::vector<Parallel::gridseg *> local_segments;
for (int node = 0; node < cache.cpusize; ++node)
{
MyList<Parallel::gridseg> *seg = cache.combined_dst[node];
while (seg)
{
if (seg->data && seg->data->Bg == cg)
local_segments.push_back(seg->data);
seg = seg->next;
}
}
if (local_segments.empty())
return;
const int n = cg->shape[0] * cg->shape[1] * cg->shape[2];
while (var_list)
{
double *host_ptr = cg->fgfs[var_list->data->sgfn];
if (!bssn_gpu_find_device_buffer(host_ptr))
{
if (bssn_gpu_stage_upload_buffer(host_ptr, n))
{
cerr << "GPU sync refresh upload failure: lev=" << lev
<< " var=" << var_list->data->name
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
break;
}
}
else
{
for (size_t i = 0; i < local_segments.size(); ++i)
{
Parallel::gridseg *seg = local_segments[i];
if (bssn_gpu_stage_upload_region(host_ptr,
cg->shape,
cg->bbox,
cg->bbox + dim,
seg->shape,
seg->llb))
{
cerr << "GPU sync region refresh failure: lev=" << lev
<< " var=" << var_list->data->name
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
break;
}
}
if (ERROR)
break;
}
var_list = var_list->next;
}
};
auto refresh_stage_device_after_sync =
[&](MyList<var> *var_list, Parallel::SyncCache &cache) {
MyList<Patch> *patch_it = GH->PatL[lev];
while (patch_it)
{
MyList<Block> *block_it = patch_it->data->blb;
while (block_it)
{
Block *cg = block_it->data;
if (myrank == cg->rank)
refresh_synced_device_regions(cg, var_list, cache);
if (block_it == patch_it->data->ble)
break;
block_it = block_it->next;
}
if (ERROR)
break;
patch_it = patch_it->next;
}
};
auto refresh_stage_host_before_sync =
[&](MyList<var> *var_list, Parallel::SyncCache &cache) -> bool {
if (!cache.valid || !cache.combined_src || myrank < 0 || myrank >= cache.cpusize)
return false;
MyList<Patch> *patch_it = GH->PatL[lev];
while (patch_it)
{
MyList<Block> *block_it = patch_it->data->blb;
while (block_it)
{
Block *cg = block_it->data;
if (myrank == cg->rank)
{
std::vector<Parallel::gridseg *> local_segments;
MyList<Parallel::gridseg> *seg = cache.combined_src[myrank];
while (seg)
{
if (seg->data && seg->data->Bg == cg)
local_segments.push_back(seg->data);
seg = seg->next;
}
if (!local_segments.empty())
{
MyList<var> *var_it = var_list;
while (var_it)
{
double *host_ptr = cg->fgfs[var_it->data->sgfn];
for (size_t i = 0; i < local_segments.size(); ++i)
{
Parallel::gridseg *src_seg = local_segments[i];
if (bssn_gpu_stage_download_region(host_ptr,
cg->shape,
cg->bbox,
cg->bbox + dim,
src_seg->shape,
src_seg->llb))
{
cerr << "GPU sync region download failure: lev=" << lev
<< " var=" << var_it->data->name
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
return true;
}
}
var_it = var_it->next;
}
}
}
if (block_it == patch_it->data->ble)
break;
block_it = block_it->next;
}
patch_it = patch_it->next;
}
return true;
};
auto can_pack_sync_from_device =
[&](MyList<var> *var_list, Parallel::SyncCache &cache) -> bool {
if (!cache.valid || !cache.combined_src || myrank < 0 || myrank >= cache.cpusize)
return false;
MyList<Parallel::gridseg> *seg = cache.combined_src[myrank];
while (seg)
{
MyList<var> *var_it = var_list;
while (var_it)
{
if (!bssn_gpu_find_device_buffer(seg->data->Bg->fgfs[var_it->data->sgfn]))
return false;
var_it = var_it->next;
}
seg = seg->next;
}
return true;
};
MyList<Patch> *Pp = GH->PatL[lev];
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank)
{
double t0 = 0.0;
if (profile_enabled)
t0 = MPI_Wtime();
if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_FIRST_TIME))
ERROR = 1;
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_RHS, MPI_Wtime() - t0);
if (profile_enabled)
t0 = MPI_Wtime();
run_stage_on_block(cg, Pp->data, StateList, StateList, SynchList_pre, RHSList, iter_count);
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_RUN_STAGE, MPI_Wtime() - t0);
if (profile_enabled)
t0 = MPI_Wtime();
if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi->sgfn], chitiny, false))
{
cerr << "GPU lowerbound failure: lev=" << lev
<< " rk_stage=" << iter_count
<< " var=" << phi->name
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_LOWERBOUND, MPI_Wtime() - t0);
}
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
if (!ERROR)
{
if (!keep_stage_sync_on_device)
{
double t0 = 0.0;
if (profile_enabled)
t0 = MPI_Wtime();
stage_download_patch_list(SynchList_pre, false);
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_DOWNLOAD, MPI_Wtime() - t0);
if (!ERROR)
{
if (profile_enabled)
t0 = MPI_Wtime();
bssn_gpu_clear_cached_device_buffers();
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0);
}
}
}
MPI_Request err_req_pre;
{
int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_pre);
}
Parallel::AsyncSyncState async_pre;
if (profile_enabled)
{
const double t0 = MPI_Wtime();
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
stage_profile_add(lev, STAGE_PROFILE_SYNC_START, MPI_Wtime() - t0);
}
else
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
if (profile_enabled)
{
const double t0 = MPI_Wtime();
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry,
!keep_stage_sync_on_device);
stage_profile_add(lev, STAGE_PROFILE_SYNC_FINISH, MPI_Wtime() - t0);
}
else
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry,
!keep_stage_sync_on_device);
if (!ERROR && !keep_stage_sync_on_device)
{
if (profile_enabled)
{
const double t0 = MPI_Wtime();
refresh_stage_device_after_sync(SynchList_pre, sync_cache_pre[lev]);
stage_profile_add(lev, STAGE_PROFILE_REFRESH, MPI_Wtime() - t0);
}
else
refresh_stage_device_after_sync(SynchList_pre, sync_cache_pre[lev]);
}
MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE);
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#if (MAPBH == 0)
if (BH_num > 0 && lev == GH->levels - 1)
{
compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count);
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg[ithBH][1], Porg_rhs[ithBH][1], iter_count);
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg[ithBH][2], Porg_rhs[ithBH][2], iter_count);
if (Symmetry > 0)
Porg[ithBH][2] = fabs(Porg[ithBH][2]);
if (Symmetry == 2)
{
Porg[ithBH][0] = fabs(Porg[ithBH][0]);
Porg[ithBH][1] = fabs(Porg[ithBH][1]);
}
}
}
if (lev == a_lev)
AnalysisStuff(lev, dT_lev);
#endif
for (iter_count = 1; iter_count < 4; iter_count++)
{
if (iter_count == 1 || iter_count == 3)
TRK4 += dT_lev / 2;
Pp = GH->PatL[lev];
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank)
{
double t0 = 0.0;
if (profile_enabled)
t0 = MPI_Wtime();
ensure_stage_device_var_list(cg, SynchList_pre);
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_ENSURE, MPI_Wtime() - t0);
if (profile_enabled)
t0 = MPI_Wtime();
if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_THEN))
ERROR = 1;
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_RHS, MPI_Wtime() - t0);
if (profile_enabled)
t0 = MPI_Wtime();
run_stage_on_block(cg, Pp->data, StateList, SynchList_pre, SynchList_cor, RHSList, iter_count);
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_RUN_STAGE, MPI_Wtime() - t0);
if (profile_enabled)
t0 = MPI_Wtime();
if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi1->sgfn], chitiny, false))
{
cerr << "GPU lowerbound failure: lev=" << lev
<< " rk_stage=" << iter_count
<< " var=" << phi1->name
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_LOWERBOUND, MPI_Wtime() - t0);
}
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
if (!ERROR)
{
if (!keep_stage_sync_on_device)
{
double t0 = 0.0;
if (profile_enabled)
t0 = MPI_Wtime();
stage_download_patch_list(SynchList_cor, false);
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_DOWNLOAD, MPI_Wtime() - t0);
if (!ERROR)
{
if (profile_enabled)
t0 = MPI_Wtime();
bssn_gpu_clear_cached_device_buffers();
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0);
}
}
}
MPI_Request err_req_cor;
{
int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
}
Parallel::AsyncSyncState async_cor;
if (profile_enabled)
{
const double t0 = MPI_Wtime();
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
stage_profile_add(lev, STAGE_PROFILE_SYNC_START, MPI_Wtime() - t0);
}
else
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
if (profile_enabled)
{
const double t0 = MPI_Wtime();
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry,
!keep_stage_sync_on_device);
stage_profile_add(lev, STAGE_PROFILE_SYNC_FINISH, MPI_Wtime() - t0);
}
else
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry,
!keep_stage_sync_on_device);
if (!ERROR && !keep_stage_sync_on_device && iter_count < 3)
{
if (profile_enabled)
{
const double t0 = MPI_Wtime();
refresh_stage_device_after_sync(SynchList_cor, sync_cache_cor[lev]);
stage_profile_add(lev, STAGE_PROFILE_REFRESH, MPI_Wtime() - t0);
}
else
refresh_stage_device_after_sync(SynchList_cor, sync_cache_cor[lev]);
}
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#if (MAPBH == 0)
if (BH_num > 0 && lev == GH->levels - 1)
{
compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev);
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg1[ithBH][0], Porg_rhs[ithBH][0], iter_count);
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg1[ithBH][1], Porg_rhs[ithBH][1], iter_count);
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg1[ithBH][2], Porg_rhs[ithBH][2], iter_count);
if (Symmetry > 0)
Porg1[ithBH][2] = fabs(Porg1[ithBH][2]);
if (Symmetry == 2)
{
Porg1[ithBH][0] = fabs(Porg1[ithBH][0]);
Porg1[ithBH][1] = fabs(Porg1[ithBH][1]);
}
}
}
#endif
if (iter_count < 3)
{
Pp = GH->PatL[lev];
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
BP->data->swapList(SynchList_pre, SynchList_cor, myrank);
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
#if (MAPBH == 0)
if (BH_num > 0 && lev == GH->levels - 1)
{
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
Porg[ithBH][0] = Porg1[ithBH][0];
Porg[ithBH][1] = Porg1[ithBH][1];
Porg[ithBH][2] = Porg1[ithBH][2];
}
}
#endif
}
}
#if (RPS == 0)
RestrictProlong(lev, YN, BB);
#endif
Pp = GH->PatL[lev];
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
cg->swapList(StateList, SynchList_cor, myrank);
cg->swapList(OldStateList, SynchList_cor, myrank);
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
if (!ERROR && keep_stage_sync_on_device)
{
// After the swaps above, only StateList points at arrays updated during this step.
// OldStateList/SynchList_cor remain valid on host because their backing arrays were
// read-only during the RK step, and SynchList_pre is reused only as scratch later.
const double t0 = profile_enabled ? MPI_Wtime() : 0.0;
stage_download_patch_list(StateList, true);
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_DOWNLOAD, MPI_Wtime() - t0);
}
if (profile_enabled)
{
const double t0 = MPI_Wtime();
bssn_gpu_clear_cached_device_buffers();
stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0);
}
else
bssn_gpu_clear_cached_device_buffers();
if (BH_num > 0 && lev == GH->levels - 1)
{
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
Porg0[ithBH][0] = Porg1[ithBH][0];
Porg0[ithBH][1] = Porg1[ithBH][1];
Porg0[ithBH][2] = Porg1[ithBH][2];
}
}
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_TOTAL, MPI_Wtime() - step_total_begin);
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -4,8 +4,10 @@
#include "bssn_macro.h" #include "bssn_macro.h"
#include "macrodef.fh" #include "macrodef.fh"
#define GRID_DIM 256 #define DEVICE_ID 0
#define BLOCK_DIM 128 // #define DEVICE_ID_BY_MPI_RANK
#define GRID_DIM 256
#define BLOCK_DIM 128
#define _FH2_(i, j, k) fh[(i) + (j) * _1D_SIZE[2] + (k) * _2D_SIZE[2]] #define _FH2_(i, j, k) fh[(i) + (j) * _1D_SIZE[2] + (k) * _2D_SIZE[2]]
#define _FH3_(i, j, k) fh[(i) + (j) * _1D_SIZE[3] + (k) * _2D_SIZE[3]] #define _FH3_(i, j, k) fh[(i) + (j) * _1D_SIZE[3] + (k) * _2D_SIZE[3]]
@@ -63,45 +65,9 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,
double *Gmx_Res, double *Gmy_Res, double *Gmz_Res, double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
int &Symmetry, int &Lev, double &eps, int &co); int &Symmetry, int &Lev, double &eps, int &co);
int gpu_rhs_ss(RHS_SS_PARA); int gpu_rhs_ss(RHS_SS_PARA);
int bssn_gpu_bind_process_device(int mpi_rank); /** Init GPU side data in GPUMeta. */
void bssn_gpu_clear_cached_device_buffers(); // void init_fluid_meta_gpu(GPUMeta *gpu_meta);
void bssn_gpu_release_pinned_host_buffers();
const double *bssn_gpu_find_device_buffer(const double *host_ptr);
void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr);
void bssn_gpu_prepare_host_buffer(const double *host_ptr, int count);
int bssn_gpu_stage_upload_buffer(const double *host_ptr, int count);
int bssn_gpu_stage_zero_buffer(const double *host_ptr, int count);
int bssn_gpu_stage_upload_region(const double *host_ptr,
const int *full_shape,
const double *full_llb,
const double *full_uub,
const int *region_shape,
const double *region_llb);
int bssn_gpu_stage_download_region(double *host_ptr,
const int *full_shape,
const double *full_llb,
const double *full_uub,
const int *region_shape,
const double *region_llb);
int bssn_gpu_stage_download_region_to_buffer(const double *host_src_ptr,
const int *full_shape,
const double *full_llb,
const double *full_uub,
const int *region_shape,
const double *region_llb,
double *host_dst_ptr);
int bssn_gpu_stage_upload_buffer_to_region(const double *host_src_ptr,
double *host_dst_ptr,
const int *full_shape,
const double *full_llb,
const double *full_uub,
const int *region_shape,
const double *region_llb);
int bssn_gpu_download_buffer_batch(const int *ex, double **host_ptrs, int num_buffers);
/** Init GPU side data in GPUMeta. */
// void init_fluid_meta_gpu(GPUMeta *gpu_meta);
#endif #endif

View File

@@ -65,10 +65,9 @@ if(TIME_COUNT_EACH_RANK == 1){\
}\ }\
} }
//3---------------------GPU--------------------- //3---------------------GPU---------------------
#define CALLED_BY_STEP 0 #define CALLED_BY_STEP 0
#define CALLED_BY_CONSTRAINT 1 #define CALLED_BY_CONSTRAINT 1
#define CALLED_BY_CONSTRAINT_CONS_ONLY 2
#define RHS_PARA_CALLED_FIRST_TIME cg->shape,TRK4,cg->X[0],cg->X[1],cg->X[2],cg->fgfs[phi0->sgfn],cg->fgfs[trK0->sgfn],cg->fgfs[gxx0->sgfn],cg->fgfs[gxy0->sgfn],cg->fgfs[gxz0->sgfn],cg->fgfs[gyy0->sgfn],cg->fgfs[gyz0->sgfn],cg->fgfs[gzz0->sgfn],cg->fgfs[Axx0->sgfn],cg->fgfs[Axy0->sgfn],cg->fgfs[Axz0->sgfn],cg->fgfs[Ayy0->sgfn],cg->fgfs[Ayz0->sgfn],cg->fgfs[Azz0->sgfn],cg->fgfs[Gmx0->sgfn],cg->fgfs[Gmy0->sgfn],cg->fgfs[Gmz0->sgfn],cg->fgfs[Lap0->sgfn],cg->fgfs[Sfx0->sgfn],cg->fgfs[Sfy0->sgfn],cg->fgfs[Sfz0->sgfn],cg->fgfs[dtSfx0->sgfn],cg->fgfs[dtSfy0->sgfn],cg->fgfs[dtSfz0->sgfn],cg->fgfs[phi_rhs->sgfn],cg->fgfs[trK_rhs->sgfn],cg->fgfs[gxx_rhs->sgfn],cg->fgfs[gxy_rhs->sgfn],cg->fgfs[gxz_rhs->sgfn],cg->fgfs[gyy_rhs->sgfn],cg->fgfs[gyz_rhs->sgfn],cg->fgfs[gzz_rhs->sgfn],cg->fgfs[Axx_rhs->sgfn],cg->fgfs[Axy_rhs->sgfn],cg->fgfs[Axz_rhs->sgfn],cg->fgfs[Ayy_rhs->sgfn],cg->fgfs[Ayz_rhs->sgfn],cg->fgfs[Azz_rhs->sgfn],cg->fgfs[Gmx_rhs->sgfn],cg->fgfs[Gmy_rhs->sgfn],cg->fgfs[Gmz_rhs->sgfn],cg->fgfs[Lap_rhs->sgfn],cg->fgfs[Sfx_rhs->sgfn],cg->fgfs[Sfy_rhs->sgfn],cg->fgfs[Sfz_rhs->sgfn],cg->fgfs[dtSfx_rhs->sgfn],cg->fgfs[dtSfy_rhs->sgfn],cg->fgfs[dtSfz_rhs->sgfn],cg->fgfs[rho->sgfn],cg->fgfs[Sx->sgfn],cg->fgfs[Sy->sgfn],cg->fgfs[Sz->sgfn],cg->fgfs[Sxx->sgfn],cg->fgfs[Sxy->sgfn],cg->fgfs[Sxz->sgfn],cg->fgfs[Syy->sgfn],cg->fgfs[Syz->sgfn],cg->fgfs[Szz->sgfn],cg->fgfs[Gamxxx->sgfn],cg->fgfs[Gamxxy->sgfn],cg->fgfs[Gamxxz->sgfn],cg->fgfs[Gamxyy->sgfn],cg->fgfs[Gamxyz->sgfn],cg->fgfs[Gamxzz->sgfn],cg->fgfs[Gamyxx->sgfn],cg->fgfs[Gamyxy->sgfn],cg->fgfs[Gamyxz->sgfn],cg->fgfs[Gamyyy->sgfn],cg->fgfs[Gamyyz->sgfn],cg->fgfs[Gamyzz->sgfn],cg->fgfs[Gamzxx->sgfn],cg->fgfs[Gamzxy->sgfn],cg->fgfs[Gamzxz->sgfn],cg->fgfs[Gamzyy->sgfn],cg->fgfs[Gamzyz->sgfn],cg->fgfs[Gamzzz->sgfn],cg->fgfs[Rxx->sgfn],cg->fgfs[Rxy->sgfn],cg->fgfs[Rxz->sgfn],cg->fgfs[Ryy->sgfn],cg->fgfs[Ryz->sgfn],cg->fgfs[Rzz->sgfn],cg->fgfs[Cons_Ham->sgfn],cg->fgfs[Cons_Px->sgfn],cg->fgfs[Cons_Py->sgfn],cg->fgfs[Cons_Pz->sgfn],cg->fgfs[Cons_Gx->sgfn],cg->fgfs[Cons_Gy->sgfn],cg->fgfs[Cons_Gz->sgfn],Symmetry,lev,ndeps,pre #define RHS_PARA_CALLED_FIRST_TIME cg->shape,TRK4,cg->X[0],cg->X[1],cg->X[2],cg->fgfs[phi0->sgfn],cg->fgfs[trK0->sgfn],cg->fgfs[gxx0->sgfn],cg->fgfs[gxy0->sgfn],cg->fgfs[gxz0->sgfn],cg->fgfs[gyy0->sgfn],cg->fgfs[gyz0->sgfn],cg->fgfs[gzz0->sgfn],cg->fgfs[Axx0->sgfn],cg->fgfs[Axy0->sgfn],cg->fgfs[Axz0->sgfn],cg->fgfs[Ayy0->sgfn],cg->fgfs[Ayz0->sgfn],cg->fgfs[Azz0->sgfn],cg->fgfs[Gmx0->sgfn],cg->fgfs[Gmy0->sgfn],cg->fgfs[Gmz0->sgfn],cg->fgfs[Lap0->sgfn],cg->fgfs[Sfx0->sgfn],cg->fgfs[Sfy0->sgfn],cg->fgfs[Sfz0->sgfn],cg->fgfs[dtSfx0->sgfn],cg->fgfs[dtSfy0->sgfn],cg->fgfs[dtSfz0->sgfn],cg->fgfs[phi_rhs->sgfn],cg->fgfs[trK_rhs->sgfn],cg->fgfs[gxx_rhs->sgfn],cg->fgfs[gxy_rhs->sgfn],cg->fgfs[gxz_rhs->sgfn],cg->fgfs[gyy_rhs->sgfn],cg->fgfs[gyz_rhs->sgfn],cg->fgfs[gzz_rhs->sgfn],cg->fgfs[Axx_rhs->sgfn],cg->fgfs[Axy_rhs->sgfn],cg->fgfs[Axz_rhs->sgfn],cg->fgfs[Ayy_rhs->sgfn],cg->fgfs[Ayz_rhs->sgfn],cg->fgfs[Azz_rhs->sgfn],cg->fgfs[Gmx_rhs->sgfn],cg->fgfs[Gmy_rhs->sgfn],cg->fgfs[Gmz_rhs->sgfn],cg->fgfs[Lap_rhs->sgfn],cg->fgfs[Sfx_rhs->sgfn],cg->fgfs[Sfy_rhs->sgfn],cg->fgfs[Sfz_rhs->sgfn],cg->fgfs[dtSfx_rhs->sgfn],cg->fgfs[dtSfy_rhs->sgfn],cg->fgfs[dtSfz_rhs->sgfn],cg->fgfs[rho->sgfn],cg->fgfs[Sx->sgfn],cg->fgfs[Sy->sgfn],cg->fgfs[Sz->sgfn],cg->fgfs[Sxx->sgfn],cg->fgfs[Sxy->sgfn],cg->fgfs[Sxz->sgfn],cg->fgfs[Syy->sgfn],cg->fgfs[Syz->sgfn],cg->fgfs[Szz->sgfn],cg->fgfs[Gamxxx->sgfn],cg->fgfs[Gamxxy->sgfn],cg->fgfs[Gamxxz->sgfn],cg->fgfs[Gamxyy->sgfn],cg->fgfs[Gamxyz->sgfn],cg->fgfs[Gamxzz->sgfn],cg->fgfs[Gamyxx->sgfn],cg->fgfs[Gamyxy->sgfn],cg->fgfs[Gamyxz->sgfn],cg->fgfs[Gamyyy->sgfn],cg->fgfs[Gamyyz->sgfn],cg->fgfs[Gamyzz->sgfn],cg->fgfs[Gamzxx->sgfn],cg->fgfs[Gamzxy->sgfn],cg->fgfs[Gamzxz->sgfn],cg->fgfs[Gamzyy->sgfn],cg->fgfs[Gamzyz->sgfn],cg->fgfs[Gamzzz->sgfn],cg->fgfs[Rxx->sgfn],cg->fgfs[Rxy->sgfn],cg->fgfs[Rxz->sgfn],cg->fgfs[Ryy->sgfn],cg->fgfs[Ryz->sgfn],cg->fgfs[Rzz->sgfn],cg->fgfs[Cons_Ham->sgfn],cg->fgfs[Cons_Px->sgfn],cg->fgfs[Cons_Py->sgfn],cg->fgfs[Cons_Pz->sgfn],cg->fgfs[Cons_Gx->sgfn],cg->fgfs[Cons_Gy->sgfn],cg->fgfs[Cons_Gz->sgfn],Symmetry,lev,ndeps,pre

View File

@@ -59,10 +59,9 @@
real*8, dimension(ex(1),ex(2),ex(3)),intent(out) :: Rxx,Rxy,Rxz,Ryy,Ryz,Rzz real*8, dimension(ex(1),ex(2),ex(3)),intent(out) :: Rxx,Rxy,Rxz,Ryy,Ryz,Rzz
real*8,intent(in) :: eps real*8,intent(in) :: eps
real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: ham_Res, movx_Res, movy_Res, movz_Res real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: ham_Res, movx_Res, movy_Res, movz_Res
real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: Gmx_Res, Gmy_Res, Gmz_Res real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: Gmx_Res, Gmy_Res, Gmz_Res
! gont = 0: success; gont = 1: something wrong ! gont = 0: success; gont = 1: something wrong
integer::gont integer::gont
integer :: i,j,k
!~~~~~~> Other variables: !~~~~~~> Other variables:
@@ -84,18 +83,11 @@
real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz
real*8, dimension(ex(1),ex(2),ex(3)) :: gupyy,gupyz,gupzz real*8, dimension(ex(1),ex(2),ex(3)) :: gupyy,gupyz,gupzz
real*8,dimension(3) ::SSS,AAS,ASA,SAA,ASS,SAS,SSA real*8,dimension(3) ::SSS,AAS,ASA,SAA,ASS,SAS,SSA
real*8 :: dX, dY, dZ, PI real*8 :: dX, dY, dZ, PI
real*8 :: divb_loc,det_loc real*8, parameter :: ZEO = 0.d0,ONE = 1.D0, TWO = 2.D0, FOUR = 4.D0
real*8 :: gupxx_loc,gupxy_loc,gupxz_loc,gupyy_loc,gupyz_loc,gupzz_loc real*8, parameter :: EIGHT = 8.D0, HALF = 0.5D0, THR = 3.d0
real*8 :: Rxx_loc,Rxy_loc,Rxz_loc,Ryy_loc,Ryz_loc,Rzz_loc real*8, parameter :: SYM = 1.D0, ANTI= - 1.D0
real*8 :: fxx_loc,fxy_loc,fxz_loc
real*8 :: Gamxa_loc,Gamya_loc,Gamza_loc
real*8 :: f_loc,chin_loc
real*8 :: l_fxx,l_fxy,l_fxz,l_fyy,l_fyz,l_fzz,S_loc
real*8, parameter :: ZEO = 0.d0,ONE = 1.D0, TWO = 2.D0, FOUR = 4.D0
real*8, parameter :: EIGHT = 8.D0, HALF = 0.5D0, THR = 3.d0
real*8, parameter :: SYM = 1.D0, ANTI= - 1.D0
double precision,parameter::FF = 0.75d0,eta=2.d0 double precision,parameter::FF = 0.75d0,eta=2.d0
real*8, parameter :: F1o3 = 1.D0/3.D0, F2o3 = 2.D0/3.D0,F3o2=1.5d0, F1o6 = 1.D0/6.D0 real*8, parameter :: F1o3 = 1.D0/3.D0, F2o3 = 2.D0/3.D0,F3o2=1.5d0, F1o6 = 1.D0/6.D0
real*8, parameter :: F16=1.6d1,F8=8.d0 real*8, parameter :: F16=1.6d1,F8=8.d0
@@ -104,11 +96,11 @@
real*8, dimension(ex(1),ex(2),ex(3)) :: reta real*8, dimension(ex(1),ex(2),ex(3)) :: reta
#endif #endif
#if (GAUGE == 6 || GAUGE == 7) #if (GAUGE == 6 || GAUGE == 7)
integer :: BHN integer :: BHN,i,j,k
real*8, dimension(9) :: Porg real*8, dimension(9) :: Porg
real*8, dimension(3) :: Mass real*8, dimension(3) :: Mass
real*8 :: r1,r2,M,A,w1,w2,C1,C2 real*8 :: r1,r2,M,A,w1,w2,C1,C2
real*8, dimension(ex(1),ex(2),ex(3)) :: reta real*8, dimension(ex(1),ex(2),ex(3)) :: reta
call getpbh(BHN,Porg,Mass) call getpbh(BHN,Porg,Mass)
@@ -153,204 +145,174 @@
dY = Y(2) - Y(1) dY = Y(2) - Y(1)
dZ = Z(2) - Z(1) dZ = Z(2) - Z(1)
do k=1,ex(3) alpn1 = Lap + ONE
do j=1,ex(2) chin1 = chi + ONE
do i=1,ex(1) gxx = dxx + ONE
alpn1(i,j,k) = Lap(i,j,k) + ONE gyy = dyy + ONE
chin1(i,j,k) = chi(i,j,k) + ONE gzz = dzz + ONE
gxx(i,j,k) = dxx(i,j,k) + ONE
gyy(i,j,k) = dyy(i,j,k) + ONE
gzz(i,j,k) = dzz(i,j,k) + ONE
enddo
enddo
enddo
call fderivs(ex,betax,betaxx,betaxy,betaxz,X,Y,Z,ANTI, SYM, SYM,Symmetry,Lev) call fderivs(ex,betax,betaxx,betaxy,betaxz,X,Y,Z,ANTI, SYM, SYM,Symmetry,Lev)
call fderivs(ex,betay,betayx,betayy,betayz,X,Y,Z, SYM,ANTI, SYM,Symmetry,Lev) call fderivs(ex,betay,betayx,betayy,betayz,X,Y,Z, SYM,ANTI, SYM,Symmetry,Lev)
call fderivs(ex,betaz,betazx,betazy,betazz,X,Y,Z, SYM, SYM,ANTI,Symmetry,Lev) call fderivs(ex,betaz,betazx,betazy,betazz,X,Y,Z, SYM, SYM,ANTI,Symmetry,Lev)
call fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev) div_beta = betaxx + betayy + betazz
call fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
call fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev) chi_rhs = F2o3 *chin1*( alpn1 * trK - div_beta ) !rhs for chi
call fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev)
call fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev) call fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
call fderivs(ex,dyy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev) call fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev)
call fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev) call fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev)
call fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev) call fderivs(ex,dyy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
call fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev)
do k=1,ex(3) call fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
do j=1,ex(2)
do i=1,ex(1) gxx_rhs = - TWO * alpn1 * Axx - F2o3 * gxx * div_beta + &
divb_loc = betaxx(i,j,k) + betayy(i,j,k) + betazz(i,j,k) TWO *( gxx * betaxx + gxy * betayx + gxz * betazx)
div_beta(i,j,k) = divb_loc
gyy_rhs = - TWO * alpn1 * Ayy - F2o3 * gyy * div_beta + &
chi_rhs(i,j,k) = F2o3 * chin1(i,j,k) * (alpn1(i,j,k) * trK(i,j,k) - divb_loc) TWO *( gxy * betaxy + gyy * betayy + gyz * betazy)
gxx_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Axx(i,j,k) - F2o3 * gxx(i,j,k) * divb_loc + & gzz_rhs = - TWO * alpn1 * Azz - F2o3 * gzz * div_beta + &
TWO * ( gxx(i,j,k) * betaxx(i,j,k) + gxy(i,j,k) * betayx(i,j,k) + gxz(i,j,k) * betazx(i,j,k) ) TWO *( gxz * betaxz + gyz * betayz + gzz * betazz)
gyy_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Ayy(i,j,k) - F2o3 * gyy(i,j,k) * divb_loc + & gxy_rhs = - TWO * alpn1 * Axy + F1o3 * gxy * div_beta + &
TWO * ( gxy(i,j,k) * betaxy(i,j,k) + gyy(i,j,k) * betayy(i,j,k) + gyz(i,j,k) * betazy(i,j,k) ) gxx * betaxy + gxz * betazy + &
gyy * betayx + gyz * betazx &
gzz_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Azz(i,j,k) - F2o3 * gzz(i,j,k) * divb_loc + & - gxy * betazz
TWO * ( gxz(i,j,k) * betaxz(i,j,k) + gyz(i,j,k) * betayz(i,j,k) + gzz(i,j,k) * betazz(i,j,k) )
gyz_rhs = - TWO * alpn1 * Ayz + F1o3 * gyz * div_beta + &
gxy_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Axy(i,j,k) + F1o3 * gxy(i,j,k) * divb_loc + & gxy * betaxz + gyy * betayz + &
gxx(i,j,k) * betaxy(i,j,k) + gxz(i,j,k) * betazy(i,j,k) + gyy(i,j,k) * betayx(i,j,k) + & gxz * betaxy + gzz * betazy &
gyz(i,j,k) * betazx(i,j,k) - gxy(i,j,k) * betazz(i,j,k) - gyz * betaxx
gyz_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Ayz(i,j,k) + F1o3 * gyz(i,j,k) * divb_loc + & gxz_rhs = - TWO * alpn1 * Axz + F1o3 * gxz * div_beta + &
gxy(i,j,k) * betaxz(i,j,k) + gyy(i,j,k) * betayz(i,j,k) + gxz(i,j,k) * betaxy(i,j,k) + & gxx * betaxz + gxy * betayz + &
gzz(i,j,k) * betazy(i,j,k) - gyz(i,j,k) * betaxx(i,j,k) gyz * betayx + gzz * betazx &
- gxz * betayy !rhs for gij
gxz_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Axz(i,j,k) + F1o3 * gxz(i,j,k) * divb_loc + &
gxx(i,j,k) * betaxz(i,j,k) + gxy(i,j,k) * betayz(i,j,k) + gyz(i,j,k) * betayx(i,j,k) + & ! invert tilted metric
gzz(i,j,k) * betazx(i,j,k) - gxz(i,j,k) * betayy(i,j,k) gupzz = gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
det_loc = gxx(i,j,k) * gyy(i,j,k) * gzz(i,j,k) + gxy(i,j,k) * gyz(i,j,k) * gxz(i,j,k) + & gupxx = ( gyy * gzz - gyz * gyz ) / gupzz
gxz(i,j,k) * gxy(i,j,k) * gyz(i,j,k) - gxz(i,j,k) * gyy(i,j,k) * gxz(i,j,k) - & gupxy = - ( gxy * gzz - gyz * gxz ) / gupzz
gxy(i,j,k) * gxy(i,j,k) * gzz(i,j,k) - gxx(i,j,k) * gyz(i,j,k) * gyz(i,j,k) gupxz = ( gxy * gyz - gyy * gxz ) / gupzz
gupxx_loc = ( gyy(i,j,k) * gzz(i,j,k) - gyz(i,j,k) * gyz(i,j,k) ) / det_loc gupyy = ( gxx * gzz - gxz * gxz ) / gupzz
gupxy_loc = - ( gxy(i,j,k) * gzz(i,j,k) - gyz(i,j,k) * gxz(i,j,k) ) / det_loc gupyz = - ( gxx * gyz - gxy * gxz ) / gupzz
gupxz_loc = ( gxy(i,j,k) * gyz(i,j,k) - gyy(i,j,k) * gxz(i,j,k) ) / det_loc gupzz = ( gxx * gyy - gxy * gxy ) / gupzz
gupyy_loc = ( gxx(i,j,k) * gzz(i,j,k) - gxz(i,j,k) * gxz(i,j,k) ) / det_loc
gupyz_loc = - ( gxx(i,j,k) * gyz(i,j,k) - gxy(i,j,k) * gxz(i,j,k) ) / det_loc if(co == 0)then
gupzz_loc = ( gxx(i,j,k) * gyy(i,j,k) - gxy(i,j,k) * gxy(i,j,k) ) / det_loc ! Gam^i_Res = Gam^i + gup^ij_,j
gupxx(i,j,k) = gupxx_loc Gmx_Res = Gamx - (gupxx*(gupxx*gxxx+gupxy*gxyx+gupxz*gxzx)&
gupxy(i,j,k) = gupxy_loc +gupxy*(gupxx*gxyx+gupxy*gyyx+gupxz*gyzx)&
gupxz(i,j,k) = gupxz_loc +gupxz*(gupxx*gxzx+gupxy*gyzx+gupxz*gzzx)&
gupyy(i,j,k) = gupyy_loc +gupxx*(gupxy*gxxy+gupyy*gxyy+gupyz*gxzy)&
gupyz(i,j,k) = gupyz_loc +gupxy*(gupxy*gxyy+gupyy*gyyy+gupyz*gyzy)&
gupzz(i,j,k) = gupzz_loc +gupxz*(gupxy*gxzy+gupyy*gyzy+gupyz*gzzy)&
+gupxx*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
if(co == 0)then +gupxy*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)&
Gmx_Res(i,j,k) = Gamx(i,j,k) - ( & +gupxz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz))
gupxx_loc*(gupxx_loc*gxxx(i,j,k)+gupxy_loc*gxyx(i,j,k)+gupxz_loc*gxzx(i,j,k)) + & Gmy_Res = Gamy - (gupxx*(gupxy*gxxx+gupyy*gxyx+gupyz*gxzx)&
gupxy_loc*(gupxx_loc*gxyx(i,j,k)+gupxy_loc*gyyx(i,j,k)+gupxz_loc*gyzx(i,j,k)) + & +gupxy*(gupxy*gxyx+gupyy*gyyx+gupyz*gyzx)&
gupxz_loc*(gupxx_loc*gxzx(i,j,k)+gupxy_loc*gyzx(i,j,k)+gupxz_loc*gzzx(i,j,k)) + & +gupxz*(gupxy*gxzx+gupyy*gyzx+gupyz*gzzx)&
gupxx_loc*(gupxy_loc*gxxy(i,j,k)+gupyy_loc*gxyy(i,j,k)+gupyz_loc*gxzy(i,j,k)) + & +gupxy*(gupxy*gxxy+gupyy*gxyy+gupyz*gxzy)&
gupxy_loc*(gupxy_loc*gxyy(i,j,k)+gupyy_loc*gyyy(i,j,k)+gupyz_loc*gyzy(i,j,k)) + & +gupyy*(gupxy*gxyy+gupyy*gyyy+gupyz*gyzy)&
gupxz_loc*(gupxy_loc*gxzy(i,j,k)+gupyy_loc*gyzy(i,j,k)+gupyz_loc*gzzy(i,j,k)) + & +gupyz*(gupxy*gxzy+gupyy*gyzy+gupyz*gzzy)&
gupxx_loc*(gupxz_loc*gxxz(i,j,k)+gupyz_loc*gxyz(i,j,k)+gupzz_loc*gxzz(i,j,k)) + & +gupxy*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
gupxy_loc*(gupxz_loc*gxyz(i,j,k)+gupyz_loc*gyyz(i,j,k)+gupzz_loc*gyzz(i,j,k)) + & +gupyy*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)&
gupxz_loc*(gupxz_loc*gxzz(i,j,k)+gupyz_loc*gyzz(i,j,k)+gupzz_loc*gzzz(i,j,k))) +gupyz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz))
Gmy_Res(i,j,k) = Gamy(i,j,k) - ( & Gmz_Res = Gamz - (gupxx*(gupxz*gxxx+gupyz*gxyx+gupzz*gxzx)&
gupxx_loc*(gupxy_loc*gxxx(i,j,k)+gupyy_loc*gxyx(i,j,k)+gupyz_loc*gxzx(i,j,k)) + & +gupxy*(gupxz*gxyx+gupyz*gyyx+gupzz*gyzx)&
gupxy_loc*(gupxy_loc*gxyx(i,j,k)+gupyy_loc*gyyx(i,j,k)+gupyz_loc*gyzx(i,j,k)) + & +gupxz*(gupxz*gxzx+gupyz*gyzx+gupzz*gzzx)&
gupxz_loc*(gupxy_loc*gxzx(i,j,k)+gupyy_loc*gyzx(i,j,k)+gupyz_loc*gzzx(i,j,k)) + & +gupxy*(gupxz*gxxy+gupyz*gxyy+gupzz*gxzy)&
gupxy_loc*(gupxy_loc*gxxy(i,j,k)+gupyy_loc*gxyy(i,j,k)+gupyz_loc*gxzy(i,j,k)) + & +gupyy*(gupxz*gxyy+gupyz*gyyy+gupzz*gyzy)&
gupyy_loc*(gupxy_loc*gxyy(i,j,k)+gupyy_loc*gyyy(i,j,k)+gupyz_loc*gyzy(i,j,k)) + & +gupyz*(gupxz*gxzy+gupyz*gyzy+gupzz*gzzy)&
gupyz_loc*(gupxy_loc*gxzy(i,j,k)+gupyy_loc*gyzy(i,j,k)+gupyz_loc*gzzy(i,j,k)) + & +gupxz*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
gupxy_loc*(gupxz_loc*gxxz(i,j,k)+gupyz_loc*gxyz(i,j,k)+gupzz_loc*gxzz(i,j,k)) + & +gupyz*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)&
gupyy_loc*(gupxz_loc*gxyz(i,j,k)+gupyz_loc*gyyz(i,j,k)+gupzz_loc*gyzz(i,j,k)) + & +gupzz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz))
gupyz_loc*(gupxz_loc*gxzz(i,j,k)+gupyz_loc*gyzz(i,j,k)+gupzz_loc*gzzz(i,j,k))) endif
Gmz_Res(i,j,k) = Gamz(i,j,k) - ( &
gupxx_loc*(gupxz_loc*gxxx(i,j,k)+gupyz_loc*gxyx(i,j,k)+gupzz_loc*gxzx(i,j,k)) + & ! second kind of connection
gupxy_loc*(gupxz_loc*gxyx(i,j,k)+gupyz_loc*gyyx(i,j,k)+gupzz_loc*gyzx(i,j,k)) + & Gamxxx =HALF*( gupxx*gxxx + gupxy*(TWO*gxyx - gxxy ) + gupxz*(TWO*gxzx - gxxz ))
gupxz_loc*(gupxz_loc*gxzx(i,j,k)+gupyz_loc*gyzx(i,j,k)+gupzz_loc*gzzx(i,j,k)) + & Gamyxx =HALF*( gupxy*gxxx + gupyy*(TWO*gxyx - gxxy ) + gupyz*(TWO*gxzx - gxxz ))
gupxy_loc*(gupxz_loc*gxxy(i,j,k)+gupyz_loc*gxyy(i,j,k)+gupzz_loc*gxzy(i,j,k)) + & Gamzxx =HALF*( gupxz*gxxx + gupyz*(TWO*gxyx - gxxy ) + gupzz*(TWO*gxzx - gxxz ))
gupyy_loc*(gupxz_loc*gxyy(i,j,k)+gupyz_loc*gyyy(i,j,k)+gupzz_loc*gyzy(i,j,k)) + &
gupyz_loc*(gupxz_loc*gxzy(i,j,k)+gupyz_loc*gyzy(i,j,k)+gupzz_loc*gzzy(i,j,k)) + & Gamxyy =HALF*( gupxx*(TWO*gxyy - gyyx ) + gupxy*gyyy + gupxz*(TWO*gyzy - gyyz ))
gupxz_loc*(gupxz_loc*gxxz(i,j,k)+gupyz_loc*gxyz(i,j,k)+gupzz_loc*gxzz(i,j,k)) + & Gamyyy =HALF*( gupxy*(TWO*gxyy - gyyx ) + gupyy*gyyy + gupyz*(TWO*gyzy - gyyz ))
gupyz_loc*(gupxz_loc*gxyz(i,j,k)+gupyz_loc*gyyz(i,j,k)+gupzz_loc*gyzz(i,j,k)) + & Gamzyy =HALF*( gupxz*(TWO*gxyy - gyyx ) + gupyz*gyyy + gupzz*(TWO*gyzy - gyyz ))
gupzz_loc*(gupxz_loc*gxzz(i,j,k)+gupyz_loc*gyzz(i,j,k)+gupzz_loc*gzzz(i,j,k)))
endif Gamxzz =HALF*( gupxx*(TWO*gxzz - gzzx ) + gupxy*(TWO*gyzz - gzzy ) + gupxz*gzzz)
Gamyzz =HALF*( gupxy*(TWO*gxzz - gzzx ) + gupyy*(TWO*gyzz - gzzy ) + gupyz*gzzz)
Gamxxx(i,j,k)=HALF*( gupxx_loc*gxxx(i,j,k) + gupxy_loc*(TWO*gxyx(i,j,k) - gxxy(i,j,k)) + gupxz_loc*(TWO*gxzx(i,j,k) - gxxz(i,j,k))) Gamzzz =HALF*( gupxz*(TWO*gxzz - gzzx ) + gupyz*(TWO*gyzz - gzzy ) + gupzz*gzzz)
Gamyxx(i,j,k)=HALF*( gupxy_loc*gxxx(i,j,k) + gupyy_loc*(TWO*gxyx(i,j,k) - gxxy(i,j,k)) + gupyz_loc*(TWO*gxzx(i,j,k) - gxxz(i,j,k)))
Gamzxx(i,j,k)=HALF*( gupxz_loc*gxxx(i,j,k) + gupyz_loc*(TWO*gxyx(i,j,k) - gxxy(i,j,k)) + gupzz_loc*(TWO*gxzx(i,j,k) - gxxz(i,j,k))) Gamxxy =HALF*( gupxx*gxxy + gupxy*gyyx + gupxz*( gxzy + gyzx - gxyz ) )
Gamyxy =HALF*( gupxy*gxxy + gupyy*gyyx + gupyz*( gxzy + gyzx - gxyz ) )
Gamxyy(i,j,k)=HALF*( gupxx_loc*(TWO*gxyy(i,j,k) - gyyx(i,j,k)) + gupxy_loc*gyyy(i,j,k) + gupxz_loc*(TWO*gyzy(i,j,k) - gyyz(i,j,k))) Gamzxy =HALF*( gupxz*gxxy + gupyz*gyyx + gupzz*( gxzy + gyzx - gxyz ) )
Gamyyy(i,j,k)=HALF*( gupxy_loc*(TWO*gxyy(i,j,k) - gyyx(i,j,k)) + gupyy_loc*gyyy(i,j,k) + gupyz_loc*(TWO*gyzy(i,j,k) - gyyz(i,j,k)))
Gamzyy(i,j,k)=HALF*( gupxz_loc*(TWO*gxyy(i,j,k) - gyyx(i,j,k)) + gupyz_loc*gyyy(i,j,k) + gupzz_loc*(TWO*gyzy(i,j,k) - gyyz(i,j,k))) Gamxxz =HALF*( gupxx*gxxz + gupxy*( gxyz + gyzx - gxzy ) + gupxz*gzzx )
Gamyxz =HALF*( gupxy*gxxz + gupyy*( gxyz + gyzx - gxzy ) + gupyz*gzzx )
Gamxzz(i,j,k)=HALF*( gupxx_loc*(TWO*gxzz(i,j,k) - gzzx(i,j,k)) + gupxy_loc*(TWO*gyzz(i,j,k) - gzzy(i,j,k)) + gupxz_loc*gzzz(i,j,k)) Gamzxz =HALF*( gupxz*gxxz + gupyz*( gxyz + gyzx - gxzy ) + gupzz*gzzx )
Gamyzz(i,j,k)=HALF*( gupxy_loc*(TWO*gxzz(i,j,k) - gzzx(i,j,k)) + gupyy_loc*(TWO*gyzz(i,j,k) - gzzy(i,j,k)) + gupyz_loc*gzzz(i,j,k))
Gamzzz(i,j,k)=HALF*( gupxz_loc*(TWO*gxzz(i,j,k) - gzzx(i,j,k)) + gupyz_loc*(TWO*gyzz(i,j,k) - gzzy(i,j,k)) + gupzz_loc*gzzz(i,j,k)) Gamxyz =HALF*( gupxx*( gxyz + gxzy - gyzx ) + gupxy*gyyz + gupxz*gzzy )
Gamyyz =HALF*( gupxy*( gxyz + gxzy - gyzx ) + gupyy*gyyz + gupyz*gzzy )
Gamxxy(i,j,k)=HALF*( gupxx_loc*gxxy(i,j,k) + gupxy_loc*gyyx(i,j,k) + gupxz_loc*(gxzy(i,j,k) + gyzx(i,j,k) - gxyz(i,j,k)) ) Gamzyz =HALF*( gupxz*( gxyz + gxzy - gyzx ) + gupyz*gyyz + gupzz*gzzy )
Gamyxy(i,j,k)=HALF*( gupxy_loc*gxxy(i,j,k) + gupyy_loc*gyyx(i,j,k) + gupyz_loc*(gxzy(i,j,k) + gyzx(i,j,k) - gxyz(i,j,k)) ) ! Raise indices of \tilde A_{ij} and store in R_ij
Gamzxy(i,j,k)=HALF*( gupxz_loc*gxxy(i,j,k) + gupyz_loc*gyyx(i,j,k) + gupzz_loc*(gxzy(i,j,k) + gyzx(i,j,k) - gxyz(i,j,k)) )
Rxx = gupxx * gupxx * Axx + gupxy * gupxy * Ayy + gupxz * gupxz * Azz + &
Gamxxz(i,j,k)=HALF*( gupxx_loc*gxxz(i,j,k) + gupxy_loc*(gxyz(i,j,k) + gyzx(i,j,k) - gxzy(i,j,k)) + gupxz_loc*gzzx(i,j,k) ) TWO*(gupxx * gupxy * Axy + gupxx * gupxz * Axz + gupxy * gupxz * Ayz)
Gamyxz(i,j,k)=HALF*( gupxy_loc*gxxz(i,j,k) + gupyy_loc*(gxyz(i,j,k) + gyzx(i,j,k) - gxzy(i,j,k)) + gupyz_loc*gzzx(i,j,k) )
Gamzxz(i,j,k)=HALF*( gupxz_loc*gxxz(i,j,k) + gupyz_loc*(gxyz(i,j,k) + gyzx(i,j,k) - gxzy(i,j,k)) + gupzz_loc*gzzx(i,j,k) ) Ryy = gupxy * gupxy * Axx + gupyy * gupyy * Ayy + gupyz * gupyz * Azz + &
TWO*(gupxy * gupyy * Axy + gupxy * gupyz * Axz + gupyy * gupyz * Ayz)
Gamxyz(i,j,k)=HALF*( gupxx_loc*(gxyz(i,j,k) + gxzy(i,j,k) - gyzx(i,j,k)) + gupxy_loc*gyyz(i,j,k) + gupxz_loc*gzzy(i,j,k) )
Gamyyz(i,j,k)=HALF*( gupxy_loc*(gxyz(i,j,k) + gxzy(i,j,k) - gyzx(i,j,k)) + gupyy_loc*gyyz(i,j,k) + gupyz_loc*gzzy(i,j,k) ) Rzz = gupxz * gupxz * Axx + gupyz * gupyz * Ayy + gupzz * gupzz * Azz + &
Gamzyz(i,j,k)=HALF*( gupxz_loc*(gxyz(i,j,k) + gxzy(i,j,k) - gyzx(i,j,k)) + gupyz_loc*gyyz(i,j,k) + gupzz_loc*gzzy(i,j,k) ) TWO*(gupxz * gupyz * Axy + gupxz * gupzz * Axz + gupyz * gupzz * Ayz)
enddo
enddo Rxy = gupxx * gupxy * Axx + gupxy * gupyy * Ayy + gupxz * gupyz * Azz + &
enddo (gupxx * gupyy + gupxy * gupxy)* Axy + &
! Raise indices of \tilde A_{ij} and store in R_ij (gupxx * gupyz + gupxz * gupxy)* Axz + &
(gupxy * gupyz + gupxz * gupyy)* Ayz
! Right hand side for Gam^i without shift terms...
call fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev) Rxz = gupxx * gupxz * Axx + gupxy * gupyz * Ayy + gupxz * gupzz * Azz + &
call fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev) (gupxx * gupyz + gupxy * gupxz)* Axy + &
do k=1,ex(3) (gupxx * gupzz + gupxz * gupxz)* Axz + &
do j=1,ex(2) (gupxy * gupzz + gupxz * gupyz)* Ayz
do i=1,ex(1)
gupxx_loc = gupxx(i,j,k) Ryz = gupxy * gupxz * Axx + gupyy * gupyz * Ayy + gupyz * gupzz * Azz + &
gupxy_loc = gupxy(i,j,k) (gupxy * gupyz + gupyy * gupxz)* Axy + &
gupxz_loc = gupxz(i,j,k) (gupxy * gupzz + gupyz * gupxz)* Axz + &
gupyy_loc = gupyy(i,j,k) (gupyy * gupzz + gupyz * gupyz)* Ayz
gupyz_loc = gupyz(i,j,k)
gupzz_loc = gupzz(i,j,k) ! Right hand side for Gam^i without shift terms...
call fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
Rxx_loc = gupxx_loc * gupxx_loc * Axx(i,j,k) + gupxy_loc * gupxy_loc * Ayy(i,j,k) + gupxz_loc * gupxz_loc * Azz(i,j,k) + & call fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
TWO * (gupxx_loc * gupxy_loc * Axy(i,j,k) + gupxx_loc * gupxz_loc * Axz(i,j,k) + gupxy_loc * gupxz_loc * Ayz(i,j,k))
Ryy_loc = gupxy_loc * gupxy_loc * Axx(i,j,k) + gupyy_loc * gupyy_loc * Ayy(i,j,k) + gupyz_loc * gupyz_loc * Azz(i,j,k) + & Gamx_rhs = - TWO * ( Lapx * Rxx + Lapy * Rxy + Lapz * Rxz ) + &
TWO * (gupxy_loc * gupyy_loc * Axy(i,j,k) + gupxy_loc * gupyz_loc * Axz(i,j,k) + gupyy_loc * gupyz_loc * Ayz(i,j,k)) TWO * alpn1 * ( &
Rzz_loc = gupxz_loc * gupxz_loc * Axx(i,j,k) + gupyz_loc * gupyz_loc * Ayy(i,j,k) + gupzz_loc * gupzz_loc * Azz(i,j,k) + & -F3o2/chin1 * ( chix * Rxx + chiy * Rxy + chiz * Rxz ) - &
TWO * (gupxz_loc * gupyz_loc * Axy(i,j,k) + gupxz_loc * gupzz_loc * Axz(i,j,k) + gupyz_loc * gupzz_loc * Ayz(i,j,k)) gupxx * ( F2o3 * Kx + EIGHT * PI * Sx ) - &
Rxy_loc = gupxx_loc * gupxy_loc * Axx(i,j,k) + gupxy_loc * gupyy_loc * Ayy(i,j,k) + gupxz_loc * gupyz_loc * Azz(i,j,k) + & gupxy * ( F2o3 * Ky + EIGHT * PI * Sy ) - &
(gupxx_loc * gupyy_loc + gupxy_loc * gupxy_loc) * Axy(i,j,k) + & gupxz * ( F2o3 * Kz + EIGHT * PI * Sz ) + &
(gupxx_loc * gupyz_loc + gupxz_loc * gupxy_loc) * Axz(i,j,k) + & Gamxxx * Rxx + Gamxyy * Ryy + Gamxzz * Rzz + &
(gupxy_loc * gupyz_loc + gupxz_loc * gupyy_loc) * Ayz(i,j,k) TWO * ( Gamxxy * Rxy + Gamxxz * Rxz + Gamxyz * Ryz ) )
Rxz_loc = gupxx_loc * gupxz_loc * Axx(i,j,k) + gupxy_loc * gupyz_loc * Ayy(i,j,k) + gupxz_loc * gupzz_loc * Azz(i,j,k) + &
(gupxx_loc * gupyz_loc + gupxy_loc * gupxz_loc) * Axy(i,j,k) + & Gamy_rhs = - TWO * ( Lapx * Rxy + Lapy * Ryy + Lapz * Ryz ) + &
(gupxx_loc * gupzz_loc + gupxz_loc * gupxz_loc) * Axz(i,j,k) + & TWO * alpn1 * ( &
(gupxy_loc * gupzz_loc + gupxz_loc * gupyz_loc) * Ayz(i,j,k) -F3o2/chin1 * ( chix * Rxy + chiy * Ryy + chiz * Ryz ) - &
Ryz_loc = gupxy_loc * gupxz_loc * Axx(i,j,k) + gupyy_loc * gupyz_loc * Ayy(i,j,k) + gupyz_loc * gupzz_loc * Azz(i,j,k) + & gupxy * ( F2o3 * Kx + EIGHT * PI * Sx ) - &
(gupxy_loc * gupyz_loc + gupyy_loc * gupxz_loc) * Axy(i,j,k) + & gupyy * ( F2o3 * Ky + EIGHT * PI * Sy ) - &
(gupxy_loc * gupzz_loc + gupyz_loc * gupxz_loc) * Axz(i,j,k) + & gupyz * ( F2o3 * Kz + EIGHT * PI * Sz ) + &
(gupyy_loc * gupzz_loc + gupyz_loc * gupyz_loc) * Ayz(i,j,k) Gamyxx * Rxx + Gamyyy * Ryy + Gamyzz * Rzz + &
Rxx(i,j,k) = Rxx_loc TWO * ( Gamyxy * Rxy + Gamyxz * Rxz + Gamyyz * Ryz ) )
Ryy(i,j,k) = Ryy_loc
Rzz(i,j,k) = Rzz_loc Gamz_rhs = - TWO * ( Lapx * Rxz + Lapy * Ryz + Lapz * Rzz ) + &
Rxy(i,j,k) = Rxy_loc TWO * alpn1 * ( &
Rxz(i,j,k) = Rxz_loc -F3o2/chin1 * ( chix * Rxz + chiy * Ryz + chiz * Rzz ) - &
Ryz(i,j,k) = Ryz_loc gupxz * ( F2o3 * Kx + EIGHT * PI * Sx ) - &
gupyz * ( F2o3 * Ky + EIGHT * PI * Sy ) - &
Gamx_rhs(i,j,k) = - TWO * (Lapx(i,j,k) * Rxx_loc + Lapy(i,j,k) * Rxy_loc + Lapz(i,j,k) * Rxz_loc) + & gupzz * ( F2o3 * Kz + EIGHT * PI * Sz ) + &
TWO * alpn1(i,j,k) * ( & Gamzxx * Rxx + Gamzyy * Ryy + Gamzzz * Rzz + &
-F3o2/chin1(i,j,k) * (chix(i,j,k) * Rxx_loc + chiy(i,j,k) * Rxy_loc + chiz(i,j,k) * Rxz_loc) - & TWO * ( Gamzxy * Rxy + Gamzxz * Rxz + Gamzyz * Ryz ) )
gupxx_loc * (F2o3 * Kx(i,j,k) + EIGHT * PI * Sx(i,j,k)) - &
gupxy_loc * (F2o3 * Ky(i,j,k) + EIGHT * PI * Sy(i,j,k)) - &
gupxz_loc * (F2o3 * Kz(i,j,k) + EIGHT * PI * Sz(i,j,k)) + &
Gamxxx(i,j,k) * Rxx_loc + Gamxyy(i,j,k) * Ryy_loc + Gamxzz(i,j,k) * Rzz_loc + &
TWO * (Gamxxy(i,j,k) * Rxy_loc + Gamxxz(i,j,k) * Rxz_loc + Gamxyz(i,j,k) * Ryz_loc))
Gamy_rhs(i,j,k) = - TWO * (Lapx(i,j,k) * Rxy_loc + Lapy(i,j,k) * Ryy_loc + Lapz(i,j,k) * Ryz_loc) + &
TWO * alpn1(i,j,k) * ( &
-F3o2/chin1(i,j,k) * (chix(i,j,k) * Rxy_loc + chiy(i,j,k) * Ryy_loc + chiz(i,j,k) * Ryz_loc) - &
gupxy_loc * (F2o3 * Kx(i,j,k) + EIGHT * PI * Sx(i,j,k)) - &
gupyy_loc * (F2o3 * Ky(i,j,k) + EIGHT * PI * Sy(i,j,k)) - &
gupyz_loc * (F2o3 * Kz(i,j,k) + EIGHT * PI * Sz(i,j,k)) + &
Gamyxx(i,j,k) * Rxx_loc + Gamyyy(i,j,k) * Ryy_loc + Gamyzz(i,j,k) * Rzz_loc + &
TWO * (Gamyxy(i,j,k) * Rxy_loc + Gamyxz(i,j,k) * Rxz_loc + Gamyyz(i,j,k) * Ryz_loc))
Gamz_rhs(i,j,k) = - TWO * (Lapx(i,j,k) * Rxz_loc + Lapy(i,j,k) * Ryz_loc + Lapz(i,j,k) * Rzz_loc) + &
TWO * alpn1(i,j,k) * ( &
-F3o2/chin1(i,j,k) * (chix(i,j,k) * Rxz_loc + chiy(i,j,k) * Ryz_loc + chiz(i,j,k) * Rzz_loc) - &
gupxz_loc * (F2o3 * Kx(i,j,k) + EIGHT * PI * Sx(i,j,k)) - &
gupyz_loc * (F2o3 * Ky(i,j,k) + EIGHT * PI * Sy(i,j,k)) - &
gupzz_loc * (F2o3 * Kz(i,j,k) + EIGHT * PI * Sz(i,j,k)) + &
Gamzxx(i,j,k) * Rxx_loc + Gamzyy(i,j,k) * Ryy_loc + Gamzzz(i,j,k) * Rzz_loc + &
TWO * (Gamzxy(i,j,k) * Rxy_loc + Gamzxz(i,j,k) * Rxz_loc + Gamzyz(i,j,k) * Ryz_loc))
enddo
enddo
enddo
call fdderivs(ex,betax,gxxx,gxyx,gxzx,gyyx,gyzx,gzzx,& call fdderivs(ex,betax,gxxx,gxyx,gxzx,gyyx,gyzx,gzzx,&
X,Y,Z,ANTI,SYM, SYM ,Symmetry,Lev) X,Y,Z,ANTI,SYM, SYM ,Symmetry,Lev)
@@ -359,54 +321,38 @@
call fdderivs(ex,betaz,gxxz,gxyz,gxzz,gyyz,gyzz,gzzz,& call fdderivs(ex,betaz,gxxz,gxyz,gxzz,gyyz,gyzz,gzzz,&
X,Y,Z,SYM ,SYM, ANTI,Symmetry,Lev) X,Y,Z,SYM ,SYM, ANTI,Symmetry,Lev)
call fderivs(ex,Gamx,Gamxx,Gamxy,Gamxz,X,Y,Z,ANTI,SYM ,SYM ,Symmetry,Lev) fxx = gxxx + gxyy + gxzz
call fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev) fxy = gxyx + gyyy + gyzz
call fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev) fxz = gxzx + gyzy + gzzz
do k=1,ex(3)
do j=1,ex(2) Gamxa = gupxx * Gamxxx + gupyy * Gamxyy + gupzz * Gamxzz + &
do i=1,ex(1) TWO*( gupxy * Gamxxy + gupxz * Gamxxz + gupyz * Gamxyz )
divb_loc = div_beta(i,j,k) Gamya = gupxx * Gamyxx + gupyy * Gamyyy + gupzz * Gamyzz + &
fxx_loc = gxxx(i,j,k) + gxyy(i,j,k) + gxzz(i,j,k) TWO*( gupxy * Gamyxy + gupxz * Gamyxz + gupyz * Gamyyz )
fxy_loc = gxyx(i,j,k) + gyyy(i,j,k) + gyzz(i,j,k) Gamza = gupxx * Gamzxx + gupyy * Gamzyy + gupzz * Gamzzz + &
fxz_loc = gxzx(i,j,k) + gyzy(i,j,k) + gzzz(i,j,k) TWO*( gupxy * Gamzxy + gupxz * Gamzxz + gupyz * Gamzyz )
gupxx_loc = gupxx(i,j,k) call fderivs(ex,Gamx,Gamxx,Gamxy,Gamxz,X,Y,Z,ANTI,SYM ,SYM ,Symmetry,Lev)
gupxy_loc = gupxy(i,j,k) call fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev)
gupxz_loc = gupxz(i,j,k) call fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev)
gupyy_loc = gupyy(i,j,k)
gupyz_loc = gupyz(i,j,k) Gamx_rhs = Gamx_rhs + F2o3 * Gamxa * div_beta - &
gupzz_loc = gupzz(i,j,k) Gamxa * betaxx - Gamya * betaxy - Gamza * betaxz + &
F1o3 * (gupxx * fxx + gupxy * fxy + gupxz * fxz ) + &
Gamxa_loc = gupxx_loc * Gamxxx(i,j,k) + gupyy_loc * Gamxyy(i,j,k) + gupzz_loc * Gamxzz(i,j,k) + & gupxx * gxxx + gupyy * gyyx + gupzz * gzzx + &
TWO * (gupxy_loc * Gamxxy(i,j,k) + gupxz_loc * Gamxxz(i,j,k) + gupyz_loc * Gamxyz(i,j,k)) TWO * (gupxy * gxyx + gupxz * gxzx + gupyz * gyzx )
Gamya_loc = gupxx_loc * Gamyxx(i,j,k) + gupyy_loc * Gamyyy(i,j,k) + gupzz_loc * Gamyzz(i,j,k) + &
TWO * (gupxy_loc * Gamyxy(i,j,k) + gupxz_loc * Gamyxz(i,j,k) + gupyz_loc * Gamyyz(i,j,k)) Gamy_rhs = Gamy_rhs + F2o3 * Gamya * div_beta - &
Gamza_loc = gupxx_loc * Gamzxx(i,j,k) + gupyy_loc * Gamzyy(i,j,k) + gupzz_loc * Gamzzz(i,j,k) + & Gamxa * betayx - Gamya * betayy - Gamza * betayz + &
TWO * (gupxy_loc * Gamzxy(i,j,k) + gupxz_loc * Gamzxz(i,j,k) + gupyz_loc * Gamzyz(i,j,k)) F1o3 * (gupxy * fxx + gupyy * fxy + gupyz * fxz ) + &
Gamxa(i,j,k) = Gamxa_loc gupxx * gxxy + gupyy * gyyy + gupzz * gzzy + &
Gamya(i,j,k) = Gamya_loc TWO * (gupxy * gxyy + gupxz * gxzy + gupyz * gyzy )
Gamza(i,j,k) = Gamza_loc
Gamz_rhs = Gamz_rhs + F2o3 * Gamza * div_beta - &
Gamx_rhs(i,j,k) = Gamx_rhs(i,j,k) + F2o3 * Gamxa_loc * divb_loc - & Gamxa * betazx - Gamya * betazy - Gamza * betazz + &
Gamxa_loc * betaxx(i,j,k) - Gamya_loc * betaxy(i,j,k) - Gamza_loc * betaxz(i,j,k) + & F1o3 * (gupxz * fxx + gupyz * fxy + gupzz * fxz ) + &
F1o3 * (gupxx_loc * fxx_loc + gupxy_loc * fxy_loc + gupxz_loc * fxz_loc) + & gupxx * gxxz + gupyy * gyyz + gupzz * gzzz + &
gupxx_loc * gxxx(i,j,k) + gupyy_loc * gyyx(i,j,k) + gupzz_loc * gzzx(i,j,k) + & TWO * (gupxy * gxyz + gupxz * gxzz + gupyz * gyzz ) !rhs for Gam^i
TWO * (gupxy_loc * gxyx(i,j,k) + gupxz_loc * gxzx(i,j,k) + gupyz_loc * gyzx(i,j,k))
Gamy_rhs(i,j,k) = Gamy_rhs(i,j,k) + F2o3 * Gamya_loc * divb_loc - &
Gamxa_loc * betayx(i,j,k) - Gamya_loc * betayy(i,j,k) - Gamza_loc * betayz(i,j,k) + &
F1o3 * (gupxy_loc * fxx_loc + gupyy_loc * fxy_loc + gupyz_loc * fxz_loc) + &
gupxx_loc * gxxy(i,j,k) + gupyy_loc * gyyy(i,j,k) + gupzz_loc * gzzy(i,j,k) + &
TWO * (gupxy_loc * gxyy(i,j,k) + gupxz_loc * gxzy(i,j,k) + gupyz_loc * gyzy(i,j,k))
Gamz_rhs(i,j,k) = Gamz_rhs(i,j,k) + F2o3 * Gamza_loc * divb_loc - &
Gamxa_loc * betazx(i,j,k) - Gamya_loc * betazy(i,j,k) - Gamza_loc * betazz(i,j,k) + &
F1o3 * (gupxz_loc * fxx_loc + gupyz_loc * fxy_loc + gupzz_loc * fxz_loc) + &
gupxx_loc * gxxz(i,j,k) + gupyy_loc * gyyz(i,j,k) + gupzz_loc * gzzz(i,j,k) + &
TWO * (gupxy_loc * gxyz(i,j,k) + gupxz_loc * gxzz(i,j,k) + gupyz_loc * gyzz(i,j,k))
enddo
enddo
enddo
!first kind of connection stored in gij,k !first kind of connection stored in gij,k
gxxx = gxx * Gamxxx + gxy * Gamyxx + gxz * Gamzxx gxxx = gxx * Gamxxx + gxy * Gamyxx + gxz * Gamzxx
@@ -655,190 +601,192 @@
Gamxyz * gxzz + Gamyyz * gyzz + Gamzyz * gzzz + & Gamxyz * gxzz + Gamyyz * gyzz + Gamzyz * gzzz + &
Gamxzz * gxzy + Gamyzz * gyzy + Gamzzz * gzzy + & Gamxzz * gxzy + Gamyzz * gyzy + Gamzzz * gzzy + &
Gamxyz * gzzx + Gamyyz * gzzy + Gamzyz * gzzz ) Gamxyz * gzzx + Gamyyz * gzzy + Gamzyz * gzzz )
!covariant second derivative of chi respect to tilted metric !covariant second derivative of chi respect to tilted metric
call fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev) call fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
do k=1,ex(3) fxx = fxx - Gamxxx * chix - Gamyxx * chiy - Gamzxx * chiz
do j=1,ex(2) fxy = fxy - Gamxxy * chix - Gamyxy * chiy - Gamzxy * chiz
do i=1,ex(1) fxz = fxz - Gamxxz * chix - Gamyxz * chiy - Gamzxz * chiz
fxx(i,j,k) = fxx(i,j,k) - Gamxxx(i,j,k) * chix(i,j,k) - Gamyxx(i,j,k) * chiy(i,j,k) - Gamzxx(i,j,k) * chiz(i,j,k) fyy = fyy - Gamxyy * chix - Gamyyy * chiy - Gamzyy * chiz
fxy(i,j,k) = fxy(i,j,k) - Gamxxy(i,j,k) * chix(i,j,k) - Gamyxy(i,j,k) * chiy(i,j,k) - Gamzxy(i,j,k) * chiz(i,j,k) fyz = fyz - Gamxyz * chix - Gamyyz * chiy - Gamzyz * chiz
fxz(i,j,k) = fxz(i,j,k) - Gamxxz(i,j,k) * chix(i,j,k) - Gamyxz(i,j,k) * chiy(i,j,k) - Gamzxz(i,j,k) * chiz(i,j,k) fzz = fzz - Gamxzz * chix - Gamyzz * chiy - Gamzzz * chiz
fyy(i,j,k) = fyy(i,j,k) - Gamxyy(i,j,k) * chix(i,j,k) - Gamyyy(i,j,k) * chiy(i,j,k) - Gamzyy(i,j,k) * chiz(i,j,k) ! Store D^l D_l chi - 3/(2*chi) D^l chi D_l chi in f
fyz(i,j,k) = fyz(i,j,k) - Gamxyz(i,j,k) * chix(i,j,k) - Gamyyz(i,j,k) * chiy(i,j,k) - Gamzyz(i,j,k) * chiz(i,j,k)
fzz(i,j,k) = fzz(i,j,k) - Gamxzz(i,j,k) * chix(i,j,k) - Gamyzz(i,j,k) * chiy(i,j,k) - Gamzzz(i,j,k) * chiz(i,j,k) f = gupxx * ( fxx - F3o2/chin1 * chix * chix ) + &
gupyy * ( fyy - F3o2/chin1 * chiy * chiy ) + &
chin_loc = chin1(i,j,k) gupzz * ( fzz - F3o2/chin1 * chiz * chiz ) + &
f_loc = gupxx(i,j,k) * (fxx(i,j,k) - F3o2/chin_loc * chix(i,j,k) * chix(i,j,k)) + & TWO * gupxy * ( fxy - F3o2/chin1 * chix * chiy ) + &
gupyy(i,j,k) * (fyy(i,j,k) - F3o2/chin_loc * chiy(i,j,k) * chiy(i,j,k)) + & TWO * gupxz * ( fxz - F3o2/chin1 * chix * chiz ) + &
gupzz(i,j,k) * (fzz(i,j,k) - F3o2/chin_loc * chiz(i,j,k) * chiz(i,j,k)) + & TWO * gupyz * ( fyz - F3o2/chin1 * chiy * chiz )
TWO * gupxy(i,j,k) * (fxy(i,j,k) - F3o2/chin_loc * chix(i,j,k) * chiy(i,j,k)) + & ! Add chi part to Ricci tensor:
TWO * gupxz(i,j,k) * (fxz(i,j,k) - F3o2/chin_loc * chix(i,j,k) * chiz(i,j,k)) + &
TWO * gupyz(i,j,k) * (fyz(i,j,k) - F3o2/chin_loc * chiy(i,j,k) * chiz(i,j,k)) Rxx = Rxx + (fxx - chix*chix/chin1/TWO + gxx * f)/chin1/TWO
f(i,j,k) = f_loc Ryy = Ryy + (fyy - chiy*chiy/chin1/TWO + gyy * f)/chin1/TWO
Rzz = Rzz + (fzz - chiz*chiz/chin1/TWO + gzz * f)/chin1/TWO
Rxx(i,j,k) = Rxx(i,j,k) + (fxx(i,j,k) - chix(i,j,k)*chix(i,j,k)/chin_loc/TWO + gxx(i,j,k) * f_loc)/chin_loc/TWO Rxy = Rxy + (fxy - chix*chiy/chin1/TWO + gxy * f)/chin1/TWO
Ryy(i,j,k) = Ryy(i,j,k) + (fyy(i,j,k) - chiy(i,j,k)*chiy(i,j,k)/chin_loc/TWO + gyy(i,j,k) * f_loc)/chin_loc/TWO Rxz = Rxz + (fxz - chix*chiz/chin1/TWO + gxz * f)/chin1/TWO
Rzz(i,j,k) = Rzz(i,j,k) + (fzz(i,j,k) - chiz(i,j,k)*chiz(i,j,k)/chin_loc/TWO + gzz(i,j,k) * f_loc)/chin_loc/TWO Ryz = Ryz + (fyz - chiy*chiz/chin1/TWO + gyz * f)/chin1/TWO
Rxy(i,j,k) = Rxy(i,j,k) + (fxy(i,j,k) - chix(i,j,k)*chiy(i,j,k)/chin_loc/TWO + gxy(i,j,k) * f_loc)/chin_loc/TWO
Rxz(i,j,k) = Rxz(i,j,k) + (fxz(i,j,k) - chix(i,j,k)*chiz(i,j,k)/chin_loc/TWO + gxz(i,j,k) * f_loc)/chin_loc/TWO ! covariant second derivatives of the lapse respect to physical metric
Ryz(i,j,k) = Ryz(i,j,k) + (fyz(i,j,k) - chiy(i,j,k)*chiz(i,j,k)/chin_loc/TWO + gyz(i,j,k) * f_loc)/chin_loc/TWO call fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z, &
enddo SYM,SYM,SYM,symmetry,Lev)
enddo
enddo gxxx = (gupxx * chix + gupxy * chiy + gupxz * chiz)/chin1
gxxy = (gupxy * chix + gupyy * chiy + gupyz * chiz)/chin1
! covariant second derivatives of the lapse respect to physical metric gxxz = (gupxz * chix + gupyz * chiy + gupzz * chiz)/chin1
call fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z, & ! now get physical second kind of connection
SYM,SYM,SYM,symmetry,Lev) Gamxxx = Gamxxx - ( (chix + chix)/chin1 - gxx * gxxx )*HALF
Gamyxx = Gamyxx - ( - gxx * gxxy )*HALF
do k=1,ex(3) Gamzxx = Gamzxx - ( - gxx * gxxz )*HALF
do j=1,ex(2) Gamxyy = Gamxyy - ( - gyy * gxxx )*HALF
do i=1,ex(1) Gamyyy = Gamyyy - ( (chiy + chiy)/chin1 - gyy * gxxy )*HALF
chin_loc = chin1(i,j,k) Gamzyy = Gamzyy - ( - gyy * gxxz )*HALF
gxxx(i,j,k) = (gupxx(i,j,k) * chix(i,j,k) + gupxy(i,j,k) * chiy(i,j,k) + gupxz(i,j,k) * chiz(i,j,k)) / chin_loc Gamxzz = Gamxzz - ( - gzz * gxxx )*HALF
gxxy(i,j,k) = (gupxy(i,j,k) * chix(i,j,k) + gupyy(i,j,k) * chiy(i,j,k) + gupyz(i,j,k) * chiz(i,j,k)) / chin_loc Gamyzz = Gamyzz - ( - gzz * gxxy )*HALF
gxxz(i,j,k) = (gupxz(i,j,k) * chix(i,j,k) + gupyz(i,j,k) * chiy(i,j,k) + gupzz(i,j,k) * chiz(i,j,k)) / chin_loc Gamzzz = Gamzzz - ( (chiz + chiz)/chin1 - gzz * gxxz )*HALF
Gamxxy = Gamxxy - ( chiy /chin1 - gxy * gxxx )*HALF
Gamxxx(i,j,k) = Gamxxx(i,j,k) - ( (chix(i,j,k) + chix(i,j,k))/chin_loc - gxx(i,j,k) * gxxx(i,j,k) )*HALF Gamyxy = Gamyxy - ( chix /chin1 - gxy * gxxy )*HALF
Gamyxx(i,j,k) = Gamyxx(i,j,k) - ( - gxx(i,j,k) * gxxy(i,j,k) )*HALF Gamzxy = Gamzxy - ( - gxy * gxxz )*HALF
Gamzxx(i,j,k) = Gamzxx(i,j,k) - ( - gxx(i,j,k) * gxxz(i,j,k) )*HALF Gamxxz = Gamxxz - ( chiz /chin1 - gxz * gxxx )*HALF
Gamxyy(i,j,k) = Gamxyy(i,j,k) - ( - gyy(i,j,k) * gxxx(i,j,k) )*HALF Gamyxz = Gamyxz - ( - gxz * gxxy )*HALF
Gamyyy(i,j,k) = Gamyyy(i,j,k) - ( (chiy(i,j,k) + chiy(i,j,k))/chin_loc - gyy(i,j,k) * gxxy(i,j,k) )*HALF Gamzxz = Gamzxz - ( chix /chin1 - gxz * gxxz )*HALF
Gamzyy(i,j,k) = Gamzyy(i,j,k) - ( - gyy(i,j,k) * gxxz(i,j,k) )*HALF Gamxyz = Gamxyz - ( - gyz * gxxx )*HALF
Gamxzz(i,j,k) = Gamxzz(i,j,k) - ( - gzz(i,j,k) * gxxx(i,j,k) )*HALF Gamyyz = Gamyyz - ( chiz /chin1 - gyz * gxxy )*HALF
Gamyzz(i,j,k) = Gamyzz(i,j,k) - ( - gzz(i,j,k) * gxxy(i,j,k) )*HALF Gamzyz = Gamzyz - ( chiy /chin1 - gyz * gxxz )*HALF
Gamzzz(i,j,k) = Gamzzz(i,j,k) - ( (chiz(i,j,k) + chiz(i,j,k))/chin_loc - gzz(i,j,k) * gxxz(i,j,k) )*HALF
Gamxxy(i,j,k) = Gamxxy(i,j,k) - ( chiy(i,j,k) /chin_loc - gxy(i,j,k) * gxxx(i,j,k) )*HALF fxx = fxx - Gamxxx*Lapx - Gamyxx*Lapy - Gamzxx*Lapz
Gamyxy(i,j,k) = Gamyxy(i,j,k) - ( chix(i,j,k) /chin_loc - gxy(i,j,k) * gxxy(i,j,k) )*HALF fyy = fyy - Gamxyy*Lapx - Gamyyy*Lapy - Gamzyy*Lapz
Gamzxy(i,j,k) = Gamzxy(i,j,k) - ( - gxy(i,j,k) * gxxz(i,j,k) )*HALF fzz = fzz - Gamxzz*Lapx - Gamyzz*Lapy - Gamzzz*Lapz
Gamxxz(i,j,k) = Gamxxz(i,j,k) - ( chiz(i,j,k) /chin_loc - gxz(i,j,k) * gxxx(i,j,k) )*HALF fxy = fxy - Gamxxy*Lapx - Gamyxy*Lapy - Gamzxy*Lapz
Gamyxz(i,j,k) = Gamyxz(i,j,k) - ( - gxz(i,j,k) * gxxy(i,j,k) )*HALF fxz = fxz - Gamxxz*Lapx - Gamyxz*Lapy - Gamzxz*Lapz
Gamzxz(i,j,k) = Gamzxz(i,j,k) - ( chix(i,j,k) /chin_loc - gxz(i,j,k) * gxxz(i,j,k) )*HALF fyz = fyz - Gamxyz*Lapx - Gamyyz*Lapy - Gamzyz*Lapz
Gamxyz(i,j,k) = Gamxyz(i,j,k) - ( - gyz(i,j,k) * gxxx(i,j,k) )*HALF
Gamyyz(i,j,k) = Gamyyz(i,j,k) - ( chiz(i,j,k) /chin_loc - gyz(i,j,k) * gxxy(i,j,k) )*HALF ! store D^i D_i Lap in trK_rhs upto chi
Gamzyz(i,j,k) = Gamzyz(i,j,k) - ( chiy(i,j,k) /chin_loc - gyz(i,j,k) * gxxz(i,j,k) )*HALF trK_rhs = gupxx * fxx + gupyy * fyy + gupzz * fzz + &
TWO* ( gupxy * fxy + gupxz * fxz + gupyz * fyz )
fxx(i,j,k) = fxx(i,j,k) - Gamxxx(i,j,k)*Lapx(i,j,k) - Gamyxx(i,j,k)*Lapy(i,j,k) - Gamzxx(i,j,k)*Lapz(i,j,k) #if 1
fyy(i,j,k) = fyy(i,j,k) - Gamxyy(i,j,k)*Lapx(i,j,k) - Gamyyy(i,j,k)*Lapy(i,j,k) - Gamzyy(i,j,k)*Lapz(i,j,k) !! follow bam code
fzz(i,j,k) = fzz(i,j,k) - Gamxzz(i,j,k)*Lapx(i,j,k) - Gamyzz(i,j,k)*Lapy(i,j,k) - Gamzzz(i,j,k)*Lapz(i,j,k) S = chin1 * ( gupxx * Sxx + gupyy * Syy + gupzz * Szz + &
fxy(i,j,k) = fxy(i,j,k) - Gamxxy(i,j,k)*Lapx(i,j,k) - Gamyxy(i,j,k)*Lapy(i,j,k) - Gamzxy(i,j,k)*Lapz(i,j,k) TWO * ( gupxy * Sxy + gupxz * Sxz + gupyz * Syz ) )
fxz(i,j,k) = fxz(i,j,k) - Gamxxz(i,j,k)*Lapx(i,j,k) - Gamyxz(i,j,k)*Lapy(i,j,k) - Gamzxz(i,j,k)*Lapz(i,j,k) f = F2o3 * trK * trK -(&
fyz(i,j,k) = fyz(i,j,k) - Gamxyz(i,j,k)*Lapx(i,j,k) - Gamyyz(i,j,k)*Lapy(i,j,k) - Gamzyz(i,j,k)*Lapz(i,j,k) gupxx * ( &
gupxx * Axx * Axx + gupyy * Axy * Axy + gupzz * Axz * Axz + &
trK_rhs(i,j,k) = gupxx(i,j,k) * fxx(i,j,k) + gupyy(i,j,k) * fyy(i,j,k) + gupzz(i,j,k) * fzz(i,j,k) + & TWO * (gupxy * Axx * Axy + gupxz * Axx * Axz + gupyz * Axy * Axz) ) + &
TWO * (gupxy(i,j,k) * fxy(i,j,k) + gupxz(i,j,k) * fxz(i,j,k) + gupyz(i,j,k) * fyz(i,j,k)) gupyy * ( &
enddo gupxx * Axy * Axy + gupyy * Ayy * Ayy + gupzz * Ayz * Ayz + &
enddo TWO * (gupxy * Axy * Ayy + gupxz * Axy * Ayz + gupyz * Ayy * Ayz) ) + &
enddo gupzz * ( &
do k=1,ex(3) gupxx * Axz * Axz + gupyy * Ayz * Ayz + gupzz * Azz * Azz + &
do j=1,ex(2) TWO * (gupxy * Axz * Ayz + gupxz * Axz * Azz + gupyz * Ayz * Azz) ) + &
do i=1,ex(1) TWO * ( &
divb_loc = div_beta(i,j,k) gupxy * ( &
chin_loc = chin1(i,j,k) gupxx * Axx * Axy + gupyy * Axy * Ayy + gupzz * Axz * Ayz + &
gupxy * (Axx * Ayy + Axy * Axy) + &
S_loc = chin_loc * ( gupxx(i,j,k) * Sxx(i,j,k) + gupyy(i,j,k) * Syy(i,j,k) + gupzz(i,j,k) * Szz(i,j,k) + & gupxz * (Axx * Ayz + Axz * Axy) + &
TWO * (gupxy(i,j,k) * Sxy(i,j,k) + gupxz(i,j,k) * Sxz(i,j,k) + gupyz(i,j,k) * Syz(i,j,k)) ) gupyz * (Axy * Ayz + Axz * Ayy) ) + &
S(i,j,k) = S_loc gupxz * ( &
gupxx * Axx * Axz + gupyy * Axy * Ayz + gupzz * Axz * Azz + &
f_loc = F2o3 * trK(i,j,k) * trK(i,j,k) - ( & gupxy * (Axx * Ayz + Axy * Axz) + &
gupxx(i,j,k) * ( gupxx(i,j,k) * Axx(i,j,k) * Axx(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + & gupxz * (Axx * Azz + Axz * Axz) + &
gupzz(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + & gupyz * (Axy * Azz + Axz * Ayz) ) + &
TWO * (gupxy(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + gupxz(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + & gupyz * ( &
gupyz(i,j,k) * Axy(i,j,k) * Axz(i,j,k)) ) + & gupxx * Axy * Axz + gupyy * Ayy * Ayz + gupzz * Ayz * Azz + &
gupyy(i,j,k) * ( gupxx(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayy(i,j,k) + & gupxy * (Axy * Ayz + Ayy * Axz) + &
gupzz(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + & gupxz * (Axy * Azz + Ayz * Axz) + &
TWO * (gupxy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + gupxz(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + & gupyz * (Ayy * Azz + Ayz * Ayz) ) )) -1.6d1*PI*rho + EIGHT * PI * S
gupyz(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k)) ) + & f = - F1o3 *( gupxx * fxx + gupyy * fyy + gupzz * fzz + &
gupzz(i,j,k) * ( gupxx(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + & TWO* ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) + alpn1/chin1*f)
gupzz(i,j,k) * Azz(i,j,k) * Azz(i,j,k) + &
TWO * (gupxy(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + gupxz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + & fxx = alpn1 * (Rxx - EIGHT * PI * Sxx) - fxx
gupyz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k)) ) + & fxy = alpn1 * (Rxy - EIGHT * PI * Sxy) - fxy
TWO * ( gupxy(i,j,k) * ( gupxx(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + & fxz = alpn1 * (Rxz - EIGHT * PI * Sxz) - fxz
gupzz(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + & fyy = alpn1 * (Ryy - EIGHT * PI * Syy) - fyy
gupxy(i,j,k) * (Axx(i,j,k) * Ayy(i,j,k) + Axy(i,j,k) * Axy(i,j,k)) + & fyz = alpn1 * (Ryz - EIGHT * PI * Syz) - fyz
gupxz(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Axy(i,j,k)) + & fzz = alpn1 * (Rzz - EIGHT * PI * Szz) - fzz
gupyz(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Ayy(i,j,k)) ) + & #else
gupxz(i,j,k) * ( gupxx(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + & ! Add lapse and S_ij parts to Ricci tensor:
gupzz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + &
gupxy(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axy(i,j,k) * Axz(i,j,k)) + & fxx = alpn1 * (Rxx - EIGHT * PI * Sxx) - fxx
gupxz(i,j,k) * (Axx(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Axz(i,j,k)) + & fxy = alpn1 * (Rxy - EIGHT * PI * Sxy) - fxy
gupyz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Ayz(i,j,k)) ) + & fxz = alpn1 * (Rxz - EIGHT * PI * Sxz) - fxz
gupyz(i,j,k) * ( gupxx(i,j,k) * Axy(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k) + & fyy = alpn1 * (Ryy - EIGHT * PI * Syy) - fyy
gupzz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k) + & fyz = alpn1 * (Ryz - EIGHT * PI * Syz) - fyz
gupxy(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Ayy(i,j,k) * Axz(i,j,k)) + & fzz = alpn1 * (Rzz - EIGHT * PI * Szz) - fzz
gupxz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Axz(i,j,k)) + &
gupyz(i,j,k) * (Ayy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Ayz(i,j,k)) ) ) ) - & ! Compute trace-free part (note: chi^-1 and chi cancel!):
F16 * PI * rho(i,j,k) + EIGHT * PI * S_loc
f = F1o3 *( gupxx * fxx + gupyy * fyy + gupzz * fzz + &
f_loc = -F1o3 * ( gupxx(i,j,k) * fxx(i,j,k) + gupyy(i,j,k) * fyy(i,j,k) + gupzz(i,j,k) * fzz(i,j,k) + & TWO* ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) )
TWO * (gupxy(i,j,k) * fxy(i,j,k) + gupxz(i,j,k) * fxz(i,j,k) + gupyz(i,j,k) * fyz(i,j,k)) + & #endif
alpn1(i,j,k)/chin_loc * f_loc )
f(i,j,k) = f_loc Axx_rhs = fxx - gxx * f
Ayy_rhs = fyy - gyy * f
l_fxx = alpn1(i,j,k) * (Rxx(i,j,k) - EIGHT * PI * Sxx(i,j,k)) - fxx(i,j,k) Azz_rhs = fzz - gzz * f
l_fxy = alpn1(i,j,k) * (Rxy(i,j,k) - EIGHT * PI * Sxy(i,j,k)) - fxy(i,j,k) Axy_rhs = fxy - gxy * f
l_fxz = alpn1(i,j,k) * (Rxz(i,j,k) - EIGHT * PI * Sxz(i,j,k)) - fxz(i,j,k) Axz_rhs = fxz - gxz * f
l_fyy = alpn1(i,j,k) * (Ryy(i,j,k) - EIGHT * PI * Syy(i,j,k)) - fyy(i,j,k) Ayz_rhs = fyz - gyz * f
l_fyz = alpn1(i,j,k) * (Ryz(i,j,k) - EIGHT * PI * Syz(i,j,k)) - fyz(i,j,k)
l_fzz = alpn1(i,j,k) * (Rzz(i,j,k) - EIGHT * PI * Szz(i,j,k)) - fzz(i,j,k) ! Now: store A_il A^l_j into fij:
Axx_rhs(i,j,k) = l_fxx - gxx(i,j,k) * f_loc fxx = gupxx * Axx * Axx + gupyy * Axy * Axy + gupzz * Axz * Axz + &
Ayy_rhs(i,j,k) = l_fyy - gyy(i,j,k) * f_loc TWO * (gupxy * Axx * Axy + gupxz * Axx * Axz + gupyz * Axy * Axz)
Azz_rhs(i,j,k) = l_fzz - gzz(i,j,k) * f_loc fyy = gupxx * Axy * Axy + gupyy * Ayy * Ayy + gupzz * Ayz * Ayz + &
Axy_rhs(i,j,k) = l_fxy - gxy(i,j,k) * f_loc TWO * (gupxy * Axy * Ayy + gupxz * Axy * Ayz + gupyz * Ayy * Ayz)
Axz_rhs(i,j,k) = l_fxz - gxz(i,j,k) * f_loc fzz = gupxx * Axz * Axz + gupyy * Ayz * Ayz + gupzz * Azz * Azz + &
Ayz_rhs(i,j,k) = l_fyz - gyz(i,j,k) * f_loc TWO * (gupxy * Axz * Ayz + gupxz * Axz * Azz + gupyz * Ayz * Azz)
fxy = gupxx * Axx * Axy + gupyy * Axy * Ayy + gupzz * Axz * Ayz + &
fxx(i,j,k) = gupxx(i,j,k) * Axx(i,j,k) * Axx(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + & gupxy *(Axx * Ayy + Axy * Axy) + &
gupzz(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + TWO * (gupxy(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + & gupxz *(Axx * Ayz + Axz * Axy) + &
gupxz(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + gupyz(i,j,k) * Axy(i,j,k) * Axz(i,j,k)) gupyz *(Axy * Ayz + Axz * Ayy)
fyy(i,j,k) = gupxx(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayy(i,j,k) + & fxz = gupxx * Axx * Axz + gupyy * Axy * Ayz + gupzz * Axz * Azz + &
gupzz(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + TWO * (gupxy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + & gupxy *(Axx * Ayz + Axy * Axz) + &
gupxz(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + gupyz(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k)) gupxz *(Axx * Azz + Axz * Axz) + &
fzz(i,j,k) = gupxx(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + & gupyz *(Axy * Azz + Axz * Ayz)
gupzz(i,j,k) * Azz(i,j,k) * Azz(i,j,k) + TWO * (gupxy(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + & fyz = gupxx * Axy * Axz + gupyy * Ayy * Ayz + gupzz * Ayz * Azz + &
gupxz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + gupyz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k)) gupxy *(Axy * Ayz + Ayy * Axz) + &
fxy(i,j,k) = gupxx(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + & gupxz *(Axy * Azz + Ayz * Axz) + &
gupzz(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + gupxy(i,j,k) * (Axx(i,j,k) * Ayy(i,j,k) + Axy(i,j,k) * Axy(i,j,k)) + & gupyz *(Ayy * Azz + Ayz * Ayz)
gupxz(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Axy(i,j,k)) + &
gupyz(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Ayy(i,j,k)) f = chin1
fxz(i,j,k) = gupxx(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + & ! store D^i D_i Lap in trK_rhs
gupzz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + gupxy(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axy(i,j,k) * Axz(i,j,k)) + & trK_rhs = f*trK_rhs
gupxz(i,j,k) * (Axx(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Axz(i,j,k)) + &
gupyz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Ayz(i,j,k)) Axx_rhs = f * Axx_rhs+ alpn1 * (trK * Axx - TWO * fxx) + &
fyz(i,j,k) = gupxx(i,j,k) * Axy(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k) + & TWO * ( Axx * betaxx + Axy * betayx + Axz * betazx )- &
gupzz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k) + gupxy(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Ayy(i,j,k) * Axz(i,j,k)) + & F2o3 * Axx * div_beta
gupxz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Axz(i,j,k)) + &
gupyz(i,j,k) * (Ayy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Ayz(i,j,k)) Ayy_rhs = f * Ayy_rhs+ alpn1 * (trK * Ayy - TWO * fyy) + &
TWO * ( Axy * betaxy + Ayy * betayy + Ayz * betazy )- &
trK_rhs(i,j,k) = chin_loc * trK_rhs(i,j,k) F2o3 * Ayy * div_beta
Axx_rhs(i,j,k) = chin_loc * Axx_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Axx(i,j,k) - TWO * fxx(i,j,k)) + & Azz_rhs = f * Azz_rhs+ alpn1 * (trK * Azz - TWO * fzz) + &
TWO * (Axx(i,j,k) * betaxx(i,j,k) + Axy(i,j,k) * betayx(i,j,k) + Axz(i,j,k) * betazx(i,j,k)) - & TWO * ( Axz * betaxz + Ayz * betayz + Azz * betazz )- &
F2o3 * Axx(i,j,k) * divb_loc F2o3 * Azz * div_beta
Ayy_rhs(i,j,k) = chin_loc * Ayy_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Ayy(i,j,k) - TWO * fyy(i,j,k)) + &
TWO * (Axy(i,j,k) * betaxy(i,j,k) + Ayy(i,j,k) * betayy(i,j,k) + Ayz(i,j,k) * betazy(i,j,k)) - & Axy_rhs = f * Axy_rhs+ alpn1 *( trK * Axy - TWO * fxy )+ &
F2o3 * Ayy(i,j,k) * divb_loc Axx * betaxy + Axz * betazy + &
Azz_rhs(i,j,k) = chin_loc * Azz_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Azz(i,j,k) - TWO * fzz(i,j,k)) + & Ayy * betayx + Ayz * betazx + &
TWO * (Axz(i,j,k) * betaxz(i,j,k) + Ayz(i,j,k) * betayz(i,j,k) + Azz(i,j,k) * betazz(i,j,k)) - & F1o3 * Axy * div_beta - Axy * betazz
F2o3 * Azz(i,j,k) * divb_loc
Axy_rhs(i,j,k) = chin_loc * Axy_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Axy(i,j,k) - TWO * fxy(i,j,k)) + & Ayz_rhs = f * Ayz_rhs+ alpn1 *( trK * Ayz - TWO * fyz )+ &
Axx(i,j,k) * betaxy(i,j,k) + Axz(i,j,k) * betazy(i,j,k) + Ayy(i,j,k) * betayx(i,j,k) + & Axy * betaxz + Ayy * betayz + &
Ayz(i,j,k) * betazx(i,j,k) + F1o3 * Axy(i,j,k) * divb_loc - Axy(i,j,k) * betazz(i,j,k) Axz * betaxy + Azz * betazy + &
Ayz_rhs(i,j,k) = chin_loc * Ayz_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Ayz(i,j,k) - TWO * fyz(i,j,k)) + & F1o3 * Ayz * div_beta - Ayz * betaxx
Axy(i,j,k) * betaxz(i,j,k) + Ayy(i,j,k) * betayz(i,j,k) + Axz(i,j,k) * betaxy(i,j,k) + &
Azz(i,j,k) * betazy(i,j,k) + F1o3 * Ayz(i,j,k) * divb_loc - Ayz(i,j,k) * betaxx(i,j,k) Axz_rhs = f * Axz_rhs+ alpn1 *( trK * Axz - TWO * fxz )+ &
Axz_rhs(i,j,k) = chin_loc * Axz_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Axz(i,j,k) - TWO * fxz(i,j,k)) + & Axx * betaxz + Axy * betayz + &
Axx(i,j,k) * betaxz(i,j,k) + Axy(i,j,k) * betayz(i,j,k) + Ayz(i,j,k) * betayx(i,j,k) + & Ayz * betayx + Azz * betazx + &
Azz(i,j,k) * betazx(i,j,k) + F1o3 * Axz(i,j,k) * divb_loc - Axz(i,j,k) * betayy(i,j,k) F1o3 * Axz * div_beta - Axz * betayy !rhs for Aij
trK_rhs(i,j,k) = - trK_rhs(i,j,k) + alpn1(i,j,k) * ( F1o3 * trK(i,j,k) * trK(i,j,k) + & ! Compute trace of S_ij
gupxx(i,j,k) * fxx(i,j,k) + gupyy(i,j,k) * fyy(i,j,k) + gupzz(i,j,k) * fzz(i,j,k) + &
TWO * (gupxy(i,j,k) * fxy(i,j,k) + gupxz(i,j,k) * fxz(i,j,k) + gupyz(i,j,k) * fyz(i,j,k)) + & S = f * ( gupxx * Sxx + gupyy * Syy + gupzz * Szz + &
FOUR * PI * (rho(i,j,k) + S_loc) ) TWO * ( gupxy * Sxy + gupxz * Sxz + gupyz * Syz ) )
enddo
enddo trK_rhs = - trK_rhs + alpn1 *( F1o3 * trK * trK + &
enddo gupxx * fxx + gupyy * fyy + gupzz * fzz + &
TWO * ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) + &
FOUR * PI * ( rho + S )) !rhs for trK
!!!! gauge variable part !!!! gauge variable part
@@ -1000,15 +948,15 @@
!!!!!!!!!advection term + Kreiss-Oliger dissipation (merged for cache efficiency) !!!!!!!!!advection term + Kreiss-Oliger dissipation (merged for cache efficiency)
! lopsided_kodis shares the symmetry_bd buffer between advection and ! lopsided_kodis shares the symmetry_bd buffer between advection and
! dissipation, eliminating redundant full-grid copies. For metric variables ! dissipation, eliminating redundant full-grid copies. For metric variables
! gxx/gyy/gzz (=dxx/dyy/dzz+1): stencil coefficients sum to zero, ! gxx/gyy/gzz (=dxx/dyy/dzz+1): kodis stencil coefficients sum to zero,
! so the constant offset has no effect on dissipation. ! so the constant offset has no effect on dissipation.
call lopsided_kodis(ex,X,Y,Z,dxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided_kodis(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps) call lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
call lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps) call lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps)
call lopsided_kodis(ex,X,Y,Z,dyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided_kodis(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps) call lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps)
call lopsided_kodis(ex,X,Y,Z,dzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided_kodis(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps) call lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps)

View File

@@ -716,7 +716,6 @@ int f_compute_rhs_bssn(int *ex, double &T,
// 24ms // // 24ms //
fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev); fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
// 6ms // // 6ms //
for (int i=0;i<all;i+=1) { for (int i=0;i<all;i+=1) {
@@ -1014,24 +1013,17 @@ int f_compute_rhs_bssn(int *ex, double &T,
betaz_rhs[i] = FF * dtSfz[i]; betaz_rhs[i] = FF * dtSfz[i];
reta[i] = reta[i] =
gupxx[i] * dtSfx_rhs[i] * dtSfx_rhs[i] gupxx[i] * chix[i] * chix[i]
+ gupyy[i] * dtSfy_rhs[i] * dtSfy_rhs[i] + gupyy[i] * chiy[i] * chiy[i]
+ gupzz[i] * dtSfz_rhs[i] * dtSfz_rhs[i] + gupzz[i] * chiz[i] * chiz[i]
+ TWO * ( gupxy[i] * dtSfx_rhs[i] * dtSfy_rhs[i] + TWO * ( gupxy[i] * chix[i] * chiy[i]
+ gupxz[i] * dtSfx_rhs[i] * dtSfz_rhs[i] + gupxz[i] * chix[i] * chiz[i]
+ gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] ); + gupyz[i] * chiy[i] * chiz[i] );
#if (GAUGE == 2) #if (GAUGE == 2)
{ reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - sqrt(chin1[i])), 2.0 );
const double chi_sqrt = sqrt(chin1[i]);
const double damping = ONE - chi_sqrt;
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping);
}
#else #else
{ reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - chin1[i]), 2.0 );
const double damping = ONE - chin1[i];
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping);
}
#endif #endif
dtSfx_rhs[i] = Gamx_rhs[i] - reta[i] * dtSfx[i]; dtSfx_rhs[i] = Gamx_rhs[i] - reta[i] * dtSfx[i];
@@ -1039,24 +1031,17 @@ int f_compute_rhs_bssn(int *ex, double &T,
dtSfz_rhs[i] = Gamz_rhs[i] - reta[i] * dtSfz[i]; dtSfz_rhs[i] = Gamz_rhs[i] - reta[i] * dtSfz[i];
#elif (GAUGE == 4 || GAUGE == 5) #elif (GAUGE == 4 || GAUGE == 5)
reta[i] = reta[i] =
gupxx[i] * dtSfx_rhs[i] * dtSfx_rhs[i] gupxx[i] * chix[i] * chix[i]
+ gupyy[i] * dtSfy_rhs[i] * dtSfy_rhs[i] + gupyy[i] * chiy[i] * chiy[i]
+ gupzz[i] * dtSfz_rhs[i] * dtSfz_rhs[i] + gupzz[i] * chiz[i] * chiz[i]
+ TWO * ( gupxy[i] * dtSfx_rhs[i] * dtSfy_rhs[i] + TWO * ( gupxy[i] * chix[i] * chiy[i]
+ gupxz[i] * dtSfx_rhs[i] * dtSfz_rhs[i] + gupxz[i] * chix[i] * chiz[i]
+ gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] ); + gupyz[i] * chiy[i] * chiz[i] );
#if (GAUGE == 4) #if (GAUGE == 4)
{ reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - sqrt(chin1[i])), 2.0 );
const double chi_sqrt = sqrt(chin1[i]);
const double damping = ONE - chi_sqrt;
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping);
}
#else #else
{ reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - chin1[i]), 2.0 );
const double damping = ONE - chin1[i];
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping);
}
#endif #endif
betax_rhs[i] = FF * Gamx[i] - reta[i] * betax[i]; betax_rhs[i] = FF * Gamx[i] - reta[i] * betax[i];

View File

@@ -23,14 +23,10 @@ using namespace std;
#include <mpi.h> #include <mpi.h>
#include "macrodef.h" #include "macrodef.h"
#include "misc.h" #include "misc.h"
#include "cgh.h" #include "cgh.h"
#include "Parallel.h" #include "Parallel.h"
#include "parameters.h" #include "parameters.h"
#ifdef USE_GPU
#include "bssn_gpu.h"
#include "bssn_cuda_ops.h"
#endif
//================================================================================================ //================================================================================================
@@ -885,20 +881,13 @@ void cgh::recompose_cgh(int nprocs, bool *lev_flag,
tmPat = construct_patchlist(lev, Symmetry); tmPat = construct_patchlist(lev, Symmetry);
// tmPat construction completes // tmPat construction completes
Parallel::distribute(tmPat, nprocs, ingfs, fngfs, false); Parallel::distribute(tmPat, nprocs, ingfs, fngfs, false);
// checkPatchList(tmPat,true); // checkPatchList(tmPat,true);
bool CC = (lev > trfls); bool CC = (lev > trfls);
Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC); Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
#ifdef USE_GPU Parallel::KillBlocks(PatL[lev]);
bssn_gpu_clear_cached_device_buffers(); PatL[lev]->destroyList();
bssn_gpu_release_pinned_host_buffers(); PatL[lev] = tmPat;
bssn_cuda_release_rk4_caches();
bssn_cuda_release_interp_caches();
patch_release_interp_plan_cache();
#endif
Parallel::KillBlocks(PatL[lev]);
PatL[lev]->destroyList();
PatL[lev] = tmPat;
#if (RPB == 1) #if (RPB == 1)
Parallel::destroypsuList_bam(bdsul[lev]); Parallel::destroypsuList_bam(bdsul[lev]);
Parallel::destroypsuList_bam(rsul[lev]); Parallel::destroypsuList_bam(rsul[lev]);
@@ -921,20 +910,13 @@ void cgh::recompose_cgh(int nprocs, bool *lev_flag,
tmPat = construct_patchlist(lev, Symmetry); tmPat = construct_patchlist(lev, Symmetry);
// tmPat construction completes // tmPat construction completes
Parallel::distribute(tmPat, end_rank[lev] - start_rank[lev] + 1, ingfs, fngfs, false, start_rank[lev], end_rank[lev]); Parallel::distribute(tmPat, end_rank[lev] - start_rank[lev] + 1, ingfs, fngfs, false, start_rank[lev], end_rank[lev]);
// checkPatchList(tmPat,true); // checkPatchList(tmPat,true);
bool CC = (lev > trfls); bool CC = (lev > trfls);
Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC); Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
#ifdef USE_GPU Parallel::KillBlocks(PatL[lev]);
bssn_gpu_clear_cached_device_buffers(); PatL[lev]->destroyList();
bssn_gpu_release_pinned_host_buffers(); PatL[lev] = tmPat;
bssn_cuda_release_rk4_caches();
bssn_cuda_release_interp_caches();
patch_release_interp_plan_cache();
#endif
Parallel::KillBlocks(PatL[lev]);
PatL[lev]->destroyList();
PatL[lev] = tmPat;
#if (RPB == 1) #if (RPB == 1)
#error "not support yet" #error "not support yet"
#endif #endif
@@ -1536,20 +1518,13 @@ void cgh::recompose_cgh_Onelevel(int nprocs, int lev,
tmPat = construct_patchlist(lev, Symmetry); tmPat = construct_patchlist(lev, Symmetry);
// tmPat construction completes // tmPat construction completes
Parallel::distribute(tmPat, nprocs, ingfs, fngfs, false); Parallel::distribute(tmPat, nprocs, ingfs, fngfs, false);
// checkPatchList(tmPat,true); // checkPatchList(tmPat,true);
bool CC = (lev > trfls); bool CC = (lev > trfls);
Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC); Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
#ifdef USE_GPU Parallel::KillBlocks(PatL[lev]);
bssn_gpu_clear_cached_device_buffers(); PatL[lev]->destroyList();
bssn_gpu_release_pinned_host_buffers(); PatL[lev] = tmPat;
bssn_cuda_release_rk4_caches();
bssn_cuda_release_interp_caches();
patch_release_interp_plan_cache();
#endif
Parallel::KillBlocks(PatL[lev]);
PatL[lev]->destroyList();
PatL[lev] = tmPat;
} }
#elif (PSTR == 1 || PSTR == 2 || PSTR == 3) #elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
#warning "recompose_cgh_Onelevel is not implimented yet" #warning "recompose_cgh_Onelevel is not implimented yet"
@@ -1565,21 +1540,14 @@ void cgh::recompose_cgh_Onelevel(int nprocs, int lev,
// tmPat construction completes // tmPat construction completes
Parallel::distribute(tmPat, end_rank[lev] - start_rank[lev] + 1, ingfs, fngfs, false, start_rank[lev], end_rank[lev]); Parallel::distribute(tmPat, end_rank[lev] - start_rank[lev] + 1, ingfs, fngfs, false, start_rank[lev], end_rank[lev]);
misc::tillherecheck(Commlev[lev], start_rank[lev], "after distribute"); misc::tillherecheck(Commlev[lev], start_rank[lev], "after distribute");
// checkPatchList(tmPat,true); // checkPatchList(tmPat,true);
bool CC = (lev > trfls); bool CC = (lev > trfls);
Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC); Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
misc::tillherecheck(Commlev[lev], start_rank[lev], "after fill_level_data"); misc::tillherecheck(Commlev[lev], start_rank[lev], "after fill_level_data");
#ifdef USE_GPU Parallel::KillBlocks(PatL[lev]);
bssn_gpu_clear_cached_device_buffers(); PatL[lev]->destroyList();
bssn_gpu_release_pinned_host_buffers(); PatL[lev] = tmPat;
bssn_cuda_release_rk4_caches();
bssn_cuda_release_interp_caches();
patch_release_interp_plan_cache();
#endif
Parallel::KillBlocks(PatL[lev]);
PatL[lev]->destroyList();
PatL[lev] = tmPat;
} }

View File

@@ -30,8 +30,8 @@ CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
endif endif
.SUFFIXES: .o .f90 .C .for .cu .SUFFIXES: .o .f90 .C .for .cu
.f90.o: .f90.o:
$(f90) $(f90appflags) -c $< -o $@ $(f90) $(f90appflags) -c $< -o $@
@@ -64,8 +64,8 @@ lopsided_c.o: lopsided_c.C
lopsided_kodis_c.o: lopsided_kodis_c.C lopsided_kodis_c.o: lopsided_kodis_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
#interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
# ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS ## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
@@ -105,12 +105,13 @@ C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\ Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o
C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
cgh.o bssn_class.o surface_integral.o ShellPatch.o\ cgh.o surface_integral.o ShellPatch.o\
bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\ bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\ bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\ Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
NullShellPatch2_Evo.o bssn_cuda_step.o writefile_f.o NullShellPatch2_Evo.o \
bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o
F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\ F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
prolongrestrict_cell.o prolongrestrict_vertex.o\ prolongrestrict_cell.o prolongrestrict_vertex.o\
@@ -142,7 +143,7 @@ initial_guess.o Newton.o Jacobian.o ilucg.o IntPnts0.o IntPnts.o
TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o
CUDAFILES = bssn_gpu.o bssn_cuda_ops.o CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o
# file dependences # file dependences
$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh $(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh

View File

@@ -9,7 +9,6 @@ filein = -I/usr/include/ -I${MKLROOT}/include
## Using sequential MKL (OpenMP disabled for better single-threaded performance) ## Using sequential MKL (OpenMP disabled for better single-threaded performance)
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5 LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
CUDA_LDLIBS = -L/usr/local/cuda-12.9/targets/x86_64-linux/lib -lcudart
## Memory allocator switch ## Memory allocator switch
## 1 (default) : link Intel oneTBB allocator (libtbbmalloc) ## 1 (default) : link Intel oneTBB allocator (libtbbmalloc)
@@ -25,8 +24,6 @@ ifeq ($(USE_TBBMALLOC),1)
LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS) LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
endif endif
LDLIBS := $(CUDA_LDLIBS) $(LDLIBS)
## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags) ## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
## opt : (default) maximum performance with PGO profile-guided optimization ## opt : (default) maximum performance with PGO profile-guided optimization
## instrument : PGO Phase 1 instrumentation to collect fresh profile data ## instrument : PGO Phase 1 instrumentation to collect fresh profile data

View File

@@ -180,64 +180,19 @@ surface_integral::surface_integral(int iSymmetry) : Symmetry(iSymmetry)
//|============================================================================ //|============================================================================
//| Destructor //| Destructor
//|============================================================================ //|============================================================================
surface_integral::~surface_integral() surface_integral::~surface_integral()
{ {
release_cached_buffers(); delete[] nx_g;
delete[] nx_g; delete[] ny_g;
delete[] ny_g; delete[] nz_g;
delete[] nz_g; delete[] arcostheta;
delete[] arcostheta; #ifdef GaussInt
#ifdef GaussInt delete[] wtcostheta;
delete[] wtcostheta; #endif
#endif }
} //|----------------------------------------------------------------
// spin weighted spinw component of psi4, general routine
void surface_integral::get_surface_points(double rex, double **pox) // l takes from spinw to maxl; m takes from -l to l
{
SpherePointCache &cache = sphere_point_cache[rex];
if (!cache.pox[0])
{
for (int i = 0; i < 3; ++i)
cache.pox[i] = new double[n_tot];
for (int n = 0; n < n_tot; ++n)
{
cache.pox[0][n] = rex * nx_g[n];
cache.pox[1][n] = rex * ny_g[n];
cache.pox[2][n] = rex * nz_g[n];
}
}
pox[0] = cache.pox[0];
pox[1] = cache.pox[1];
pox[2] = cache.pox[2];
}
double *surface_integral::get_shellf_buffer(int num_var)
{
double *&buffer = shellf_cache[num_var];
if (!buffer)
buffer = new double[n_tot * num_var];
return buffer;
}
void surface_integral::release_cached_buffers()
{
for (map<double, SpherePointCache>::iterator it = sphere_point_cache.begin(); it != sphere_point_cache.end(); ++it)
{
delete[] it->second.pox[0];
delete[] it->second.pox[1];
delete[] it->second.pox[2];
it->second.pox[0] = it->second.pox[1] = it->second.pox[2] = 0;
}
sphere_point_cache.clear();
for (map<int, double *>::iterator it = shellf_cache.begin(); it != shellf_cache.end(); ++it)
delete[] it->second;
shellf_cache.clear();
}
//|----------------------------------------------------------------
// spin weighted spinw component of psi4, general routine
// l takes from spinw to maxl; m takes from -l to l
//|---------------------------------------------------------------- //|----------------------------------------------------------------
void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4, void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
int spinw, int maxl, int NN, double *RP, double *IP, int spinw, int maxl, int NN, double *RP, double *IP,
@@ -254,9 +209,16 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
MyList<var> *DG_List = new MyList<var>(Rpsi4); MyList<var> *DG_List = new MyList<var>(Rpsi4);
DG_List->insert(Ipsi4); DG_List->insert(Ipsi4);
int n; int n;
double *pox[3]; double *pox[3];
get_surface_points(rex, pox); for (int i = 0; i < 3; i++)
pox[i] = new double[n_tot];
for (n = 0; n < n_tot; n++)
{
pox[0][n] = rex * nx_g[n];
pox[1][n] = rex * ny_g[n];
pox[2][n] = rex * nz_g[n];
}
int mp, Lp, Nmin, Nmax; int mp, Lp, Nmin, Nmax;
mp = n_tot / cpusize; mp = n_tot / cpusize;
@@ -272,7 +234,8 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
Nmax = Nmin + mp - 1; Nmax = Nmin + mp - 1;
} }
double *shellf = get_shellf_buffer(InList); double *shellf;
shellf = new double[n_tot * InList];
GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax); GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
@@ -412,10 +375,14 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
//|------= Free memory. //|------= Free memory.
delete[] RP_out; delete[] pox[0];
delete[] IP_out; delete[] pox[1];
DG_List->clearList(); delete[] pox[2];
} delete[] shellf;
delete[] RP_out;
delete[] IP_out;
DG_List->clearList();
}
void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4, void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
int spinw, int maxl, int NN, double *RP, double *IP, int spinw, int maxl, int NN, double *RP, double *IP,
monitor *Monitor, MPI_Comm Comm_here) // NN is the length of RP and IP monitor *Monitor, MPI_Comm Comm_here) // NN is the length of RP and IP
@@ -435,11 +402,19 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
MyList<var> *DG_List = new MyList<var>(Rpsi4); MyList<var> *DG_List = new MyList<var>(Rpsi4);
DG_List->insert(Ipsi4); DG_List->insert(Ipsi4);
int n; int n;
double *pox[3]; double *pox[3];
get_surface_points(rex, pox); for (int i = 0; i < 3; i++)
pox[i] = new double[n_tot];
double *shellf = get_shellf_buffer(InList); for (n = 0; n < n_tot; n++)
{
pox[0][n] = rex * nx_g[n];
pox[1][n] = rex * ny_g[n];
pox[2][n] = rex * nz_g[n];
}
double *shellf;
shellf = new double[n_tot * InList];
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Interp_Points"); // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Interp_Points");
@@ -602,10 +577,14 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
//|------= Free memory. //|------= Free memory.
delete[] RP_out; delete[] pox[0];
delete[] IP_out; delete[] pox[1];
DG_List->clearList(); delete[] pox[2];
} delete[] shellf;
delete[] RP_out;
delete[] IP_out;
DG_List->clearList();
}
//|---------------------------------------------------------------- //|----------------------------------------------------------------
// for shell patch // for shell patch
//|---------------------------------------------------------------- //|----------------------------------------------------------------
@@ -618,11 +597,19 @@ void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH, var *Rpsi4
MyList<var> *DG_List = new MyList<var>(Rpsi4); MyList<var> *DG_List = new MyList<var>(Rpsi4);
DG_List->insert(Ipsi4); DG_List->insert(Ipsi4);
int n; int n;
double *pox[3]; double *pox[3];
get_surface_points(rex, pox); for (int i = 0; i < 3; i++)
pox[i] = new double[n_tot];
for (n = 0; n < n_tot; n++)
{
pox[0][n] = rex * nx_g[n];
pox[1][n] = rex * ny_g[n];
pox[2][n] = rex * nz_g[n];
}
double *shellf = get_shellf_buffer(InList); double *shellf;
shellf = new double[n_tot * InList];
GH->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry); GH->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);
@@ -2583,8 +2570,12 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
Rout[5] = sy; Rout[5] = sy;
Rout[6] = sz; Rout[6] = sz;
DG_List->clearList(); delete[] pox[0];
} delete[] pox[1];
delete[] pox[2];
delete[] shellf;
DG_List->clearList();
}
void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var *trK, void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz, var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz, var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
@@ -2646,11 +2637,19 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
DG_List->insert(Ayz); DG_List->insert(Ayz);
DG_List->insert(Azz); DG_List->insert(Azz);
int n; int n;
double *pox[3]; double *pox[3];
get_surface_points(rex, pox); for (int i = 0; i < 3; i++)
pox[i] = new double[n_tot];
double *shellf = get_shellf_buffer(InList); for (n = 0; n < n_tot; n++)
{
pox[0][n] = rex * nx_g[n];
pox[1][n] = rex * ny_g[n];
pox[2][n] = rex * nz_g[n];
}
double *shellf;
shellf = new double[n_tot * InList];
// we have assumed there is only one box on this level, // we have assumed there is only one box on this level,
// so we do not need loop boxes // so we do not need loop boxes
@@ -2840,8 +2839,12 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
Rout[5] = sy; Rout[5] = sy;
Rout[6] = sz; Rout[6] = sz;
DG_List->clearList(); delete[] pox[0];
} delete[] pox[1];
delete[] pox[2];
delete[] shellf;
DG_List->clearList();
}
//|---------------------------------------------------------------- //|----------------------------------------------------------------
// for shell patch // for shell patch
//|---------------------------------------------------------------- //|----------------------------------------------------------------

View File

@@ -20,41 +20,25 @@ using namespace std;
#include "cgh.h" #include "cgh.h"
#include "ShellPatch.h" #include "ShellPatch.h"
#include "NullShellPatch.h" #include "NullShellPatch.h"
#include "NullShellPatch2.h" #include "NullShellPatch2.h"
#include "var.h" #include "var.h"
#include "monitor.h" #include "monitor.h"
#include <map>
class surface_integral class surface_integral
{ {
private: private:
struct SpherePointCache int Symmetry, factor;
{ int N_theta, N_phi; // Number of points in Theta & Phi directions
double *pox[3]; double dphi, dcostheta;
SpherePointCache() double *arcostheta, *wtcostheta;
{ int n_tot; // size of arrays
pox[0] = pox[1] = pox[2] = 0;
} double *nx_g, *ny_g, *nz_g; // global list of unit normals
}; int myrank, cpusize;
int Symmetry, factor; public:
int N_theta, N_phi; // Number of points in Theta & Phi directions surface_integral(int iSymmetry);
double dphi, dcostheta;
double *arcostheta, *wtcostheta;
int n_tot; // size of arrays
double *nx_g, *ny_g, *nz_g; // global list of unit normals
int myrank, cpusize;
map<double, SpherePointCache> sphere_point_cache;
map<int, double *> shellf_cache;
void get_surface_points(double rex, double **pox);
double *get_shellf_buffer(int num_var);
void release_cached_buffers();
public:
surface_integral(int iSymmetry);
~surface_integral(); ~surface_integral();
void surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4, void surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,

View File

@@ -9,7 +9,6 @@
import AMSS_NCKU_Input as input_data import AMSS_NCKU_Input as input_data
import os
import subprocess import subprocess
import time import time
@@ -58,48 +57,6 @@ BUILD_JOBS = 64
################################################################## ##################################################################
##################################################################
def prepare_gpu_runtime_env():
"""
Create a user-private CUDA MPS environment for GPU runs.
On shared machines another user's daemon may already occupy the default
/tmp/nvidia-mps pipe directory, which makes plain cudaSetDevice/cudaMalloc
fail with cudaErrorMpsConnectionFailed. Binding AMSS-NCKU to a private
pipe directory avoids cross-user interference.
"""
env = os.environ.copy()
pipe_dir = env.get("CUDA_MPS_PIPE_DIRECTORY", f"/tmp/amss-ncku-mps-{os.getuid()}")
log_dir = env.get("CUDA_MPS_LOG_DIRECTORY", f"/tmp/amss-ncku-mps-log-{os.getuid()}")
os.makedirs(pipe_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)
env["CUDA_MPS_PIPE_DIRECTORY"] = pipe_dir
env["CUDA_MPS_LOG_DIRECTORY"] = log_dir
control_socket = os.path.join(pipe_dir, "control")
if not os.path.exists(control_socket):
start = subprocess.run(
["nvidia-cuda-mps-control", "-d"],
env=env,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
if start.returncode != 0:
print(f" Warning: failed to start private CUDA MPS daemon in {pipe_dir}")
else:
print(f" Using private CUDA MPS pipe directory: {pipe_dir}")
else:
print(f" Using existing private CUDA MPS pipe directory: {pipe_dir}")
return env
##################################################################
################################################################## ##################################################################
@@ -189,29 +146,16 @@ def run_ABE():
## Define the command to run; cast other values to strings as needed ## Define the command to run; cast other values to strings as needed
run_env = None
if (input_data.GPU_Calculation == "no"): if (input_data.GPU_Calculation == "no"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
#mpi_command = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" #mpi_command = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
mpi_command_outfile = "ABE_out.log" mpi_command_outfile = "ABE_out.log"
elif (input_data.GPU_Calculation == "yes"): elif (input_data.GPU_Calculation == "yes"):
run_env = prepare_gpu_runtime_env() mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
if int(input_data.MPI_processes) == 1:
mpi_command = "./ABEGPU"
else:
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
mpi_command_outfile = "ABEGPU_out.log" mpi_command_outfile = "ABEGPU_out.log"
## Execute the MPI command and stream output ## Execute the MPI command and stream output
mpi_process = subprocess.Popen( mpi_process = subprocess.Popen(mpi_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
mpi_command,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
env=run_env,
)
## Write ABE run output to file while printing to stdout ## Write ABE run output to file while printing to stdout
with open(mpi_command_outfile, 'w') as file0: with open(mpi_command_outfile, 'w') as file0: