Compare commits

..

11 Commits

32 changed files with 3890 additions and 7950 deletions

View File

@@ -23,20 +23,22 @@ using namespace std;
#include <mpi.h> #include <mpi.h>
#include "misc.h" #include "misc.h"
#include "macrodef.h" #include "macrodef.h"
#ifdef USE_GPU
extern void bssn_cuda_dump_stage_profile();
#endif
#ifndef ABEtype #ifndef ABEtype
#error "not define ABEtype" #error "not define ABEtype"
#endif #endif
#if (ABEtype == 0) #if (ABEtype == 0)
#include "bssn_class.h"
#ifdef USE_GPU
#elif (ABEtype == 1) #include "bssn_gpu_class.h"
#include "bssnEScalar_class.h" #else
#include "bssn_class.h"
#endif
#elif (ABEtype == 1)
#include "bssnEScalar_class.h"
#elif (ABEtype == 2) #elif (ABEtype == 2)
#include "Z4c_class.h" #include "Z4c_class.h"
@@ -472,13 +474,10 @@ int main(int argc, char *argv[])
cout << endl; cout << endl;
} }
ADM->Evolve(Steps); ADM->Evolve(Steps);
#ifdef USE_GPU
bssn_cuda_dump_stage_profile(); if (myrank == 0)
#endif {
if (myrank == 0)
{
cout << endl; cout << endl;
cout << " Total Evolve Time: " << MPI_Wtime() - End_clock << " seconds!" << endl; cout << " Total Evolve Time: " << MPI_Wtime() - End_clock << " seconds!" << endl;
cout << " Total Running Time: " << MPI_Wtime() - Begin_clock << " seconds!" << endl; cout << " Total Running Time: " << MPI_Wtime() - Begin_clock << " seconds!" << endl;

View File

@@ -9,12 +9,8 @@
#include <new> #include <new>
using namespace std; using namespace std;
#include "Block.h" #include "Block.h"
#include "misc.h" #include "misc.h"
#ifdef USE_GPU
#include "bssn_gpu.h"
#include "bssn_cuda_ops.h"
#endif
Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfsi, int levi, const int cgpui) : rank(ranki), ingfs(ingfsi), fngfs(fngfsi), lev(levi), cgpu(cgpui) Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfsi, int levi, const int cgpui) : rank(ranki), ingfs(ingfsi), fngfs(fngfsi), lev(levi), cgpu(cgpui)
{ {
@@ -99,19 +95,14 @@ Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fng
} }
#endif #endif
} }
Block::~Block() Block::~Block()
{ {
int myrank; int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
if (myrank == rank) if (myrank == rank)
{ {
#ifdef USE_GPU for (int i = 0; i < dim; i++)
bssn_gpu_clear_cached_device_buffers(); delete[] X[i];
bssn_cuda_release_rk4_caches();
bssn_cuda_release_interp_caches();
#endif
for (int i = 0; i < dim; i++)
delete[] X[i];
for (int i = 0; i < ingfs; i++) for (int i = 0; i < ingfs; i++)
free(igfs[i]); free(igfs[i]);
delete[] igfs; delete[] igfs;

View File

@@ -37,51 +37,56 @@ close(77)
end program checkFFT end program checkFFT
#endif #endif
!------------- SUBROUTINE four1(dataa,nn,isign)
! Optimized FFT using Intel oneMKL DFTI implicit none
! Mathematical equivalence: Standard DFT definition INTEGER::isign,nn
! Forward (isign=1): X[k] = sum_{n=0}^{N-1} x[n] * exp(-2*pi*i*k*n/N) double precision,dimension(2*nn)::dataa
! Backward (isign=-1): X[k] = sum_{n=0}^{N-1} x[n] * exp(+2*pi*i*k*n/N) INTEGER::i,istep,j,m,mmax,n
! Input/Output: dataa is interleaved complex array [Re(0),Im(0),Re(1),Im(1),...] double precision::tempi,tempr
!------------- DOUBLE PRECISION::theta,wi,wpi,wpr,wr,wtemp
SUBROUTINE four1(dataa,nn,isign) n=2*nn
use MKL_DFTI j=1
implicit none do i=1,n,2
INTEGER, intent(in) :: isign, nn if(j.gt.i)then
DOUBLE PRECISION, dimension(2*nn), intent(inout) :: dataa tempr=dataa(j)
tempi=dataa(j+1)
type(DFTI_DESCRIPTOR), pointer :: desc dataa(j)=dataa(i)
integer :: status dataa(j+1)=dataa(i+1)
dataa(i)=tempr
! Create DFTI descriptor for 1D complex-to-complex transform dataa(i+1)=tempi
status = DftiCreateDescriptor(desc, DFTI_DOUBLE, DFTI_COMPLEX, 1, nn) endif
if (status /= 0) return m=nn
1 if ((m.ge.2).and.(j.gt.m)) then
! Set input/output storage as interleaved complex (default) j=j-m
status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_INPLACE) m=m/2
if (status /= 0) then goto 1
status = DftiFreeDescriptor(desc) endif
return j=j+m
endif enddo
mmax=2
! Commit the descriptor 2 if (n.gt.mmax) then
status = DftiCommitDescriptor(desc) istep=2*mmax
if (status /= 0) then theta=6.28318530717959d0/(isign*mmax)
status = DftiFreeDescriptor(desc) wpr=-2.d0*sin(0.5d0*theta)**2
return wpi=sin(theta)
endif wr=1.d0
wi=0.d0
! Execute FFT based on direction do m=1,mmax,2
if (isign == 1) then do i=m,n,istep
! Forward FFT: exp(-2*pi*i*k*n/N) j=i+mmax
status = DftiComputeForward(desc, dataa) tempr=sngl(wr)*dataa(j)-sngl(wi)*dataa(j+1)
else tempi=sngl(wr)*dataa(j+1)+sngl(wi)*dataa(j)
! Backward FFT: exp(+2*pi*i*k*n/N) dataa(j)=dataa(i)-tempr
status = DftiComputeBackward(desc, dataa) dataa(j+1)=dataa(i+1)-tempi
endif dataa(i)=dataa(i)+tempr
dataa(i+1)=dataa(i+1)+tempi
! Free descriptor enddo
status = DftiFreeDescriptor(desc) wtemp=wr
wr=wr*wpr-wi*wpi+wr
return wi=wi*wpr+wtemp*wpi+wi
END SUBROUTINE four1 enddo
mmax=istep
goto 2
endif
return
END SUBROUTINE four1

View File

@@ -2,100 +2,29 @@
#include <iostream> #include <iostream>
#include <iomanip> #include <iomanip>
#include <fstream> #include <fstream>
#include <cstdlib> #include <cstdlib>
#include <cstdio> #include <cstdio>
#include <string> #include <string>
#include <cmath> #include <cmath>
#include <new> #include <new>
#include <map> #include <vector>
#include <vector> using namespace std;
using namespace std;
#include "misc.h" #include "misc.h"
#include "MPatch.h" #include "MPatch.h"
#include "Parallel.h" #include "Parallel.h"
#include "fmisc.h" #include "fmisc.h"
#include "bssn_cuda_ops.h" #ifdef INTERP_LB_PROFILE
#ifdef INTERP_LB_PROFILE #include "interp_lb_profile.h"
#include "interp_lb_profile.h" #endif
#endif
namespace
#if defined(__GNUC__) || defined(__clang__) {
extern int bssn_cuda_interp_points_batch(const int *ex, struct InterpBlockView
const double *X, const double *Y, const double *Z, {
const double *const *fields, Block *bp;
const double *soa_flat, double llb[dim];
int num_var, double uub[dim];
const double *px, const double *py, const double *pz,
int num_points,
int ordn,
int symmetry,
double *out) __attribute__((weak));
#endif
namespace
{
struct InterpVarDesc
{
int sgfn;
double soa[dim];
};
struct InterpPlanKey
{
const Patch *patch;
const double *x;
const double *y;
const double *z;
int NN;
int Symmetry;
int myrank;
};
struct InterpPlanKeyLess
{
bool operator()(const InterpPlanKey &lhs, const InterpPlanKey &rhs) const
{
if (lhs.patch != rhs.patch) return lhs.patch < rhs.patch;
if (lhs.x != rhs.x) return lhs.x < rhs.x;
if (lhs.y != rhs.y) return lhs.y < rhs.y;
if (lhs.z != rhs.z) return lhs.z < rhs.z;
if (lhs.NN != rhs.NN) return lhs.NN < rhs.NN;
if (lhs.Symmetry != rhs.Symmetry) return lhs.Symmetry < rhs.Symmetry;
return lhs.myrank < rhs.myrank;
}
};
struct CachedInterpPlan
{
int nblocks;
vector<int> owner_rank;
vector<int> owner_block;
vector<vector<int> > block_points;
vector<vector<double> > block_px;
vector<vector<double> > block_py;
vector<vector<double> > block_pz;
CachedInterpPlan() : nblocks(0) {}
};
struct CachedInterpPlanEntry
{
bool valid;
InterpPlanKey key;
vector<double> xvals;
vector<double> yvals;
vector<double> zvals;
CachedInterpPlan plan;
CachedInterpPlanEntry() : valid(false) {}
};
struct InterpBlockView
{
Block *bp;
double llb[dim];
double uub[dim];
}; };
struct BlockBinIndex struct BlockBinIndex
@@ -225,10 +154,10 @@ void build_block_bin_index(Patch *patch, const double *DH, BlockBinIndex &index)
index.valid = true; index.valid = true;
} }
int find_block_index_for_point(const BlockBinIndex &index, const double *pox, const double *DH) int find_block_index_for_point(const BlockBinIndex &index, const double *pox, const double *DH)
{ {
if (!index.valid) if (!index.valid)
return -1; return -1;
const int bx = coord_to_bin(pox[0], index.lo[0], index.inv[0], index.bins[0]); const int bx = coord_to_bin(pox[0], index.lo[0], index.inv[0], index.bins[0]);
const int by = coord_to_bin(pox[1], index.lo[1], index.inv[1], index.bins[1]); const int by = coord_to_bin(pox[1], index.lo[1], index.inv[1], index.bins[1]);
@@ -246,314 +175,13 @@ int find_block_index_for_point(const BlockBinIndex &index, const double *pox, co
for (size_t bi = 0; bi < index.views.size(); bi++) for (size_t bi = 0; bi < index.views.size(); bi++)
if (point_in_block_view(index.views[bi], pox, DH)) if (point_in_block_view(index.views[bi], pox, DH))
return int(bi); return int(bi);
return -1; return -1;
} }
} // namespace
void collect_interp_vars(MyList<var> *VarList, vector<InterpVarDesc> &vars)
{ Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
vars.clear(); {
MyList<var> *varl = VarList;
while (varl)
{
InterpVarDesc desc;
desc.sgfn = varl->data->sgfn;
for (int d = 0; d < dim; ++d)
desc.soa[d] = varl->data->SoA[d];
vars.push_back(desc);
varl = varl->next;
}
}
bool should_try_cuda_interp(int ordn, int num_points, int num_var)
{
#if defined(__GNUC__) || defined(__clang__)
if (!bssn_cuda_interp_points_batch)
return false;
#else
return false;
#endif
if (ordn != 6)
return false;
if (num_points < 32)
return false;
return num_points * num_var >= 256;
}
CachedInterpPlanEntry &interp_plan_cache_entry()
{
static CachedInterpPlanEntry cache;
return cache;
}
bool same_interp_plan_key(const InterpPlanKey &lhs, const InterpPlanKey &rhs)
{
return lhs.patch == rhs.patch &&
lhs.NN == rhs.NN &&
lhs.Symmetry == rhs.Symmetry &&
lhs.myrank == rhs.myrank;
}
bool same_interp_plan_points(const CachedInterpPlanEntry &cache, int NN, double **XX)
{
if (static_cast<int>(cache.xvals.size()) != NN ||
static_cast<int>(cache.yvals.size()) != NN ||
static_cast<int>(cache.zvals.size()) != NN)
return false;
for (int j = 0; j < NN; ++j)
{
if (cache.xvals[j] != XX[0][j] ||
cache.yvals[j] != XX[1][j] ||
cache.zvals[j] != XX[2][j])
return false;
}
return true;
}
CachedInterpPlan &get_cached_interp_plan(Patch *patch,
int NN, double **XX,
int Symmetry, int myrank,
const double *DH,
const BlockBinIndex &block_index,
bool report_bounds_here,
bool allow_missing_points)
{
InterpPlanKey key;
key.patch = patch;
key.x = XX[0];
key.y = XX[1];
key.z = XX[2];
key.NN = NN;
key.Symmetry = Symmetry;
key.myrank = myrank;
CachedInterpPlanEntry &cache = interp_plan_cache_entry();
if (cache.valid &&
same_interp_plan_key(cache.key, key) &&
same_interp_plan_points(cache, NN, XX) &&
cache.plan.nblocks == static_cast<int>(block_index.views.size()))
return cache.plan;
cache.valid = true;
cache.key = key;
cache.xvals.assign(XX[0], XX[0] + NN);
cache.yvals.assign(XX[1], XX[1] + NN);
cache.zvals.assign(XX[2], XX[2] + NN);
cache.plan = CachedInterpPlan();
CachedInterpPlan &plan = cache.plan;
plan.nblocks = static_cast<int>(block_index.views.size());
plan.owner_rank.assign(NN, -1);
plan.owner_block.assign(NN, -1);
plan.block_points.resize(plan.nblocks);
plan.block_px.resize(plan.nblocks);
plan.block_py.resize(plan.nblocks);
plan.block_pz.resize(plan.nblocks);
for (int j = 0; j < NN; ++j)
{
double pox[dim];
for (int i = 0; i < dim; ++i)
{
pox[i] = XX[i][j];
if (report_bounds_here &&
(XX[i][j] < patch->bbox[i] + patch->lli[i] * DH[i] ||
XX[i][j] > patch->bbox[dim + i] - patch->uui[i] * DH[i]))
{
cout << "Patch::Interp_Points: point (";
for (int k = 0; k < dim; ++k)
{
cout << XX[k][j];
if (k < dim - 1)
cout << ",";
else
cout << ") is out of current Patch." << endl;
}
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
const int block_i = find_block_index_for_point(block_index, pox, DH);
if (block_i >= 0)
{
Block *BP = block_index.views[block_i].bp;
plan.owner_rank[j] = BP->rank;
plan.owner_block[j] = block_i;
if (BP->rank == myrank)
{
plan.block_points[block_i].push_back(j);
plan.block_px[block_i].push_back(XX[0][j]);
plan.block_py[block_i].push_back(XX[1][j]);
plan.block_pz[block_i].push_back(XX[2][j]);
}
}
}
if (!allow_missing_points && report_bounds_here)
{
for (int j = 0; j < NN; ++j)
{
if (plan.owner_rank[j] >= 0)
continue;
cout << "ERROR: Patch::Interp_Points fails to find point (";
for (int d = 0; d < dim; ++d)
{
cout << XX[d][j];
if (d < dim - 1)
cout << ",";
else
cout << ")";
}
cout << " on Patch (";
for (int d = 0; d < dim; ++d)
{
cout << patch->bbox[d] << "+" << patch->lli[d] * DH[d];
if (d < dim - 1)
cout << ",";
else
cout << ")--";
}
cout << "(";
for (int d = 0; d < dim; ++d)
{
cout << patch->bbox[dim + d] << "-" << patch->uui[d] * DH[d];
if (d < dim - 1)
cout << ",";
else
cout << ")" << endl;
}
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
return plan;
}
void release_interp_plan_cache_internal()
{
CachedInterpPlanEntry &cache = interp_plan_cache_entry();
cache.valid = false;
cache.xvals.clear();
cache.yvals.clear();
cache.zvals.clear();
cache.plan = CachedInterpPlan();
}
bool run_cuda_interp_for_block(Block *BP,
const vector<InterpVarDesc> &vars,
const vector<int> &point_ids,
const vector<double> &px,
const vector<double> &py,
const vector<double> &pz,
double *Shellf,
int num_var,
int ordn,
int Symmetry)
{
if (!should_try_cuda_interp(ordn, static_cast<int>(point_ids.size()), num_var))
return false;
vector<const double *> field_ptrs(num_var);
vector<double> soa_flat(3 * num_var);
for (int v = 0; v < num_var; ++v)
{
field_ptrs[v] = BP->fgfs[vars[v].sgfn];
for (int d = 0; d < dim; ++d)
soa_flat[3 * v + d] = vars[v].soa[d];
}
const int npts = static_cast<int>(point_ids.size());
vector<double> out(static_cast<size_t>(npts) * static_cast<size_t>(num_var));
if (bssn_cuda_interp_points_batch(BP->shape,
BP->X[0], BP->X[1], BP->X[2],
field_ptrs.data(),
soa_flat.data(),
num_var,
px.data(), py.data(), pz.data(),
npts,
ordn,
Symmetry,
out.data()) != 0)
{
return false;
}
for (int p = 0; p < npts; ++p)
{
const int j = point_ids[p];
memcpy(Shellf + j * num_var, out.data() + p * num_var, sizeof(double) * num_var);
}
return true;
}
void run_cpu_interp_for_block(Block *BP,
const vector<InterpVarDesc> &vars,
const vector<int> &point_ids,
const vector<double> &px,
const vector<double> &py,
const vector<double> &pz,
double *Shellf,
int num_var,
int ordn,
int Symmetry)
{
for (size_t p = 0; p < point_ids.size(); ++p)
{
const int j = point_ids[p];
double x = px[p];
double y = py[p];
double z = pz[p];
int ordn_local = ordn;
int symmetry_local = Symmetry;
for (int v = 0; v < num_var; ++v)
{
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2],
BP->fgfs[vars[v].sgfn], Shellf[j * num_var + v],
x, y, z, ordn_local, const_cast<double *>(vars[v].soa), symmetry_local);
}
}
}
void interpolate_owned_points(MyList<var> *VarList,
double *Shellf, int Symmetry,
int ordn,
const BlockBinIndex &block_index,
const CachedInterpPlan &plan)
{
vector<InterpVarDesc> vars;
collect_interp_vars(VarList, vars);
const int num_var = static_cast<int>(vars.size());
for (size_t bi = 0; bi < plan.block_points.size(); ++bi)
{
if (plan.block_points[bi].empty())
continue;
Block *BP = block_index.views[bi].bp;
bool done = run_cuda_interp_for_block(BP, vars,
plan.block_points[bi],
plan.block_px[bi],
plan.block_py[bi],
plan.block_pz[bi],
Shellf, num_var, ordn, Symmetry);
if (!done)
run_cpu_interp_for_block(BP, vars,
plan.block_points[bi],
plan.block_px[bi],
plan.block_py[bi],
plan.block_pz[bi],
Shellf, num_var, ordn, Symmetry);
}
}
} // namespace
void patch_release_interp_plan_cache()
{
release_interp_plan_cache_internal();
}
Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
{
int hbuffer_width = buffer_width; int hbuffer_width = buffer_width;
if (lev == 0) if (lev == 0)
@@ -895,15 +523,60 @@ void Patch::Interp_Points(MyList<var> *VarList,
memset(Shellf, 0, sizeof(double) * NN * num_var); memset(Shellf, 0, sizeof(double) * NN * num_var);
double DH[dim]; // owner_rank[j] records which MPI rank owns point j
for (int i = 0; i < dim; i++) // All ranks traverse the same block list so they all agree on ownership
DH[i] = getdX(i); int *owner_rank;
BlockBinIndex block_index; owner_rank = new int[NN];
build_block_bin_index(this, DH, block_index); for (int j = 0; j < NN; j++)
CachedInterpPlan &plan = get_cached_interp_plan(this, NN, XX, Symmetry, myrank, DH, block_index, myrank == 0, false); owner_rank[j] = -1;
const int *owner_rank = plan.owner_rank.data();
double DH[dim];
interpolate_owned_points(VarList, Shellf, Symmetry, ordn, block_index, plan); for (int i = 0; i < dim; i++)
DH[i] = getdX(i);
BlockBinIndex block_index;
build_block_bin_index(this, DH, block_index);
for (int j = 0; j < NN; j++) // run along points
{
double pox[dim];
for (int i = 0; i < dim; i++)
{
pox[i] = XX[i][j];
if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
{
cout << "Patch::Interp_Points: point (";
for (int k = 0; k < dim; k++)
{
cout << XX[k][j];
if (k < dim - 1)
cout << ",";
else
cout << ") is out of current Patch." << endl;
}
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
const int block_i = find_block_index_for_point(block_index, pox, DH);
if (block_i >= 0)
{
Block *BP = block_index.views[block_i].bp;
owner_rank[j] = BP->rank;
if (myrank == BP->rank)
{
//---> interpolation
varl = VarList;
int k = 0;
while (varl) // run along variables
{
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next;
k++;
}
}
}
}
// Replace MPI_Allreduce with per-owner MPI_Bcast: // Replace MPI_Allreduce with per-owner MPI_Bcast:
// Group consecutive points by owner rank and broadcast each group. // Group consecutive points by owner rank and broadcast each group.
@@ -958,8 +631,9 @@ void Patch::Interp_Points(MyList<var> *VarList,
MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner, MPI_COMM_WORLD); MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner, MPI_COMM_WORLD);
} }
} }
} delete[] owner_rank;
}
void Patch::Interp_Points(MyList<var> *VarList, void Patch::Interp_Points(MyList<var> *VarList,
int NN, double **XX, int NN, double **XX,
double *Shellf, int Symmetry, double *Shellf, int Symmetry,
@@ -987,22 +661,102 @@ void Patch::Interp_Points(MyList<var> *VarList,
memset(Shellf, 0, sizeof(double) * NN * num_var); memset(Shellf, 0, sizeof(double) * NN * num_var);
double DH[dim]; // owner_rank[j] records which MPI rank owns point j
for (int i = 0; i < dim; i++) int *owner_rank;
DH[i] = getdX(i); owner_rank = new int[NN];
BlockBinIndex block_index; for (int j = 0; j < NN; j++)
build_block_bin_index(this, DH, block_index); owner_rank[j] = -1;
CachedInterpPlan &plan = get_cached_interp_plan(this, NN, XX, Symmetry, myrank, DH, block_index, myrank == 0, false);
const int *owner_rank = plan.owner_rank.data(); double DH[dim];
for (int i = 0; i < dim; i++)
interpolate_owned_points(VarList, Shellf, Symmetry, ordn, block_index, plan); DH[i] = getdX(i);
BlockBinIndex block_index;
build_block_bin_index(this, DH, block_index);
// --- Interpolation phase (identical to original) ---
for (int j = 0; j < NN; j++)
{
double pox[dim];
for (int i = 0; i < dim; i++)
{
pox[i] = XX[i][j];
if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
{
cout << "Patch::Interp_Points: point (";
for (int k = 0; k < dim; k++)
{
cout << XX[k][j];
if (k < dim - 1)
cout << ",";
else
cout << ") is out of current Patch." << endl;
}
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
const int block_i = find_block_index_for_point(block_index, pox, DH);
if (block_i >= 0)
{
Block *BP = block_index.views[block_i].bp;
owner_rank[j] = BP->rank;
if (myrank == BP->rank)
{
varl = VarList;
int k = 0;
while (varl)
{
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next;
k++;
}
}
}
}
#ifdef INTERP_LB_PROFILE #ifdef INTERP_LB_PROFILE
double t_interp_end = MPI_Wtime(); double t_interp_end = MPI_Wtime();
double t_interp_local = t_interp_end - t_interp_start; double t_interp_local = t_interp_end - t_interp_start;
#endif #endif
// --- Targeted point-to-point communication phase --- // --- Error check for unfound points ---
for (int j = 0; j < NN; j++)
{
if (owner_rank[j] < 0 && myrank == 0)
{
cout << "ERROR: Patch::Interp_Points fails to find point (";
for (int d = 0; d < dim; d++)
{
cout << XX[d][j];
if (d < dim - 1)
cout << ",";
else
cout << ")";
}
cout << " on Patch (";
for (int d = 0; d < dim; d++)
{
cout << bbox[d] << "+" << lli[d] * DH[d];
if (d < dim - 1)
cout << ",";
else
cout << ")--";
}
cout << "(";
for (int d = 0; d < dim; d++)
{
cout << bbox[dim + d] << "-" << uui[d] * DH[d];
if (d < dim - 1)
cout << ",";
else
cout << ")" << endl;
}
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
// --- Targeted point-to-point communication phase ---
// Compute consumer_rank[j] using the same deterministic formula as surface_integral // Compute consumer_rank[j] using the same deterministic formula as surface_integral
int *consumer_rank = new int[NN]; int *consumer_rank = new int[NN];
{ {
@@ -1119,8 +873,9 @@ void Patch::Interp_Points(MyList<var> *VarList,
delete[] send_offset; delete[] send_offset;
delete[] recv_offset; delete[] recv_offset;
delete[] send_count; delete[] send_count;
delete[] recv_count; delete[] recv_count;
delete[] consumer_rank; delete[] consumer_rank;
delete[] owner_rank;
#ifdef INTERP_LB_PROFILE #ifdef INTERP_LB_PROFILE
{ {
@@ -1168,20 +923,64 @@ void Patch::Interp_Points(MyList<var> *VarList,
memset(Shellf, 0, sizeof(double) * NN * num_var); memset(Shellf, 0, sizeof(double) * NN * num_var);
// Build global-to-local rank translation for Comm_here // owner_rank[j] stores the global rank that owns point j
MPI_Group world_group, local_group; int *owner_rank;
MPI_Comm_group(MPI_COMM_WORLD, &world_group); owner_rank = new int[NN];
MPI_Comm_group(Comm_here, &local_group); for (int j = 0; j < NN; j++)
owner_rank[j] = -1;
double DH[dim]; // Build global-to-local rank translation for Comm_here
for (int i = 0; i < dim; i++) MPI_Group world_group, local_group;
DH[i] = getdX(i); MPI_Comm_group(MPI_COMM_WORLD, &world_group);
BlockBinIndex block_index; MPI_Comm_group(Comm_here, &local_group);
build_block_bin_index(this, DH, block_index);
CachedInterpPlan &plan = get_cached_interp_plan(this, NN, XX, Symmetry, myrank, DH, block_index, lmyrank == 0, true); double DH[dim];
const int *owner_rank = plan.owner_rank.data(); for (int i = 0; i < dim; i++)
DH[i] = getdX(i);
interpolate_owned_points(VarList, Shellf, Symmetry, ordn, block_index, plan); BlockBinIndex block_index;
build_block_bin_index(this, DH, block_index);
for (int j = 0; j < NN; j++) // run along points
{
double pox[dim];
for (int i = 0; i < dim; i++)
{
pox[i] = XX[i][j];
if (lmyrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
{
cout << "Patch::Interp_Points: point (";
for (int k = 0; k < dim; k++)
{
cout << XX[k][j];
if (k < dim - 1)
cout << ",";
else
cout << ") is out of current Patch." << endl;
}
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
const int block_i = find_block_index_for_point(block_index, pox, DH);
if (block_i >= 0)
{
Block *BP = block_index.views[block_i].bp;
owner_rank[j] = BP->rank;
if (myrank == BP->rank)
{
//---> interpolation
varl = VarList;
int k = 0;
while (varl) // run along variables
{
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next;
k++;
}
}
}
}
// Collect unique global owner ranks and translate to local ranks in Comm_here // Collect unique global owner ranks and translate to local ranks in Comm_here
// Then broadcast each owner's points via MPI_Bcast on Comm_here // Then broadcast each owner's points via MPI_Bcast on Comm_here
@@ -1209,9 +1008,10 @@ void Patch::Interp_Points(MyList<var> *VarList,
} }
} }
MPI_Group_free(&world_group); MPI_Group_free(&world_group);
MPI_Group_free(&local_group); MPI_Group_free(&local_group);
} delete[] owner_rank;
}
void Patch::checkBlock() void Patch::checkBlock()
{ {
int myrank; int myrank;

View File

@@ -8,7 +8,7 @@
#include "var.h" #include "var.h"
#include "macrodef.h" //need dim here; Vertex or Cell; ghost_width #include "macrodef.h" //need dim here; Vertex or Cell; ghost_width
class Patch class Patch
{ {
public: public:
@@ -50,8 +50,6 @@ public:
double *Shellf, int Symmetry, MPI_Comm Comm_here); double *Shellf, int Symmetry, MPI_Comm Comm_here);
void Find_Maximum(MyList<var> *VarList, double *XX, void Find_Maximum(MyList<var> *VarList, double *XX,
double *Shellf, MPI_Comm Comm_here); double *Shellf, MPI_Comm Comm_here);
}; };
void patch_release_interp_plan_cache(); #endif /* PATCH_H */
#endif /* PATCH_H */

File diff suppressed because it is too large Load Diff

View File

@@ -89,12 +89,9 @@ namespace Parallel
void transfermix(MyList<gridseg> **src, MyList<gridseg> **dst, void transfermix(MyList<gridseg> **src, MyList<gridseg> **dst,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */, MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
int Symmetry); int Symmetry);
void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry); void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry);
void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry, const char *context); void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry); void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, const char *context);
void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, const char *context);
struct SyncCache { struct SyncCache {
bool valid; bool valid;
@@ -108,13 +105,12 @@ namespace Parallel
int *send_buf_caps; int *send_buf_caps;
int *recv_buf_caps; int *recv_buf_caps;
MPI_Request *reqs; MPI_Request *reqs;
MPI_Status *stats; MPI_Status *stats;
int max_reqs; int max_reqs;
bool lengths_valid; bool lengths_valid;
int lengths_var_count; int *tc_req_node;
int *tc_req_node; int *tc_req_is_recv;
int *tc_req_is_recv; int *tc_completed;
int *tc_completed;
SyncCache(); SyncCache();
void invalidate(); void invalidate();
void destroy(); void destroy();
@@ -125,20 +121,19 @@ namespace Parallel
MyList<var> *VarList1, MyList<var> *VarList2, MyList<var> *VarList1, MyList<var> *VarList2,
int Symmetry, SyncCache &cache); int Symmetry, SyncCache &cache);
struct AsyncSyncState { struct AsyncSyncState {
int req_no; int req_no;
bool active; bool active;
int mpi_tag; int *req_node;
int *req_node; int *req_is_recv;
int *req_is_recv; int pending_recv;
int pending_recv; AsyncSyncState() : req_no(0), active(false), req_node(0), req_is_recv(0), pending_recv(0) {}
AsyncSyncState() : req_no(0), active(false), mpi_tag(0), req_node(0), req_is_recv(0), pending_recv(0) {} };
};
void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
SyncCache &cache, AsyncSyncState &state); SyncCache &cache, AsyncSyncState &state);
void Sync_finish(SyncCache &cache, AsyncSyncState &state, void Sync_finish(SyncCache &cache, AsyncSyncState &state,
MyList<var> *VarList, int Symmetry, bool unpack_to_host = true); MyList<var> *VarList, int Symmetry);
void OutBdLow2Hi(Patch *Patc, Patch *Patf, void OutBdLow2Hi(Patch *Patc, Patch *Patf,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */, MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
int Symmetry); int Symmetry);
@@ -184,12 +179,13 @@ namespace Parallel
MyList<Parallel::gridseg> *clone_gsl(MyList<Parallel::gridseg> *p, bool first_only); MyList<Parallel::gridseg> *clone_gsl(MyList<Parallel::gridseg> *p, bool first_only);
MyList<Parallel::gridseg> *build_bulk_gsl(Patch *Pat); // similar to build_owned_gsl0 but does not care rank issue MyList<Parallel::gridseg> *build_bulk_gsl(Patch *Pat); // similar to build_owned_gsl0 but does not care rank issue
MyList<Parallel::gridseg> *build_bulk_gsl(Block *bp, Patch *Pat); MyList<Parallel::gridseg> *build_bulk_gsl(Block *bp, Patch *Pat);
void build_PhysBD_gstl(Patch *Pat, MyList<Parallel::gridseg> *srci, MyList<Parallel::gridseg> *dsti, void build_PhysBD_gstl(Patch *Pat, MyList<Parallel::gridseg> *srci, MyList<Parallel::gridseg> *dsti,
MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst); MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst);
void PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry); void PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry);
double L2Norm(Patch *Pat, var *vf); double L2Norm(Patch *Pat, var *vf);
void checkgsl(MyList<Parallel::gridseg> *pp, bool first_only); void L2Norm7(Patch *Pat, var **vf, double *norms);
void checkvarl(MyList<var> *pp, bool first_only); void checkgsl(MyList<Parallel::gridseg> *pp, bool first_only);
void checkvarl(MyList<var> *pp, bool first_only);
MyList<Parallel::gridseg> *divide_gsl(MyList<Parallel::gridseg> *p, Patch *Pat); MyList<Parallel::gridseg> *divide_gsl(MyList<Parallel::gridseg> *p, Patch *Pat);
MyList<Parallel::gridseg> *divide_gs(MyList<Parallel::gridseg> *p, Patch *Pat); MyList<Parallel::gridseg> *divide_gs(MyList<Parallel::gridseg> *p, Patch *Pat);
void prepare_inter_time_level(Patch *Pat, void prepare_inter_time_level(Patch *Pat,
@@ -221,11 +217,12 @@ namespace Parallel
void aligncheck(double *bbox0, double *bboxl, int lev, double *DH0, int *shape); void aligncheck(double *bbox0, double *bboxl, int lev, double *DH0, int *shape);
bool point_locat_gsl(double *pox, MyList<Parallel::gridseg> *gsl); bool point_locat_gsl(double *pox, MyList<Parallel::gridseg> *gsl);
void checkpatchlist(MyList<Patch> *PatL, bool buflog); void checkpatchlist(MyList<Patch> *PatL, bool buflog);
double L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here); double L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here);
bool PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList, void L2Norm7(Patch *Pat, var **vf, double *norms, MPI_Comm Comm_here);
int NN, double **XX, bool PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
double *Shellf, int Symmetry, MPI_Comm Comm_here); int NN, double **XX,
double *Shellf, int Symmetry, MPI_Comm Comm_here);
#if (PSTR == 1 || PSTR == 2 || PSTR == 3) #if (PSTR == 1 || PSTR == 2 || PSTR == 3)
MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi, MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
bool periodic, int start_rank, int end_rank, int nodes = 0); bool periodic, int start_rank, int end_rank, int nodes = 0);

View File

@@ -3439,10 +3439,10 @@ void ShellPatch::write_Pablo_file_ss(int *ext, double xmin, double xmax, double
delete[] Z; delete[] Z;
} }
double ShellPatch::L2Norm(var *vf) double ShellPatch::L2Norm(var *vf)
{ {
double tvf, dtvf = 0; double tvf, dtvf = 0;
int BDW = overghost; int BDW = overghost;
MyList<ss_patch> *sPp = PatL; MyList<ss_patch> *sPp = PatL;
while (sPp) while (sPp)
@@ -3469,13 +3469,50 @@ double ShellPatch::L2Norm(var *vf)
MPI_Allreduce(&dtvf, &tvf, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&dtvf, &tvf, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
tvf = sqrt(tvf); tvf = sqrt(tvf);
return tvf; return tvf;
} }
void ShellPatch::L2Norm7(var **vf, double *norms)
// find maximum of abstract value, XX store position for maximum, Shellf store maximum themselvs {
void ShellPatch::Find_Maximum(MyList<var> *VarList, double *XX, double tvf[7], dtvf[7];
double *Shellf) int BDW = overghost;
for (int i = 0; i < 7; i++)
dtvf[i] = 0;
MyList<ss_patch> *sPp = PatL;
while (sPp)
{
MyList<Block> *Bp = sPp->data->blb;
while (Bp)
{
Block *cg = Bp->data;
if (myrank == cg->rank)
{
f_l2normhelper7(cg->shape, cg->X[0], cg->X[1], cg->X[2],
sPp->data->bbox[0], sPp->data->bbox[1], sPp->data->bbox[2],
sPp->data->bbox[3], sPp->data->bbox[4], sPp->data->bbox[5],
cg->fgfs[vf[0]->sgfn], cg->fgfs[vf[1]->sgfn], cg->fgfs[vf[2]->sgfn],
cg->fgfs[vf[3]->sgfn], cg->fgfs[vf[4]->sgfn], cg->fgfs[vf[5]->sgfn],
cg->fgfs[vf[6]->sgfn], tvf, BDW);
for (int i = 0; i < 7; i++)
dtvf[i] += tvf[i];
}
if (Bp == sPp->data->ble)
break;
Bp = Bp->next;
}
sPp = sPp->next;
}
MPI_Allreduce(dtvf, tvf, 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
for (int i = 0; i < 7; i++)
norms[i] = sqrt(tvf[i]);
}
// find maximum of abstract value, XX store position for maximum, Shellf store maximum themselvs
void ShellPatch::Find_Maximum(MyList<var> *VarList, double *XX,
double *Shellf)
{ {
MyList<var> *varl; MyList<var> *varl;
int num_var = 0; int num_var = 0;

View File

@@ -195,10 +195,11 @@ public:
bool Interp_One_Point(MyList<var> *VarList, bool Interp_One_Point(MyList<var> *VarList,
double *XX, /*input global Cartesian coordinate*/ double *XX, /*input global Cartesian coordinate*/
double *Shellf, int Symmetry); double *Shellf, int Symmetry);
void write_Pablo_file_ss(int *ext, double xmin, double xmax, double ymin, double ymax, double zmin, double zmax, void write_Pablo_file_ss(int *ext, double xmin, double xmax, double ymin, double ymax, double zmin, double zmax,
char *filename, int sst); char *filename, int sst);
double L2Norm(var *vf); double L2Norm(var *vf);
void Find_Maximum(MyList<var> *VarList, double *XX, double *Shellf); void L2Norm7(var **vf, double *norms);
}; void Find_Maximum(MyList<var> *VarList, double *XX, double *Shellf);
};
#endif /* SHELLPATCH_H */ #endif /* SHELLPATCH_H */

View File

@@ -25,9 +25,23 @@ using namespace std;
#include <math.h> #include <math.h>
#include <complex.h> #include <complex.h>
#endif #endif
#include "TwoPunctures.h" #include "TwoPunctures.h"
#include <mkl_cblas.h>
extern "C" {
double cblas_ddot(const int, const double *, const int, const double *, const int);
double cblas_dnrm2(const int, const double *, const int);
void cblas_dgemm(const int, const int, const int,
const int, const int, const int,
const double, const double *, const int,
const double *, const int, const double,
double *, const int);
}
enum {
CblasRowMajor = 101,
CblasNoTrans = 111
};
TwoPunctures::TwoPunctures(double mp, double mm, double b, TwoPunctures::TwoPunctures(double mp, double mm, double b,
double P_plusx, double P_plusy, double P_plusz, double P_plusx, double P_plusy, double P_plusz,

File diff suppressed because it is too large Load Diff

View File

@@ -45,10 +45,11 @@ public:
int checkrun; int checkrun;
char checkfilename[50]; char checkfilename[50];
int Steps; int Steps;
double StartTime, TotalTime; double StartTime, TotalTime;
double AnasTime, DumpTime, d2DumpTime, CheckTime; double AnasTime, DumpTime, d2DumpTime, CheckTime;
double LastAnas, LastConsOut; double LastAnas, LastConsOut;
double Courant; int *ConstraintRefreshLevels;
double Courant;
double numepss, numepsb, numepsh; double numepss, numepsb, numepsh;
int Symmetry; int Symmetry;
int maxl, decn; int maxl, decn;
@@ -128,15 +129,14 @@ public:
Parallel::SyncCache *sync_cache_pre; // per-level cache for predictor sync Parallel::SyncCache *sync_cache_pre; // per-level cache for predictor sync
Parallel::SyncCache *sync_cache_cor; // per-level cache for corrector sync Parallel::SyncCache *sync_cache_cor; // per-level cache for corrector sync
Parallel::SyncCache *sync_cache_rp_coarse; // RestrictProlong sync on PatL[lev-1] Parallel::SyncCache *sync_cache_rp_coarse; // RestrictProlong sync on PatL[lev-1]
Parallel::SyncCache *sync_cache_rp_fine; // RestrictProlong sync on PatL[lev] Parallel::SyncCache *sync_cache_rp_fine; // RestrictProlong sync on PatL[lev]
Parallel::SyncCache *sync_cache_restrict; // cached Restrict in RestrictProlong Parallel::SyncCache *sync_cache_restrict; // cached Restrict in RestrictProlong
Parallel::SyncCache *sync_cache_outbd; // cached OutBdLow2Hi in RestrictProlong Parallel::SyncCache *sync_cache_outbd; // cached OutBdLow2Hi in RestrictProlong
Parallel::SyncCache *sync_cache_psi4; // cached Psi4 sync on PatL[lev]
monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor; monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
monitor *ConVMonitor; monitor *ConVMonitor, *TimingMonitor;
surface_integral *Waveshell; surface_integral *Waveshell;
checkpoint *CheckPoint; checkpoint *CheckPoint;
public: public:
@@ -172,20 +172,16 @@ public:
bool check_Stdin_Abort(); bool check_Stdin_Abort();
virtual void Setup_Initial_Data_Cao(); virtual void Setup_Initial_Data_Cao();
virtual void Setup_Initial_Data_Lousto(); virtual void Setup_Initial_Data_Lousto();
virtual void Initialize(); virtual void Initialize();
virtual void Read_Ansorg(); virtual void Read_Ansorg();
virtual void Read_Pablo() {}; virtual void Read_Pablo() {};
void InvalidateSyncCaches(); virtual void Compute_Psi4(int lev);
virtual void Compute_Psi4(int lev); virtual void Step(int lev, int YN);
virtual void Step(int lev, int YN); virtual void Interp_Constraint(bool infg);
#ifdef USE_GPU virtual void Constraint_Out();
void Step_MainPath_GPU(int lev, int YN); virtual void Compute_Constraint();
#endif
virtual void Interp_Constraint(bool infg);
virtual void Constraint_Out();
virtual void Compute_Constraint();
#ifdef With_AHF #ifdef With_AHF
protected: protected:

File diff suppressed because it is too large Load Diff

View File

@@ -1,68 +0,0 @@
#ifndef BSSN_CUDA_OPS_H
#define BSSN_CUDA_OPS_H
int bssn_cuda_enforce_ga(int *ex,
double *dxx, double *gxy, double *gxz,
double *dyy, double *gyz, double *dzz,
double *Axx, double *Axy, double *Axz,
double *Ayy, double *Ayz, double *Azz);
int bssn_cuda_rk4_boundary_var(int *ex, double dT,
const double *X, const double *Y, const double *Z,
double xmin, double ymin, double zmin,
double xmax, double ymax, double zmax,
const double *state0,
const double *phi_field,
const double *lap_field,
const double *boundary_src,
double *stage_data,
double *rhs_accum,
double propspeed,
const double SoA[3],
int symmetry,
int lev,
int rk_stage,
bool force_host_boundary_fix,
bool download_to_host = true);
int bssn_cuda_rk4_boundary_batch(int *ex, double dT,
const double *X, const double *Y, const double *Z,
double xmin, double ymin, double zmin,
double xmax, double ymax, double zmax,
int symmetry,
const double *const *state0_list,
double *const *stage_data_list,
double *const *rhs_accum_list,
int num_var,
int rk_stage,
bool download_to_host = false);
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_host = true);
int bssn_cuda_download_buffer(int *ex, double *host_ptr);
void bssn_cuda_release_rk4_caches();
void bssn_cuda_release_interp_caches();
int bssn_cuda_prolong3_pack(int wei,
const double *llbc, const double *uubc, const int *extc, const double *func,
const double *llbf, const double *uubf, const int *extf, double *funf,
const double *llbp, const double *uubp,
const double *SoA, int symmetry);
int bssn_cuda_restrict3_pack(int wei,
const double *llbc, const double *uubc, const int *extc, double *func,
const double *llbf, const double *uubf, const int *extf, const double *funf,
const double *llbr, const double *uubr,
const double *SoA, int symmetry);
int bssn_cuda_interp_points_batch(const int *ex,
const double *X, const double *Y, const double *Z,
const double *const *fields,
const double *soa_flat,
int num_var,
const double *px, const double *py, const double *pz,
int num_points,
int ordn,
int symmetry,
double *out);
#endif

View File

@@ -1,936 +0,0 @@
#include "macrodef.h"
#ifdef USE_GPU
#include <algorithm>
#include <cmath>
#include <cstring>
#include <cstdlib>
#include <iomanip>
#include <vector>
#include "bssn_class.h"
#include "bssn_cuda_ops.h"
#include "bssn_gpu.h"
#include "bssn_macro.h"
namespace
{
enum StageProfileMetric
{
STAGE_PROFILE_TOTAL = 0,
STAGE_PROFILE_RHS,
STAGE_PROFILE_RUN_STAGE,
STAGE_PROFILE_RUN_STAGE_DEVICE,
STAGE_PROFILE_RUN_STAGE_HOST_FIX,
STAGE_PROFILE_LOWERBOUND,
STAGE_PROFILE_ENSURE,
STAGE_PROFILE_DOWNLOAD,
STAGE_PROFILE_CLEAR_CACHE,
STAGE_PROFILE_SYNC_START,
STAGE_PROFILE_SYNC_FINISH,
STAGE_PROFILE_REFRESH,
STAGE_PROFILE_COUNT
};
static const int kStageProfileMaxLevels = 32;
struct StageProfileStore
{
bool env_checked;
bool enabled;
int calls[kStageProfileMaxLevels];
double metric[kStageProfileMaxLevels][STAGE_PROFILE_COUNT];
};
StageProfileStore &stage_profile_store()
{
static StageProfileStore store = {};
return store;
}
bool stage_profile_enabled()
{
StageProfileStore &store = stage_profile_store();
if (!store.env_checked)
{
const char *env = getenv("AMSS_GPU_STAGE_TIMING");
store.enabled = (env && env[0] && strcmp(env, "0") != 0);
store.env_checked = true;
}
return store.enabled;
}
void stage_profile_note_call(int lev)
{
if (lev >= 0 && lev < kStageProfileMaxLevels)
stage_profile_store().calls[lev]++;
}
void stage_profile_add(int lev, StageProfileMetric metric, double seconds)
{
if (lev >= 0 && lev < kStageProfileMaxLevels)
stage_profile_store().metric[lev][metric] += seconds;
}
const char *stage_profile_metric_name(StageProfileMetric metric)
{
switch (metric)
{
case STAGE_PROFILE_TOTAL:
return "total";
case STAGE_PROFILE_RHS:
return "rhs";
case STAGE_PROFILE_RUN_STAGE:
return "run_stage";
case STAGE_PROFILE_RUN_STAGE_DEVICE:
return "run_stage_dev";
case STAGE_PROFILE_RUN_STAGE_HOST_FIX:
return "run_stage_host";
case STAGE_PROFILE_LOWERBOUND:
return "lower";
case STAGE_PROFILE_ENSURE:
return "ensure";
case STAGE_PROFILE_DOWNLOAD:
return "download";
case STAGE_PROFILE_CLEAR_CACHE:
return "clear_cache";
case STAGE_PROFILE_SYNC_START:
return "sync_start";
case STAGE_PROFILE_SYNC_FINISH:
return "sync_finish";
case STAGE_PROFILE_REFRESH:
return "refresh";
default:
return "unknown";
}
}
} // namespace
void bssn_cuda_dump_stage_profile()
{
if (!stage_profile_enabled())
return;
int myrank = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
StageProfileStore &store = stage_profile_store();
int global_calls_sum[kStageProfileMaxLevels] = {};
double global_metric_sum[kStageProfileMaxLevels][STAGE_PROFILE_COUNT] = {};
double global_metric_max[kStageProfileMaxLevels][STAGE_PROFILE_COUNT] = {};
MPI_Reduce(store.calls, global_calls_sum, kStageProfileMaxLevels, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
MPI_Reduce(store.metric[0], global_metric_sum[0],
kStageProfileMaxLevels * STAGE_PROFILE_COUNT,
MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
MPI_Reduce(store.metric[0], global_metric_max[0],
kStageProfileMaxLevels * STAGE_PROFILE_COUNT,
MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
if (myrank != 0)
return;
cout << endl;
cout << " GPU stage timing summary (sum/max over MPI ranks) " << endl;
cout << " lev calls";
for (int metric = 0; metric < STAGE_PROFILE_COUNT; ++metric)
cout << " " << setw(22) << stage_profile_metric_name(static_cast<StageProfileMetric>(metric));
cout << endl;
for (int lev = 0; lev < kStageProfileMaxLevels; ++lev)
{
if (global_calls_sum[lev] == 0)
continue;
cout << setw(4) << lev << " " << setw(5) << global_calls_sum[lev];
for (int metric = 0; metric < STAGE_PROFILE_COUNT; ++metric)
{
cout << " "
<< setw(10) << setprecision(6) << fixed << global_metric_sum[lev][metric]
<< "/"
<< setw(10) << setprecision(6) << fixed << global_metric_max[lev][metric];
}
cout << endl;
}
cout << endl;
}
void bssn_class::Step_MainPath_GPU(int lev, int YN)
{
#ifdef WithShell
#error "Step_MainPath_GPU currently supports Patch grids only."
#endif
const bool profile_enabled = stage_profile_enabled();
const double step_total_begin = profile_enabled ? MPI_Wtime() : 0.0;
if (profile_enabled)
stage_profile_note_call(lev);
if (bssn_gpu_bind_process_device(myrank))
{
cerr << "GPU device bind failure on MPI rank " << myrank << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
if (profile_enabled)
{
const double t0 = MPI_Wtime();
bssn_gpu_clear_cached_device_buffers();
stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0);
}
else
bssn_gpu_clear_cached_device_buffers();
setpbh(BH_num, Porg0, Mass, BH_num_input);
const double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
#if (MAPBH == 1)
if (BH_num > 0 && lev == GH->levels - 1)
{
compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
for (int ith = 0; ith < 3; ith++)
Porg1[ithBH][ith] = Porg0[ithBH][ith] + Porg_rhs[ithBH][ith] * dT_lev;
if (Symmetry > 0)
Porg1[ithBH][2] = fabs(Porg1[ithBH][2]);
if (Symmetry == 2)
{
Porg1[ithBH][0] = fabs(Porg1[ithBH][0]);
Porg1[ithBH][1] = fabs(Porg1[ithBH][1]);
}
}
}
if (lev == a_lev)
AnalysisStuff(lev, dT_lev);
#endif
#ifdef With_AHF
AH_Step_Find(lev, dT_lev);
#endif
const bool BB = fgt(PhysTime, StartTime, dT_lev / 2);
(void)BB;
double ndeps = (lev < GH->movls) ? numepsb : numepss;
double TRK4 = PhysTime;
int iter_count = 0;
int pre = 0, cor = 1;
int ERROR = 0;
const bool keep_stage_sync_on_device = (RPS == 1) && (MAPBH == 1) && (REGLEV == 0);
auto run_stage_on_block =
[&](Block *cg, Patch *patch, MyList<var> *state0_list,
MyList<var> *boundary_src_list, MyList<var> *stage_data_list,
MyList<var> *rhs_list, int rk_stage) {
MyList<var> *varl0 = state0_list;
MyList<var> *varlb = boundary_src_list;
MyList<var> *varls = stage_data_list;
MyList<var> *varlr = rhs_list;
std::vector<const double *> batch_state0;
std::vector<double *> batch_stage;
std::vector<double *> batch_rhs;
while (varl0)
{
const bool force_host_boundary_fix = false;
const bool can_batch_device_path = (lev > 0) && !force_host_boundary_fix;
if (can_batch_device_path)
{
batch_state0.push_back(cg->fgfs[varl0->data->sgfn]);
batch_stage.push_back(cg->fgfs[varls->data->sgfn]);
batch_rhs.push_back(cg->fgfs[varlr->data->sgfn]);
varl0 = varl0->next;
varlb = varlb->next;
varls = varls->next;
varlr = varlr->next;
continue;
}
const double var_begin = profile_enabled ? MPI_Wtime() : 0.0;
if (bssn_cuda_rk4_boundary_var(cg->shape, dT_lev,
cg->X[0], cg->X[1], cg->X[2],
patch->bbox[0], patch->bbox[1], patch->bbox[2],
patch->bbox[3], patch->bbox[4], patch->bbox[5],
cg->fgfs[varl0->data->sgfn],
cg->fgfs[phi0->sgfn],
cg->fgfs[Lap0->sgfn],
cg->fgfs[varlb->data->sgfn],
cg->fgfs[varls->data->sgfn],
cg->fgfs[varlr->data->sgfn],
varl0->data->propspeed,
varl0->data->SoA,
Symmetry, lev, rk_stage,
force_host_boundary_fix, false))
{
cerr << "GPU rk4/boundary failure: lev=" << lev
<< " rk_stage=" << rk_stage
<< " var=" << varl0->data->name
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
break;
}
if (profile_enabled)
{
stage_profile_add(lev,
force_host_boundary_fix ? STAGE_PROFILE_RUN_STAGE_HOST_FIX
: STAGE_PROFILE_RUN_STAGE_DEVICE,
MPI_Wtime() - var_begin);
}
varl0 = varl0->next;
varlb = varlb->next;
varls = varls->next;
varlr = varlr->next;
}
if (!ERROR && !batch_state0.empty())
{
const double batch_begin = profile_enabled ? MPI_Wtime() : 0.0;
if (bssn_cuda_rk4_boundary_batch(cg->shape, dT_lev,
cg->X[0], cg->X[1], cg->X[2],
patch->bbox[0], patch->bbox[1], patch->bbox[2],
patch->bbox[3], patch->bbox[4], patch->bbox[5],
Symmetry,
&batch_state0[0],
&batch_stage[0],
&batch_rhs[0],
static_cast<int>(batch_state0.size()),
rk_stage, false))
{
cerr << "GPU rk4/boundary batch failure: lev=" << lev
<< " rk_stage=" << rk_stage
<< " vars=" << batch_state0.size()
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
else if (profile_enabled)
{
stage_profile_add(lev, STAGE_PROFILE_RUN_STAGE_DEVICE, MPI_Wtime() - batch_begin);
}
}
};
auto stage_download_var_list =
[&](Block *cg, MyList<var> *var_list, bool skip_unmapped) {
std::vector<double *> batch_host_ptrs;
std::vector<MyList<var> *> batch_vars;
while (var_list)
{
double *host_ptr = cg->fgfs[var_list->data->sgfn];
if (skip_unmapped && !bssn_gpu_find_device_buffer(host_ptr))
{
var_list = var_list->next;
continue;
}
batch_host_ptrs.push_back(host_ptr);
batch_vars.push_back(var_list);
var_list = var_list->next;
}
if (!batch_host_ptrs.empty() &&
bssn_gpu_download_buffer_batch(cg->shape, &batch_host_ptrs[0],
static_cast<int>(batch_host_ptrs.size())))
{
for (size_t i = 0; i < batch_host_ptrs.size(); ++i)
{
if (bssn_cuda_download_buffer(cg->shape, batch_host_ptrs[i]))
{
cerr << "GPU stage download failure: lev=" << lev
<< " var=" << batch_vars[i]->data->name
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
break;
}
}
}
};
auto stage_download_patch_list =
[&](MyList<var> *var_list, bool skip_unmapped) {
MyList<Patch> *patch_it = GH->PatL[lev];
while (patch_it)
{
MyList<Block> *block_it = patch_it->data->blb;
while (block_it)
{
Block *cg = block_it->data;
if (myrank == cg->rank)
stage_download_var_list(cg, var_list, skip_unmapped);
if (block_it == patch_it->data->ble)
break;
block_it = block_it->next;
}
if (ERROR)
break;
patch_it = patch_it->next;
}
};
auto ensure_stage_device_var_list =
[&](Block *cg, MyList<var> *var_list) {
const int n = cg->shape[0] * cg->shape[1] * cg->shape[2];
while (var_list)
{
double *host_ptr = cg->fgfs[var_list->data->sgfn];
if (!bssn_gpu_find_device_buffer(host_ptr) &&
bssn_gpu_stage_upload_buffer(host_ptr, n))
{
cerr << "GPU state ensure failure: lev=" << lev
<< " var=" << var_list->data->name
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
break;
}
var_list = var_list->next;
}
};
auto refresh_synced_device_regions =
[&](Block *cg, MyList<var> *var_list, Parallel::SyncCache &cache) {
std::vector<Parallel::gridseg *> local_segments;
for (int node = 0; node < cache.cpusize; ++node)
{
MyList<Parallel::gridseg> *seg = cache.combined_dst[node];
while (seg)
{
if (seg->data && seg->data->Bg == cg)
local_segments.push_back(seg->data);
seg = seg->next;
}
}
if (local_segments.empty())
return;
const int n = cg->shape[0] * cg->shape[1] * cg->shape[2];
while (var_list)
{
double *host_ptr = cg->fgfs[var_list->data->sgfn];
if (!bssn_gpu_find_device_buffer(host_ptr))
{
if (bssn_gpu_stage_upload_buffer(host_ptr, n))
{
cerr << "GPU sync refresh upload failure: lev=" << lev
<< " var=" << var_list->data->name
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
break;
}
}
else
{
for (size_t i = 0; i < local_segments.size(); ++i)
{
Parallel::gridseg *seg = local_segments[i];
if (bssn_gpu_stage_upload_region(host_ptr,
cg->shape,
cg->bbox,
cg->bbox + dim,
seg->shape,
seg->llb))
{
cerr << "GPU sync region refresh failure: lev=" << lev
<< " var=" << var_list->data->name
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
break;
}
}
if (ERROR)
break;
}
var_list = var_list->next;
}
};
auto refresh_stage_device_after_sync =
[&](MyList<var> *var_list, Parallel::SyncCache &cache) {
MyList<Patch> *patch_it = GH->PatL[lev];
while (patch_it)
{
MyList<Block> *block_it = patch_it->data->blb;
while (block_it)
{
Block *cg = block_it->data;
if (myrank == cg->rank)
refresh_synced_device_regions(cg, var_list, cache);
if (block_it == patch_it->data->ble)
break;
block_it = block_it->next;
}
if (ERROR)
break;
patch_it = patch_it->next;
}
};
auto refresh_stage_host_before_sync =
[&](MyList<var> *var_list, Parallel::SyncCache &cache) -> bool {
if (!cache.valid || !cache.combined_src || myrank < 0 || myrank >= cache.cpusize)
return false;
MyList<Patch> *patch_it = GH->PatL[lev];
while (patch_it)
{
MyList<Block> *block_it = patch_it->data->blb;
while (block_it)
{
Block *cg = block_it->data;
if (myrank == cg->rank)
{
std::vector<Parallel::gridseg *> local_segments;
MyList<Parallel::gridseg> *seg = cache.combined_src[myrank];
while (seg)
{
if (seg->data && seg->data->Bg == cg)
local_segments.push_back(seg->data);
seg = seg->next;
}
if (!local_segments.empty())
{
MyList<var> *var_it = var_list;
while (var_it)
{
double *host_ptr = cg->fgfs[var_it->data->sgfn];
for (size_t i = 0; i < local_segments.size(); ++i)
{
Parallel::gridseg *src_seg = local_segments[i];
if (bssn_gpu_stage_download_region(host_ptr,
cg->shape,
cg->bbox,
cg->bbox + dim,
src_seg->shape,
src_seg->llb))
{
cerr << "GPU sync region download failure: lev=" << lev
<< " var=" << var_it->data->name
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
return true;
}
}
var_it = var_it->next;
}
}
}
if (block_it == patch_it->data->ble)
break;
block_it = block_it->next;
}
patch_it = patch_it->next;
}
return true;
};
auto can_pack_sync_from_device =
[&](MyList<var> *var_list, Parallel::SyncCache &cache) -> bool {
if (!cache.valid || !cache.combined_src || myrank < 0 || myrank >= cache.cpusize)
return false;
MyList<Parallel::gridseg> *seg = cache.combined_src[myrank];
while (seg)
{
MyList<var> *var_it = var_list;
while (var_it)
{
if (!bssn_gpu_find_device_buffer(seg->data->Bg->fgfs[var_it->data->sgfn]))
return false;
var_it = var_it->next;
}
seg = seg->next;
}
return true;
};
MyList<Patch> *Pp = GH->PatL[lev];
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank)
{
double t0 = 0.0;
if (profile_enabled)
t0 = MPI_Wtime();
if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_FIRST_TIME))
ERROR = 1;
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_RHS, MPI_Wtime() - t0);
if (profile_enabled)
t0 = MPI_Wtime();
run_stage_on_block(cg, Pp->data, StateList, StateList, SynchList_pre, RHSList, iter_count);
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_RUN_STAGE, MPI_Wtime() - t0);
if (profile_enabled)
t0 = MPI_Wtime();
if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi->sgfn], chitiny, false))
{
cerr << "GPU lowerbound failure: lev=" << lev
<< " rk_stage=" << iter_count
<< " var=" << phi->name
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_LOWERBOUND, MPI_Wtime() - t0);
}
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
if (!ERROR)
{
if (!keep_stage_sync_on_device)
{
double t0 = 0.0;
if (profile_enabled)
t0 = MPI_Wtime();
stage_download_patch_list(SynchList_pre, false);
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_DOWNLOAD, MPI_Wtime() - t0);
if (!ERROR)
{
if (profile_enabled)
t0 = MPI_Wtime();
bssn_gpu_clear_cached_device_buffers();
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0);
}
}
}
MPI_Request err_req_pre;
{
int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_pre);
}
Parallel::AsyncSyncState async_pre;
if (profile_enabled)
{
const double t0 = MPI_Wtime();
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
stage_profile_add(lev, STAGE_PROFILE_SYNC_START, MPI_Wtime() - t0);
}
else
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
if (profile_enabled)
{
const double t0 = MPI_Wtime();
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry,
!keep_stage_sync_on_device);
stage_profile_add(lev, STAGE_PROFILE_SYNC_FINISH, MPI_Wtime() - t0);
}
else
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry,
!keep_stage_sync_on_device);
if (!ERROR && !keep_stage_sync_on_device)
{
if (profile_enabled)
{
const double t0 = MPI_Wtime();
refresh_stage_device_after_sync(SynchList_pre, sync_cache_pre[lev]);
stage_profile_add(lev, STAGE_PROFILE_REFRESH, MPI_Wtime() - t0);
}
else
refresh_stage_device_after_sync(SynchList_pre, sync_cache_pre[lev]);
}
MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE);
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#if (MAPBH == 0)
if (BH_num > 0 && lev == GH->levels - 1)
{
compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count);
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg[ithBH][1], Porg_rhs[ithBH][1], iter_count);
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg[ithBH][2], Porg_rhs[ithBH][2], iter_count);
if (Symmetry > 0)
Porg[ithBH][2] = fabs(Porg[ithBH][2]);
if (Symmetry == 2)
{
Porg[ithBH][0] = fabs(Porg[ithBH][0]);
Porg[ithBH][1] = fabs(Porg[ithBH][1]);
}
}
}
if (lev == a_lev)
AnalysisStuff(lev, dT_lev);
#endif
for (iter_count = 1; iter_count < 4; iter_count++)
{
if (iter_count == 1 || iter_count == 3)
TRK4 += dT_lev / 2;
Pp = GH->PatL[lev];
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank)
{
double t0 = 0.0;
if (profile_enabled)
t0 = MPI_Wtime();
ensure_stage_device_var_list(cg, SynchList_pre);
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_ENSURE, MPI_Wtime() - t0);
if (profile_enabled)
t0 = MPI_Wtime();
if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_THEN))
ERROR = 1;
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_RHS, MPI_Wtime() - t0);
if (profile_enabled)
t0 = MPI_Wtime();
run_stage_on_block(cg, Pp->data, StateList, SynchList_pre, SynchList_cor, RHSList, iter_count);
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_RUN_STAGE, MPI_Wtime() - t0);
if (profile_enabled)
t0 = MPI_Wtime();
if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi1->sgfn], chitiny, false))
{
cerr << "GPU lowerbound failure: lev=" << lev
<< " rk_stage=" << iter_count
<< " var=" << phi1->name
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_LOWERBOUND, MPI_Wtime() - t0);
}
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
if (!ERROR)
{
if (!keep_stage_sync_on_device)
{
double t0 = 0.0;
if (profile_enabled)
t0 = MPI_Wtime();
stage_download_patch_list(SynchList_cor, false);
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_DOWNLOAD, MPI_Wtime() - t0);
if (!ERROR)
{
if (profile_enabled)
t0 = MPI_Wtime();
bssn_gpu_clear_cached_device_buffers();
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0);
}
}
}
MPI_Request err_req_cor;
{
int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
}
Parallel::AsyncSyncState async_cor;
if (profile_enabled)
{
const double t0 = MPI_Wtime();
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
stage_profile_add(lev, STAGE_PROFILE_SYNC_START, MPI_Wtime() - t0);
}
else
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
if (profile_enabled)
{
const double t0 = MPI_Wtime();
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry,
!keep_stage_sync_on_device);
stage_profile_add(lev, STAGE_PROFILE_SYNC_FINISH, MPI_Wtime() - t0);
}
else
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry,
!keep_stage_sync_on_device);
if (!ERROR && !keep_stage_sync_on_device && iter_count < 3)
{
if (profile_enabled)
{
const double t0 = MPI_Wtime();
refresh_stage_device_after_sync(SynchList_cor, sync_cache_cor[lev]);
stage_profile_add(lev, STAGE_PROFILE_REFRESH, MPI_Wtime() - t0);
}
else
refresh_stage_device_after_sync(SynchList_cor, sync_cache_cor[lev]);
}
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#if (MAPBH == 0)
if (BH_num > 0 && lev == GH->levels - 1)
{
compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev);
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg1[ithBH][0], Porg_rhs[ithBH][0], iter_count);
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg1[ithBH][1], Porg_rhs[ithBH][1], iter_count);
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg1[ithBH][2], Porg_rhs[ithBH][2], iter_count);
if (Symmetry > 0)
Porg1[ithBH][2] = fabs(Porg1[ithBH][2]);
if (Symmetry == 2)
{
Porg1[ithBH][0] = fabs(Porg1[ithBH][0]);
Porg1[ithBH][1] = fabs(Porg1[ithBH][1]);
}
}
}
#endif
if (iter_count < 3)
{
Pp = GH->PatL[lev];
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
BP->data->swapList(SynchList_pre, SynchList_cor, myrank);
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
#if (MAPBH == 0)
if (BH_num > 0 && lev == GH->levels - 1)
{
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
Porg[ithBH][0] = Porg1[ithBH][0];
Porg[ithBH][1] = Porg1[ithBH][1];
Porg[ithBH][2] = Porg1[ithBH][2];
}
}
#endif
}
}
#if (RPS == 0)
RestrictProlong(lev, YN, BB);
#endif
Pp = GH->PatL[lev];
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
cg->swapList(StateList, SynchList_cor, myrank);
cg->swapList(OldStateList, SynchList_cor, myrank);
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
if (!ERROR && keep_stage_sync_on_device)
{
// After the swaps above, only StateList points at arrays updated during this step.
// OldStateList/SynchList_cor remain valid on host because their backing arrays were
// read-only during the RK step, and SynchList_pre is reused only as scratch later.
const double t0 = profile_enabled ? MPI_Wtime() : 0.0;
stage_download_patch_list(StateList, true);
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_DOWNLOAD, MPI_Wtime() - t0);
}
if (profile_enabled)
{
const double t0 = MPI_Wtime();
bssn_gpu_clear_cached_device_buffers();
stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0);
}
else
bssn_gpu_clear_cached_device_buffers();
if (BH_num > 0 && lev == GH->levels - 1)
{
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
Porg0[ithBH][0] = Porg1[ithBH][0];
Porg0[ithBH][1] = Porg1[ithBH][1];
Porg0[ithBH][2] = Porg1[ithBH][2];
}
}
if (profile_enabled)
stage_profile_add(lev, STAGE_PROFILE_TOTAL, MPI_Wtime() - step_total_begin);
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -4,8 +4,10 @@
#include "bssn_macro.h" #include "bssn_macro.h"
#include "macrodef.fh" #include "macrodef.fh"
#define GRID_DIM 256 #define DEVICE_ID 0
#define BLOCK_DIM 128 // #define DEVICE_ID_BY_MPI_RANK
#define GRID_DIM 256
#define BLOCK_DIM 128
#define _FH2_(i, j, k) fh[(i) + (j) * _1D_SIZE[2] + (k) * _2D_SIZE[2]] #define _FH2_(i, j, k) fh[(i) + (j) * _1D_SIZE[2] + (k) * _2D_SIZE[2]]
#define _FH3_(i, j, k) fh[(i) + (j) * _1D_SIZE[3] + (k) * _2D_SIZE[3]] #define _FH3_(i, j, k) fh[(i) + (j) * _1D_SIZE[3] + (k) * _2D_SIZE[3]]
@@ -63,45 +65,9 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,
double *Gmx_Res, double *Gmy_Res, double *Gmz_Res, double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
int &Symmetry, int &Lev, double &eps, int &co); int &Symmetry, int &Lev, double &eps, int &co);
int gpu_rhs_ss(RHS_SS_PARA); int gpu_rhs_ss(RHS_SS_PARA);
int bssn_gpu_bind_process_device(int mpi_rank); /** Init GPU side data in GPUMeta. */
void bssn_gpu_clear_cached_device_buffers(); // void init_fluid_meta_gpu(GPUMeta *gpu_meta);
void bssn_gpu_release_pinned_host_buffers();
const double *bssn_gpu_find_device_buffer(const double *host_ptr);
void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr);
void bssn_gpu_prepare_host_buffer(const double *host_ptr, int count);
int bssn_gpu_stage_upload_buffer(const double *host_ptr, int count);
int bssn_gpu_stage_zero_buffer(const double *host_ptr, int count);
int bssn_gpu_stage_upload_region(const double *host_ptr,
const int *full_shape,
const double *full_llb,
const double *full_uub,
const int *region_shape,
const double *region_llb);
int bssn_gpu_stage_download_region(double *host_ptr,
const int *full_shape,
const double *full_llb,
const double *full_uub,
const int *region_shape,
const double *region_llb);
int bssn_gpu_stage_download_region_to_buffer(const double *host_src_ptr,
const int *full_shape,
const double *full_llb,
const double *full_uub,
const int *region_shape,
const double *region_llb,
double *host_dst_ptr);
int bssn_gpu_stage_upload_buffer_to_region(const double *host_src_ptr,
double *host_dst_ptr,
const int *full_shape,
const double *full_llb,
const double *full_uub,
const int *region_shape,
const double *region_llb);
int bssn_gpu_download_buffer_batch(const int *ex, double **host_ptrs, int num_buffers);
/** Init GPU side data in GPUMeta. */
// void init_fluid_meta_gpu(GPUMeta *gpu_meta);
#endif #endif

View File

@@ -65,10 +65,9 @@ if(TIME_COUNT_EACH_RANK == 1){\
}\ }\
} }
//3---------------------GPU--------------------- //3---------------------GPU---------------------
#define CALLED_BY_STEP 0 #define CALLED_BY_STEP 0
#define CALLED_BY_CONSTRAINT 1 #define CALLED_BY_CONSTRAINT 1
#define CALLED_BY_CONSTRAINT_CONS_ONLY 2
#define RHS_PARA_CALLED_FIRST_TIME cg->shape,TRK4,cg->X[0],cg->X[1],cg->X[2],cg->fgfs[phi0->sgfn],cg->fgfs[trK0->sgfn],cg->fgfs[gxx0->sgfn],cg->fgfs[gxy0->sgfn],cg->fgfs[gxz0->sgfn],cg->fgfs[gyy0->sgfn],cg->fgfs[gyz0->sgfn],cg->fgfs[gzz0->sgfn],cg->fgfs[Axx0->sgfn],cg->fgfs[Axy0->sgfn],cg->fgfs[Axz0->sgfn],cg->fgfs[Ayy0->sgfn],cg->fgfs[Ayz0->sgfn],cg->fgfs[Azz0->sgfn],cg->fgfs[Gmx0->sgfn],cg->fgfs[Gmy0->sgfn],cg->fgfs[Gmz0->sgfn],cg->fgfs[Lap0->sgfn],cg->fgfs[Sfx0->sgfn],cg->fgfs[Sfy0->sgfn],cg->fgfs[Sfz0->sgfn],cg->fgfs[dtSfx0->sgfn],cg->fgfs[dtSfy0->sgfn],cg->fgfs[dtSfz0->sgfn],cg->fgfs[phi_rhs->sgfn],cg->fgfs[trK_rhs->sgfn],cg->fgfs[gxx_rhs->sgfn],cg->fgfs[gxy_rhs->sgfn],cg->fgfs[gxz_rhs->sgfn],cg->fgfs[gyy_rhs->sgfn],cg->fgfs[gyz_rhs->sgfn],cg->fgfs[gzz_rhs->sgfn],cg->fgfs[Axx_rhs->sgfn],cg->fgfs[Axy_rhs->sgfn],cg->fgfs[Axz_rhs->sgfn],cg->fgfs[Ayy_rhs->sgfn],cg->fgfs[Ayz_rhs->sgfn],cg->fgfs[Azz_rhs->sgfn],cg->fgfs[Gmx_rhs->sgfn],cg->fgfs[Gmy_rhs->sgfn],cg->fgfs[Gmz_rhs->sgfn],cg->fgfs[Lap_rhs->sgfn],cg->fgfs[Sfx_rhs->sgfn],cg->fgfs[Sfy_rhs->sgfn],cg->fgfs[Sfz_rhs->sgfn],cg->fgfs[dtSfx_rhs->sgfn],cg->fgfs[dtSfy_rhs->sgfn],cg->fgfs[dtSfz_rhs->sgfn],cg->fgfs[rho->sgfn],cg->fgfs[Sx->sgfn],cg->fgfs[Sy->sgfn],cg->fgfs[Sz->sgfn],cg->fgfs[Sxx->sgfn],cg->fgfs[Sxy->sgfn],cg->fgfs[Sxz->sgfn],cg->fgfs[Syy->sgfn],cg->fgfs[Syz->sgfn],cg->fgfs[Szz->sgfn],cg->fgfs[Gamxxx->sgfn],cg->fgfs[Gamxxy->sgfn],cg->fgfs[Gamxxz->sgfn],cg->fgfs[Gamxyy->sgfn],cg->fgfs[Gamxyz->sgfn],cg->fgfs[Gamxzz->sgfn],cg->fgfs[Gamyxx->sgfn],cg->fgfs[Gamyxy->sgfn],cg->fgfs[Gamyxz->sgfn],cg->fgfs[Gamyyy->sgfn],cg->fgfs[Gamyyz->sgfn],cg->fgfs[Gamyzz->sgfn],cg->fgfs[Gamzxx->sgfn],cg->fgfs[Gamzxy->sgfn],cg->fgfs[Gamzxz->sgfn],cg->fgfs[Gamzyy->sgfn],cg->fgfs[Gamzyz->sgfn],cg->fgfs[Gamzzz->sgfn],cg->fgfs[Rxx->sgfn],cg->fgfs[Rxy->sgfn],cg->fgfs[Rxz->sgfn],cg->fgfs[Ryy->sgfn],cg->fgfs[Ryz->sgfn],cg->fgfs[Rzz->sgfn],cg->fgfs[Cons_Ham->sgfn],cg->fgfs[Cons_Px->sgfn],cg->fgfs[Cons_Py->sgfn],cg->fgfs[Cons_Pz->sgfn],cg->fgfs[Cons_Gx->sgfn],cg->fgfs[Cons_Gy->sgfn],cg->fgfs[Cons_Gz->sgfn],Symmetry,lev,ndeps,pre #define RHS_PARA_CALLED_FIRST_TIME cg->shape,TRK4,cg->X[0],cg->X[1],cg->X[2],cg->fgfs[phi0->sgfn],cg->fgfs[trK0->sgfn],cg->fgfs[gxx0->sgfn],cg->fgfs[gxy0->sgfn],cg->fgfs[gxz0->sgfn],cg->fgfs[gyy0->sgfn],cg->fgfs[gyz0->sgfn],cg->fgfs[gzz0->sgfn],cg->fgfs[Axx0->sgfn],cg->fgfs[Axy0->sgfn],cg->fgfs[Axz0->sgfn],cg->fgfs[Ayy0->sgfn],cg->fgfs[Ayz0->sgfn],cg->fgfs[Azz0->sgfn],cg->fgfs[Gmx0->sgfn],cg->fgfs[Gmy0->sgfn],cg->fgfs[Gmz0->sgfn],cg->fgfs[Lap0->sgfn],cg->fgfs[Sfx0->sgfn],cg->fgfs[Sfy0->sgfn],cg->fgfs[Sfz0->sgfn],cg->fgfs[dtSfx0->sgfn],cg->fgfs[dtSfy0->sgfn],cg->fgfs[dtSfz0->sgfn],cg->fgfs[phi_rhs->sgfn],cg->fgfs[trK_rhs->sgfn],cg->fgfs[gxx_rhs->sgfn],cg->fgfs[gxy_rhs->sgfn],cg->fgfs[gxz_rhs->sgfn],cg->fgfs[gyy_rhs->sgfn],cg->fgfs[gyz_rhs->sgfn],cg->fgfs[gzz_rhs->sgfn],cg->fgfs[Axx_rhs->sgfn],cg->fgfs[Axy_rhs->sgfn],cg->fgfs[Axz_rhs->sgfn],cg->fgfs[Ayy_rhs->sgfn],cg->fgfs[Ayz_rhs->sgfn],cg->fgfs[Azz_rhs->sgfn],cg->fgfs[Gmx_rhs->sgfn],cg->fgfs[Gmy_rhs->sgfn],cg->fgfs[Gmz_rhs->sgfn],cg->fgfs[Lap_rhs->sgfn],cg->fgfs[Sfx_rhs->sgfn],cg->fgfs[Sfy_rhs->sgfn],cg->fgfs[Sfz_rhs->sgfn],cg->fgfs[dtSfx_rhs->sgfn],cg->fgfs[dtSfy_rhs->sgfn],cg->fgfs[dtSfz_rhs->sgfn],cg->fgfs[rho->sgfn],cg->fgfs[Sx->sgfn],cg->fgfs[Sy->sgfn],cg->fgfs[Sz->sgfn],cg->fgfs[Sxx->sgfn],cg->fgfs[Sxy->sgfn],cg->fgfs[Sxz->sgfn],cg->fgfs[Syy->sgfn],cg->fgfs[Syz->sgfn],cg->fgfs[Szz->sgfn],cg->fgfs[Gamxxx->sgfn],cg->fgfs[Gamxxy->sgfn],cg->fgfs[Gamxxz->sgfn],cg->fgfs[Gamxyy->sgfn],cg->fgfs[Gamxyz->sgfn],cg->fgfs[Gamxzz->sgfn],cg->fgfs[Gamyxx->sgfn],cg->fgfs[Gamyxy->sgfn],cg->fgfs[Gamyxz->sgfn],cg->fgfs[Gamyyy->sgfn],cg->fgfs[Gamyyz->sgfn],cg->fgfs[Gamyzz->sgfn],cg->fgfs[Gamzxx->sgfn],cg->fgfs[Gamzxy->sgfn],cg->fgfs[Gamzxz->sgfn],cg->fgfs[Gamzyy->sgfn],cg->fgfs[Gamzyz->sgfn],cg->fgfs[Gamzzz->sgfn],cg->fgfs[Rxx->sgfn],cg->fgfs[Rxy->sgfn],cg->fgfs[Rxz->sgfn],cg->fgfs[Ryy->sgfn],cg->fgfs[Ryz->sgfn],cg->fgfs[Rzz->sgfn],cg->fgfs[Cons_Ham->sgfn],cg->fgfs[Cons_Px->sgfn],cg->fgfs[Cons_Py->sgfn],cg->fgfs[Cons_Pz->sgfn],cg->fgfs[Cons_Gx->sgfn],cg->fgfs[Cons_Gy->sgfn],cg->fgfs[Cons_Gz->sgfn],Symmetry,lev,ndeps,pre

View File

@@ -22,19 +22,32 @@
#define f_compute_rhs_Z4c_ss COMPUTE_RHS_Z4C_SS #define f_compute_rhs_Z4c_ss COMPUTE_RHS_Z4C_SS
#define f_compute_constraint_fr COMPUTE_CONSTRAINT_FR #define f_compute_constraint_fr COMPUTE_CONSTRAINT_FR
#endif #endif
#ifdef fortran3 #ifdef fortran3
#define f_compute_rhs_bssn compute_rhs_bssn_ #define f_compute_rhs_bssn compute_rhs_bssn_
#define f_compute_rhs_bssn_ss compute_rhs_bssn_ss_ #define f_compute_rhs_bssn_ss compute_rhs_bssn_ss_
#define f_compute_rhs_bssn_escalar compute_rhs_bssn_escalar_ #define f_compute_rhs_bssn_escalar compute_rhs_bssn_escalar_
#define f_compute_rhs_bssn_escalar_ss compute_rhs_bssn_escalar_ss_ #define f_compute_rhs_bssn_escalar_ss compute_rhs_bssn_escalar_ss_
#define f_compute_rhs_Z4c compute_rhs_z4c_ #define f_compute_rhs_Z4c compute_rhs_z4c_
#define f_compute_rhs_Z4cnot compute_rhs_z4cnot_ #define f_compute_rhs_Z4cnot compute_rhs_z4cnot_
#define f_compute_rhs_Z4c_ss compute_rhs_z4c_ss_ #define f_compute_rhs_Z4c_ss compute_rhs_z4c_ss_
#define f_compute_constraint_fr compute_constraint_fr_ #define f_compute_constraint_fr compute_constraint_fr_
#endif #endif
extern "C"
{ #ifdef __cplusplus
int f_compute_rhs_bssn(int *, double &, double *, double *, double *, // ex,T,X,Y,Z extern "C"
{
#endif
void f_bssn_rhs_kernel_timing_reset();
int f_bssn_rhs_kernel_timing_bucket_count();
const double *f_bssn_rhs_kernel_timing_local_seconds();
const char *f_bssn_rhs_kernel_timing_label(int);
#ifdef __cplusplus
}
#endif
extern "C"
{
int f_compute_rhs_bssn(int *, double &, double *, double *, double *, // ex,T,X,Y,Z
double *, double *, // chi, trK double *, double *, // chi, trK
double *, double *, double *, double *, double *, double *, // gij double *, double *, double *, double *, double *, double *, // gij
double *, double *, double *, double *, double *, double *, // Aij double *, double *, double *, double *, double *, double *, // Aij

View File

@@ -2,12 +2,88 @@
#include "bssn_rhs.h" #include "bssn_rhs.h"
#include "share_func.h" #include "share_func.h"
#include "tool.h" #include "tool.h"
#include <time.h>
// 0-based i,j,k // 0-based i,j,k
// #define IDX_F(i,j,k,nx,ny) ((i) + (j)*(nx) + (k)*(nx)*(ny)) // #define IDX_F(i,j,k,nx,ny) ((i) + (j)*(nx) + (k)*(nx)*(ny))
// ex(1)=nx, ex(2)=ny, ex(3)=nz // ex(1)=nx, ex(2)=ny, ex(3)=nz
// 用法a[ IDX_F(i,j,k,nx,ny) ] // 用法a[ IDX_F(i,j,k,nx,ny) ]
#ifndef BSSN_KERNEL_FINE_TIMING
#define BSSN_KERNEL_FINE_TIMING 0
#endif
#if BSSN_KERNEL_FINE_TIMING
namespace rhs_kernel_timing
{
enum Bucket
{
KB_SETUP_DERIVS = 0,
KB_GEOM_GAMMA,
KB_RICCI_METRIC,
KB_CHI_LAPSE,
KB_AIJ_TRK_GAUGE,
KB_KO_CONSTRAINT,
KB_COUNT
};
static double local_bucket_seconds[KB_COUNT];
static const char *bucket_labels[KB_COUNT] =
{
"setup_derivs",
"geom_gamma",
"ricci_metric",
"chi_lapse",
"aij_trk_gauge",
"ko_constraint"
};
static inline double now_seconds()
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return double(ts.tv_sec) + 1.0e-9 * double(ts.tv_nsec);
}
}
extern "C" void f_bssn_rhs_kernel_timing_reset()
{
for (int i = 0; i < rhs_kernel_timing::KB_COUNT; ++i)
rhs_kernel_timing::local_bucket_seconds[i] = 0.0;
}
extern "C" int f_bssn_rhs_kernel_timing_bucket_count()
{
return rhs_kernel_timing::KB_COUNT;
}
extern "C" const double *f_bssn_rhs_kernel_timing_local_seconds()
{
return rhs_kernel_timing::local_bucket_seconds;
}
extern "C" const char *f_bssn_rhs_kernel_timing_label(int bucket_index)
{
if (bucket_index < 0 || bucket_index >= rhs_kernel_timing::KB_COUNT)
return "unknown";
return rhs_kernel_timing::bucket_labels[bucket_index];
}
#define RHS_KERNEL_TIMER_DECL(var_name) const double var_name = rhs_kernel_timing::now_seconds()
#define RHS_KERNEL_TIMER_ADD(bucket_name, var_name) \
rhs_kernel_timing::local_bucket_seconds[int(rhs_kernel_timing::bucket_name)] += \
rhs_kernel_timing::now_seconds() - (var_name)
#else
extern "C" void f_bssn_rhs_kernel_timing_reset() {}
extern "C" int f_bssn_rhs_kernel_timing_bucket_count() { return 0; }
extern "C" const double *f_bssn_rhs_kernel_timing_local_seconds() { return 0; }
extern "C" const char *f_bssn_rhs_kernel_timing_label(int) { return "disabled"; }
#define RHS_KERNEL_TIMER_DECL(var_name)
#define RHS_KERNEL_TIMER_ADD(bucket_name, var_name)
#endif
// C function that calculates the right-hand side for BSSN equations // C function that calculates the right-hand side for BSSN equations
int f_compute_rhs_bssn(int *ex, double &T, int f_compute_rhs_bssn(int *ex, double &T,
double *X, double *Y, double *Z, double *X, double *Y, double *Z,
@@ -102,6 +178,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
dY = Y[1] - Y[0]; dY = Y[1] - Y[0];
dZ = Z[1] - Z[0]; dZ = Z[1] - Z[0];
RHS_KERNEL_TIMER_DECL(timer_setup_derivs);
// 1ms // // 1ms //
for(int i=0;i<all;i+=1){ for(int i=0;i<all;i+=1){
alpn1[i] = Lap[i] + 1.0; alpn1[i] = Lap[i] + 1.0;
@@ -141,6 +218,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
(dxx[i] + ONE) * betaxz[i] + gxy[i] * betayz[i] + gyz[i] * betayx[i] (dxx[i] + ONE) * betaxz[i] + gxy[i] * betayz[i] + gyz[i] * betayx[i]
+ (dzz[i] + ONE) * betazx[i] - gxz[i] * betayy[i]; + (dzz[i] + ONE) * betazx[i] - gxz[i] * betayy[i];
} }
RHS_KERNEL_TIMER_ADD(KB_SETUP_DERIVS, timer_setup_derivs);
RHS_KERNEL_TIMER_DECL(timer_geom_gamma);
// Fused: inverse metric + Gamma constraint + Christoffel (3 loops -> 1) // Fused: inverse metric + Gamma constraint + Christoffel (3 loops -> 1)
for(int i=0;i<all;i+=1){ for(int i=0;i<all;i+=1){
double det = (dxx[i] + ONE) * (dyy[i] + ONE) * (dzz[i] + ONE) + gxy[i] * gyz[i] * gxz[i] + gxz[i] * gxy[i] * gyz[i] - double det = (dxx[i] + ONE) * (dyy[i] + ONE) * (dzz[i] + ONE) + gxy[i] * gyz[i] * gxz[i] + gxz[i] * gxy[i] * gyz[i] -
@@ -283,9 +362,6 @@ int f_compute_rhs_bssn(int *ex, double &T,
+ ( gupxy[i]*gupyz[i] + gupyy[i]*gupxz[i] ) * Axy[i] + ( gupxy[i]*gupyz[i] + gupyy[i]*gupxz[i] ) * Axy[i]
+ ( gupxy[i]*gupzz[i] + gupyz[i]*gupxz[i] ) * Axz[i] + ( gupxy[i]*gupzz[i] + gupyz[i]*gupxz[i] ) * Axz[i]
+ ( gupyy[i]*gupzz[i] + gupyz[i]*gupyz[i] ) * Ayz[i]; + ( gupyy[i]*gupzz[i] + gupyz[i]*gupyz[i] ) * Ayz[i];
Rxx[i] = axx; Ryy[i] = ayy; Rzz[i] = azz;
Rxy[i] = axy; Rxz[i] = axz; Ryz[i] = ayz;
Gamx_rhs[i] = - TWO * ( Lapx[i]*axx + Lapy[i]*axy + Lapz[i]*axz ) + Gamx_rhs[i] = - TWO * ( Lapx[i]*axx + Lapy[i]*axy + Lapz[i]*axz ) +
TWO * alpn1[i] * ( TWO * alpn1[i] * (
-F3o2/chin1[i] * ( chix[i]*axx + chiy[i]*axy + chiz[i]*axz ) - -F3o2/chin1[i] * ( chix[i]*axx + chiy[i]*axy + chiz[i]*axz ) -
@@ -315,6 +391,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
+ TWO * ( Gamzxy[i]*axy + Gamzxz[i]*axz + Gamzyz[i]*ayz ) + TWO * ( Gamzxy[i]*axy + Gamzxz[i]*axz + Gamzyz[i]*ayz )
); );
} }
RHS_KERNEL_TIMER_ADD(KB_GEOM_GAMMA, timer_geom_gamma);
RHS_KERNEL_TIMER_DECL(timer_ricci_metric);
// 22.3ms // // 22.3ms //
fdderivs(ex,betax,gxxx,gxyx,gxzx,gyyx,gyzx,gzzx, fdderivs(ex,betax,gxxx,gxyx,gxzx,gyyx,gyzx,gzzx,
X,Y,Z,ANTI,SYM, SYM ,Symmetry,Lev); X,Y,Z,ANTI,SYM, SYM ,Symmetry,Lev);
@@ -332,7 +410,6 @@ int f_compute_rhs_bssn(int *ex, double &T,
double lfxx = gxxx[i] + gxyy[i] + gxzz[i]; double lfxx = gxxx[i] + gxyy[i] + gxzz[i];
double lfxy = gxyx[i] + gyyy[i] + gyzz[i]; double lfxy = gxyx[i] + gyyy[i] + gyzz[i];
double lfxz = gxzx[i] + gyzy[i] + gzzz[i]; double lfxz = gxzx[i] + gyzy[i] + gzzz[i];
fxx[i] = lfxx; fxy[i] = lfxy; fxz[i] = lfxz;
double gxa = gupxx[i]*Gamxxx[i] + gupyy[i]*Gamxyy[i] + gupzz[i]*Gamxzz[i] double gxa = gupxx[i]*Gamxxx[i] + gupyy[i]*Gamxyy[i] + gupzz[i]*Gamxzz[i]
+ TWO * ( gupxy[i]*Gamxxy[i] + gupxz[i]*Gamxxz[i] + gupyz[i]*Gamxyz[i] ); + TWO * ( gupxy[i]*Gamxxy[i] + gupxz[i]*Gamxxz[i] + gupyz[i]*Gamxyz[i] );
@@ -686,69 +763,74 @@ int f_compute_rhs_bssn(int *ex, double &T,
+ Gamxyz[i] * gzzx[i] + Gamyyz[i] * gzzy[i] + Gamzyz[i] * gzzz[i] + Gamxyz[i] * gzzx[i] + Gamyyz[i] * gzzy[i] + Gamzyz[i] * gzzz[i]
); );
} }
RHS_KERNEL_TIMER_ADD(KB_RICCI_METRIC, timer_ricci_metric);
RHS_KERNEL_TIMER_DECL(timer_chi_lapse);
// 22.3ms // // 22.3ms //
fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev); fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
// 7ms // // 7ms //
for (int i=0;i<all;i+=1) { for (int i=0;i<all;i+=1) {
fxx[i] = fxx[i] - Gamxxx[i] * chix[i] - Gamyxx[i] * chiy[i] - Gamzxx[i] * chiz[i]; const double inv_chin1 = ONE / chin1[i];
fxy[i] = fxy[i] - Gamxxy[i] * chix[i] - Gamyxy[i] * chiy[i] - Gamzxy[i] * chiz[i]; const double half_inv_chin1 = HALF * inv_chin1;
fxz[i] = fxz[i] - Gamxxz[i] * chix[i] - Gamyxz[i] * chiy[i] - Gamzxz[i] * chiz[i]; const double scaled_inv = F3o2 * inv_chin1;
fyy[i] = fyy[i] - Gamxyy[i] * chix[i] - Gamyyy[i] * chiy[i] - Gamzyy[i] * chiz[i]; const double cxx = fxx[i] - Gamxxx[i] * chix[i] - Gamyxx[i] * chiy[i] - Gamzxx[i] * chiz[i];
fyz[i] = fyz[i] - Gamxyz[i] * chix[i] - Gamyyz[i] * chiy[i] - Gamzyz[i] * chiz[i]; const double cxy = fxy[i] - Gamxxy[i] * chix[i] - Gamyxy[i] * chiy[i] - Gamzxy[i] * chiz[i];
fzz[i] = fzz[i] - Gamxzz[i] * chix[i] - Gamyzz[i] * chiy[i] - Gamzzz[i] * chiz[i]; const double cxz = fxz[i] - Gamxxz[i] * chix[i] - Gamyxz[i] * chiy[i] - Gamzxz[i] * chiz[i];
f[i] = const double cyy = fyy[i] - Gamxyy[i] * chix[i] - Gamyyy[i] * chiy[i] - Gamzyy[i] * chiz[i];
gupxx[i] * (fxx[i] - (F3o2 / chin1[i]) * chix[i] * chix[i]) const double cyz = fyz[i] - Gamxyz[i] * chix[i] - Gamyyz[i] * chiy[i] - Gamzyz[i] * chiz[i];
+ gupyy[i] * (fyy[i] - (F3o2 / chin1[i]) * chiy[i] * chiy[i]) const double czz = fzz[i] - Gamxzz[i] * chix[i] - Gamyzz[i] * chiy[i] - Gamzzz[i] * chiz[i];
+ gupzz[i] * (fzz[i] - (F3o2 / chin1[i]) * chiz[i] * chiz[i]) const double ricci_chi =
+ TWO * gupxy[i] * (fxy[i] - (F3o2 / chin1[i]) * chix[i] * chiy[i]) gupxx[i] * (cxx - scaled_inv * chix[i] * chix[i])
+ TWO * gupxz[i] * (fxz[i] - (F3o2 / chin1[i]) * chix[i] * chiz[i]) + gupyy[i] * (cyy - scaled_inv * chiy[i] * chiy[i])
+ TWO * gupyz[i] * (fyz[i] - (F3o2 / chin1[i]) * chiy[i] * chiz[i]); + gupzz[i] * (czz - scaled_inv * chiz[i] * chiz[i])
Rxx[i] = Rxx[i] + ( fxx[i] - (chix[i] * chix[i]) / (chin1[i] * TWO) + (dxx[i] + ONE) * f[i] ) / (chin1[i] * TWO); + TWO * gupxy[i] * (cxy - scaled_inv * chix[i] * chiy[i])
Ryy[i] = Ryy[i] + ( fyy[i] - (chiy[i] * chiy[i]) / (chin1[i] * TWO) + (dyy[i] + ONE) * f[i] ) / (chin1[i] * TWO); + TWO * gupxz[i] * (cxz - scaled_inv * chix[i] * chiz[i])
Rzz[i] = Rzz[i] + ( fzz[i] - (chiz[i] * chiz[i]) / (chin1[i] * TWO) + (dzz[i] + ONE) * f[i] ) / (chin1[i] * TWO); + TWO * gupyz[i] * (cyz - scaled_inv * chiy[i] * chiz[i]);
f[i] = ricci_chi;
Rxx[i] = Rxx[i] + ( cxx - half_inv_chin1 * chix[i] * chix[i] + (dxx[i] + ONE) * ricci_chi ) * half_inv_chin1;
Ryy[i] = Ryy[i] + ( cyy - half_inv_chin1 * chiy[i] * chiy[i] + (dyy[i] + ONE) * ricci_chi ) * half_inv_chin1;
Rzz[i] = Rzz[i] + ( czz - half_inv_chin1 * chiz[i] * chiz[i] + (dzz[i] + ONE) * ricci_chi ) * half_inv_chin1;
Rxy[i] = Rxy[i] + ( fxy[i] - (chix[i] * chiy[i]) / (chin1[i] * TWO) + gxy[i] * f[i] ) / (chin1[i] * TWO); Rxy[i] = Rxy[i] + ( cxy - half_inv_chin1 * chix[i] * chiy[i] + gxy[i] * ricci_chi ) * half_inv_chin1;
Rxz[i] = Rxz[i] + ( fxz[i] - (chix[i] * chiz[i]) / (chin1[i] * TWO) + gxz[i] * f[i] ) / (chin1[i] * TWO); Rxz[i] = Rxz[i] + ( cxz - half_inv_chin1 * chix[i] * chiz[i] + gxz[i] * ricci_chi ) * half_inv_chin1;
Ryz[i] = Ryz[i] + ( fyz[i] - (chiy[i] * chiz[i]) / (chin1[i] * TWO) + gyz[i] * f[i] ) / (chin1[i] * TWO); Ryz[i] = Ryz[i] + ( cyz - half_inv_chin1 * chiy[i] * chiz[i] + gyz[i] * ricci_chi ) * half_inv_chin1;
} }
// 24ms // // 24ms //
fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev); fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
// 6ms // // 6ms //
for (int i=0;i<all;i+=1) { for (int i=0;i<all;i+=1) {
/* gxxx,gxxy,gxxz (这里是“升指标后的chi导数/chi”那类量你沿用原变量名即可) */ const double inv_chin1 = ONE / chin1[i];
gxxx[i] = (gupxx[i] * chix[i] + gupxy[i] * chiy[i] + gupxz[i] * chiz[i]) / chin1[i]; const double gchi_x = (gupxx[i] * chix[i] + gupxy[i] * chiy[i] + gupxz[i] * chiz[i]) * inv_chin1;
gxxy[i] = (gupxy[i] * chix[i] + gupyy[i] * chiy[i] + gupyz[i] * chiz[i]) / chin1[i]; const double gchi_y = (gupxy[i] * chix[i] + gupyy[i] * chiy[i] + gupyz[i] * chiz[i]) * inv_chin1;
gxxz[i] = (gupxz[i] * chix[i] + gupyz[i] * chiy[i] + gupzz[i] * chiz[i]) / chin1[i]; const double gchi_z = (gupxz[i] * chix[i] + gupyz[i] * chiy[i] + gupzz[i] * chiz[i]) * inv_chin1;
/* Christoffel 修正项 */ /* Christoffel 修正项 */
Gamxxx[i] = Gamxxx[i] - ( ((chix[i] + chix[i]) / chin1[i]) - (dxx[i] + ONE) * gxxx[i] ) * HALF; Gamxxx[i] = Gamxxx[i] - ( ((chix[i] + chix[i]) * inv_chin1) - (dxx[i] + ONE) * gchi_x ) * HALF;
Gamyxx[i] = Gamyxx[i] - ( 0.0 - (dxx[i] + ONE) * gxxy[i] ) * HALF; /* 原式只有 -gxx*gxxy */ Gamyxx[i] = Gamyxx[i] - ( 0.0 - (dxx[i] + ONE) * gchi_y ) * HALF; /* 原式只有 -gxx*gxxy */
Gamzxx[i] = Gamzxx[i] - ( 0.0 - (dxx[i] + ONE) * gxxz[i] ) * HALF; Gamzxx[i] = Gamzxx[i] - ( 0.0 - (dxx[i] + ONE) * gchi_z ) * HALF;
Gamxyy[i] = Gamxyy[i] - ( 0.0 - (dyy[i] + ONE) * gxxx[i] ) * HALF; Gamxyy[i] = Gamxyy[i] - ( 0.0 - (dyy[i] + ONE) * gchi_x ) * HALF;
Gamyyy[i] = Gamyyy[i] - ( ((chiy[i] + chiy[i]) / chin1[i]) - (dyy[i] + ONE) * gxxy[i] ) * HALF; Gamyyy[i] = Gamyyy[i] - ( ((chiy[i] + chiy[i]) * inv_chin1) - (dyy[i] + ONE) * gchi_y ) * HALF;
Gamzyy[i] = Gamzyy[i] - ( 0.0 - (dyy[i] + ONE) * gxxz[i] ) * HALF; Gamzyy[i] = Gamzyy[i] - ( 0.0 - (dyy[i] + ONE) * gchi_z ) * HALF;
Gamxzz[i] = Gamxzz[i] - ( 0.0 - (dzz[i] + ONE) * gxxx[i] ) * HALF; Gamxzz[i] = Gamxzz[i] - ( 0.0 - (dzz[i] + ONE) * gchi_x ) * HALF;
Gamyzz[i] = Gamyzz[i] - ( 0.0 - (dzz[i] + ONE) * gxxy[i] ) * HALF; Gamyzz[i] = Gamyzz[i] - ( 0.0 - (dzz[i] + ONE) * gchi_y ) * HALF;
Gamzzz[i] = Gamzzz[i] - ( ((chiz[i] + chiz[i]) / chin1[i]) - (dzz[i] + ONE) * gxxz[i] ) * HALF; Gamzzz[i] = Gamzzz[i] - ( ((chiz[i] + chiz[i]) * inv_chin1) - (dzz[i] + ONE) * gchi_z ) * HALF;
Gamxxy[i] = Gamxxy[i] - ( ( chiy[i] / chin1[i]) - gxy[i] * gxxx[i] ) * HALF; Gamxxy[i] = Gamxxy[i] - ( ( chiy[i] * inv_chin1) - gxy[i] * gchi_x ) * HALF;
Gamyxy[i] = Gamyxy[i] - ( ( chix[i] / chin1[i]) - gxy[i] * gxxy[i] ) * HALF; Gamyxy[i] = Gamyxy[i] - ( ( chix[i] * inv_chin1) - gxy[i] * gchi_y ) * HALF;
Gamzxy[i] = Gamzxy[i] - ( 0.0 - gxy[i] * gxxz[i] ) * HALF; Gamzxy[i] = Gamzxy[i] - ( 0.0 - gxy[i] * gchi_z ) * HALF;
Gamxxz[i] = Gamxxz[i] - ( ( chiz[i] / chin1[i]) - gxz[i] * gxxx[i] ) * HALF; Gamxxz[i] = Gamxxz[i] - ( ( chiz[i] * inv_chin1) - gxz[i] * gchi_x ) * HALF;
Gamyxz[i] = Gamyxz[i] - ( 0.0 - gxz[i] * gxxy[i] ) * HALF; Gamyxz[i] = Gamyxz[i] - ( 0.0 - gxz[i] * gchi_y ) * HALF;
Gamzxz[i] = Gamzxz[i] - ( ( chix[i] / chin1[i]) - gxz[i] * gxxz[i] ) * HALF; Gamzxz[i] = Gamzxz[i] - ( ( chix[i] * inv_chin1) - gxz[i] * gchi_z ) * HALF;
Gamxyz[i] = Gamxyz[i] - ( 0.0 - gyz[i] * gxxx[i] ) * HALF; Gamxyz[i] = Gamxyz[i] - ( 0.0 - gyz[i] * gchi_x ) * HALF;
Gamyyz[i] = Gamyyz[i] - ( ( chiz[i] / chin1[i]) - gyz[i] * gxxy[i] ) * HALF; Gamyyz[i] = Gamyyz[i] - ( ( chiz[i] * inv_chin1) - gyz[i] * gchi_y ) * HALF;
Gamzyz[i] = Gamzyz[i] - ( ( chiy[i] / chin1[i]) - gyz[i] * gxxz[i] ) * HALF; Gamzyz[i] = Gamzyz[i] - ( ( chiy[i] * inv_chin1) - gyz[i] * gchi_z ) * HALF;
/* fxx..fyz 修正:减去 Γ * ∂Lap */ /* fxx..fyz 修正:减去 Γ * ∂Lap */
fxx[i] = fxx[i] - Gamxxx[i] * Lapx[i] - Gamyxx[i] * Lapy[i] - Gamzxx[i] * Lapz[i]; fxx[i] = fxx[i] - Gamxxx[i] * Lapx[i] - Gamyxx[i] * Lapy[i] - Gamzxx[i] * Lapz[i];
@@ -762,6 +844,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
trK_rhs[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i] trK_rhs[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i]
+ TWO * ( gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i] ); + TWO * ( gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i] );
} }
RHS_KERNEL_TIMER_ADD(KB_CHI_LAPSE, timer_chi_lapse);
RHS_KERNEL_TIMER_DECL(timer_aij_trk_gauge);
// 2.5ms // // 2.5ms //
for (int i=0;i<all;i+=1) { for (int i=0;i<all;i+=1) {
const double divb = betaxx[i] + betayy[i] + betazz[i]; const double divb = betaxx[i] + betayy[i] + betazz[i];
@@ -1022,16 +1106,9 @@ int f_compute_rhs_bssn(int *ex, double &T,
+ gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] ); + gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] );
#if (GAUGE == 2) #if (GAUGE == 2)
{ reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - sqrt(chin1[i])), 2.0 );
const double chi_sqrt = sqrt(chin1[i]);
const double damping = ONE - chi_sqrt;
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping);
}
#else #else
{ reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - chin1[i]), 2.0 );
const double damping = ONE - chin1[i];
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping);
}
#endif #endif
dtSfx_rhs[i] = Gamx_rhs[i] - reta[i] * dtSfx[i]; dtSfx_rhs[i] = Gamx_rhs[i] - reta[i] * dtSfx[i];
@@ -1047,16 +1124,9 @@ int f_compute_rhs_bssn(int *ex, double &T,
+ gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] ); + gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] );
#if (GAUGE == 4) #if (GAUGE == 4)
{ reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - sqrt(chin1[i])), 2.0 );
const double chi_sqrt = sqrt(chin1[i]);
const double damping = ONE - chi_sqrt;
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping);
}
#else #else
{ reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - chin1[i]), 2.0 );
const double damping = ONE - chin1[i];
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping);
}
#endif #endif
betax_rhs[i] = FF * Gamx[i] - reta[i] * betax[i]; betax_rhs[i] = FF * Gamx[i] - reta[i] * betax[i];
@@ -1076,6 +1146,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
dtSfz_rhs[i] = Gamz_rhs[i] - reta[i] * dtSfz[i]; dtSfz_rhs[i] = Gamz_rhs[i] - reta[i] * dtSfz[i];
#endif #endif
} }
RHS_KERNEL_TIMER_ADD(KB_AIJ_TRK_GAUGE, timer_aij_trk_gauge);
RHS_KERNEL_TIMER_DECL(timer_ko_constraint);
// advection + KO dissipation with shared symmetry buffer // advection + KO dissipation with shared symmetry buffer
lopsided_kodis(ex,X,Y,Z,dxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps); lopsided_kodis(ex,X,Y,Z,dxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps);
lopsided_kodis(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA,eps); lopsided_kodis(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA,eps);
@@ -1207,6 +1279,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
movz_Res[i] = movz_Res[i] - F2o3*Kz[i] - F8*PI*Sz[i]; movz_Res[i] = movz_Res[i] - F2o3*Kz[i] - F8*PI*Sz[i];
} }
} }
RHS_KERNEL_TIMER_ADD(KB_KO_CONSTRAINT, timer_ko_constraint);

View File

@@ -23,14 +23,10 @@ using namespace std;
#include <mpi.h> #include <mpi.h>
#include "macrodef.h" #include "macrodef.h"
#include "misc.h" #include "misc.h"
#include "cgh.h" #include "cgh.h"
#include "Parallel.h" #include "Parallel.h"
#include "parameters.h" #include "parameters.h"
#ifdef USE_GPU
#include "bssn_gpu.h"
#include "bssn_cuda_ops.h"
#endif
//================================================================================================ //================================================================================================
@@ -885,20 +881,13 @@ void cgh::recompose_cgh(int nprocs, bool *lev_flag,
tmPat = construct_patchlist(lev, Symmetry); tmPat = construct_patchlist(lev, Symmetry);
// tmPat construction completes // tmPat construction completes
Parallel::distribute(tmPat, nprocs, ingfs, fngfs, false); Parallel::distribute(tmPat, nprocs, ingfs, fngfs, false);
// checkPatchList(tmPat,true); // checkPatchList(tmPat,true);
bool CC = (lev > trfls); bool CC = (lev > trfls);
Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC); Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
#ifdef USE_GPU Parallel::KillBlocks(PatL[lev]);
bssn_gpu_clear_cached_device_buffers(); PatL[lev]->destroyList();
bssn_gpu_release_pinned_host_buffers(); PatL[lev] = tmPat;
bssn_cuda_release_rk4_caches();
bssn_cuda_release_interp_caches();
patch_release_interp_plan_cache();
#endif
Parallel::KillBlocks(PatL[lev]);
PatL[lev]->destroyList();
PatL[lev] = tmPat;
#if (RPB == 1) #if (RPB == 1)
Parallel::destroypsuList_bam(bdsul[lev]); Parallel::destroypsuList_bam(bdsul[lev]);
Parallel::destroypsuList_bam(rsul[lev]); Parallel::destroypsuList_bam(rsul[lev]);
@@ -921,20 +910,13 @@ void cgh::recompose_cgh(int nprocs, bool *lev_flag,
tmPat = construct_patchlist(lev, Symmetry); tmPat = construct_patchlist(lev, Symmetry);
// tmPat construction completes // tmPat construction completes
Parallel::distribute(tmPat, end_rank[lev] - start_rank[lev] + 1, ingfs, fngfs, false, start_rank[lev], end_rank[lev]); Parallel::distribute(tmPat, end_rank[lev] - start_rank[lev] + 1, ingfs, fngfs, false, start_rank[lev], end_rank[lev]);
// checkPatchList(tmPat,true); // checkPatchList(tmPat,true);
bool CC = (lev > trfls); bool CC = (lev > trfls);
Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC); Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
#ifdef USE_GPU Parallel::KillBlocks(PatL[lev]);
bssn_gpu_clear_cached_device_buffers(); PatL[lev]->destroyList();
bssn_gpu_release_pinned_host_buffers(); PatL[lev] = tmPat;
bssn_cuda_release_rk4_caches();
bssn_cuda_release_interp_caches();
patch_release_interp_plan_cache();
#endif
Parallel::KillBlocks(PatL[lev]);
PatL[lev]->destroyList();
PatL[lev] = tmPat;
#if (RPB == 1) #if (RPB == 1)
#error "not support yet" #error "not support yet"
#endif #endif
@@ -1536,20 +1518,13 @@ void cgh::recompose_cgh_Onelevel(int nprocs, int lev,
tmPat = construct_patchlist(lev, Symmetry); tmPat = construct_patchlist(lev, Symmetry);
// tmPat construction completes // tmPat construction completes
Parallel::distribute(tmPat, nprocs, ingfs, fngfs, false); Parallel::distribute(tmPat, nprocs, ingfs, fngfs, false);
// checkPatchList(tmPat,true); // checkPatchList(tmPat,true);
bool CC = (lev > trfls); bool CC = (lev > trfls);
Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC); Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
#ifdef USE_GPU Parallel::KillBlocks(PatL[lev]);
bssn_gpu_clear_cached_device_buffers(); PatL[lev]->destroyList();
bssn_gpu_release_pinned_host_buffers(); PatL[lev] = tmPat;
bssn_cuda_release_rk4_caches();
bssn_cuda_release_interp_caches();
patch_release_interp_plan_cache();
#endif
Parallel::KillBlocks(PatL[lev]);
PatL[lev]->destroyList();
PatL[lev] = tmPat;
} }
#elif (PSTR == 1 || PSTR == 2 || PSTR == 3) #elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
#warning "recompose_cgh_Onelevel is not implimented yet" #warning "recompose_cgh_Onelevel is not implimented yet"
@@ -1565,21 +1540,14 @@ void cgh::recompose_cgh_Onelevel(int nprocs, int lev,
// tmPat construction completes // tmPat construction completes
Parallel::distribute(tmPat, end_rank[lev] - start_rank[lev] + 1, ingfs, fngfs, false, start_rank[lev], end_rank[lev]); Parallel::distribute(tmPat, end_rank[lev] - start_rank[lev] + 1, ingfs, fngfs, false, start_rank[lev], end_rank[lev]);
misc::tillherecheck(Commlev[lev], start_rank[lev], "after distribute"); misc::tillherecheck(Commlev[lev], start_rank[lev], "after distribute");
// checkPatchList(tmPat,true); // checkPatchList(tmPat,true);
bool CC = (lev > trfls); bool CC = (lev > trfls);
Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC); Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
misc::tillherecheck(Commlev[lev], start_rank[lev], "after fill_level_data"); misc::tillherecheck(Commlev[lev], start_rank[lev], "after fill_level_data");
#ifdef USE_GPU Parallel::KillBlocks(PatL[lev]);
bssn_gpu_clear_cached_device_buffers(); PatL[lev]->destroyList();
bssn_gpu_release_pinned_host_buffers(); PatL[lev] = tmPat;
bssn_cuda_release_rk4_caches();
bssn_cuda_release_interp_caches();
patch_release_interp_plan_cache();
#endif
Parallel::KillBlocks(PatL[lev]);
PatL[lev]->destroyList();
PatL[lev] = tmPat;
} }

View File

@@ -1511,13 +1511,88 @@ deallocate(f_flat)
f_out = f_out*dX*dY*dZ f_out = f_out*dX*dY*dZ
return return
end subroutine l2normhelper end subroutine l2normhelper
!-------------------------------------------------------------------------------------- !--------------------------------------------------------------------------------------
! calculate L2norm especially for shell Blocks subroutine l2normhelper7(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
subroutine l2normhelper_sh(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,& f1,f2,f3,f4,f5,f6,f7,f_out,gw)
f,f_out,gw,ogw,Symmetry)
implicit none
!~~~~~~> Input parameters:
integer,intent(in ):: ex(1:3)
real*8, intent(in ):: X(1:ex(1)),Y(1:ex(2)),Z(1:ex(3)),xmin,ymin,zmin,xmax,ymax,zmax
integer,intent(in)::gw
real*8, dimension(ex(1),ex(2),ex(3)),intent(in) :: f1,f2,f3,f4,f5,f6,f7
real*8, intent(out) :: f_out(7)
!~~~~~~> Other variables:
real*8 :: dX, dY, dZ
integer::imin,jmin,kmin
integer::imax,jmax,kmax
integer::i,j,k
real*8 :: s1,s2,s3,s4,s5,s6,s7
dX = X(2) - X(1)
dY = Y(2) - Y(1)
dZ = Z(2) - Z(1)
! for ghost zone
imin = gw+1
jmin = gw+1
kmin = gw+1
imax = ex(1) - gw
jmax = ex(2) - gw
kmax = ex(3) - gw
!for patch boundary (i.e., not ghost boundary)
if(dabs(X(ex(1))-xmax) < dX) imax = ex(1)
if(dabs(Y(ex(2))-ymax) < dY) jmax = ex(2)
if(dabs(Z(ex(3))-zmax) < dZ) kmax = ex(3)
if(dabs(X(1)-xmin) < dX) imin = 1
if(dabs(Y(1)-ymin) < dY) jmin = 1
if(dabs(Z(1)-zmin) < dZ) kmin = 1
s1 = 0.d0
s2 = 0.d0
s3 = 0.d0
s4 = 0.d0
s5 = 0.d0
s6 = 0.d0
s7 = 0.d0
do k=kmin,kmax
do j=jmin,jmax
!DIR$ SIMD REDUCTION(+:s1,s2,s3,s4,s5,s6,s7)
do i=imin,imax
s1 = s1 + f1(i,j,k)*f1(i,j,k)
s2 = s2 + f2(i,j,k)*f2(i,j,k)
s3 = s3 + f3(i,j,k)*f3(i,j,k)
s4 = s4 + f4(i,j,k)*f4(i,j,k)
s5 = s5 + f5(i,j,k)*f5(i,j,k)
s6 = s6 + f6(i,j,k)*f6(i,j,k)
s7 = s7 + f7(i,j,k)*f7(i,j,k)
enddo
enddo
enddo
f_out(1) = s1*dX*dY*dZ
f_out(2) = s2*dX*dY*dZ
f_out(3) = s3*dX*dY*dZ
f_out(4) = s4*dX*dY*dZ
f_out(5) = s5*dX*dY*dZ
f_out(6) = s6*dX*dY*dZ
f_out(7) = s7*dX*dY*dZ
return
end subroutine l2normhelper7
!--------------------------------------------------------------------------------------
! calculate L2norm especially for shell Blocks
subroutine l2normhelper_sh(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
f,f_out,gw,ogw,Symmetry)
implicit none implicit none
!~~~~~~> Input parameters: !~~~~~~> Input parameters:

View File

@@ -12,9 +12,10 @@
#define f_global_interpind global_interpind #define f_global_interpind global_interpind
#define f_global_interpind2d global_interpind2d #define f_global_interpind2d global_interpind2d
#define f_global_interpind1d global_interpind1d #define f_global_interpind1d global_interpind1d
#define f_l2normhelper l2normhelper #define f_l2normhelper l2normhelper
#define f_l2normhelper_sh l2normhelper_sh #define f_l2normhelper7 l2normhelper7
#define f_l2normhelper_sh_rms l2normhelper_sh_rms #define f_l2normhelper_sh l2normhelper_sh
#define f_l2normhelper_sh_rms l2normhelper_sh_rms
#define f_average average #define f_average average
#define f_average3 average3 #define f_average3 average3
#define f_average2 average2 #define f_average2 average2
@@ -41,9 +42,10 @@
#define f_global_interpind GLOBAL_INTERPIND #define f_global_interpind GLOBAL_INTERPIND
#define f_global_interpind2d GLOBAL_INTERPIND2D #define f_global_interpind2d GLOBAL_INTERPIND2D
#define f_global_interpind1d GLOBAL_INTERPIND1D #define f_global_interpind1d GLOBAL_INTERPIND1D
#define f_l2normhelper L2NORMHELPER #define f_l2normhelper L2NORMHELPER
#define f_l2normhelper_sh L2NORMHELPER_SH #define f_l2normhelper7 L2NORMHELPER7
#define f_l2normhelper_sh_rms L2NORMHELPER_SH_RMS #define f_l2normhelper_sh L2NORMHELPER_SH
#define f_l2normhelper_sh_rms L2NORMHELPER_SH_RMS
#define f_average AVERAGE #define f_average AVERAGE
#define f_average3 AVERAGE3 #define f_average3 AVERAGE3
#define f_average2 AVERAGE2 #define f_average2 AVERAGE2
@@ -70,9 +72,10 @@
#define f_global_interpind global_interpind_ #define f_global_interpind global_interpind_
#define f_global_interpind2d global_interpind2d_ #define f_global_interpind2d global_interpind2d_
#define f_global_interpind1d global_interpind1d_ #define f_global_interpind1d global_interpind1d_
#define f_l2normhelper l2normhelper_ #define f_l2normhelper l2normhelper_
#define f_l2normhelper_sh l2normhelper_sh_ #define f_l2normhelper7 l2normhelper7_
#define f_l2normhelper_sh_rms l2normhelper_sh_rms_ #define f_l2normhelper_sh l2normhelper_sh_
#define f_l2normhelper_sh_rms l2normhelper_sh_rms_
#define f_average average_ #define f_average average_
#define f_average3 average3_ #define f_average3 average3_
#define f_average2 average2_ #define f_average2 average2_
@@ -156,20 +159,29 @@ extern "C"
int *, double *, int &, int &); int *, double *, int &, int &);
} }
extern "C" extern "C"
{ {
void f_l2normhelper(int *, double *, double *, double *, void f_l2normhelper(int *, double *, double *, double *,
double &, double &, double &, double &, double &, double &,
double &, double &, double &, double &, double &, double &,
double *, double &, int &); double *, double &, int &);
} }
extern "C" extern "C"
{ {
void f_l2normhelper_sh(int *, double *, double *, double *, void f_l2normhelper7(int *, double *, double *, double *,
double &, double &, double &, double &, double &, double &,
double &, double &, double &, double &, double &, double &,
double *, double &, int &, int &, int &); double *, double *, double *, double *,
double *, double *, double *, double *, int &);
}
extern "C"
{
void f_l2normhelper_sh(int *, double *, double *, double *,
double &, double &, double &,
double &, double &, double &,
double *, double &, int &, int &, int &);
} }
extern "C" extern "C"

View File

@@ -17,68 +17,106 @@ using namespace std;
#include <math.h> #include <math.h>
#endif #endif
// Intel oneMKL LAPACK interface /* Linear equation solution by Gauss-Jordan elimination.
#include <mkl_lapacke.h> a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input
/* Linear equation solution using Intel oneMKL LAPACK. containing the right-hand side vectors. On output a is
a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input replaced by its matrix inverse, and b is replaced by the
containing the right-hand side vectors. On output a is corresponding set of solution vectors. */
replaced by its matrix inverse, and b is replaced by the
corresponding set of solution vectors. int gaussj(double *a, double *b, int n)
{
Mathematical equivalence: double swap;
Solves: A * x = b => x = A^(-1) * b
Original Gauss-Jordan and LAPACK dgesv/dgetri produce identical results int *indxc, *indxr, *ipiv;
within numerical precision. */ indxc = new int[n];
indxr = new int[n];
int gaussj(double *a, double *b, int n) ipiv = new int[n];
{
// Allocate pivot array and workspace int i, icol, irow, j, k, l, ll;
lapack_int *ipiv = new lapack_int[n]; double big, dum, pivinv;
lapack_int info;
for (j = 0; j < n; j++)
// Make a copy of matrix a for solving (dgesv modifies it to LU form) ipiv[j] = 0;
double *a_copy = new double[n * n]; for (i = 0; i < n; i++)
for (int i = 0; i < n * n; i++) { {
a_copy[i] = a[i]; big = 0.0;
} for (j = 0; j < n; j++)
if (ipiv[j] != 1)
// Step 1: Solve linear system A*x = b using LU decomposition for (k = 0; k < n; k++)
// LAPACKE_dgesv uses column-major by default, but we use row-major {
info = LAPACKE_dgesv(LAPACK_ROW_MAJOR, n, 1, a_copy, n, ipiv, b, 1); if (ipiv[k] == 0)
{
if (info != 0) { if (fabs(a[j * n + k]) >= big)
cout << "gaussj: Singular Matrix (dgesv info=" << info << ")" << endl; {
delete[] ipiv; big = fabs(a[j * n + k]);
delete[] a_copy; irow = j;
return 1; icol = k;
} }
}
// Step 2: Compute matrix inverse A^(-1) using LU factorization else if (ipiv[k] > 1)
// First do LU factorization of original matrix a {
info = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, n, n, a, n, ipiv); cout << "gaussj: Singular Matrix-1" << endl;
return 1;
if (info != 0) { }
cout << "gaussj: Singular Matrix (dgetrf info=" << info << ")" << endl; }
delete[] ipiv;
delete[] a_copy; ipiv[icol] = ipiv[icol] + 1;
return 1; if (irow != icol)
} {
for (l = 0; l < n; l++)
// Then compute inverse from LU factorization {
info = LAPACKE_dgetri(LAPACK_ROW_MAJOR, n, a, n, ipiv); swap = a[irow * n + l];
a[irow * n + l] = a[icol * n + l];
if (info != 0) { a[icol * n + l] = swap;
cout << "gaussj: Singular Matrix (dgetri info=" << info << ")" << endl; }
delete[] ipiv;
delete[] a_copy; swap = b[irow];
return 1; b[irow] = b[icol];
} b[icol] = swap;
}
delete[] ipiv;
delete[] a_copy; indxr[i] = irow;
indxc[i] = icol;
return 0;
} if (a[icol * n + icol] == 0.0)
{
cout << "gaussj: Singular Matrix-2" << endl;
return 1;
}
pivinv = 1.0 / a[icol * n + icol];
a[icol * n + icol] = 1.0;
for (l = 0; l < n; l++)
a[icol * n + l] *= pivinv;
b[icol] *= pivinv;
for (ll = 0; ll < n; ll++)
if (ll != icol)
{
dum = a[ll * n + icol];
a[ll * n + icol] = 0.0;
for (l = 0; l < n; l++)
a[ll * n + l] -= a[icol * n + l] * dum;
b[ll] -= b[icol] * dum;
}
}
for (l = n - 1; l >= 0; l--)
{
if (indxr[l] != indxc[l])
for (k = 0; k < n; k++)
{
swap = a[k * n + indxr[l]];
a[k * n + indxr[l]] = a[k * n + indxc[l]];
a[k * n + indxc[l]] = swap;
}
}
delete[] indxc;
delete[] indxr;
delete[] ipiv;
return 0;
}
// for check usage // for check usage
/* /*
int main() int main()

View File

@@ -29,6 +29,16 @@
#define REGLEV 0 #define REGLEV 0
#define BSSN_FINE_TIMING 0
#define BSSN_FINE_TIMING_EVERY 1
#define BSSN_FINE_TIMING_TOPN 8
#define BSSN_KERNEL_FINE_TIMING 0
#define BSSN_ENABLE_STDIN_ABORT_POLL 0
//#define USE_GPU //#define USE_GPU
//#define CHECKDETAIL //#define CHECKDETAIL
@@ -88,6 +98,21 @@
// 0: for every level; // 0: for every level;
// 1: for all // 1: for all
// //
// define BSSN_FINE_TIMING
// enable fine-grained per-timestep timing monitor
//
// define BSSN_FINE_TIMING_EVERY
// report timing every N coarse timesteps
//
// define BSSN_FINE_TIMING_TOPN
// number of hottest timing buckets shown in stdout
//
// define BSSN_KERNEL_FINE_TIMING
// enable split timing inside compute_rhs_bssn
//
// define BSSN_ENABLE_STDIN_ABORT_POLL
// poll stdin and broadcast abort flag every coarse step
//
// define USE_GPU // define USE_GPU
// use gpu or not // use gpu or not
// //
@@ -142,4 +167,3 @@
#define TINY 1e-10 #define TINY 1e-10
#endif /* MICRODEF_H */ #endif /* MICRODEF_H */

View File

@@ -8,30 +8,19 @@ include makefile.inc
POLINT6_USE_BARY ?= 1 POLINT6_USE_BARY ?= 1
POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY) POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt) ## Legacy GNU/OpenMPI flags
## make -> opt (PGO-guided, maximum performance) CXXBASEFLAGS = -O3 -march=native -Wno-deprecated -Dfortran3 -Dnewc $(INTERP_LB_FLAGS)
## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data) F90BASEFLAGS = -O3 -march=native -cpp -fallow-argument-mismatch $(POLINT6_FLAG)
PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
ifeq ($(PGO_MODE),instrument)
ifeq ($(PGO_MODE),instrument) CXXAPPFLAGS = $(CXXBASEFLAGS)
## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability f90appflags = $(F90BASEFLAGS)
CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
else else
## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \ CXXAPPFLAGS = $(CXXBASEFLAGS)
## PGO has been turned off, now tested and found to be negative optimization f90appflags = $(F90BASEFLAGS)
## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
endif endif
.SUFFIXES: .o .f90 .C .for .cu .SUFFIXES: .o .f90 .C .for .cu
.f90.o: .f90.o:
$(f90) $(f90appflags) -c $< -o $@ $(f90) $(f90appflags) -c $< -o $@
@@ -67,17 +56,14 @@ lopsided_kodis_c.o: lopsided_kodis_c.C
#interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h #interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
# ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ # ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS ## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata TP_OPTFLAGS = $(CXXBASEFLAGS) $(TP_OPENMP_FLAGS)
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=$(TP_PROFDATA) \ TwoPunctures.o: TwoPunctures.C
-Dfortran3 -Dnewc -I${MKLROOT}/include ${CXX} $(TP_OPTFLAGS) -c $< -o $@
TwoPunctures.o: TwoPunctures.C TwoPunctureABE.o: TwoPunctureABE.C
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@ ${CXX} $(TP_OPTFLAGS) -c $< -o $@
TwoPunctureABE.o: TwoPunctureABE.C
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
# Input files # Input files
@@ -105,12 +91,13 @@ C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\ Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o
C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
cgh.o bssn_class.o surface_integral.o ShellPatch.o\ cgh.o surface_integral.o ShellPatch.o\
bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\ bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\ bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\ Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
NullShellPatch2_Evo.o bssn_cuda_step.o writefile_f.o NullShellPatch2_Evo.o \
bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o
F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\ F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
prolongrestrict_cell.o prolongrestrict_vertex.o\ prolongrestrict_cell.o prolongrestrict_vertex.o\
@@ -142,7 +129,7 @@ initial_guess.o Newton.o Jacobian.o ilucg.o IntPnts0.o IntPnts.o
TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o
CUDAFILES = bssn_gpu.o bssn_cuda_ops.o CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o
# file dependences # file dependences
$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh $(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh
@@ -183,8 +170,8 @@ ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS) $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
TwoPunctureABE: $(TwoPunctureFILES) TwoPunctureABE: $(TwoPunctureFILES)
$(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS) $(CLINKER) $(TP_OPTFLAGS) -o $@ $(TwoPunctureFILES) $(LDLIBS)
clean: clean:
rm *.o ABE ABEGPU TwoPunctureABE make.log -f rm *.o ABE ABEGPU TwoPunctureABE make.log -f

57
AMSS_NCKU_source/makefile.inc Executable file → Normal file
View File

@@ -1,36 +1,27 @@
## GCC version (commented out) ## Legacy GNU/OpenMPI toolchain configuration
## filein = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## filein = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
## Intel oneAPI version with oneMKL (Optimized for performance) ## OpenMPI wrappers are installed but may not be on PATH.
filein = -I/usr/include/ -I${MKLROOT}/include OMPI_BIN ?= /usr/lib64/openmpi/bin
## Using sequential MKL (OpenMP disabled for better single-threaded performance) ## Wrapper compilers
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library f90 = $(OMPI_BIN)/mpifort
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5 f77 = $(OMPI_BIN)/mpifort
CUDA_LDLIBS = -L/usr/local/cuda-12.9/targets/x86_64-linux/lib -lcudart CXX = $(OMPI_BIN)/mpicxx
CC = $(OMPI_BIN)/mpicc
CLINKER = $(OMPI_BIN)/mpicxx
## Memory allocator switch ## Extra include flags are not needed when using the OpenMPI wrappers.
## 1 (default) : link Intel oneTBB allocator (libtbbmalloc) filein =
## 0 : use system default allocator (ptmalloc)
USE_TBBMALLOC ?= 1
TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
ifneq ($(wildcard $(TBBMALLOC_SO)),)
TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
else
TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
endif
ifeq ($(USE_TBBMALLOC),1)
LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
endif
LDLIBS := $(CUDA_LDLIBS) $(LDLIBS) ## BLAS/LAPACK backend:
## OpenBLAS on this system provides BLAS, CBLAS and LAPACK symbols.
BLAS_LAPACK_LIB ?= /lib64/libopenblaso.so.0
LDLIBS = $(BLAS_LAPACK_LIB) -lgfortran -lpthread -lm -ldl
## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags) ## PGO build mode switch
## opt : (default) maximum performance with PGO profile-guided optimization ## off : default legacy GNU build without PGO
## instrument : PGO Phase 1 instrumentation to collect fresh profile data ## instrument : accepted for compatibility, currently same as off
PGO_MODE ?= opt PGO_MODE ?= off
## Interp_Points load balance profiling mode ## Interp_Points load balance profiling mode
## off : (default) no load balance instrumentation ## off : (default) no load balance instrumentation
@@ -52,17 +43,13 @@ endif
USE_CXX_KERNELS ?= 1 USE_CXX_KERNELS ?= 1
## RK4 kernel implementation switch ## RK4 kernel implementation switch
## 1 (default) : use C/C++ rewrite of rungekutta4_rout (for optimization experiments) ## 1 (default) : use C/C++ rewrite of rungekutta4_rout
## 0 : use original Fortran rungekutta4_rout.o ## 0 : use original Fortran rungekutta4_rout.o
USE_CXX_RK4 ?= 1 USE_CXX_RK4 ?= 1
f90 = ifx ## OpenMP is only used for TwoPunctures on the legacy toolchain.
f77 = ifx TP_OPENMP_FLAGS ?= -fopenmp
CXX = icpx
CC = icx
CLINKER = mpiicpx
Cu = nvcc Cu = nvcc
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc

File diff suppressed because it is too large Load Diff

View File

@@ -20,24 +20,14 @@ using namespace std;
#include "cgh.h" #include "cgh.h"
#include "ShellPatch.h" #include "ShellPatch.h"
#include "NullShellPatch.h" #include "NullShellPatch.h"
#include "NullShellPatch2.h" #include "NullShellPatch2.h"
#include "var.h" #include "var.h"
#include "monitor.h" #include "monitor.h"
#include <map>
class surface_integral class surface_integral
{ {
private: private:
struct SpherePointCache
{
double *pox[3];
SpherePointCache()
{
pox[0] = pox[1] = pox[2] = 0;
}
};
int Symmetry, factor; int Symmetry, factor;
int N_theta, N_phi; // Number of points in Theta & Phi directions int N_theta, N_phi; // Number of points in Theta & Phi directions
double dphi, dcostheta; double dphi, dcostheta;
@@ -46,16 +36,15 @@ private:
double *nx_g, *ny_g, *nz_g; // global list of unit normals double *nx_g, *ny_g, *nz_g; // global list of unit normals
int myrank, cpusize; int myrank, cpusize;
map<double, SpherePointCache> sphere_point_cache; int wave_cache_spinw, wave_cache_maxl, wave_cache_modes;
map<int, double *> shellf_cache; double *wave_theta_pos, *wave_theta_neg;
double *wave_phi_cos, *wave_phi_sin;
void get_surface_points(double rex, double **pox); void clear_wave_cache();
double *get_shellf_buffer(int num_var); void build_wave_cache(int spinw, int maxl);
void release_cached_buffers();
public: public:
surface_integral(int iSymmetry); surface_integral(int iSymmetry);
~surface_integral(); ~surface_integral();
void surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4, void surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
int spinw, int maxl, int NN, double *RP, double *IP, int spinw, int maxl, int NN, double *RP, double *IP,
@@ -93,21 +82,37 @@ public:
double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &,
double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &,
double &, double &)); // NN is the length of RP and IP double &, double &)); // NN is the length of RP and IP
void surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var *trK, void surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz, var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz, var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz, var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
double *Rout, monitor *Monitor); double *Rout, monitor *Monitor, bool refresh_mass_fields = true);
void surf_MassPAng(double rex, int lev, ShellPatch *GH, var *chi, var *trK, void surf_MassPAng(double rex, int lev, ShellPatch *GH, var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz, var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz, var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz, var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
double *Rout, monitor *Monitor); double *Rout, monitor *Monitor, bool refresh_mass_fields = true);
void surf_Wave(double rex, cgh *GH, ShellPatch *SH, void surf_WaveMassPAng(double rex, int lev, cgh *GH,
var *chi, var *trK, var *Rpsi4, var *Ipsi4, int spinw, int maxl, int NN, double *RP, double *IP,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz, var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
double *Rout, monitor *Monitor, bool refresh_mass_fields = true);
void surf_WaveMassPAng(double rex, int lev, ShellPatch *GH,
var *Rpsi4, var *Ipsi4, int spinw, int maxl, int NN, double *RP, double *IP,
var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
double *Rout, monitor *Monitor, bool refresh_mass_fields = true);
void surf_Wave(double rex, cgh *GH, ShellPatch *SH,
var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz, var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *chix, var *chiy, var *chiz, var *chix, var *chiy, var *chiz,
var *trKx, var *trKy, var *trKz, var *trKx, var *trKy, var *trKz,
@@ -126,12 +131,12 @@ public:
bool SR_Interp_Points(MyList<var> *VarList, cgh *GH, ShellPatch *SH, bool SR_Interp_Points(MyList<var> *VarList, cgh *GH, ShellPatch *SH,
int NN, double **XX, double *Shellf); int NN, double **XX, double *Shellf);
void surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var *trK, void surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz, var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz, var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz, var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, // temparay memory for mass^i var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, // temparay memory for mass^i
double *Rout, monitor *Monitor, MPI_Comm Comm_here); double *Rout, monitor *Monitor, MPI_Comm Comm_here, bool refresh_mass_fields = true);
void surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4, void surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
int spinw, int maxl, int NN, double *RP, double *IP, int spinw, int maxl, int NN, double *RP, double *IP,
monitor *Monitor, MPI_Comm Comm_here); monitor *Monitor, MPI_Comm Comm_here);

View File

@@ -93,11 +93,13 @@ Here, we take the Ubuntu 22.04 system as an example
#### How to use AMSS-NCKU #### How to use AMSS-NCKU
0. Setting the parameters for compilation 0. Setting the parameters for compilation
Modify the makefile.inc file in the AMSS_NCKU_source directory and change the settings according to your computer. Modify the makefile.inc file in the AMSS_NCKU_source directory and change the settings according to your computer.
The settings for the Ubuntu 22.04 system do not need to be modified. The default configuration in this branch uses GNU compilers through the OpenMPI wrappers under `/usr/lib64/openmpi/bin`.
If your OpenMPI installation is in another location, update `OMPI_BIN` in `AMSS_NCKU_source/makefile.inc` or export `AMSS_OPENMPI_BIN` before running the Python launcher.
1. Enter the AMSS-NCKU Python code folder and modify the input. 1. Enter the AMSS-NCKU Python code folder and modify the input.

View File

@@ -144,6 +144,62 @@ def generate_macrodef_h():
print( "#define REGLEV 0", file=file1 ) print( "#define REGLEV 0", file=file1 )
print( file=file1 ) print( file=file1 )
# Define fine-grained timing/debug macros.
# All of them default to OFF so production builds do not pay profiling overhead.
fine_timing = getattr(input_data, "Fine_Timing",
getattr(input_data, "Finegrained_Timing", "no"))
kernel_fine_timing = getattr(input_data, "Kernel_Fine_Timing",
getattr(input_data, "BSSN_Kernel_Fine_Timing", "no"))
stdin_abort_poll = getattr(input_data, "Enable_Stdin_Abort_Poll",
getattr(input_data, "Stdin_Abort_Poll", "no"))
timing_report_every = max(1, int(getattr(
input_data, "Timing_Every_Steps",
getattr(input_data, "Timing_Report_Every", 1))))
timing_top_hotspots = max(1, int(getattr(
input_data, "Timing_Top_Hotspots", 8)))
if ( fine_timing == "yes" ):
print( "#define BSSN_FINE_TIMING 1", file=file1 )
print( file=file1 )
elif ( fine_timing == "no" ):
print( "#define BSSN_FINE_TIMING 0", file=file1 )
print( file=file1 )
else:
print( "Fine_Timing setting error!!!" )
print()
print( "# Fine_Timing setting error!!!", file=file1 )
print( file=file1 )
print( f"#define BSSN_FINE_TIMING_EVERY {timing_report_every}", file=file1 )
print( file=file1 )
print( f"#define BSSN_FINE_TIMING_TOPN {timing_top_hotspots}", file=file1 )
print( file=file1 )
if ( kernel_fine_timing == "yes" ):
print( "#define BSSN_KERNEL_FINE_TIMING 1", file=file1 )
print( file=file1 )
elif ( kernel_fine_timing == "no" ):
print( "#define BSSN_KERNEL_FINE_TIMING 0", file=file1 )
print( file=file1 )
else:
print( "Kernel_Fine_Timing setting error!!!" )
print()
print( "# Kernel_Fine_Timing setting error!!!", file=file1 )
print( file=file1 )
if ( stdin_abort_poll == "yes" ):
print( "#define BSSN_ENABLE_STDIN_ABORT_POLL 1", file=file1 )
print( file=file1 )
elif ( stdin_abort_poll == "no" ):
print( "#define BSSN_ENABLE_STDIN_ABORT_POLL 0", file=file1 )
print( file=file1 )
else:
print( "Enable_Stdin_Abort_Poll setting error!!!" )
print()
print( "# Enable_Stdin_Abort_Poll setting error!!!", file=file1 )
print( file=file1 )
# Define macro USE_GPU # Define macro USE_GPU
# use GPU or not # use GPU or not
@@ -224,6 +280,21 @@ def generate_macrodef_h():
print( "// 0: for every level;", file=file1 ) print( "// 0: for every level;", file=file1 )
print( "// 1: for all", file=file1 ) print( "// 1: for all", file=file1 )
print( "//", file=file1 ) print( "//", file=file1 )
print( "// define BSSN_FINE_TIMING", file=file1 )
print( "// enable fine-grained per-timestep timing monitor", file=file1 )
print( "//", file=file1 )
print( "// define BSSN_FINE_TIMING_EVERY", file=file1 )
print( "// report timing every N coarse timesteps", file=file1 )
print( "//", file=file1 )
print( "// define BSSN_FINE_TIMING_TOPN", file=file1 )
print( "// number of hottest timing buckets shown in stdout", file=file1 )
print( "//", file=file1 )
print( "// define BSSN_KERNEL_FINE_TIMING", file=file1 )
print( "// enable split timing inside compute_rhs_bssn", file=file1 )
print( "//", file=file1 )
print( "// define BSSN_ENABLE_STDIN_ABORT_POLL", file=file1 )
print( "// poll stdin and broadcast abort flag every coarse step", file=file1 )
print( "//", file=file1 )
print( "// define USE_GPU", file=file1 ) print( "// define USE_GPU", file=file1 )
print( "// use gpu or not", file=file1 ) print( "// use gpu or not", file=file1 )
print( "//", file=file1 ) print( "//", file=file1 )

View File

@@ -53,53 +53,13 @@ NUMACTL_CPU_BIND = get_last_n_cores_per_socket(n=32)
## Build parallelism: match the number of bound cores ## Build parallelism: match the number of bound cores
BUILD_JOBS = 64 BUILD_JOBS = 64
OPENMPI_BIN = os.environ.get("AMSS_OPENMPI_BIN", "/usr/lib64/openmpi/bin")
MPI_RUNNER = os.path.join(OPENMPI_BIN, "mpirun")
################################################################## ##################################################################
##################################################################
def prepare_gpu_runtime_env():
"""
Create a user-private CUDA MPS environment for GPU runs.
On shared machines another user's daemon may already occupy the default
/tmp/nvidia-mps pipe directory, which makes plain cudaSetDevice/cudaMalloc
fail with cudaErrorMpsConnectionFailed. Binding AMSS-NCKU to a private
pipe directory avoids cross-user interference.
"""
env = os.environ.copy()
pipe_dir = env.get("CUDA_MPS_PIPE_DIRECTORY", f"/tmp/amss-ncku-mps-{os.getuid()}")
log_dir = env.get("CUDA_MPS_LOG_DIRECTORY", f"/tmp/amss-ncku-mps-log-{os.getuid()}")
os.makedirs(pipe_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)
env["CUDA_MPS_PIPE_DIRECTORY"] = pipe_dir
env["CUDA_MPS_LOG_DIRECTORY"] = log_dir
control_socket = os.path.join(pipe_dir, "control")
if not os.path.exists(control_socket):
start = subprocess.run(
["nvidia-cuda-mps-control", "-d"],
env=env,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
if start.returncode != 0:
print(f" Warning: failed to start private CUDA MPS daemon in {pipe_dir}")
else:
print(f" Using private CUDA MPS pipe directory: {pipe_dir}")
else:
print(f" Using existing private CUDA MPS pipe directory: {pipe_dir}")
return env
##################################################################
################################################################## ##################################################################
@@ -189,29 +149,16 @@ def run_ABE():
## Define the command to run; cast other values to strings as needed ## Define the command to run; cast other values to strings as needed
run_env = None
if (input_data.GPU_Calculation == "no"): if (input_data.GPU_Calculation == "no"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" mpi_command = NUMACTL_CPU_BIND + " " + MPI_RUNNER + " -np " + str(input_data.MPI_processes) + " ./ABE"
#mpi_command = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" #mpi_command = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
mpi_command_outfile = "ABE_out.log" mpi_command_outfile = "ABE_out.log"
elif (input_data.GPU_Calculation == "yes"): elif (input_data.GPU_Calculation == "yes"):
run_env = prepare_gpu_runtime_env() mpi_command = NUMACTL_CPU_BIND + " " + MPI_RUNNER + " -np " + str(input_data.MPI_processes) + " ./ABEGPU"
if int(input_data.MPI_processes) == 1:
mpi_command = "./ABEGPU"
else:
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
mpi_command_outfile = "ABEGPU_out.log" mpi_command_outfile = "ABEGPU_out.log"
## Execute the MPI command and stream output ## Execute the MPI command and stream output
mpi_process = subprocess.Popen( mpi_process = subprocess.Popen(mpi_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
mpi_command,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
env=run_env,
)
## Write ABE run output to file while printing to stdout ## Write ABE run output to file while printing to stdout
with open(mpi_command_outfile, 'w') as file0: with open(mpi_command_outfile, 'w') as file0: