diff --git a/AMSS_NCKU_source/Z4c_class.C b/AMSS_NCKU_source/Z4c_class.C index 79ed62b..1775761 100644 --- a/AMSS_NCKU_source/Z4c_class.C +++ b/AMSS_NCKU_source/Z4c_class.C @@ -1,10 +1,10 @@ #ifdef newc -#include -#include -#include -#include -using namespace std; +#include +#include +#include +#include +using namespace std; #else #include #include @@ -27,20 +27,23 @@ using namespace std; #include "shellfunctions.h" #include "cpbc.h" #include "kodiss.h" -#include "parameters.h" - -#ifndef USE_CUDA_Z4C -#define USE_CUDA_Z4C 0 -#endif - -#if USE_CUDA_Z4C && (ABEtype == 2) -#include "z4c_rhs_cuda.h" -#endif - -#ifdef With_AHF -#include "derivatives.h" -#include "myglobal.h" -#endif +#include "parameters.h" + +#ifndef USE_CUDA_Z4C +#define USE_CUDA_Z4C 0 +#endif + +#if USE_CUDA_Z4C && (ABEtype == 2) +#include "z4c_rhs_cuda.h" +#endif +#if USE_CUDA_BSSN +#include "bssn_rhs_cuda.h" +#endif + +#ifdef With_AHF +#include "derivatives.h" +#include "myglobal.h" +#endif //================================================================================================ @@ -136,19 +139,19 @@ void Z4c_class::Initialize() { CheckPoint->read_Black_Hole_position(BH_num_input, BH_num, Porg0, Pmom, Spin, Mass, Porgbr, Porg, Porg1, Porg_rhs); } - else - { - PhysTime = StartTime; - Setup_Black_Hole_position(); - } - - sync_cache_pre = new Parallel::SyncCache[GH->levels]; - sync_cache_cor = new Parallel::SyncCache[GH->levels]; - sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels]; - sync_cache_rp_fine = new Parallel::SyncCache[GH->levels]; - sync_cache_restrict = new Parallel::SyncCache[GH->levels]; - sync_cache_outbd = new Parallel::SyncCache[GH->levels]; -} + else + { + PhysTime = StartTime; + Setup_Black_Hole_position(); + } + + sync_cache_pre = new Parallel::SyncCache[GH->levels]; + sync_cache_cor = new Parallel::SyncCache[GH->levels]; + sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels]; + sync_cache_rp_fine = new Parallel::SyncCache[GH->levels]; + sync_cache_restrict = new Parallel::SyncCache[GH->levels]; + sync_cache_outbd = new Parallel::SyncCache[GH->levels]; +} //================================================================================================ @@ -183,572 +186,577 @@ Z4c_class::~Z4c_class() #define MRBD 0 // 0: fix BD for meshrefinement level; 1: sommerfeld_bam for them; 2: sommerfeld_yo for them -#ifndef CPBC -// for sommerfeld boundary - -#if USE_CUDA_Z4C && (ABEtype == 2) -#ifdef WithShell -#error "USE_CUDA_Z4C resident path currently supports Cartesian non-shell Z4C only" -#endif -#if (MRBD == 2) -#error "USE_CUDA_Z4C resident path does not support MRBD == 2" -#endif - -namespace { - -static const int k_z4c_cuda_bh_state_indices[3] = {18, 19, 20}; - -bool fill_z4c_cuda_views(Block *cg, MyList *vars, - double **host_views, - double *propspeeds = 0, - double *soa_flat = 0) -{ - int idx = 0; - while (vars && idx < Z4C_CUDA_STATE_COUNT) - { - host_views[idx] = cg->fgfs[vars->data->sgfn]; - if (propspeeds) - propspeeds[idx] = vars->data->propspeed; - if (soa_flat) - { - soa_flat[3 * idx + 0] = vars->data->SoA[0]; - soa_flat[3 * idx + 1] = vars->data->SoA[1]; - soa_flat[3 * idx + 2] = vars->data->SoA[2]; - } - vars = vars->next; - ++idx; - } - return idx == Z4C_CUDA_STATE_COUNT && vars == 0; -} - -void z4c_cuda_download_level_state(MyList *PatL, MyList *vars, int myrank, bool release_ctx) -{ - MyList *Pp = PatL; - while (Pp) - { - MyList *BP = Pp->data->blb; - while (BP) - { - Block *cg = BP->data; - if (myrank == cg->rank && z4c_cuda_has_resident_state(cg)) - { - double *state_out[Z4C_CUDA_STATE_COUNT]; - if (!fill_z4c_cuda_views(cg, vars, state_out)) - { - cout << "CUDA Z4C state list mismatch on resident state download" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - if (z4c_cuda_download_resident_state(cg, cg->shape, state_out)) - { - cout << "CUDA Z4C resident state download failed" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - if (release_ctx) - z4c_cuda_release_step_ctx(cg); - } - if (BP == Pp->data->ble) - break; - BP = BP->next; - } - Pp = Pp->next; - } -} - -bool z4c_cuda_patch_contains_point(Patch *patch, const double *point) -{ - if (!patch) - return false; - for (int d = 0; d < dim; d++) - { - const double h = patch->getdX(d); - const double lo = patch->bbox[d] + patch->lli[d] * h; - const double hi = patch->bbox[dim + d] - patch->uui[d] * h; - if (point[d] < lo || point[d] > hi) - return false; - } - return true; -} - -bool z4c_cuda_point_in_block(Patch *patch, Block *block, - const double *point, const double *DH) -{ - if (!patch || !block) - return false; - for (int d = 0; d < dim; d++) - { - double llb; - double uub; -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - llb = (feq(block->bbox[d], patch->bbox[d], DH[d] / 2)) - ? block->bbox[d] + patch->lli[d] * DH[d] - : block->bbox[d] + (ghost_width - 0.5) * DH[d]; - uub = (feq(block->bbox[dim + d], patch->bbox[dim + d], DH[d] / 2)) - ? block->bbox[dim + d] - patch->uui[d] * DH[d] - : block->bbox[dim + d] - (ghost_width - 0.5) * DH[d]; -#else -#ifdef Cell - llb = (feq(block->bbox[d], patch->bbox[d], DH[d] / 2)) - ? block->bbox[d] + patch->lli[d] * DH[d] - : block->bbox[d] + ghost_width * DH[d]; - uub = (feq(block->bbox[dim + d], patch->bbox[dim + d], DH[d] / 2)) - ? block->bbox[dim + d] - patch->uui[d] * DH[d] - : block->bbox[dim + d] - ghost_width * DH[d]; -#else -#error Not define Vertex nor Cell -#endif -#endif - if (point[d] - llb < -DH[d] / 2 || point[d] - uub > DH[d] / 2) - return false; - } - return true; -} - -int z4c_cuda_interp_tile_start(const double *coords, int n, double x, double dx, int ordn) -{ - if (!coords || n <= ordn) - return 0; - int cxi = int((x - coords[0]) / dx + 0.4) + 1; - int start = cxi - ordn / 2; - if (start < 0) - start = 0; - const int max_start = n - ordn; - if (start > max_start) - start = max_start; - return start; -} - -bool z4c_cuda_interp_bh_point_resident(MyList *PatL, - int myrank, - const double *point, - var *forx, var *fory, var *forz, - int Symmetry, - double *shellf) -{ - const int ordn = 2 * ghost_width; - int owner_rank = -1; - - shellf[0] = shellf[1] = shellf[2] = 0.0; - - MyList *PL = PatL; - while (PL) - { - Patch *patch = PL->data; - if (!z4c_cuda_patch_contains_point(patch, point)) - { - PL = PL->next; - continue; - } - - double DH[dim]; - for (int d = 0; d < dim; d++) - DH[d] = patch->getdX(d); - - MyList *BP = patch->blb; - while (BP) - { - Block *block = BP->data; - if (z4c_cuda_point_in_block(patch, block, point, DH)) - { - owner_rank = block->rank; - if (myrank == owner_rank) - { - int interp_ordn = ordn; - int interp_sym = Symmetry; - double x = point[0]; - double y = point[1]; - double z = point[2]; - - if (z4c_cuda_has_resident_state(block) && - block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn) - { - const int sx = ordn; - const int sy = ordn; - const int sz = ordn; - const int region_all = sx * sy * sz; - const int i0 = z4c_cuda_interp_tile_start(block->X[0], block->shape[0], x, DH[0], ordn); - const int j0 = z4c_cuda_interp_tile_start(block->X[1], block->shape[1], y, DH[1], ordn); - const int k0 = z4c_cuda_interp_tile_start(block->X[2], block->shape[2], z, DH[2], ordn); - double *packed_fields = new double[3 * region_all]; - var *vars[3] = {forx, fory, forz}; - for (int f = 0; f < 3; f++) - { - if (z4c_cuda_pack_state_region_to_host_buffer(block, - k_z4c_cuda_bh_state_indices[f], - packed_fields + f * region_all, - block->shape, - i0, j0, k0, - sx, sy, sz) != 0) - { - delete[] packed_fields; - cout << "CUDA Z4C BH tile download failed" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - int tile_shape[3] = {sx, sy, sz}; - f_global_interp(tile_shape, - block->X[0] + i0, - block->X[1] + j0, - block->X[2] + k0, - packed_fields + f * region_all, - shellf[f], - x, y, z, - interp_ordn, - vars[f]->SoA, - interp_sym); - } - delete[] packed_fields; - } - else - { - f_global_interp(block->shape, block->X[0], block->X[1], block->X[2], - block->fgfs[forx->sgfn], shellf[0], - x, y, z, interp_ordn, forx->SoA, interp_sym); - f_global_interp(block->shape, block->X[0], block->X[1], block->X[2], - block->fgfs[fory->sgfn], shellf[1], - x, y, z, interp_ordn, fory->SoA, interp_sym); - f_global_interp(block->shape, block->X[0], block->X[1], block->X[2], - block->fgfs[forz->sgfn], shellf[2], - x, y, z, interp_ordn, forz->SoA, interp_sym); - } - } - break; - } - if (BP == patch->ble) - break; - BP = BP->next; - } - - if (owner_rank >= 0) - break; - PL = PL->next; - } - - if (owner_rank < 0) - return false; - - MPI_Bcast(shellf, 3, MPI_DOUBLE, owner_rank, MPI_COMM_WORLD); - return true; -} - -bool z4c_cuda_compute_porg_rhs_resident(cgh *GH, - int ilev, - int myrank, - int BH_num, - double **BH_PS, - double **BH_RHS, - var *forx, var *fory, var *forz, - int Symmetry) -{ - for (int n = 0; n < BH_num; n++) - { - double shellf[3] = {0.0, 0.0, 0.0}; - int lev = ilev; - while (lev >= 0 && - !z4c_cuda_interp_bh_point_resident(GH->PatL[lev], myrank, BH_PS[n], - forx, fory, forz, Symmetry, shellf)) - { - --lev; - } - if (lev < 0) - return false; - BH_RHS[n][0] = -shellf[0]; - BH_RHS[n][1] = -shellf[1]; - BH_RHS[n][2] = -shellf[2]; - } - return true; -} - -bool z4c_cuda_resident_step_enabled() -{ - static int enabled = -1; - if (enabled < 0) - { - const char *env = getenv("AMSS_Z4C_CUDA_RESIDENT"); - enabled = (env && atoi(env) != 0) ? 1 : 0; - } - return enabled != 0; -} - -} // namespace -#endif - -void Z4c_class::Step(int lev, int YN) -{ -#if USE_CUDA_Z4C && (ABEtype == 2) - double dT_lev = dT * pow(0.5, Mymax(lev, trfls)); -#ifdef With_AHF - AH_Step_Find(lev, dT_lev); -#endif - bool BB = fgt(PhysTime, StartTime, dT_lev / 2); - double ndeps = numepss; - if (lev < GH->movls) - ndeps = numepsb; - double TRK4 = PhysTime; - int iter_count = 0; - int pre = 0, cor = 1; - int ERROR = 0; - - MyList *Pp = GH->PatL[lev]; - while (Pp) - { - MyList *BP = Pp->data->blb; - while (BP) - { - Block *cg = BP->data; - if (myrank == cg->rank) - { - double *state_in[Z4C_CUDA_STATE_COUNT]; - double *state_out[Z4C_CUDA_STATE_COUNT]; - double propspeed[Z4C_CUDA_STATE_COUNT]; - double soa_flat[3 * Z4C_CUDA_STATE_COUNT]; - if (!fill_z4c_cuda_views(cg, StateList, state_in, propspeed, soa_flat) || - !fill_z4c_cuda_views(cg, SynchList_pre, state_out)) - { - cout << "CUDA Z4C state list mismatch on predictor step" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - int apply_bam_bc = 0; -#if (MRBD == 0) -#if (SommerType == 0) - apply_bam_bc = (lev == 0) ? 1 : 0; -#endif -#elif (MRBD == 1) - apply_bam_bc = 1; -#endif - int keep_resident_state = z4c_cuda_resident_step_enabled() ? 1 : 0; - int apply_enforce_ga = 0; -#if (AGM == 0) - apply_enforce_ga = 1; -#endif - if (z4c_cuda_rk4_substep(cg, - cg->shape, cg->X[0], cg->X[1], cg->X[2], - state_in, state_out, - propspeed, soa_flat, Pp->data->bbox, - dT_lev, TRK4, iter_count, apply_bam_bc, - Symmetry, lev, ndeps, pre, - keep_resident_state, apply_enforce_ga, chitiny)) - { - cout << "CUDA Z4C predictor substep failed in domain: (" - << cg->bbox[0] << ":" << cg->bbox[3] << "," - << cg->bbox[1] << ":" << cg->bbox[4] << "," - << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; - ERROR = 1; - } - } - if (BP == Pp->data->ble) - break; - BP = BP->next; - } - Pp = Pp->next; - } - - { - int erh = ERROR; - MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - } - if (ERROR) - { - if (myrank == 0 && ErrorMonitor->outfile) - ErrorMonitor->outfile << "CUDA Z4C failed in predictor at t = " << PhysTime - << ", lev = " << lev << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - - Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]); - - if (BH_num > 0 && lev == GH->levels - 1) - { - compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev); - for (int ithBH = 0; ithBH < BH_num; ithBH++) - { - f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count); - f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg[ithBH][1], Porg_rhs[ithBH][1], iter_count); - f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg[ithBH][2], Porg_rhs[ithBH][2], iter_count); - if (Symmetry > 0) - Porg[ithBH][2] = fabs(Porg[ithBH][2]); - if (Symmetry == 2) - { - Porg[ithBH][0] = fabs(Porg[ithBH][0]); - Porg[ithBH][1] = fabs(Porg[ithBH][1]); - } - } - } - - if ((lev == a_lev) && (LastAnas + dT_lev >= AnasTime)) - z4c_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false); - if (lev == a_lev) - AnalysisStuff(lev, dT_lev); - - for (iter_count = 1; iter_count < 4; iter_count++) - { - if (iter_count == 1 || iter_count == 3) - TRK4 += dT_lev / 2; - Pp = GH->PatL[lev]; - while (Pp) - { - MyList *BP = Pp->data->blb; - while (BP) - { - Block *cg = BP->data; - if (myrank == cg->rank) - { - double *state_in[Z4C_CUDA_STATE_COUNT]; - double *state_out[Z4C_CUDA_STATE_COUNT]; - double propspeed[Z4C_CUDA_STATE_COUNT]; - double soa_flat[3 * Z4C_CUDA_STATE_COUNT]; - if (!fill_z4c_cuda_views(cg, SynchList_pre, state_in, propspeed, soa_flat) || - !fill_z4c_cuda_views(cg, SynchList_cor, state_out)) - { - cout << "CUDA Z4C state list mismatch on corrector step" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - int apply_bam_bc = 0; -#if (MRBD == 0) -#if (SommerType == 0) - apply_bam_bc = (lev == 0) ? 1 : 0; -#endif -#elif (MRBD == 1) - apply_bam_bc = 1; -#endif - int keep_resident_state = z4c_cuda_resident_step_enabled() ? 1 : 0; - int apply_enforce_ga = 0; -#if (AGM == 0) - apply_enforce_ga = 1; -#elif (AGM == 1) - apply_enforce_ga = (iter_count == 3) ? 1 : 0; -#endif - if (z4c_cuda_rk4_substep(cg, - cg->shape, cg->X[0], cg->X[1], cg->X[2], - state_in, state_out, - propspeed, soa_flat, Pp->data->bbox, - dT_lev, TRK4, iter_count, apply_bam_bc, - Symmetry, lev, ndeps, cor, - keep_resident_state, apply_enforce_ga, chitiny)) - { - cout << "CUDA Z4C corrector substep failed in domain: (" - << cg->bbox[0] << ":" << cg->bbox[3] << "," - << cg->bbox[1] << ":" << cg->bbox[4] << "," - << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; - ERROR = 1; - } - } - if (BP == Pp->data->ble) - break; - BP = BP->next; - } - Pp = Pp->next; - } - - { - int erh = ERROR; - MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - } - if (ERROR) - { - if (myrank == 0 && ErrorMonitor->outfile) - ErrorMonitor->outfile << "CUDA Z4C failed in RK4 substep#" << iter_count - << " at t = " << PhysTime - << ", lev = " << lev << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - - Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]); - - if (BH_num > 0 && lev == GH->levels - 1) - { - if (z4c_cuda_resident_step_enabled()) - { - if (!z4c_cuda_compute_porg_rhs_resident(GH, lev, myrank, BH_num, - Porg, Porg1, - Sfx, Sfy, Sfz, Symmetry)) - { - if (myrank == 0 && ErrorMonitor->outfile) - ErrorMonitor->outfile << "CUDA Z4C failed to interpolate black-hole shift at t = " - << PhysTime << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - } - else - { - compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev); - } - for (int ithBH = 0; ithBH < BH_num; ithBH++) - { - f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg1[ithBH][0], Porg_rhs[ithBH][0], iter_count); - f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg1[ithBH][1], Porg_rhs[ithBH][1], iter_count); - f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg1[ithBH][2], Porg_rhs[ithBH][2], iter_count); - if (Symmetry > 0) - Porg1[ithBH][2] = fabs(Porg1[ithBH][2]); - if (Symmetry == 2) - { - Porg1[ithBH][0] = fabs(Porg1[ithBH][0]); - Porg1[ithBH][1] = fabs(Porg1[ithBH][1]); - } - } - } - - if (iter_count < 3) - { - Pp = GH->PatL[lev]; - while (Pp) - { - MyList *BP = Pp->data->blb; - while (BP) - { - Block *cg = BP->data; - cg->swapList(SynchList_pre, SynchList_cor, myrank); - if (BP == Pp->data->ble) - break; - BP = BP->next; - } - Pp = Pp->next; - } - if (BH_num > 0 && lev == GH->levels - 1) - { - for (int ithBH = 0; ithBH < BH_num; ithBH++) - { - Porg[ithBH][0] = Porg1[ithBH][0]; - Porg[ithBH][1] = Porg1[ithBH][1]; - Porg[ithBH][2] = Porg1[ithBH][2]; - } - } - } - } - - z4c_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, false); - -#if (RPS == 0) - RestrictProlong(lev, YN, BB); -#endif - - Pp = GH->PatL[lev]; - while (Pp) - { - MyList *BP = Pp->data->blb; - while (BP) - { - Block *cg = BP->data; - cg->swapList(StateList, SynchList_cor, myrank); - cg->swapList(OldStateList, SynchList_cor, myrank); - if (BP == Pp->data->ble) - break; - BP = BP->next; - } - Pp = Pp->next; - } - if (BH_num > 0 && lev == GH->levels - 1) - { - for (int ithBH = 0; ithBH < BH_num; ithBH++) - { - Porg0[ithBH][0] = Porg1[ithBH][0]; - Porg0[ithBH][1] = Porg1[ithBH][1]; - Porg0[ithBH][2] = Porg1[ithBH][2]; - } - } -#else - double dT_lev = dT * pow(0.5, Mymax(lev, trfls)); +#ifndef CPBC +// for sommerfeld boundary + +#if USE_CUDA_Z4C && (ABEtype == 2) +#if (MRBD == 2) +#error "USE_CUDA_Z4C resident path does not support MRBD == 2" +#endif + +namespace { + +static const int k_z4c_cuda_bh_state_indices[3] = {18, 19, 20}; + +bool fill_z4c_cuda_views(Block *cg, MyList *vars, + double **host_views, + double *propspeeds = 0, + double *soa_flat = 0) +{ + int idx = 0; + while (vars && idx < Z4C_CUDA_STATE_COUNT) + { + host_views[idx] = cg->fgfs[vars->data->sgfn]; + if (propspeeds) + propspeeds[idx] = vars->data->propspeed; + if (soa_flat) + { + soa_flat[3 * idx + 0] = vars->data->SoA[0]; + soa_flat[3 * idx + 1] = vars->data->SoA[1]; + soa_flat[3 * idx + 2] = vars->data->SoA[2]; + } + vars = vars->next; + ++idx; + } + return idx == Z4C_CUDA_STATE_COUNT && vars == 0; +} + +void z4c_cuda_download_level_state(MyList *PatL, MyList *vars, int myrank, bool release_ctx) +{ + MyList *Pp = PatL; + while (Pp) + { + MyList *BP = Pp->data->blb; + while (BP) + { + Block *cg = BP->data; + if (myrank == cg->rank && z4c_cuda_has_resident_state(cg)) + { + double *state_out[Z4C_CUDA_STATE_COUNT]; + if (!fill_z4c_cuda_views(cg, vars, state_out)) + { + cout << "CUDA Z4C state list mismatch on resident state download" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + if (z4c_cuda_download_resident_state(cg, cg->shape, state_out)) + { + cout << "CUDA Z4C resident state download failed" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + if (release_ctx) + z4c_cuda_release_step_ctx(cg); + } + if (BP == Pp->data->ble) + break; + BP = BP->next; + } + Pp = Pp->next; + } +} + +bool z4c_cuda_patch_contains_point(Patch *patch, const double *point) +{ + if (!patch) + return false; + for (int d = 0; d < dim; d++) + { + const double h = patch->getdX(d); + const double lo = patch->bbox[d] + patch->lli[d] * h; + const double hi = patch->bbox[dim + d] - patch->uui[d] * h; + if (point[d] < lo || point[d] > hi) + return false; + } + return true; +} + +bool z4c_cuda_point_in_block(Patch *patch, Block *block, + const double *point, const double *DH) +{ + if (!patch || !block) + return false; + for (int d = 0; d < dim; d++) + { + double llb; + double uub; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + llb = (feq(block->bbox[d], patch->bbox[d], DH[d] / 2)) + ? block->bbox[d] + patch->lli[d] * DH[d] + : block->bbox[d] + (ghost_width - 0.5) * DH[d]; + uub = (feq(block->bbox[dim + d], patch->bbox[dim + d], DH[d] / 2)) + ? block->bbox[dim + d] - patch->uui[d] * DH[d] + : block->bbox[dim + d] - (ghost_width - 0.5) * DH[d]; +#else +#ifdef Cell + llb = (feq(block->bbox[d], patch->bbox[d], DH[d] / 2)) + ? block->bbox[d] + patch->lli[d] * DH[d] + : block->bbox[d] + ghost_width * DH[d]; + uub = (feq(block->bbox[dim + d], patch->bbox[dim + d], DH[d] / 2)) + ? block->bbox[dim + d] - patch->uui[d] * DH[d] + : block->bbox[dim + d] - ghost_width * DH[d]; +#else +#error Not define Vertex nor Cell +#endif +#endif + if (point[d] - llb < -DH[d] / 2 || point[d] - uub > DH[d] / 2) + return false; + } + return true; +} + +int z4c_cuda_interp_tile_start(const double *coords, int n, double x, double dx, int ordn) +{ + if (!coords || n <= ordn) + return 0; + int cxi = int((x - coords[0]) / dx + 0.4) + 1; + int start = cxi - ordn / 2; + if (start < 0) + start = 0; + const int max_start = n - ordn; + if (start > max_start) + start = max_start; + return start; +} + +bool z4c_cuda_interp_bh_point_resident(MyList *PatL, + int myrank, + const double *point, + var *forx, var *fory, var *forz, + int Symmetry, + double *shellf) +{ + const int ordn = 2 * ghost_width; + int owner_rank = -1; + + shellf[0] = shellf[1] = shellf[2] = 0.0; + + MyList *PL = PatL; + while (PL) + { + Patch *patch = PL->data; + if (!z4c_cuda_patch_contains_point(patch, point)) + { + PL = PL->next; + continue; + } + + double DH[dim]; + for (int d = 0; d < dim; d++) + DH[d] = patch->getdX(d); + + MyList *BP = patch->blb; + while (BP) + { + Block *block = BP->data; + if (z4c_cuda_point_in_block(patch, block, point, DH)) + { + owner_rank = block->rank; + if (myrank == owner_rank) + { + int interp_ordn = ordn; + int interp_sym = Symmetry; + double x = point[0]; + double y = point[1]; + double z = point[2]; + + if (z4c_cuda_has_resident_state(block) && + block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn) + { + const int sx = ordn; + const int sy = ordn; + const int sz = ordn; + const int region_all = sx * sy * sz; + const int i0 = z4c_cuda_interp_tile_start(block->X[0], block->shape[0], x, DH[0], ordn); + const int j0 = z4c_cuda_interp_tile_start(block->X[1], block->shape[1], y, DH[1], ordn); + const int k0 = z4c_cuda_interp_tile_start(block->X[2], block->shape[2], z, DH[2], ordn); + double *packed_fields = new double[3 * region_all]; + var *vars[3] = {forx, fory, forz}; + for (int f = 0; f < 3; f++) + { + if (z4c_cuda_pack_state_region_to_host_buffer(block, + k_z4c_cuda_bh_state_indices[f], + packed_fields + f * region_all, + block->shape, + i0, j0, k0, + sx, sy, sz) != 0) + { + delete[] packed_fields; + cout << "CUDA Z4C BH tile download failed" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + int tile_shape[3] = {sx, sy, sz}; + f_global_interp(tile_shape, + block->X[0] + i0, + block->X[1] + j0, + block->X[2] + k0, + packed_fields + f * region_all, + shellf[f], + x, y, z, + interp_ordn, + vars[f]->SoA, + interp_sym); + } + delete[] packed_fields; + } + else + { + f_global_interp(block->shape, block->X[0], block->X[1], block->X[2], + block->fgfs[forx->sgfn], shellf[0], + x, y, z, interp_ordn, forx->SoA, interp_sym); + f_global_interp(block->shape, block->X[0], block->X[1], block->X[2], + block->fgfs[fory->sgfn], shellf[1], + x, y, z, interp_ordn, fory->SoA, interp_sym); + f_global_interp(block->shape, block->X[0], block->X[1], block->X[2], + block->fgfs[forz->sgfn], shellf[2], + x, y, z, interp_ordn, forz->SoA, interp_sym); + } + } + break; + } + if (BP == patch->ble) + break; + BP = BP->next; + } + + if (owner_rank >= 0) + break; + PL = PL->next; + } + + if (owner_rank < 0) + return false; + + MPI_Bcast(shellf, 3, MPI_DOUBLE, owner_rank, MPI_COMM_WORLD); + return true; +} + +bool z4c_cuda_compute_porg_rhs_resident(cgh *GH, + int ilev, + int myrank, + int BH_num, + double **BH_PS, + double **BH_RHS, + var *forx, var *fory, var *forz, + int Symmetry) +{ + for (int n = 0; n < BH_num; n++) + { + double shellf[3] = {0.0, 0.0, 0.0}; + int lev = ilev; + while (lev >= 0 && + !z4c_cuda_interp_bh_point_resident(GH->PatL[lev], myrank, BH_PS[n], + forx, fory, forz, Symmetry, shellf)) + { + --lev; + } + if (lev < 0) + return false; + BH_RHS[n][0] = -shellf[0]; + BH_RHS[n][1] = -shellf[1]; + BH_RHS[n][2] = -shellf[2]; + } + return true; +} + +bool z4c_cuda_resident_step_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_Z4C_CUDA_RESIDENT"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +} // namespace +#endif + +void Z4c_class::Step(int lev, int YN) +{ +#if USE_CUDA_Z4C && (ABEtype == 2) + double dT_lev = dT * pow(0.5, Mymax(lev, trfls)); +#ifdef With_AHF + AH_Step_Find(lev, dT_lev); +#endif + bool BB = fgt(PhysTime, StartTime, dT_lev / 2); + double ndeps = numepss; + if (lev < GH->movls) + ndeps = numepsb; + double TRK4 = PhysTime; + int iter_count = 0; + int pre = 0, cor = 1; + int ERROR = 0; + +#ifdef WithShell + if (bssn_cuda_use_resident_sync(lev)) + { + for (int dl = 0; dl < GH->levels; dl++) + bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank); + } +#endif + + MyList *Pp = GH->PatL[lev]; + while (Pp) + { + MyList *BP = Pp->data->blb; + while (BP) + { + Block *cg = BP->data; + if (myrank == cg->rank) + { + double *state_in[Z4C_CUDA_STATE_COUNT]; + double *state_out[Z4C_CUDA_STATE_COUNT]; + double propspeed[Z4C_CUDA_STATE_COUNT]; + double soa_flat[3 * Z4C_CUDA_STATE_COUNT]; + if (!fill_z4c_cuda_views(cg, StateList, state_in, propspeed, soa_flat) || + !fill_z4c_cuda_views(cg, SynchList_pre, state_out)) + { + cout << "CUDA Z4C state list mismatch on predictor step" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + int apply_bam_bc = 0; +#if (MRBD == 0) +#if (SommerType == 0) + apply_bam_bc = (lev == 0) ? 1 : 0; +#endif +#elif (MRBD == 1) + apply_bam_bc = 1; +#endif + int keep_resident_state = z4c_cuda_resident_step_enabled() ? 1 : 0; + int apply_enforce_ga = 0; +#if (AGM == 0) + apply_enforce_ga = 1; +#endif + if (z4c_cuda_rk4_substep(cg, + cg->shape, cg->X[0], cg->X[1], cg->X[2], + state_in, state_out, + propspeed, soa_flat, Pp->data->bbox, + dT_lev, TRK4, iter_count, apply_bam_bc, + Symmetry, lev, ndeps, pre, + keep_resident_state, apply_enforce_ga, chitiny)) + { + cout << "CUDA Z4C predictor substep failed in domain: (" + << cg->bbox[0] << ":" << cg->bbox[3] << "," + << cg->bbox[1] << ":" << cg->bbox[4] << "," + << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; + ERROR = 1; + } + } + if (BP == Pp->data->ble) + break; + BP = BP->next; + } + Pp = Pp->next; + } + + { + int erh = ERROR; + MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); + } + if (ERROR) + { + if (myrank == 0 && ErrorMonitor->outfile) + ErrorMonitor->outfile << "CUDA Z4C failed in predictor at t = " << PhysTime + << ", lev = " << lev << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]); + + if (BH_num > 0 && lev == GH->levels - 1) + { + compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev); + for (int ithBH = 0; ithBH < BH_num; ithBH++) + { + f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count); + f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg[ithBH][1], Porg_rhs[ithBH][1], iter_count); + f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg[ithBH][2], Porg_rhs[ithBH][2], iter_count); + if (Symmetry > 0) + Porg[ithBH][2] = fabs(Porg[ithBH][2]); + if (Symmetry == 2) + { + Porg[ithBH][0] = fabs(Porg[ithBH][0]); + Porg[ithBH][1] = fabs(Porg[ithBH][1]); + } + } + } + + if ((lev == a_lev) && (LastAnas + dT_lev >= AnasTime)) + z4c_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false); + if (lev == a_lev) + AnalysisStuff(lev, dT_lev); + + for (iter_count = 1; iter_count < 4; iter_count++) + { + if (iter_count == 1 || iter_count == 3) + TRK4 += dT_lev / 2; + Pp = GH->PatL[lev]; + while (Pp) + { + MyList *BP = Pp->data->blb; + while (BP) + { + Block *cg = BP->data; + if (myrank == cg->rank) + { + double *state_in[Z4C_CUDA_STATE_COUNT]; + double *state_out[Z4C_CUDA_STATE_COUNT]; + double propspeed[Z4C_CUDA_STATE_COUNT]; + double soa_flat[3 * Z4C_CUDA_STATE_COUNT]; + if (!fill_z4c_cuda_views(cg, SynchList_pre, state_in, propspeed, soa_flat) || + !fill_z4c_cuda_views(cg, SynchList_cor, state_out)) + { + cout << "CUDA Z4C state list mismatch on corrector step" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + int apply_bam_bc = 0; +#if (MRBD == 0) +#if (SommerType == 0) + apply_bam_bc = (lev == 0) ? 1 : 0; +#endif +#elif (MRBD == 1) + apply_bam_bc = 1; +#endif + int keep_resident_state = z4c_cuda_resident_step_enabled() ? 1 : 0; + int apply_enforce_ga = 0; +#if (AGM == 0) + apply_enforce_ga = 1; +#elif (AGM == 1) + apply_enforce_ga = (iter_count == 3) ? 1 : 0; +#endif + if (z4c_cuda_rk4_substep(cg, + cg->shape, cg->X[0], cg->X[1], cg->X[2], + state_in, state_out, + propspeed, soa_flat, Pp->data->bbox, + dT_lev, TRK4, iter_count, apply_bam_bc, + Symmetry, lev, ndeps, cor, + keep_resident_state, apply_enforce_ga, chitiny)) + { + cout << "CUDA Z4C corrector substep failed in domain: (" + << cg->bbox[0] << ":" << cg->bbox[3] << "," + << cg->bbox[1] << ":" << cg->bbox[4] << "," + << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; + ERROR = 1; + } + } + if (BP == Pp->data->ble) + break; + BP = BP->next; + } + Pp = Pp->next; + } + + { + int erh = ERROR; + MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); + } + if (ERROR) + { + if (myrank == 0 && ErrorMonitor->outfile) + ErrorMonitor->outfile << "CUDA Z4C failed in RK4 substep#" << iter_count + << " at t = " << PhysTime + << ", lev = " << lev << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]); + + if (BH_num > 0 && lev == GH->levels - 1) + { + if (z4c_cuda_resident_step_enabled()) + { + if (!z4c_cuda_compute_porg_rhs_resident(GH, lev, myrank, BH_num, + Porg, Porg1, + Sfx, Sfy, Sfz, Symmetry)) + { + if (myrank == 0 && ErrorMonitor->outfile) + ErrorMonitor->outfile << "CUDA Z4C failed to interpolate black-hole shift at t = " + << PhysTime << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + else + { + compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev); + } + for (int ithBH = 0; ithBH < BH_num; ithBH++) + { + f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg1[ithBH][0], Porg_rhs[ithBH][0], iter_count); + f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg1[ithBH][1], Porg_rhs[ithBH][1], iter_count); + f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg1[ithBH][2], Porg_rhs[ithBH][2], iter_count); + if (Symmetry > 0) + Porg1[ithBH][2] = fabs(Porg1[ithBH][2]); + if (Symmetry == 2) + { + Porg1[ithBH][0] = fabs(Porg1[ithBH][0]); + Porg1[ithBH][1] = fabs(Porg1[ithBH][1]); + } + } + } + + if (iter_count < 3) + { + Pp = GH->PatL[lev]; + while (Pp) + { + MyList *BP = Pp->data->blb; + while (BP) + { + Block *cg = BP->data; + cg->swapList(SynchList_pre, SynchList_cor, myrank); + if (BP == Pp->data->ble) + break; + BP = BP->next; + } + Pp = Pp->next; + } + if (BH_num > 0 && lev == GH->levels - 1) + { + for (int ithBH = 0; ithBH < BH_num; ithBH++) + { + Porg[ithBH][0] = Porg1[ithBH][0]; + Porg[ithBH][1] = Porg1[ithBH][1]; + Porg[ithBH][2] = Porg1[ithBH][2]; + } + } + } + } + + z4c_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, false); + +#if (RPS == 0) + RestrictProlong(lev, YN, BB); +#endif + + Pp = GH->PatL[lev]; + while (Pp) + { + MyList *BP = Pp->data->blb; + while (BP) + { + Block *cg = BP->data; + cg->swapList(StateList, SynchList_cor, myrank); + cg->swapList(OldStateList, SynchList_cor, myrank); + if (BP == Pp->data->ble) + break; + BP = BP->next; + } + Pp = Pp->next; + } + if (BH_num > 0 && lev == GH->levels - 1) + { + for (int ithBH = 0; ithBH < BH_num; ithBH++) + { + Porg0[ithBH][0] = Porg1[ithBH][0]; + Porg0[ithBH][1] = Porg1[ithBH][1]; + Porg0[ithBH][2] = Porg1[ithBH][2]; + } + } +#else + double dT_lev = dT * pow(0.5, Mymax(lev, trfls)); #ifdef With_AHF AH_Step_Find(lev, dT_lev); #endif @@ -915,6 +923,13 @@ void Z4c_class::Step(int lev, int YN) } #ifdef WithShell +#if USE_CUDA_Z4C + if (bssn_cuda_use_resident_sync(lev)) + { + for (int dl = 0; dl < GH->levels; dl++) + bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank); + } +#endif // evolve Shell Patches if (lev == 0) { @@ -1615,19 +1630,17 @@ void Z4c_class::Step(int lev, int YN) { Porg0[ithBH][0] = Porg1[ithBH][0]; Porg0[ithBH][1] = Porg1[ithBH][1]; - Porg0[ithBH][2] = Porg1[ithBH][2]; - } - } -#endif -} -#else -// for constraint preserving boundary (CPBC) -#if USE_CUDA_Z4C && (ABEtype == 2) -#error "USE_CUDA_Z4C resident path does not support CPBC" -#endif -#ifndef WithShell -#error "CPBC only supports Shell" -#endif + Porg0[ithBH][2] = Porg1[ithBH][2]; + } + } +#endif +} +#else +// for constraint preserving boundary (CPBC) +// Note: CPBC path uses CPU Fortran RHS; GPU resident sync is a no-op here. +#ifndef WithShell +#error "CPBC only supports Shell" +#endif // 0: extroplate rhs, 1: extroplate variable // 2: extroplate variable but before RHS calculation @@ -1654,6 +1667,14 @@ void Z4c_class::Step(int lev, int YN) int pre = 0, cor = 1; int ERROR = 0; +#if USE_CUDA_Z4C && defined(WithShell) + if (bssn_cuda_use_resident_sync(lev)) + { + for (int dl = 0; dl < GH->levels; dl++) + bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank); + } +#endif + MyList *sPp; // Predictor MyList *Pp = GH->PatL[lev]; @@ -3258,13 +3279,13 @@ void Z4c_class::Interp_Constraint() } } - ofstream outfile; - char suffix[64]; - sprintf(suffix, "/interp_constraint_%05d.dat", int(PhysTime / dT + 0.5)); - string filename = ErrorMonitor->out_dir + suffix; - // 0.5 for round off - - outfile.open(filename.c_str()); + ofstream outfile; + char suffix[64]; + sprintf(suffix, "/interp_constraint_%05d.dat", int(PhysTime / dT + 0.5)); + string filename = ErrorMonitor->out_dir + suffix; + // 0.5 for round off + + outfile.open(filename.c_str()); outfile << "# corrdinate, H_Res, Px_Res, Py_Res, Pz_Res, Gx_Res, Gy_Res, Gz_Res, ...." << endl; for (int i = 0; i < n; i++) { diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index 241cbf1..90ddca7 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -548,6 +548,8 @@ bool fill_bssn_cuda_views_count(Block *cg, MyList *vars, return idx == state_count && vars == 0; } +} // namespace + bool bssn_cuda_use_resident_sync(int lev) { (void)lev; @@ -1032,7 +1034,6 @@ void bssn_cuda_sync_level_bh_fields(MyList *PatL, } } -} // namespace #endif #if !USE_CUDA_BSSN diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.h b/AMSS_NCKU_source/bssn_rhs_cuda.h index 9995bb5..27c7953 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.h +++ b/AMSS_NCKU_source/bssn_rhs_cuda.h @@ -404,6 +404,10 @@ void bssn_cuda_release_step_ctx(void *block_tag); #ifdef __cplusplus } +// C++-only helpers declared for derived equation classes (Z4C, etc.) +// Defined in bssn_class.C. Requires MyList, Patch, var from including TU. +bool bssn_cuda_use_resident_sync(int lev); +void bssn_cuda_download_level_state_if_present(MyList *PatL, MyList *vars, int myrank); #endif #endif