#include "macrodef.h" #ifdef USE_GPU #include #include #include "bssn_class.h" #include "bssn_cuda_ops.h" #include "bssn_gpu.h" #include "bssn_macro.h" #include "rungekutta4_rout.h" void bssn_class::Step_MainPath_GPU(int lev, int YN) { #ifdef WithShell #error "Step_MainPath_GPU currently supports Patch grids only." #endif setpbh(BH_num, Porg0, Mass, BH_num_input); const double dT_lev = dT * pow(0.5, Mymax(lev, trfls)); #if (MAPBH == 1) if (BH_num > 0 && lev == GH->levels - 1) { compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev); for (int ithBH = 0; ithBH < BH_num; ithBH++) { for (int ith = 0; ith < 3; ith++) Porg1[ithBH][ith] = Porg0[ithBH][ith] + Porg_rhs[ithBH][ith] * dT_lev; if (Symmetry > 0) Porg1[ithBH][2] = fabs(Porg1[ithBH][2]); if (Symmetry == 2) { Porg1[ithBH][0] = fabs(Porg1[ithBH][0]); Porg1[ithBH][1] = fabs(Porg1[ithBH][1]); } } } if (lev == a_lev) AnalysisStuff(lev, dT_lev); #endif #ifdef With_AHF AH_Step_Find(lev, dT_lev); #endif const bool BB = fgt(PhysTime, StartTime, dT_lev / 2); (void)BB; double ndeps = (lev < GH->movls) ? numepsb : numepss; double TRK4 = PhysTime; int iter_count = 0; int pre = 0, cor = 1; int ERROR = 0; auto run_stage_on_block = [&](Block *cg, Patch *patch, MyList*state0_list, MyList *boundary_src_list, MyList *stage_data_list, MyList *rhs_list, int rk_stage) { MyList *varl0 = state0_list; MyList *varlb = boundary_src_list; MyList *varls = stage_data_list; MyList *varlr = rhs_list; while (varl0) { if (bssn_cuda_rk4_boundary_var(cg->shape, dT_lev, cg->X[0], cg->X[1], cg->X[2], patch->bbox[0], patch->bbox[1], patch->bbox[2], patch->bbox[3], patch->bbox[4], patch->bbox[5], cg->fgfs[varl0->data->sgfn], cg->fgfs[varlb->data->sgfn], cg->fgfs[varls->data->sgfn], cg->fgfs[varlr->data->sgfn], varl0->data->propspeed, varl0->data->SoA, Symmetry, lev, rk_stage)) { cerr << "GPU rk4/boundary failure: lev=" << lev << " rk_stage=" << rk_stage << " var=" << varl0->data->name << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << "," << cg->bbox[1] << ":" << cg->bbox[4] << "," << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; ERROR = 1; break; } varl0 = varl0->next; varlb = varlb->next; varls = varls->next; varlr = varlr->next; } }; MyList *Pp = GH->PatL[lev]; while (Pp) { MyList *BP = Pp->data->blb; while (BP) { Block *cg = BP->data; if (myrank == cg->rank) { #if (AGM == 0) if (bssn_cuda_enforce_ga(cg->shape, cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn])) { cerr << "GPU enforce_ga failure: lev=" << lev << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << "," << cg->bbox[1] << ":" << cg->bbox[4] << "," << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; ERROR = 1; } #endif if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_FIRST_TIME)) ERROR = 1; run_stage_on_block(cg, Pp->data, StateList, StateList, SynchList_pre, RHSList, iter_count); if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi->sgfn], chitiny)) { cerr << "GPU lowerbound failure: lev=" << lev << " rk_stage=" << iter_count << " var=" << phi->name << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << "," << cg->bbox[1] << ":" << cg->bbox[4] << "," << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; ERROR = 1; } } if (BP == Pp->data->ble) break; BP = BP->next; } Pp = Pp->next; } MPI_Request err_req_pre; { int erh = ERROR; MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_pre); } Parallel::AsyncSyncState async_pre; Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre); Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry); MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE); if (ERROR) { Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev); if (myrank == 0) { if (ErrorMonitor->outfile) ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime << ", lev = " << lev << endl; MPI_Abort(MPI_COMM_WORLD, 1); } } #if (MAPBH == 0) if (BH_num > 0 && lev == GH->levels - 1) { compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev); for (int ithBH = 0; ithBH < BH_num; ithBH++) { f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count); f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg[ithBH][1], Porg_rhs[ithBH][1], iter_count); f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg[ithBH][2], Porg_rhs[ithBH][2], iter_count); if (Symmetry > 0) Porg[ithBH][2] = fabs(Porg[ithBH][2]); if (Symmetry == 2) { Porg[ithBH][0] = fabs(Porg[ithBH][0]); Porg[ithBH][1] = fabs(Porg[ithBH][1]); } } } if (lev == a_lev) AnalysisStuff(lev, dT_lev); #endif for (iter_count = 1; iter_count < 4; iter_count++) { if (iter_count == 1 || iter_count == 3) TRK4 += dT_lev / 2; Pp = GH->PatL[lev]; while (Pp) { MyList *BP = Pp->data->blb; while (BP) { Block *cg = BP->data; if (myrank == cg->rank) { #if (AGM == 0) if (bssn_cuda_enforce_ga(cg->shape, cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn])) { cerr << "GPU enforce_ga failure: lev=" << lev << " rk_stage=" << iter_count << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << "," << cg->bbox[1] << ":" << cg->bbox[4] << "," << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; ERROR = 1; } #elif (AGM == 1) if (iter_count == 3 && bssn_cuda_enforce_ga(cg->shape, cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn])) { cerr << "GPU enforce_ga failure: lev=" << lev << " rk_stage=" << iter_count << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << "," << cg->bbox[1] << ":" << cg->bbox[4] << "," << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; ERROR = 1; } #endif if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_THEN)) ERROR = 1; run_stage_on_block(cg, Pp->data, StateList, SynchList_pre, SynchList_cor, RHSList, iter_count); if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi1->sgfn], chitiny)) { cerr << "GPU lowerbound failure: lev=" << lev << " rk_stage=" << iter_count << " var=" << phi1->name << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << "," << cg->bbox[1] << ":" << cg->bbox[4] << "," << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; ERROR = 1; } } if (BP == Pp->data->ble) break; BP = BP->next; } Pp = Pp->next; } MPI_Request err_req_cor; { int erh = ERROR; MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor); } Parallel::AsyncSyncState async_cor; Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor); Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry); MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE); if (ERROR) { Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev); if (myrank == 0) { if (ErrorMonitor->outfile) ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count << " variables at t = " << PhysTime << ", lev = " << lev << endl; MPI_Abort(MPI_COMM_WORLD, 1); } } #if (MAPBH == 0) if (BH_num > 0 && lev == GH->levels - 1) { compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev); for (int ithBH = 0; ithBH < BH_num; ithBH++) { f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg1[ithBH][0], Porg_rhs[ithBH][0], iter_count); f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg1[ithBH][1], Porg_rhs[ithBH][1], iter_count); f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg1[ithBH][2], Porg_rhs[ithBH][2], iter_count); if (Symmetry > 0) Porg1[ithBH][2] = fabs(Porg1[ithBH][2]); if (Symmetry == 2) { Porg1[ithBH][0] = fabs(Porg1[ithBH][0]); Porg1[ithBH][1] = fabs(Porg1[ithBH][1]); } } } #endif if (iter_count < 3) { Pp = GH->PatL[lev]; while (Pp) { MyList *BP = Pp->data->blb; while (BP) { BP->data->swapList(SynchList_pre, SynchList_cor, myrank); if (BP == Pp->data->ble) break; BP = BP->next; } Pp = Pp->next; } #if (MAPBH == 0) if (BH_num > 0 && lev == GH->levels - 1) { for (int ithBH = 0; ithBH < BH_num; ithBH++) { Porg[ithBH][0] = Porg1[ithBH][0]; Porg[ithBH][1] = Porg1[ithBH][1]; Porg[ithBH][2] = Porg1[ithBH][2]; } } #endif } } #if (RPS == 0) RestrictProlong(lev, YN, BB); #endif Pp = GH->PatL[lev]; while (Pp) { MyList *BP = Pp->data->blb; while (BP) { Block *cg = BP->data; cg->swapList(StateList, SynchList_cor, myrank); cg->swapList(OldStateList, SynchList_cor, myrank); if (BP == Pp->data->ble) break; BP = BP->next; } Pp = Pp->next; } if (BH_num > 0 && lev == GH->levels - 1) { for (int ithBH = 0; ithBH < BH_num; ithBH++) { Porg0[ithBH][0] = Porg1[ithBH][0]; Porg0[ithBH][1] = Porg1[ithBH][1]; Porg0[ithBH][2] = Porg1[ithBH][2]; } } } #endif