Stabilize GPU output path and MPI sync

This commit is contained in:
2026-04-09 10:57:49 +08:00
parent 4e3946a4f0
commit 49409645c0
8 changed files with 748 additions and 334 deletions

View File

@@ -14,7 +14,8 @@ using namespace std;
#include <string.h>
#endif
#include <time.h>
#include <time.h>
#include <unistd.h>
#include "macrodef.h"
#include "misc.h"
@@ -2036,9 +2037,10 @@ void bssn_class::Read_Ansorg()
void bssn_class::Evolve(int Steps)
{
clock_t prev_clock, curr_clock;
double LastDump = 0.0, LastCheck = 0.0, Last2dDump = 0.0;
LastAnas = 0;
clock_t prev_clock, curr_clock;
double LastDump = 0.0, LastCheck = 0.0, Last2dDump = 0.0;
LastAnas = 0;
LastConsOut = 0;
#if 0
//initial checkpoint for special uasge
{
@@ -2296,18 +2298,21 @@ void bssn_class::Evolve(int Steps)
////////////////////////////////////////////////////////////
// When LastCheck >= CheckTime, perform runtime checks and output status data
if (LastCheck >= CheckTime)
{
LastCheck = 0;
CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass);
if (LastCheck >= CheckTime)
{
LastCheck = 0;
CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass);
CheckPoint->writecheck_cgh(PhysTime, GH);
#ifdef WithShell
CheckPoint->writecheck_sh(PhysTime, SH);
#endif
CheckPoint->write_bssn(LastDump, Last2dDump, LastAnas);
}
}
#endif
CheckPoint->write_bssn(LastDump, Last2dDump, LastAnas);
}
// Keep output/analysis phases aligned across ranks before the next coarse step.
MPI_Barrier(MPI_COMM_WORLD);
}
/*
#ifdef With_AHF
// final apparent horizon finding
@@ -6253,7 +6258,7 @@ for(int ilev = GH->levels-1;ilev>=lev;ilev--)
for(int ilev=GH->levels-1;ilev>lev;ilev--)
RestrictProlong(ilev,1,false,DG_List,DG_List,DG_List);
#else
Parallel::Sync(GH->PatL[lev], DG_List, Symmetry);
Parallel::Sync(GH->PatL[lev], DG_List, Symmetry, "bssn_class::Compute_Psi4");
#endif
#ifdef WithShell
@@ -6908,10 +6913,10 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)
{
LastAnas += dT_lev;
if (LastAnas >= AnasTime)
{
#ifdef Point_Psi4
#error "not support parallel levels yet"
if (LastAnas >= AnasTime)
{
#ifdef Point_Psi4
#error "not support parallel levels yet"
// Gam_ijk and R_ij have been calculated in Interp_Constraint()
double SYM = 1, ANT = -1;
for (int levh = lev; levh < GH->levels; levh++)
@@ -7255,9 +7260,9 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)
//================================================================================================
void bssn_class::Constraint_Out()
{
LastConsOut += dT * pow(0.5, Mymax(0, trfls));
void bssn_class::Constraint_Out()
{
LastConsOut += dT * pow(0.5, Mymax(0, trfls));
if (LastConsOut >= AnasTime)
// Constraint violation
@@ -7322,7 +7327,7 @@ void bssn_class::Constraint_Out()
Pp = Pp->next;
}
}
Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry);
Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry, "bssn_class::Constraint_Out[level]");
}
#ifdef WithShell
if (0) // if the constrait quantities can be reused from the step rhs calculation
@@ -7544,7 +7549,7 @@ void bssn_class::AH_Prepare_derivatives()
}
Pp = Pp->next;
}
Parallel::Sync(GH->PatL[lev], AHDList, Symmetry);
Parallel::Sync(GH->PatL[lev], AHDList, Symmetry, "bssn_class::AH_Prepare_derivatives");
}
}
@@ -7825,7 +7830,7 @@ void bssn_class::Interp_Constraint(bool infg)
Pp = Pp->next;
}
}
Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry);
Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry, "bssn_class::Interp_Constraint[level]");
}
#ifdef WithShell
if (0) // if the constrait quantities can be reused from the step rhs calculation
@@ -8083,7 +8088,7 @@ void bssn_class::Compute_Constraint()
Pp = Pp->next;
}
}
Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry);
Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry, "bssn_class::Compute_Constraint[level]");
}
// prolong restrict constraint quantities
for (lev = GH->levels - 1; lev > 0; lev--)
@@ -8396,12 +8401,18 @@ void bssn_class::Enforce_algcon(int lev, int fg)
//================================================================================================
bool bssn_class::check_Stdin_Abort()
{
fd_set readfds;
struct timeval timeout;
bool bssn_class::check_Stdin_Abort()
{
// Non-interactive launches (mpirun via Python/subprocess, batch jobs, redirected stdin)
// should not probe stdin. Some MPI runtimes treat stdin as a managed channel and can
// fail when rank 0 polls/consumes it.
if (!isatty(STDIN_FILENO)) {
return false;
}
fd_set readfds;
struct timeval timeout;
FD_ZERO(&readfds);
FD_SET(STDIN_FILENO, &readfds);
@@ -8410,14 +8421,17 @@ bool bssn_class::check_Stdin_Abort()
timeout.tv_sec = 0;
timeout.tv_usec = 0;
int activity = select(STDIN_FILENO + 1, &readfds, nullptr, nullptr, &timeout);
if (activity > 0 && FD_ISSET(STDIN_FILENO, &readfds)) {
string input_abort;
if (cin >> input_abort) {
if (input_abort == "stop") {
return true;
}
int activity = select(STDIN_FILENO + 1, &readfds, nullptr, nullptr, &timeout);
if (activity <= 0) {
return false;
}
if (FD_ISSET(STDIN_FILENO, &readfds)) {
string input_abort;
if (cin >> input_abort) {
if (input_abort == "stop") {
return true;
}
}
}