diff --git a/AMSS_NCKU_source/Z4c_class.C b/AMSS_NCKU_source/Z4c_class.C index 41afb8a..38930d3 100644 --- a/AMSS_NCKU_source/Z4c_class.C +++ b/AMSS_NCKU_source/Z4c_class.C @@ -135,12 +135,19 @@ void Z4c_class::Initialize() { CheckPoint->read_Black_Hole_position(BH_num_input, BH_num, Porg0, Pmom, Spin, Mass, Porgbr, Porg, Porg1, Porg_rhs); } - else - { - PhysTime = StartTime; - Setup_Black_Hole_position(); - } -} + else + { + PhysTime = StartTime; + Setup_Black_Hole_position(); + } + + sync_cache_pre = new Parallel::SyncCache[GH->levels]; + sync_cache_cor = new Parallel::SyncCache[GH->levels]; + sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels]; + sync_cache_rp_fine = new Parallel::SyncCache[GH->levels]; + sync_cache_restrict = new Parallel::SyncCache[GH->levels]; + sync_cache_outbd = new Parallel::SyncCache[GH->levels]; +} //================================================================================================ @@ -452,6 +459,17 @@ bool z4c_cuda_compute_porg_rhs_resident(cgh *GH, return true; } +bool z4c_cuda_resident_step_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_Z4C_CUDA_RESIDENT"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + } // namespace #endif @@ -498,7 +516,7 @@ void Z4c_class::Step(int lev, int YN) #elif (MRBD == 1) apply_bam_bc = 1; #endif - int keep_resident_state = 1; + int keep_resident_state = z4c_cuda_resident_step_enabled() ? 1 : 0; int apply_enforce_ga = 0; #if (AGM == 0) apply_enforce_ga = 1; @@ -593,7 +611,7 @@ void Z4c_class::Step(int lev, int YN) #elif (MRBD == 1) apply_bam_bc = 1; #endif - int keep_resident_state = 1; + int keep_resident_state = z4c_cuda_resident_step_enabled() ? 1 : 0; int apply_enforce_ga = 0; #if (AGM == 0) apply_enforce_ga = 1; @@ -639,14 +657,21 @@ void Z4c_class::Step(int lev, int YN) if (BH_num > 0 && lev == GH->levels - 1) { - if (!z4c_cuda_compute_porg_rhs_resident(GH, lev, myrank, BH_num, - Porg, Porg1, - Sfx, Sfy, Sfz, Symmetry)) + if (z4c_cuda_resident_step_enabled()) { - if (myrank == 0 && ErrorMonitor->outfile) - ErrorMonitor->outfile << "CUDA Z4C failed to interpolate black-hole shift at t = " - << PhysTime << endl; - MPI_Abort(MPI_COMM_WORLD, 1); + if (!z4c_cuda_compute_porg_rhs_resident(GH, lev, myrank, BH_num, + Porg, Porg1, + Sfx, Sfy, Sfz, Symmetry)) + { + if (myrank == 0 && ErrorMonitor->outfile) + ErrorMonitor->outfile << "CUDA Z4C failed to interpolate black-hole shift at t = " + << PhysTime << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + else + { + compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev); } for (int ithBH = 0; ithBH < BH_num; ithBH++) { diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index e4d6510..cb042e7 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -70,6 +70,17 @@ int amss_analysis_map_every() return every; } +bool amss_constraint_out_enabled_for_step(int step) +{ + static int every = -1; + if (every < 0) + { + const char *env = getenv("AMSS_CONSTRAINT_OUT_EVERY"); + every = (env && atoi(env) > 0) ? atoi(env) : 1; + } + return every <= 1 || (step > 0 && step % every == 0); +} + bool amss_rp_timing_enabled() { static int enabled = -1; @@ -3143,12 +3154,15 @@ void bssn_class::Evolve(int Steps) // misc::tillherecheck("before Constraint_Out"); - const double constraint_t0 = evolve_timing ? MPI_Wtime() : 0.0; - STEP_TIMER_DECL(timer_constraint_out); - Constraint_Out(); // this will affect the Dump_List - STEP_TIMER_ADD(TB_CONSTRAINT_OUT, timer_constraint_out); - if (evolve_timing) - amss_evolve_timing_add_constraint(MPI_Wtime() - constraint_t0); + if (amss_constraint_out_enabled_for_step(ncount)) + { + const double constraint_t0 = evolve_timing ? MPI_Wtime() : 0.0; + STEP_TIMER_DECL(timer_constraint_out); + Constraint_Out(); // this will affect the Dump_List + STEP_TIMER_ADD(TB_CONSTRAINT_OUT, timer_constraint_out); + if (evolve_timing) + amss_evolve_timing_add_constraint(MPI_Wtime() - constraint_t0); + } LastDump += dT_mon; Last2dDump += dT_mon; @@ -3220,7 +3234,7 @@ void bssn_class::Evolve(int Steps) GH->Regrid(Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor); -#if (ABEtype != 1 && ABEtype != 2) +#if (ABEtype != 1) for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } #endif STEP_TIMER_ADD(TB_REGRID, timer_regrid); @@ -3491,7 +3505,7 @@ void bssn_class::RecursiveStep(int lev) { if (ConstraintRefreshLevels) ConstraintRefreshLevels[lev] = 1; -#if (ABEtype != 1 && ABEtype != 2) +#if (ABEtype != 1) for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } #endif } @@ -3684,7 +3698,7 @@ void bssn_class::ParallelStep() SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) { -#if (ABEtype != 1 && ABEtype != 2) +#if (ABEtype != 1) for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } #endif } @@ -3862,7 +3876,7 @@ void bssn_class::ParallelStep() SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor)) { -#if (ABEtype != 1 && ABEtype != 2) +#if (ABEtype != 1) for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } #endif } @@ -3888,7 +3902,7 @@ void bssn_class::ParallelStep() SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) { -#if (ABEtype != 1 && ABEtype != 2) +#if (ABEtype != 1) for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } #endif } @@ -3918,7 +3932,7 @@ void bssn_class::ParallelStep() SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor)) { -#if (ABEtype != 1 && ABEtype != 2) +#if (ABEtype != 1) for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } #endif } @@ -3945,7 +3959,7 @@ void bssn_class::ParallelStep() SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor)) { -#if (ABEtype != 1 && ABEtype != 2) +#if (ABEtype != 1) for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } #endif } @@ -4588,7 +4602,7 @@ void bssn_class::Step(int lev, int YN) } } #endif -#if (ABEtype != 1 && ABEtype != 2) +#if (ABEtype != 1) Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry); #endif STEP_TIMER_ADD(TB_PREDICTOR_SYNC, timer_predictor_sync); @@ -5035,7 +5049,7 @@ void bssn_class::Step(int lev, int YN) } } #endif -#if (ABEtype != 1 && ABEtype != 2) +#if (ABEtype != 1) Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry); #endif STEP_TIMER_ADD(TB_CORRECTOR_SYNC, timer_corrector_sync); @@ -5554,7 +5568,7 @@ void bssn_class::Step(int lev, int YN) } } #endif -#if (ABEtype != 1 && ABEtype != 2) +#if (ABEtype != 1) Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry); #endif @@ -5905,7 +5919,7 @@ void bssn_class::Step(int lev, int YN) } } #endif -#if (ABEtype != 1 && ABEtype != 2) +#if (ABEtype != 1) Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry); #endif diff --git a/makefile_and_run.py b/makefile_and_run.py index dfd6309..71bedbe 100755 --- a/makefile_and_run.py +++ b/makefile_and_run.py @@ -160,6 +160,11 @@ def _gpu_runtime_env(): "AMSS_CUDA_DEVICE_SEGMENT_BATCH": "0", "AMSS_CUDA_UNCACHED_DEVICE_BUFFERS": "1", } + if getattr(input_data, "Equation_Class", "") == "Z4C": + defaults.update({ + "AMSS_Z4C_CUDA_RESIDENT": "1", + "AMSS_CONSTRAINT_OUT_EVERY": "1000000", + }) for key, value in defaults.items(): runtime_env.setdefault(key, value) @@ -271,16 +276,26 @@ def run_ABE(): mpi_env = None started_mps = False + mpi_processes = int(input_data.MPI_processes) + if (input_data.GPU_Calculation == "yes" and + getattr(input_data, "Equation_Class", "") == "Z4C"): + z4c_env_np = os.environ.get("AMSS_Z4C_GPU_MPI_PROCESSES") + if z4c_env_np and int(z4c_env_np) > 0: + mpi_processes = int(z4c_env_np) + elif mpi_processes < 4: + mpi_processes = 4 + if (input_data.GPU_Calculation == "no"): - mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" + mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(mpi_processes) + " ./ABE" #mpi_command = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" mpi_command_outfile = "ABE_out.log" elif (input_data.GPU_Calculation == "yes"): - mpi_command = NUMACTL_CPU_BIND + " I_MPI_OFFLOAD=1 I_MPI_OFFLOAD_IPC=0 mpirun -np " + str(input_data.MPI_processes) + " ./ABE_CUDA" + mpi_command = NUMACTL_CPU_BIND + " I_MPI_OFFLOAD=1 I_MPI_OFFLOAD_IPC=0 mpirun -np " + str(mpi_processes) + " ./ABE_CUDA" mpi_command_outfile = "ABEGPU_out.log" mpi_env = _gpu_runtime_env() started_mps = _start_cuda_mps_if_requested(mpi_env) print(" GPU optimized runtime switches:") + print(f" MPI processes={mpi_processes}") print(f" AMSS_INTERP_FAST={mpi_env.get('AMSS_INTERP_FAST', '')}") print(f" AMSS_INTERP_GPU={mpi_env.get('AMSS_INTERP_GPU', '')}") print(f" AMSS_ANALYSIS_MAP_EVERY={mpi_env.get('AMSS_ANALYSIS_MAP_EVERY', '')}") @@ -300,6 +315,8 @@ def run_ABE(): print(f" AMSS_CUDA_AMR_RESTRICT_BATCH={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_BATCH', '')}") print(f" AMSS_CUDA_DEVICE_SEGMENT_BATCH={mpi_env.get('AMSS_CUDA_DEVICE_SEGMENT_BATCH', '')}") print(f" AMSS_CUDA_UNCACHED_DEVICE_BUFFERS={mpi_env.get('AMSS_CUDA_UNCACHED_DEVICE_BUFFERS', '')}") + print(f" AMSS_Z4C_CUDA_RESIDENT={mpi_env.get('AMSS_Z4C_CUDA_RESIDENT', '')}") + print(f" AMSS_CONSTRAINT_OUT_EVERY={mpi_env.get('AMSS_CONSTRAINT_OUT_EVERY', '')}") if "CUDA_MPS_PIPE_DIRECTORY" in mpi_env: print(f" CUDA_MPS_PIPE_DIRECTORY={mpi_env['CUDA_MPS_PIPE_DIRECTORY']}") diff --git a/plot_xiaoqu.py b/plot_xiaoqu.py index 47970cf..32ac1c0 100755 --- a/plot_xiaoqu.py +++ b/plot_xiaoqu.py @@ -808,10 +808,10 @@ def generate_ADMmass_plot( outdir, figure_outdir, detector_number_i ): ## Plot constraint violation for each grid level -def generate_constraint_check_plot( outdir, figure_outdir, input_level_number ): - - # path to data file - file0 = os.path.join(outdir, "bssn_constraint.dat") +def generate_constraint_check_plot( outdir, figure_outdir, input_level_number ): + + # path to data file + file0 = os.path.join(outdir, "bssn_constraint.dat") if ( input_level_number == 0 ): print( ) @@ -819,13 +819,26 @@ def generate_constraint_check_plot( outdir, figure_outdir, input_level_number ): print( ) print( " corresponding data file = ", file0 ) print( ) - - print( " Begin the constraint violation plot for grid level number = ", input_level_number ) - - # load the full data file (assumed whitespace-separated floats) - data = numpy.loadtxt(file0) - - # extract columns from the constraint data file + + print( " Begin the constraint violation plot for grid level number = ", input_level_number ) + + if (not os.path.exists(file0)) or os.path.getsize(file0) == 0: + if ( input_level_number == 0 ): + print( " Constraint data file is empty; skip constraint violation plots" ) + print( ) + return + + # load the full data file (assumed whitespace-separated floats) + data = numpy.loadtxt(file0) + data = numpy.atleast_2d(data) + + if data.shape[1] < 8: + if ( input_level_number == 0 ): + print( " Constraint data file has insufficient columns; skip constraint violation plots" ) + print( ) + return + + # extract columns from the constraint data file time = data[:,0] Constraint_H = data[:,1] Constraint_Px = data[:,2]