diff --git a/AMSS_NCKU_GPUCheck.py b/AMSS_NCKU_GPUCheck.py index 69e66ad..997f656 100644 --- a/AMSS_NCKU_GPUCheck.py +++ b/AMSS_NCKU_GPUCheck.py @@ -1,4 +1,25 @@ #!/usr/bin/env python3 +# +# Current most stable GPU-branch baseline: +# GPU_Calculation="yes" +# Equation_Class="BSSN" +# Initial_Data_Method="Ansorg-TwoPuncture" +# puncture_data_set="Manually" +# basic_grid_set="Patch" +# grid_center_set="Cell" +# Symmetry="equatorial-symmetry" +# Time_Evolution_Method="runge-kutta-45" +# Finite_Diffenence_Method="4th-order" +# boundary_choice="BAM-choice" +# gauge_choice=0 +# tetrad_type=2 +# AHF_Find="no" +# devide_factor=2.0 +# static_grid_type="Linear" +# moving_grid_type="Linear" +# AMSS_Z4C_MRBD=0 +# Do not enable AMSS_CUDA_BH_INTERP_RESIDENT unless a dedicated +# CPU/GPU trajectory comparison has been run for that configuration. """ Check whether AMSS_NCKU_Input.py is suitable for the current GPU branch. @@ -12,10 +33,51 @@ from __future__ import annotations import argparse import importlib.util import os -import sys from dataclasses import dataclass, field from pathlib import Path -from typing import Any, List +from typing import Any, Iterable, List, Sequence + + +SUPPORTED_EQUATIONS = {"BSSN", "BSSN-EScalar", "BSSN-EM", "Z4C"} +SUPPORTED_INITIAL_DATA = { + "Ansorg-TwoPuncture", + "Lousto-Analytical", + "Cao-Analytical", + "KerrSchild-Analytical", +} +SUPPORTED_SYMMETRIES = { + "no-symmetry", + "equatorial-symmetry", + "octant-symmetry", +} +SUPPORTED_GRIDS = {"Patch", "Shell-Patch"} +SUPPORTED_CENTERS = {"Cell", "Vertex"} +SUPPORTED_FD = {"2nd-order", "4th-order", "6th-order", "8th-order"} +SUPPORTED_GAUGES = {0, 1, 2, 3, 4, 5, 6, 7} +SUPPORTED_TETRADS = {0, 1, 2} +SUPPORTED_AHF = {"yes", "no"} +SUPPORTED_BOUNDARIES = {"BAM-choice", "Shibata-choice"} +SUPPORTED_PUNCTURE_DATA = {"Manually", "Automatically-BBH"} + +STABLE_BASELINE = { + "GPU_Calculation": "yes", + "Equation_Class": "BSSN", + "Initial_Data_Method": "Ansorg-TwoPuncture", + "puncture_data_set": "Manually", + "basic_grid_set": "Patch", + "grid_center_set": "Cell", + "Symmetry": "equatorial-symmetry", + "Time_Evolution_Method": "runge-kutta-45", + "Finite_Diffenence_Method": "4th-order", + "boundary_choice": "BAM-choice", + "gauge_choice": 0, + "tetrad_type": 2, + "AHF_Find": "no", + "devide_factor": 2.0, + "static_grid_type": "Linear", + "moving_grid_type": "Linear", + "AMSS_Z4C_MRBD": 0, +} @dataclass @@ -35,6 +97,9 @@ class CheckResult: def add_note(self, msg: str) -> None: self.notes.append(msg) + def extend_notes(self, messages: Iterable[str]) -> None: + self.notes.extend(messages) + def load_input_module(path: Path): spec = importlib.util.spec_from_file_location("amss_ncku_input", str(path)) @@ -55,80 +120,376 @@ def as_text(value: Any) -> str: return str(value).strip() +def as_lower_text(value: Any) -> str: + return as_text(value).lower() + + +def as_float(value: Any, default: float | None = None) -> float | None: + try: + return float(value) + except (TypeError, ValueError): + return default + + +def as_int(value: Any, default: int | None = None) -> int | None: + try: + return int(value) + except (TypeError, ValueError): + return default + + +def sequence_len(value: Any) -> int | None: + try: + return len(value) + except TypeError: + return None + + +def sequence_values(value: Any) -> List[float] | None: + try: + return [float(v) for v in value] + except (TypeError, ValueError): + return None + + +def approx_equal(a: Any, b: float, tol: float = 1.0e-12) -> bool: + value = as_float(a) + return value is not None and abs(value - b) <= tol + + +def env_truthy(name: str) -> bool: + value = os.environ.get(name) + return value is not None and value.strip().lower() in { + "1", + "yes", + "y", + "true", + "on", + "enable", + "enabled", + } + + +def stable_baseline_differences(mod: Any) -> List[str]: + diffs = [] + for name, expected in STABLE_BASELINE.items(): + if not hasattr(mod, name): + continue + actual = get_attr(mod, name, None) + if isinstance(expected, float): + if not approx_equal(actual, expected): + diffs.append(f"{name}={actual!r} (stable baseline: {expected!r})") + elif actual != expected: + diffs.append(f"{name}={actual!r} (stable baseline: {expected!r})") + return diffs + + +def add_membership_check( + r: CheckResult, + name: str, + value: Any, + supported: Sequence[Any] | set[Any], + *, + risk_message: str | None = None, + note_message: str | None = None, +) -> None: + if value not in supported: + r.add_risk(risk_message or f"Unsupported {name}: {value!r}") + elif note_message: + r.add_note(note_message) + + +def check_positive_int(r: CheckResult, name: str, value: Any) -> None: + parsed = as_int(value) + if parsed is None or parsed <= 0: + r.add_risk(f"{name} must be a positive integer; got {value!r}") + + +def check_nonnegative_number(r: CheckResult, name: str, value: Any) -> None: + parsed = as_float(value) + if parsed is None or parsed < 0.0: + r.add_risk(f"{name} must be a non-negative number; got {value!r}") + + +def check_grid_geometry(r: CheckResult, mod: Any, grid: str) -> None: + grid_level = as_int(get_attr(mod, "grid_level", None)) + static_grid_level = as_int(get_attr(mod, "static_grid_level", None)) + moving_grid_level = as_int(get_attr(mod, "moving_grid_level", None)) + refinement_level = as_int(get_attr(mod, "refinement_level", None)) + analysis_level = as_int(get_attr(mod, "analysis_level", 0)) + + for name in ( + "grid_level", + "static_grid_level", + "moving_grid_level", + "static_grid_number", + "moving_grid_number", + "quarter_sphere_number", + ): + check_positive_int(r, name, get_attr(mod, name, None)) + + if grid_level is not None and static_grid_level is not None: + if static_grid_level > grid_level: + r.add_risk("static_grid_level cannot exceed grid_level.") + if moving_grid_level is not None and moving_grid_level != grid_level - static_grid_level: + r.add_risk( + "moving_grid_level should equal grid_level - static_grid_level; " + f"got {moving_grid_level}, expected {grid_level - static_grid_level}." + ) + if grid_level is not None: + if refinement_level is None or refinement_level < 0 or refinement_level > grid_level: + r.add_risk(f"refinement_level must be in [0, grid_level]; got {refinement_level!r}") + if analysis_level is None or analysis_level < 0 or analysis_level >= grid_level: + r.add_risk(f"analysis_level must be in [0, grid_level); got {analysis_level!r}") + + largest_max = sequence_values(get_attr(mod, "largest_box_xyz_max", None)) + largest_min = sequence_values(get_attr(mod, "largest_box_xyz_min", None)) + if largest_max is None or len(largest_max) != 3: + r.add_risk("largest_box_xyz_max must contain three numeric values.") + elif any(v <= 0.0 for v in largest_max): + r.add_risk(f"largest_box_xyz_max values must be positive; got {largest_max!r}") + if largest_min is None or len(largest_min) != 3: + r.add_risk("largest_box_xyz_min must contain three numeric values.") + elif largest_max is not None and len(largest_max) == 3: + for idx, (lo, hi) in enumerate(zip(largest_min, largest_max)): + if lo >= hi: + r.add_risk( + f"largest_box_xyz_min[{idx}] must be smaller than largest_box_xyz_max[{idx}]." + ) + + if grid == "Shell-Patch" and largest_max is not None and len(largest_max) == 3: + if max(largest_max) - min(largest_max) > 1.0e-12: + r.add_risk("Shell-Patch requires a cubic largest_box_xyz_max.") + + if not approx_equal(get_attr(mod, "devide_factor", None), 2.0): + r.add_risk("devide_factor must remain 2.0; the AMR code documents only this ratio as supported.") + if as_text(get_attr(mod, "static_grid_type", "")) != "Linear": + r.add_risk("static_grid_type must remain 'Linear'.") + if as_text(get_attr(mod, "moving_grid_type", "")) != "Linear": + r.add_risk("moving_grid_type must remain 'Linear'.") + + shell_shape = sequence_values(get_attr(mod, "shell_grid_number", None)) + if grid == "Shell-Patch": + if shell_shape is None or len(shell_shape) != 3: + r.add_risk("Shell-Patch requires shell_grid_number with three numeric values.") + elif any(int(v) <= 0 for v in shell_shape): + r.add_risk(f"shell_grid_number values must be positive; got {shell_shape!r}") + + +def check_punctures(r: CheckResult, mod: Any, init: str, puncture_data: str) -> None: + puncture_number = as_int(get_attr(mod, "puncture_number", None)) + if puncture_number is None or puncture_number <= 0: + r.add_risk(f"puncture_number must be a positive integer; got {puncture_number!r}") + return + + if init == "Ansorg-TwoPuncture" and puncture_number != 2: + r.add_warning( + "Ansorg-TwoPuncture is validated on the GPU branch mainly for puncture_number=2." + ) + if puncture_data == "Automatically-BBH": + r.add_risk("puncture_data_set='Automatically-BBH' is documented as still developing.") + + for name in ("position_BH", "parameter_BH", "dimensionless_spin_BH", "momentum_BH"): + value = get_attr(mod, name, None) + outer = sequence_len(value) + if outer != puncture_number: + r.add_risk(f"{name} must have puncture_number rows; got {outer!r}.") + continue + for idx in range(puncture_number): + if sequence_len(value[idx]) != 3: + r.add_risk(f"{name}[{idx}] must contain three values.") + break + + if init == "Ansorg-TwoPuncture": + for name in ("parameter_BH", "position_BH", "momentum_BH"): + if get_attr(mod, name, None) is None: + r.add_risk(f"Ansorg-TwoPuncture requires {name}.") + + +def check_output_and_time(r: CheckResult, mod: Any) -> None: + for name in ( + "Final_Evolution_Time", + "Check_Time", + "Dump_Time", + "D2_Dump_Time", + "Analysis_Time", + "Courant_Factor", + "Dissipation", + ): + check_nonnegative_number(r, name, get_attr(mod, name, None)) + check_positive_int(r, "Evolution_Step_Number", get_attr(mod, "Evolution_Step_Number", None)) + + start_time = as_float(get_attr(mod, "Start_Evolution_Time", None)) + final_time = as_float(get_attr(mod, "Final_Evolution_Time", None)) + if start_time is None: + r.add_risk("Start_Evolution_Time must be numeric.") + elif final_time is not None and final_time <= start_time: + r.add_risk("Final_Evolution_Time must be greater than Start_Evolution_Time.") + + for name in ("GW_L_max", "GW_M_max", "Detector_Number"): + check_positive_int(r, name, get_attr(mod, name, None)) + detector_min = as_float(get_attr(mod, "Detector_Rmin", None)) + detector_max = as_float(get_attr(mod, "Detector_Rmax", None)) + if detector_min is None or detector_min <= 0.0: + r.add_risk(f"Detector_Rmin must be positive; got {detector_min!r}") + if detector_max is None or detector_max <= 0.0: + r.add_risk(f"Detector_Rmax must be positive; got {detector_max!r}") + if detector_min is not None and detector_max is not None and detector_max <= detector_min: + r.add_risk("Detector_Rmax must be greater than Detector_Rmin.") + + +def check_equation_specific(r: CheckResult, mod: Any, eq: str, grid: str, fd: str) -> None: + if eq == "BSSN": + r.add_note("Equation_Class=BSSN is the current validated GPU baseline.") + elif eq == "BSSN-EScalar": + r.add_warning("BSSN-EScalar has a CUDA path, but it is less broadly validated than BSSN.") + fr_choice = as_int(get_attr(mod, "FR_Choice", None)) + if fr_choice not in {1, 2, 3, 4, 5}: + r.add_risk(f"FR_Choice must be one of 1..5 for BSSN-EScalar; got {fr_choice!r}") + if approx_equal(get_attr(mod, "FR_a2", None), 0.0): + r.add_risk("CUDA BSSN-EScalar requires nonzero FR_a2.") + elif not approx_equal(get_attr(mod, "FR_a2", None), 3.0): + r.add_warning("CUDA BSSN-EScalar now passes FR_a2 to the kernel, but non-3.0 values need CPU/GPU regression.") + for name in ("FR_l2", "FR_phi0", "FR_r0", "FR_sigma0"): + check_nonnegative_number(r, name, get_attr(mod, name, None)) + elif eq == "BSSN-EM": + r.add_warning( + "BSSN-EM is accepted by the build, but this checker cannot certify its physics/output " + "without a CPU/GPU regression run." + ) + if fd == "8th-order": + r.add_note("BSSN-EM with 8th-order enables extra CUDA AMR batching defaults.") + elif eq == "Z4C": + r.add_warning( + "Z4C has CUDA support, but the resident path and Shell/CPBC combinations are more constrained." + ) + if grid == "Patch": + r.add_warning("Z4C+Patch avoids Shell CPBC, but still needs a dedicated regression test.") + else: + r.add_warning("Z4C+Shell-Patch uses CPBC/Shell logic and is not the stable BSSN baseline.") + + +def check_runtime_environment(r: CheckResult, mod: Any, eq: str, grid: str, fd: str) -> None: + if env_truthy("AMSS_CUDA_BH_INTERP_RESIDENT"): + r.add_risk( + "AMSS_CUDA_BH_INTERP_RESIDENT is enabled in the environment; this option previously caused " + "late-time trajectory drift and should stay off unless explicitly revalidated." + ) + else: + r.add_note("AMSS_CUDA_BH_INTERP_RESIDENT is not enabled; this matches the fixed stable default.") + + if eq in {"BSSN", "BSSN-EScalar", "Z4C"}: + r.add_note("makefile_and_run.py will default AMSS_CUDA_AMR_RESTRICT_DEVICE=1 for this equation.") + if fd in {"2nd-order", "8th-order"}: + r.add_warning( + f"{fd} disables some interpolation/CUDA-aware MPI fast paths by default; validate performance and output." + ) + if grid == "Shell-Patch": + r.add_warning( + "Shell-Patch changes runtime defaults and MPI process handling; use at least the script-adjusted 4 MPI ranks." + ) + + z4c_mrbd = as_int(get_attr(mod, "AMSS_Z4C_MRBD", 0), 0) + if z4c_mrbd not in {0, 1, 2}: + r.add_risk(f"AMSS_Z4C_MRBD must be 0, 1, or 2; got {z4c_mrbd!r}") + elif eq == "Z4C" and z4c_mrbd == 2: + r.add_risk("Z4C GPU resident path does not support AMSS_Z4C_MRBD=2.") + elif eq == "Z4C" and z4c_mrbd in {0, 1}: + r.add_note(f"Z4C will build with AMSS_Z4C_MRBD={z4c_mrbd}.") + + +def check_stable_profile(r: CheckResult, mod: Any) -> None: + diffs = stable_baseline_differences(mod) + if not diffs: + r.add_note("This input matches the documented most stable GPU baseline.") + return + r.add_warning( + "This input differs from the documented most stable GPU baseline: " + "; ".join(diffs) + ) + + def check_input(mod: Any) -> CheckResult: r = CheckResult() - gpu = as_text(get_attr(mod, "GPU_Calculation", "no")).lower() == "yes" + gpu_text = as_lower_text(get_attr(mod, "GPU_Calculation", "no")) + gpu = gpu_text == "yes" eq = as_text(get_attr(mod, "Equation_Class", "")) init = as_text(get_attr(mod, "Initial_Data_Method", "")) + symmetry = as_text(get_attr(mod, "Symmetry", "")) + time_method = as_text(get_attr(mod, "Time_Evolution_Method", "")) grid = as_text(get_attr(mod, "basic_grid_set", "")) center = as_text(get_attr(mod, "grid_center_set", "")) fd = as_text(get_attr(mod, "Finite_Diffenence_Method", "")) gauge = get_attr(mod, "gauge_choice", None) + tetrad = get_attr(mod, "tetrad_type", None) ahf = as_text(get_attr(mod, "AHF_Find", "no")).lower() boundary = as_text(get_attr(mod, "boundary_choice", "")) + puncture_data = as_text(get_attr(mod, "puncture_data_set", "")) cpu_part = get_attr(mod, "CPU_Part", None) gpu_part = get_attr(mod, "GPU_Part", None) + if gpu_text not in {"yes", "no"}: + r.add_risk(f"GPU_Calculation must be 'yes' or 'no'; got {get_attr(mod, 'GPU_Calculation', None)!r}") if not gpu: r.add_note("GPU_Calculation=no; this check only targets the GPU branch.") return r r.add_note("GPU_Calculation=yes detected.") - if eq not in {"BSSN", "BSSN-EScalar", "BSSN-EM", "Z4C"}: - r.add_risk(f"Unsupported Equation_Class for GPU branch: {eq!r}") - elif eq == "BSSN": - r.add_note("Equation_Class=BSSN is supported.") + add_membership_check(r, "Equation_Class", eq, SUPPORTED_EQUATIONS) + add_membership_check(r, "Symmetry", symmetry, SUPPORTED_SYMMETRIES) + add_membership_check(r, "Initial_Data_Method", init, SUPPORTED_INITIAL_DATA) + add_membership_check(r, "basic_grid_set", grid, SUPPORTED_GRIDS) + add_membership_check(r, "grid_center_set", center, SUPPORTED_CENTERS) + add_membership_check(r, "Finite_Diffenence_Method", fd, SUPPORTED_FD) + add_membership_check(r, "gauge_choice", gauge, SUPPORTED_GAUGES) + add_membership_check(r, "tetrad_type", tetrad, SUPPORTED_TETRADS) + add_membership_check(r, "AHF_Find", ahf, SUPPORTED_AHF) + add_membership_check(r, "boundary_choice", boundary, SUPPORTED_BOUNDARIES) + add_membership_check(r, "puncture_data_set", puncture_data, SUPPORTED_PUNCTURE_DATA) if init != "Ansorg-TwoPuncture": - r.add_risk(f"Initial_Data_Method={init!r} is not the safe default for this GPU path.") + r.add_risk( + f"Initial_Data_Method={init!r} is not validated as safe on this GPU branch; " + "the stable path is Ansorg-TwoPuncture." + ) else: r.add_note("Initial_Data_Method=Ansorg-TwoPuncture is supported.") - if grid not in {"Patch", "Shell-Patch"}: - r.add_risk(f"Unsupported basic_grid_set: {grid!r}") - elif grid == "Patch": - r.add_note("basic_grid_set=Patch is supported.") - else: - r.add_note("basic_grid_set=Shell-Patch is supported, but GPU runtime uses extra shell-specific switches.") + if time_method != "runge-kutta-45": + r.add_risk(f"Only Time_Evolution_Method='runge-kutta-45' is supported; got {time_method!r}.") + if grid == "Patch": + r.add_note("basic_grid_set=Patch is the current stable GPU grid path.") + elif grid == "Shell-Patch": + r.add_warning("basic_grid_set=Shell-Patch has GPU support but is outside the stable BSSN baseline.") + if center == "Vertex": + r.add_warning("grid_center_set=Vertex is compiled by macros, but the stable GPU baseline is Cell.") + if symmetry != "equatorial-symmetry": + r.add_warning("The stable validation case uses equatorial-symmetry; other symmetries need regression tests.") + if fd != "4th-order": + r.add_warning("The stable validation case uses 4th-order finite differences.") + if gauge not in {0, 1}: + r.add_warning("Input comments recommend gauge_choice 0 or 1; other gauges need dedicated validation.") + if tetrad != 2: + r.add_warning("Input comments recommend tetrad_type=2; other tetrads affect wave extraction conventions.") - if center not in {"Cell", "Vertex"}: - r.add_risk(f"Unsupported grid_center_set: {center!r}") - else: - r.add_note(f"grid_center_set={center} is supported.") + if ahf == "yes": + r.add_warning("AHF_Find=yes is supported by macros, but it is outside the current stable GPU baseline.") - if fd not in {"2nd-order", "4th-order", "6th-order", "8th-order"}: - r.add_risk(f"Unsupported Finite_Diffenence_Method: {fd!r}") - else: - r.add_note(f"Finite_Diffenence_Method={fd} is supported.") - - if gauge not in {0, 1, 2, 3, 4, 5, 6, 7}: - r.add_risk(f"Unsupported gauge_choice: {gauge!r}") - else: - r.add_note(f"gauge_choice={gauge} is supported.") - - if ahf not in {"yes", "no"}: - r.add_risk(f"Unsupported AHF_Find value: {ahf!r}") - elif ahf == "yes": - r.add_note("AHF_Find=yes is supported, but it increases sensitivity to state sync.") - - if boundary not in {"BAM-choice", "Shibata-choice"}: - r.add_risk(f"Unsupported boundary_choice: {boundary!r}") - elif boundary == "Shibata-choice": + if boundary == "Shibata-choice": r.add_risk("Shibata-choice is not faithfully distinguished in the current macro generator; it maps to the BAM branch.") - else: + elif boundary == "BAM-choice": r.add_note("boundary_choice=BAM-choice is supported.") if cpu_part is not None or gpu_part is not None: r.add_warning("CPU_Part/GPU_Part are printed and propagated, but they do not control a real mixed CPU/GPU split in this branch.") - if eq == "BSSN" and grid == "Patch" and init == "Ansorg-TwoPuncture": - if get_attr(mod, "GPU_Calculation", "no") == "yes": - r.add_note("This configuration is generally runnable on the GPU branch.") - - if eq == "BSSN" and get_attr(mod, "GPU_Calculation", "no") == "yes": - r.add_warning("Default BH resident interpolation was previously enabled and could cause trajectory drift; the current code now defaults it off unless AMSS_CUDA_BH_INTERP_RESIDENT=1 is set.") + check_output_and_time(r, mod) + check_grid_geometry(r, mod, grid) + check_punctures(r, mod, init, puncture_data) + check_equation_specific(r, mod, eq, grid, fd) + check_runtime_environment(r, mod, eq, grid, fd) + check_stable_profile(r, mod) return r @@ -160,13 +521,18 @@ def main() -> int: print(f"Input: {path}") print(f"GPU_Calculation: {get_attr(mod, 'GPU_Calculation', 'no')}") + print(f"Symmetry: {get_attr(mod, 'Symmetry', '')}") print(f"Equation_Class: {get_attr(mod, 'Equation_Class', '')}") print(f"Initial_Data_Method: {get_attr(mod, 'Initial_Data_Method', '')}") + print(f"puncture_data_set: {get_attr(mod, 'puncture_data_set', '')}") print(f"basic_grid_set: {get_attr(mod, 'basic_grid_set', '')}") print(f"grid_center_set: {get_attr(mod, 'grid_center_set', '')}") print(f"Finite_Diffenence_Method: {get_attr(mod, 'Finite_Diffenence_Method', '')}") print(f"gauge_choice: {get_attr(mod, 'gauge_choice', '')}") + print(f"tetrad_type: {get_attr(mod, 'tetrad_type', '')}") + print(f"boundary_choice: {get_attr(mod, 'boundary_choice', '')}") print(f"AHF_Find: {get_attr(mod, 'AHF_Find', '')}") + print(f"AMSS_Z4C_MRBD: {get_attr(mod, 'AMSS_Z4C_MRBD', 0)}") print("") for msg in result.notes: diff --git a/AMSS_NCKU_source/Z4c_class.C b/AMSS_NCKU_source/Z4c_class.C index a110f2a..22e33f5 100644 --- a/AMSS_NCKU_source/Z4c_class.C +++ b/AMSS_NCKU_source/Z4c_class.C @@ -262,7 +262,10 @@ Z4c_class::~Z4c_class() //================================================================================================ -#define MRBD 0 // 0: fix BD for meshrefinement level; 1: sommerfeld_bam for them; 2: sommerfeld_yo for them +#ifndef AMSS_Z4C_MRBD +#define AMSS_Z4C_MRBD 0 +#endif +#define MRBD AMSS_Z4C_MRBD // 0: fix BD for meshrefinement level; 1: sommerfeld_bam for them; 2: sommerfeld_yo for them #ifndef CPBC // for sommerfeld boundary diff --git a/AMSS_NCKU_source/bssnEM_class.C b/AMSS_NCKU_source/bssnEM_class.C index 1076709..2777ed6 100644 --- a/AMSS_NCKU_source/bssnEM_class.C +++ b/AMSS_NCKU_source/bssnEM_class.C @@ -318,6 +318,16 @@ void fill_bssn_em_matter_cuda_views(Block *cg, double **matter, bool bssn_em_cuda_use_resident_sync(int lev) { + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_CUDA_RESIDENT_SYNC"); + if (!env) + env = getenv("AMSS_CUDA_EM_RESIDENT_SYNC"); + enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1; + } + if (!enabled) + return false; #ifdef WithShell (void)lev; return false; diff --git a/AMSS_NCKU_source/bssnEScalar_class.C b/AMSS_NCKU_source/bssnEScalar_class.C index 48ab025..a715cd4 100644 --- a/AMSS_NCKU_source/bssnEScalar_class.C +++ b/AMSS_NCKU_source/bssnEScalar_class.C @@ -65,6 +65,16 @@ bool fill_bssn_escalar_cuda_views(Block *cg, MyList *vars, bool bssn_escalar_cuda_use_resident_sync(int lev) { + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_CUDA_RESIDENT_SYNC"); + if (!env) + env = getenv("AMSS_CUDA_ESCALAR_RESIDENT_SYNC"); + enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1; + } + if (!enabled) + return false; #ifdef WithShell (void)lev; return false; diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index 2b88c67..d6f3698 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -552,6 +552,16 @@ bool fill_bssn_cuda_views_count(Block *cg, MyList *vars, bool bssn_cuda_use_resident_sync(int lev) { + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_CUDA_RESIDENT_SYNC"); + if (!env) + env = getenv("AMSS_CUDA_BSSN_RESIDENT_SYNC"); + enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1; + } + if (!enabled) + return false; (void)lev; return true; } diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index 1b0c4d5..43750e7 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -2792,12 +2792,13 @@ void kern_escalar_sources( double * __restrict__ Sxz, double * __restrict__ Syy, double * __restrict__ Syz, - double * __restrict__ Szz) + double * __restrict__ Szz, + double escalar_a2) { constexpr double PI_V = 3.141592653589793238462643383279502884; constexpr double TWO = 2.0; constexpr double HALF = 0.5; - constexpr double A2 = 3.0; + const double A2 = escalar_a2; for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < d_gp.all; @@ -2852,7 +2853,7 @@ void kern_escalar_sources( } } -static void gpu_escalar_sources(int all) +static void gpu_escalar_sources(int all, double escalar_a2) { #define D(s) g_buf.slot[s] gpu_fderivs(D(S_Sphi), D(S_Sphi_x), D(S_Sphi_y), D(S_Sphi_z), 1.0, 1.0, 1.0, all); @@ -2872,7 +2873,8 @@ static void gpu_escalar_sources(int all) D(S_Sphi_yy), D(S_Sphi_yz), D(S_Sphi_zz), D(S_Sphi_rhs), D(S_Spi_rhs), D(S_rho), D(S_Sx), D(S_Sy), D(S_Sz), - D(S_Sxx), D(S_Sxy), D(S_Sxz), D(S_Syy), D(S_Syz), D(S_Szz)); + D(S_Sxx), D(S_Sxy), D(S_Sxz), D(S_Syy), D(S_Syz), D(S_Szz), + escalar_a2); #undef D } @@ -6571,7 +6573,8 @@ static int active_or_keyed_bank(StepContext &ctx, return 0; } -static void launch_rhs_pipeline(int all, double eps, int co, bool compute_escalar = false) +static void launch_rhs_pipeline(int all, double eps, int co, bool compute_escalar = false, + double escalar_a2 = 3.0) { const double SYM = 1.0; const double ANTI = -1.0; @@ -6652,7 +6655,7 @@ static void launch_rhs_pipeline(int all, double eps, int co, bool compute_escala D(S_gupyy), D(S_gupyz), D(S_gupzz)); if (compute_escalar) { - gpu_escalar_sources(all); + gpu_escalar_sources(all, escalar_a2); gpu_fderivs(D(S_trK), D(S_trK_x), D(S_trK_y), D(S_trK_z), SYM, SYM, SYM, all); } @@ -7127,9 +7130,8 @@ int bssn_escalar_cuda_rk4_substep(void *block_tag, #ifdef fortran3 set_escalar_parameter_(escalar_a2, escalar_phi0, escalar_r0, escalar_sigma0, escalar_l2); #endif - if (fabs(escalar_a2 - 3.0) > 1.0e-12 && g_dispatch.my_rank == 0) { - fprintf(stderr, "CUDA BSSN-EScalar currently supports FR a2=3 for EScalar_CC=2/3; got %.17g\n", - escalar_a2); + if (fabs(escalar_a2) <= 1.0e-300 && g_dispatch.my_rank == 0) { + fprintf(stderr, "CUDA BSSN-EScalar requires nonzero FR a2; got %.17g\n", escalar_a2); return 1; } @@ -7187,7 +7189,7 @@ int bssn_escalar_cuda_rk4_substep(void *block_tag, } } - launch_rhs_pipeline((int)all, eps, co, true); + launch_rhs_pipeline((int)all, eps, co, true, escalar_a2); if (apply_bam_bc) { for (int i = 0; i < BSSN_ESCALAR_STATE_COUNT; ++i) { @@ -7250,7 +7252,7 @@ int bssn_escalar_cuda_compute_constraints(int *ex, double *X, double *Y, double const size_t bytes = all * sizeof(double); setup_grid_params(ex, X, Y, Z, Symmetry, eps, 0); upload_escalar_state_inputs(state_host_in, all); - launch_rhs_pipeline((int)all, eps, 0, true); + launch_rhs_pipeline((int)all, eps, 0, true, escalar_a2); #define D(s) g_buf.slot[s] kern_escalar_constraint_fr<<>>( diff --git a/AMSS_NCKU_source/makefile b/AMSS_NCKU_source/makefile index fee25b0..987aa8f 100644 --- a/AMSS_NCKU_source/makefile +++ b/AMSS_NCKU_source/makefile @@ -111,16 +111,19 @@ TwoPunctureABE.o: TwoPunctureABE.C # Input files -## CUDA BSSN RHS switch -## 1 : use the rewritten CUDA bssn_rhs backend -## 0 : keep the normal CPU/Fortran selection below -USE_CUDA_BSSN ?= 0 -USE_CUDA_Z4C ?= 0 - -CXXAPPFLAGS += -DUSE_CUDA_BSSN=$(USE_CUDA_BSSN) -CUDA_APP_FLAGS += -DUSE_CUDA_BSSN=$(USE_CUDA_BSSN) -CXXAPPFLAGS += -DUSE_CUDA_Z4C=$(USE_CUDA_Z4C) -CUDA_APP_FLAGS += -DUSE_CUDA_Z4C=$(USE_CUDA_Z4C) +## CUDA BSSN RHS switch +## 1 : use the rewritten CUDA bssn_rhs backend +## 0 : keep the normal CPU/Fortran selection below +USE_CUDA_BSSN ?= 0 +USE_CUDA_Z4C ?= 0 +AMSS_Z4C_MRBD ?= 0 + +CXXAPPFLAGS += -DUSE_CUDA_BSSN=$(USE_CUDA_BSSN) +CUDA_APP_FLAGS += -DUSE_CUDA_BSSN=$(USE_CUDA_BSSN) +CXXAPPFLAGS += -DUSE_CUDA_Z4C=$(USE_CUDA_Z4C) +CUDA_APP_FLAGS += -DUSE_CUDA_Z4C=$(USE_CUDA_Z4C) +CXXAPPFLAGS += -DAMSS_Z4C_MRBD=$(AMSS_Z4C_MRBD) +CUDA_APP_FLAGS += -DAMSS_Z4C_MRBD=$(AMSS_Z4C_MRBD) ## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran) ifeq ($(USE_CXX_KERNELS),0) diff --git a/makefile_and_run.py b/makefile_and_run.py index 387c3dd..75cf92c 100755 --- a/makefile_and_run.py +++ b/makefile_and_run.py @@ -75,6 +75,13 @@ def _input_or_env(input_name, env_name, default=None): return getattr(input_data, input_name, default) +def _input_env_passthrough(runtime_env, env_name): + if env_name in runtime_env: + return + if hasattr(input_data, env_name): + runtime_env[env_name] = str(getattr(input_data, env_name)) + + def _start_cuda_mps_if_requested(runtime_env): if input_data.GPU_Calculation != "yes": return False @@ -193,6 +200,40 @@ def _gpu_runtime_env(): for key, value in defaults.items(): runtime_env.setdefault(key, value) + passthrough_envs = [ + "AMSS_CUDA_RESIDENT_SYNC", + "AMSS_CUDA_BSSN_RESIDENT_SYNC", + "AMSS_CUDA_EM_RESIDENT_SYNC", + "AMSS_CUDA_ESCALAR_RESIDENT_SYNC", + "AMSS_CUDA_BH_INTERP_RESIDENT", + "AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP", + "AMSS_CUDA_KEEP_ALL_LEVELS", + "AMSS_CUDA_EM_KEEP_RESIDENT_AFTER_STEP", + "AMSS_CUDA_EM_KEEP_ALL_LEVELS", + "AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP", + "AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS", + "AMSS_CUDA_AMR_HOST_STAGED", + "AMSS_CUDA_AMR_RESTRICT_DEVICE", + "AMSS_CUDA_AMR_RESTRICT_BATCH", + "AMSS_CUDA_DEVICE_SEGMENT_BATCH", + "AMSS_CUDA_UNCACHED_DEVICE_BUFFERS", + "AMSS_CUDA_EM_CACHE_SOURCES", + "AMSS_CUDA_EM_ZERO_FASTPATH", + "AMSS_CUDA_AWARE_MPI", + "AMSS_CUDA_REGRID_FLUSH_ALWAYS", + "AMSS_Z4C_CUDA_RESIDENT", + "AMSS_SHELL_FAST_INTERP", + "AMSS_SHELL_PARALLEL_INTERP", + "AMSS_SHELL_CUDA_INTERP", + "AMSS_SHELL_INTERP_THREADS", + "AMSS_EM_ZERO_ANALYSIS_FASTPATH", + "AMSS_EM_ZERO_RESIDENT_DOWNLOAD_FASTPATH", + "AMSS_INTERP_FAST", + "AMSS_INTERP_GPU", + ] + for env_name in passthrough_envs: + _input_env_passthrough(runtime_env, env_name) + optional_overrides = { "AMSS_INTERP_FAST_COMPARE": "AMSS_Interp_Fast_Compare", "AMSS_INTERP_FAST_COMPARE_LIMIT": "AMSS_Interp_Fast_Compare_Limit", @@ -221,11 +262,13 @@ def makefile_ABE(): print( " Compiling the AMSS-NCKU executable file ABE/ABEGPU " ) print( ) + z4c_mrbd = int(getattr(input_data, "AMSS_Z4C_MRBD", 0)) + ## Build command with CPU binding to nohz_full cores if (input_data.GPU_Calculation == "no"): - makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=off USE_CUDA_BSSN=0 USE_CUDA_Z4C=0 ABE" + makefile_command = f"{NUMACTL_CPU_BIND} env AMSS_Z4C_MRBD={z4c_mrbd} make -j{BUILD_JOBS} INTERP_LB_MODE=off USE_CUDA_BSSN=0 USE_CUDA_Z4C=0 ABE" elif (input_data.GPU_Calculation == "yes"): - makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=off USE_CUDA_BSSN=1 USE_CUDA_Z4C=1 ABE_CUDA" + makefile_command = f"{NUMACTL_CPU_BIND} env AMSS_Z4C_MRBD={z4c_mrbd} make -j{BUILD_JOBS} INTERP_LB_MODE=off USE_CUDA_BSSN=1 USE_CUDA_Z4C=1 ABE_CUDA" else: print( " CPU/GPU numerical calculation setting is wrong " ) print( )