#!/usr/bin/env bash set -euo pipefail # Warm up LeWM evaluation before a formal run. # # This script intentionally does a small eval for each task so ROCm/PyTorch can # initialize GPU contexts, compile predictor graphs, populate kernel caches, and # touch dataset/checkpoint paths before the timed run. # # Site-specific things to check before using this at the competition: # 1. STABLEWM_HOME points to the directory containing datasets/checkpoints. # 2. The policy names below match the checkpoint folders at STABLEWM_HOME. # 3. The dataset names in config/eval/*.yaml match the onsite dataset files. # 4. The GPU visibility variables match the GPUs allocated to this job. # 5. WARMUP_NUM_EVAL is close enough to the formal shape to trigger useful # compilation, but small enough not to waste much time. REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" cd "${REPO_ROOT}" PYTHON_BIN="${PYTHON_BIN:-${REPO_ROOT}/.venv/bin/python}" STABLEWM_HOME="${STABLEWM_HOME:-/mnt/ASC1637/stablewm}" export STABLEWM_HOME # If Slurm allocates multiple GPUs, set these to the allocated physical GPU ids. # Example for physical GPU 2 and 3: # ROCR_VISIBLE_DEVICES=2,3 HIP_VISIBLE_DEVICES=0,1 CUDA_VISIBLE_DEVICES=0,1 # # Important ROCm detail: # ROCR_VISIBLE_DEVICES uses physical ids. # HIP_VISIBLE_DEVICES/CUDA_VISIBLE_DEVICES use ids after ROCR remapping. export ROCR_VISIBLE_DEVICES="${ROCR_VISIBLE_DEVICES:-0}" export HIP_VISIBLE_DEVICES="${HIP_VISIBLE_DEVICES:-0}" export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}" WARMUP_NUM_EVAL="${WARMUP_NUM_EVAL:-10}" INFERENCE_PRECISION="${INFERENCE_PRECISION:-fp16}" OUTPUT_DIR="${OUTPUT_DIR:-/tmp/lewm_warmup}" mkdir -p "${OUTPUT_DIR}" # Enable multi-GPU warmup by setting MULTI_GPU=1. # MULTI_GPU_DEVICES are process-local ids, not physical ids after ROCR remapping. # Example: # ROCR_VISIBLE_DEVICES=2,3 HIP_VISIBLE_DEVICES=0,1 MULTI_GPU=1 MULTI_GPU_DEVICES='[0,1]' MULTI_GPU="${MULTI_GPU:-0}" MULTI_GPU_DEVICES="${MULTI_GPU_DEVICES:-[0,1]}" MULTI_NODE="${MULTI_NODE:-0}" # Multi-node warmup uses the same eval.py entrypoint under torchrun. # Example: # torchrun --nnodes=2 --nproc_per_node=2 --node_rank=0 --master_addr= --master_port=29500 \ # eval.py --config-name=pusht.yaml policy=pusht/lewm multi_node.enabled=true # This script leaves multi-node launch to the caller. COMMON_ARGS=( "eval.num_eval=${WARMUP_NUM_EVAL}" "inference_precision=${INFERENCE_PRECISION}" ) if [[ "${MULTI_GPU}" == "1" ]]; then COMMON_ARGS+=( "+multi_gpu.enabled=true" "+multi_gpu.devices=${MULTI_GPU_DEVICES}" ) fi if [[ "${MULTI_NODE}" == "1" ]]; then COMMON_ARGS+=( "multi_node.enabled=true" ) fi run_warmup() { local config_name="$1" local policy="$2" local output_name="$3" echo echo "== Warmup ${config_name} policy=${policy} ==" "${PYTHON_BIN}" eval.py \ "--config-name=${config_name}" \ "policy=${policy}" \ "output.filename=${OUTPUT_DIR}/${output_name}" \ "${COMMON_ARGS[@]}" } echo "LeWM warmup" echo " repo: ${REPO_ROOT}" echo " python: ${PYTHON_BIN}" echo " STABLEWM_HOME: ${STABLEWM_HOME}" echo " ROCR_VISIBLE_DEVICES: ${ROCR_VISIBLE_DEVICES}" echo " HIP_VISIBLE_DEVICES: ${HIP_VISIBLE_DEVICES}" echo " CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES}" echo " WARMUP_NUM_EVAL: ${WARMUP_NUM_EVAL}" echo " INFERENCE_PRECISION: ${INFERENCE_PRECISION}" echo " MULTI_GPU: ${MULTI_GPU}" if [[ "${MULTI_GPU}" == "1" ]]; then echo " MULTI_GPU_DEVICES: ${MULTI_GPU_DEVICES}" fi echo " MULTI_NODE: ${MULTI_NODE}" # Defaults match the checkpoint names used in this repo. If onsite checkpoint # folders differ, override by editing these calls or passing the equivalent # eval.py command manually. run_warmup "pusht.yaml" "pusht/lewm" "warmup_pusht.txt" run_warmup "reacher.yaml" "reacher/lewm" "warmup_reacher.txt" run_warmup "cube.yaml" "cube/lewm" "warmup_cube.txt" run_warmup "tworoom.yaml" "tworoom/lewm" "warmup_tworoom.txt" echo echo "Warmup complete. Logs were appended under ${OUTPUT_DIR}."