#!/usr/bin/env bash
set -euo pipefail

# Warm up LeWM evaluation before a formal run.
#
# This script intentionally does a small eval for each task so ROCm/PyTorch can
# initialize GPU contexts, compile predictor graphs, populate kernel caches, and
# touch dataset/checkpoint paths before the timed run.
#
# Site-specific things to check before using this at the competition:
#   1. STABLEWM_HOME points to the directory containing datasets/checkpoints.
#   2. The policy names below match the checkpoint folders at STABLEWM_HOME.
#   3. The dataset names in config/eval/*.yaml match the onsite dataset files.
#   4. The GPU visibility variables match the GPUs allocated to this job.
#   5. WARMUP_NUM_EVAL is close enough to the formal shape to trigger useful
#      compilation, but small enough not to waste much time.

REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "${REPO_ROOT}"

PYTHON_BIN="${PYTHON_BIN:-${REPO_ROOT}/.venv/bin/python}"
STABLEWM_HOME="${STABLEWM_HOME:-/mnt/ASC1637/stablewm}"
export STABLEWM_HOME

# If Slurm allocates multiple GPUs, set these to the allocated physical GPU ids.
# Example for physical GPU 2 and 3:
#   ROCR_VISIBLE_DEVICES=2,3 HIP_VISIBLE_DEVICES=0,1 CUDA_VISIBLE_DEVICES=0,1
#
# Important ROCm detail:
#   ROCR_VISIBLE_DEVICES uses physical ids.
#   HIP_VISIBLE_DEVICES/CUDA_VISIBLE_DEVICES use ids after ROCR remapping.
export ROCR_VISIBLE_DEVICES="${ROCR_VISIBLE_DEVICES:-0}"
export HIP_VISIBLE_DEVICES="${HIP_VISIBLE_DEVICES:-0}"
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"

WARMUP_NUM_EVAL="${WARMUP_NUM_EVAL:-10}"
INFERENCE_PRECISION="${INFERENCE_PRECISION:-fp16}"
OUTPUT_DIR="${OUTPUT_DIR:-/tmp/lewm_warmup}"
mkdir -p "${OUTPUT_DIR}"

# Enable multi-GPU warmup by setting MULTI_GPU=1.
# MULTI_GPU_DEVICES are process-local ids, not physical ids after ROCR remapping.
# Example:
#   ROCR_VISIBLE_DEVICES=2,3 HIP_VISIBLE_DEVICES=0,1 MULTI_GPU=1 MULTI_GPU_DEVICES='[0,1]'
MULTI_GPU="${MULTI_GPU:-0}"
MULTI_GPU_DEVICES="${MULTI_GPU_DEVICES:-[0,1]}"
MULTI_NODE="${MULTI_NODE:-0}"

# Multi-node warmup uses the same eval.py entrypoint under torchrun.
# Example:
#   torchrun --nnodes=2 --nproc_per_node=2 --node_rank=0 --master_addr=<ip> --master_port=29500 \
#     eval.py --config-name=pusht.yaml policy=pusht/lewm multi_node.enabled=true
# This script leaves multi-node launch to the caller.

COMMON_ARGS=(
  "eval.num_eval=${WARMUP_NUM_EVAL}"
  "inference_precision=${INFERENCE_PRECISION}"
)

if [[ "${MULTI_GPU}" == "1" ]]; then
  COMMON_ARGS+=(
    "+multi_gpu.enabled=true"
    "+multi_gpu.devices=${MULTI_GPU_DEVICES}"
  )
fi

if [[ "${MULTI_NODE}" == "1" ]]; then
  COMMON_ARGS+=(
    "multi_node.enabled=true"
  )
fi

run_warmup() {
  local config_name="$1"
  local policy="$2"
  local output_name="$3"

  echo
  echo "== Warmup ${config_name} policy=${policy} =="
  "${PYTHON_BIN}" eval.py \
    "--config-name=${config_name}" \
    "policy=${policy}" \
    "output.filename=${OUTPUT_DIR}/${output_name}" \
    "${COMMON_ARGS[@]}"
}

echo "LeWM warmup"
echo "  repo: ${REPO_ROOT}"
echo "  python: ${PYTHON_BIN}"
echo "  STABLEWM_HOME: ${STABLEWM_HOME}"
echo "  ROCR_VISIBLE_DEVICES: ${ROCR_VISIBLE_DEVICES}"
echo "  HIP_VISIBLE_DEVICES: ${HIP_VISIBLE_DEVICES}"
echo "  CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES}"
echo "  WARMUP_NUM_EVAL: ${WARMUP_NUM_EVAL}"
echo "  INFERENCE_PRECISION: ${INFERENCE_PRECISION}"
  echo "  MULTI_GPU: ${MULTI_GPU}"
  if [[ "${MULTI_GPU}" == "1" ]]; then
    echo "  MULTI_GPU_DEVICES: ${MULTI_GPU_DEVICES}"
  fi
  echo "  MULTI_NODE: ${MULTI_NODE}"

# Defaults match the checkpoint names used in this repo. If onsite checkpoint
# folders differ, override by editing these calls or passing the equivalent
# eval.py command manually.
run_warmup "pusht.yaml" "pusht/lewm" "warmup_pusht.txt"
run_warmup "reacher.yaml" "reacher/lewm" "warmup_reacher.txt"
run_warmup "cube.yaml" "cube/lewm" "warmup_cube.txt"
run_warmup "tworoom.yaml" "tworoom/lewm" "warmup_tworoom.txt"

echo
echo "Warmup complete. Logs were appended under ${OUTPUT_DIR}."