增加脚本
This commit is contained in:
97
scripts/warmup_eval.sh
Executable file
97
scripts/warmup_eval.sh
Executable file
@@ -0,0 +1,97 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# Warm up LeWM evaluation before a formal run.
|
||||
#
|
||||
# This script intentionally does a small eval for each task so ROCm/PyTorch can
|
||||
# initialize GPU contexts, compile predictor graphs, populate kernel caches, and
|
||||
# touch dataset/checkpoint paths before the timed run.
|
||||
#
|
||||
# Site-specific things to check before using this at the competition:
|
||||
# 1. STABLEWM_HOME points to the directory containing datasets/checkpoints.
|
||||
# 2. The policy names below match the checkpoint folders at STABLEWM_HOME.
|
||||
# 3. The dataset names in config/eval/*.yaml match the onsite dataset files.
|
||||
# 4. The GPU visibility variables match the GPUs allocated to this job.
|
||||
# 5. WARMUP_NUM_EVAL is close enough to the formal shape to trigger useful
|
||||
# compilation, but small enough not to waste much time.
|
||||
|
||||
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
cd "${REPO_ROOT}"
|
||||
|
||||
PYTHON_BIN="${PYTHON_BIN:-${REPO_ROOT}/.venv/bin/python}"
|
||||
STABLEWM_HOME="${STABLEWM_HOME:-/mnt/ASC1637/stablewm}"
|
||||
export STABLEWM_HOME
|
||||
|
||||
# If Slurm allocates multiple GPUs, set these to the allocated physical GPU ids.
|
||||
# Example for physical GPU 2 and 3:
|
||||
# ROCR_VISIBLE_DEVICES=2,3 HIP_VISIBLE_DEVICES=0,1 CUDA_VISIBLE_DEVICES=0,1
|
||||
#
|
||||
# Important ROCm detail:
|
||||
# ROCR_VISIBLE_DEVICES uses physical ids.
|
||||
# HIP_VISIBLE_DEVICES/CUDA_VISIBLE_DEVICES use ids after ROCR remapping.
|
||||
export ROCR_VISIBLE_DEVICES="${ROCR_VISIBLE_DEVICES:-0}"
|
||||
export HIP_VISIBLE_DEVICES="${HIP_VISIBLE_DEVICES:-0}"
|
||||
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
|
||||
|
||||
WARMUP_NUM_EVAL="${WARMUP_NUM_EVAL:-10}"
|
||||
INFERENCE_PRECISION="${INFERENCE_PRECISION:-fp16}"
|
||||
OUTPUT_DIR="${OUTPUT_DIR:-/tmp/lewm_warmup}"
|
||||
mkdir -p "${OUTPUT_DIR}"
|
||||
|
||||
# Enable multi-GPU warmup by setting MULTI_GPU=1.
|
||||
# MULTI_GPU_DEVICES are process-local ids, not physical ids after ROCR remapping.
|
||||
# Example:
|
||||
# ROCR_VISIBLE_DEVICES=2,3 HIP_VISIBLE_DEVICES=0,1 MULTI_GPU=1 MULTI_GPU_DEVICES='[0,1]'
|
||||
MULTI_GPU="${MULTI_GPU:-0}"
|
||||
MULTI_GPU_DEVICES="${MULTI_GPU_DEVICES:-[0,1]}"
|
||||
|
||||
COMMON_ARGS=(
|
||||
"eval.num_eval=${WARMUP_NUM_EVAL}"
|
||||
"inference_precision=${INFERENCE_PRECISION}"
|
||||
)
|
||||
|
||||
if [[ "${MULTI_GPU}" == "1" ]]; then
|
||||
COMMON_ARGS+=(
|
||||
"+multi_gpu.enabled=true"
|
||||
"+multi_gpu.devices=${MULTI_GPU_DEVICES}"
|
||||
)
|
||||
fi
|
||||
|
||||
run_warmup() {
|
||||
local config_name="$1"
|
||||
local policy="$2"
|
||||
local output_name="$3"
|
||||
|
||||
echo
|
||||
echo "== Warmup ${config_name} policy=${policy} =="
|
||||
"${PYTHON_BIN}" eval.py \
|
||||
"--config-name=${config_name}" \
|
||||
"policy=${policy}" \
|
||||
"output.filename=${OUTPUT_DIR}/${output_name}" \
|
||||
"${COMMON_ARGS[@]}"
|
||||
}
|
||||
|
||||
echo "LeWM warmup"
|
||||
echo " repo: ${REPO_ROOT}"
|
||||
echo " python: ${PYTHON_BIN}"
|
||||
echo " STABLEWM_HOME: ${STABLEWM_HOME}"
|
||||
echo " ROCR_VISIBLE_DEVICES: ${ROCR_VISIBLE_DEVICES}"
|
||||
echo " HIP_VISIBLE_DEVICES: ${HIP_VISIBLE_DEVICES}"
|
||||
echo " CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES}"
|
||||
echo " WARMUP_NUM_EVAL: ${WARMUP_NUM_EVAL}"
|
||||
echo " INFERENCE_PRECISION: ${INFERENCE_PRECISION}"
|
||||
echo " MULTI_GPU: ${MULTI_GPU}"
|
||||
if [[ "${MULTI_GPU}" == "1" ]]; then
|
||||
echo " MULTI_GPU_DEVICES: ${MULTI_GPU_DEVICES}"
|
||||
fi
|
||||
|
||||
# Defaults match the checkpoint names used in this repo. If onsite checkpoint
|
||||
# folders differ, override by editing these calls or passing the equivalent
|
||||
# eval.py command manually.
|
||||
run_warmup "pusht.yaml" "pusht/lewm" "warmup_pusht.txt"
|
||||
run_warmup "reacher.yaml" "reacher/lewm" "warmup_reacher.txt"
|
||||
run_warmup "cube.yaml" "cube/lewm" "warmup_cube.txt"
|
||||
run_warmup "tworoom.yaml" "tworoom/lewm" "warmup_tworoom.txt"
|
||||
|
||||
echo
|
||||
echo "Warmup complete. Logs were appended under ${OUTPUT_DIR}."
|
||||
Reference in New Issue
Block a user