This commit is contained in:
qihuanye
2026-05-17 19:23:31 +08:00
parent 02080e2564
commit 0164e21f48
8 changed files with 527 additions and 6 deletions

View File

@@ -44,6 +44,13 @@ mkdir -p "${OUTPUT_DIR}"
# ROCR_VISIBLE_DEVICES=2,3 HIP_VISIBLE_DEVICES=0,1 MULTI_GPU=1 MULTI_GPU_DEVICES='[0,1]'
MULTI_GPU="${MULTI_GPU:-0}"
MULTI_GPU_DEVICES="${MULTI_GPU_DEVICES:-[0,1]}"
MULTI_NODE="${MULTI_NODE:-0}"
# Multi-node warmup uses the same eval.py entrypoint under torchrun.
# Example:
# torchrun --nnodes=2 --nproc_per_node=2 --node_rank=0 --master_addr=<ip> --master_port=29500 \
# eval.py --config-name=pusht.yaml policy=pusht/lewm multi_node.enabled=true
# This script leaves multi-node launch to the caller.
COMMON_ARGS=(
"eval.num_eval=${WARMUP_NUM_EVAL}"
@@ -57,6 +64,12 @@ if [[ "${MULTI_GPU}" == "1" ]]; then
)
fi
if [[ "${MULTI_NODE}" == "1" ]]; then
COMMON_ARGS+=(
"multi_node.enabled=true"
)
fi
run_warmup() {
local config_name="$1"
local policy="$2"
@@ -80,10 +93,11 @@ echo " HIP_VISIBLE_DEVICES: ${HIP_VISIBLE_DEVICES}"
echo " CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES}"
echo " WARMUP_NUM_EVAL: ${WARMUP_NUM_EVAL}"
echo " INFERENCE_PRECISION: ${INFERENCE_PRECISION}"
echo " MULTI_GPU: ${MULTI_GPU}"
if [[ "${MULTI_GPU}" == "1" ]]; then
echo " MULTI_GPU_DEVICES: ${MULTI_GPU_DEVICES}"
fi
echo " MULTI_GPU: ${MULTI_GPU}"
if [[ "${MULTI_GPU}" == "1" ]]; then
echo " MULTI_GPU_DEVICES: ${MULTI_GPU_DEVICES}"
fi
echo " MULTI_NODE: ${MULTI_NODE}"
# Defaults match the checkpoint names used in this repo. If onsite checkpoint
# folders differ, override by editing these calls or passing the equivalent