多机
This commit is contained in:
@@ -44,6 +44,13 @@ mkdir -p "${OUTPUT_DIR}"
|
||||
# ROCR_VISIBLE_DEVICES=2,3 HIP_VISIBLE_DEVICES=0,1 MULTI_GPU=1 MULTI_GPU_DEVICES='[0,1]'
|
||||
MULTI_GPU="${MULTI_GPU:-0}"
|
||||
MULTI_GPU_DEVICES="${MULTI_GPU_DEVICES:-[0,1]}"
|
||||
MULTI_NODE="${MULTI_NODE:-0}"
|
||||
|
||||
# Multi-node warmup uses the same eval.py entrypoint under torchrun.
|
||||
# Example:
|
||||
# torchrun --nnodes=2 --nproc_per_node=2 --node_rank=0 --master_addr=<ip> --master_port=29500 \
|
||||
# eval.py --config-name=pusht.yaml policy=pusht/lewm multi_node.enabled=true
|
||||
# This script leaves multi-node launch to the caller.
|
||||
|
||||
COMMON_ARGS=(
|
||||
"eval.num_eval=${WARMUP_NUM_EVAL}"
|
||||
@@ -57,6 +64,12 @@ if [[ "${MULTI_GPU}" == "1" ]]; then
|
||||
)
|
||||
fi
|
||||
|
||||
if [[ "${MULTI_NODE}" == "1" ]]; then
|
||||
COMMON_ARGS+=(
|
||||
"multi_node.enabled=true"
|
||||
)
|
||||
fi
|
||||
|
||||
run_warmup() {
|
||||
local config_name="$1"
|
||||
local policy="$2"
|
||||
@@ -80,10 +93,11 @@ echo " HIP_VISIBLE_DEVICES: ${HIP_VISIBLE_DEVICES}"
|
||||
echo " CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES}"
|
||||
echo " WARMUP_NUM_EVAL: ${WARMUP_NUM_EVAL}"
|
||||
echo " INFERENCE_PRECISION: ${INFERENCE_PRECISION}"
|
||||
echo " MULTI_GPU: ${MULTI_GPU}"
|
||||
if [[ "${MULTI_GPU}" == "1" ]]; then
|
||||
echo " MULTI_GPU_DEVICES: ${MULTI_GPU_DEVICES}"
|
||||
fi
|
||||
echo " MULTI_GPU: ${MULTI_GPU}"
|
||||
if [[ "${MULTI_GPU}" == "1" ]]; then
|
||||
echo " MULTI_GPU_DEVICES: ${MULTI_GPU_DEVICES}"
|
||||
fi
|
||||
echo " MULTI_NODE: ${MULTI_NODE}"
|
||||
|
||||
# Defaults match the checkpoint names used in this repo. If onsite checkpoint
|
||||
# folders differ, override by editing these calls or passing the equivalent
|
||||
|
||||
Reference in New Issue
Block a user