256 lines
6.7 KiB
Bash
Executable File
256 lines
6.7 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
# Launch 2-node LeWM evaluation from node-3.
|
|
#
|
|
# Defaults match the current cluster layout:
|
|
# node-3: 10.16.200.9, node_rank=0
|
|
# node-2: 10.16.200.8, node_rank=1
|
|
# Each node runs two local torchrun processes for two visible GPUs.
|
|
|
|
REPO_ROOT="${REPO_ROOT:-/home/lewm/lewm}"
|
|
REMOTE_HOST="${REMOTE_HOST:-lewm@10.16.200.8}"
|
|
MASTER_ADDR="${MASTER_ADDR:-10.16.200.9}"
|
|
MASTER_PORT="${MASTER_PORT:-29500}"
|
|
|
|
NNODES="${NNODES:-2}"
|
|
NPROC_PER_NODE="${NPROC_PER_NODE:-2}"
|
|
CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1}"
|
|
STABLEWM_HOME="${STABLEWM_HOME:-/home/lewm/.stable-wm}"
|
|
|
|
CONFIG_NAME="${CONFIG_NAME:-pusht.yaml}"
|
|
POLICY="${POLICY:-pusht/lewm}"
|
|
OUTPUT_FILENAME="${OUTPUT_FILENAME:-pusht_multinode_results.txt}"
|
|
EXTRA_ARGS="${EXTRA_ARGS:-}"
|
|
DRY_RUN="${DRY_RUN:-0}"
|
|
TAIL_LOGS="${TAIL_LOGS:-1}"
|
|
PRELOAD_WAIT="${PRELOAD_WAIT:-0}"
|
|
PRELOAD_SIGNAL_FILE="${PRELOAD_SIGNAL_FILE:-/tmp/lewm_preload_start}"
|
|
PRELOAD_CLEAR_SIGNAL="${PRELOAD_CLEAR_SIGNAL:-1}"
|
|
|
|
LOG_DIR="${LOG_DIR:-${REPO_ROOT}/logs/multinode}"
|
|
mkdir -p "${LOG_DIR}"
|
|
RUN_ID="$(date +%Y%m%d_%H%M%S)"
|
|
LOCAL_LOG="${LOG_DIR}/${RUN_ID}_node3_rank0.log"
|
|
REMOTE_LOG="${LOG_DIR}/${RUN_ID}_node2_rank1.log"
|
|
|
|
SSH_OPTS=(
|
|
-F /dev/null
|
|
-o StrictHostKeyChecking=no
|
|
-o ServerAliveInterval=30
|
|
-o ServerAliveCountMax=20
|
|
)
|
|
|
|
COMMON_ARGS=(
|
|
"--config-name=${CONFIG_NAME}"
|
|
"policy=${POLICY}"
|
|
"multi_node.enabled=true"
|
|
"output.filename=${OUTPUT_FILENAME}"
|
|
)
|
|
|
|
if [[ "${PRELOAD_WAIT}" == "1" ]]; then
|
|
COMMON_ARGS+=(
|
|
"preload_wait.enabled=true"
|
|
"preload_wait.file=${PRELOAD_SIGNAL_FILE}"
|
|
)
|
|
fi
|
|
|
|
if [[ -n "${EXTRA_ARGS}" ]]; then
|
|
# shellcheck disable=SC2206
|
|
COMMON_ARGS+=(${EXTRA_ARGS})
|
|
fi
|
|
|
|
make_command() {
|
|
local node_rank="$1"
|
|
local repo_q cuda_q stablewm_q arg_q eval_args
|
|
printf -v repo_q '%q' "${REPO_ROOT}"
|
|
printf -v cuda_q '%q' "${CUDA_VISIBLE_DEVICES}"
|
|
printf -v stablewm_q '%q' "${STABLEWM_HOME}"
|
|
|
|
eval_args=""
|
|
for arg in "${COMMON_ARGS[@]}"; do
|
|
printf -v arg_q '%q' "${arg}"
|
|
eval_args+=" ${arg_q}"
|
|
done
|
|
|
|
printf 'cd %s && source .venv/bin/activate && export CUDA_VISIBLE_DEVICES=%s && export STABLEWM_HOME=%s && torchrun --nnodes=%q --nproc_per_node=%q --node_rank=%q --master_addr=%q --master_port=%q eval.py%s' \
|
|
"${repo_q}" \
|
|
"${cuda_q}" \
|
|
"${stablewm_q}" \
|
|
"${NNODES}" \
|
|
"${NPROC_PER_NODE}" \
|
|
"${node_rank}" \
|
|
"${MASTER_ADDR}" \
|
|
"${MASTER_PORT}" \
|
|
"${eval_args}"
|
|
}
|
|
|
|
REMOTE_CMD="$(make_command 1)"
|
|
LOCAL_CMD="$(make_command 0)"
|
|
printf -v REMOTE_CMD_Q '%q' "${REMOTE_CMD}"
|
|
|
|
REMOTE_PID=""
|
|
LOCAL_PID=""
|
|
LOCAL_TAIL_PID=""
|
|
REMOTE_TAIL_PID=""
|
|
REMOTE_CLEANUP_CMD=""
|
|
REMOTE_CLEANUP_CMD_Q=""
|
|
|
|
start_log_tail() {
|
|
local label="$1"
|
|
local log_file="$2"
|
|
local label_q log_q
|
|
|
|
printf -v label_q '%q' "${label}"
|
|
printf -v log_q '%q' "${log_file}"
|
|
setsid bash -lc "tail -n +1 -F ${log_q} 2>/dev/null | sed -u 's/^/[${label_q}] /'" &
|
|
}
|
|
|
|
stop_log_tails() {
|
|
local pid
|
|
for pid in "${LOCAL_TAIL_PID}" "${REMOTE_TAIL_PID}"; do
|
|
if [[ -n "${pid}" ]] && kill -0 "${pid}" 2>/dev/null; then
|
|
kill -TERM "-${pid}" 2>/dev/null || kill -TERM "${pid}" 2>/dev/null || true
|
|
fi
|
|
done
|
|
}
|
|
|
|
remote_cleanup_command() {
|
|
local pattern_q
|
|
local patterns=(
|
|
"torchrun .*--master_addr=${MASTER_ADDR} .*--master_port=${MASTER_PORT} .*eval.py"
|
|
"torchrun .*--master_port=${MASTER_PORT} .*eval.py"
|
|
"python.*eval.py .*output.filename=${OUTPUT_FILENAME}"
|
|
)
|
|
|
|
printf 'set +e; '
|
|
for pattern in "${patterns[@]}"; do
|
|
printf -v pattern_q '%q' "${pattern}"
|
|
printf 'pkill -TERM -f %s 2>/dev/null; ' "${pattern_q}"
|
|
done
|
|
printf 'sleep 2; '
|
|
for pattern in "${patterns[@]}"; do
|
|
printf -v pattern_q '%q' "${pattern}"
|
|
printf 'pkill -KILL -f %s 2>/dev/null; ' "${pattern_q}"
|
|
done
|
|
printf 'true'
|
|
}
|
|
|
|
cleanup() {
|
|
local status="$?"
|
|
trap - INT TERM EXIT
|
|
|
|
if [[ "${status}" -eq 0 ]]; then
|
|
return 0
|
|
fi
|
|
|
|
echo
|
|
echo "Stopping multi-node eval..."
|
|
stop_log_tails
|
|
|
|
if [[ -n "${LOCAL_PID}" ]] && kill -0 "${LOCAL_PID}" 2>/dev/null; then
|
|
kill -TERM "-${LOCAL_PID}" 2>/dev/null || kill -TERM "${LOCAL_PID}" 2>/dev/null || true
|
|
fi
|
|
|
|
if [[ -n "${REMOTE_PID}" ]] && kill -0 "${REMOTE_PID}" 2>/dev/null; then
|
|
kill -TERM "${REMOTE_PID}" 2>/dev/null || true
|
|
fi
|
|
|
|
ssh "${SSH_OPTS[@]}" "${REMOTE_HOST}" "bash -lc ${REMOTE_CLEANUP_CMD_Q}" >/dev/null 2>&1 || true
|
|
|
|
if [[ -n "${LOCAL_PID}" ]] && kill -0 "${LOCAL_PID}" 2>/dev/null; then
|
|
sleep 2
|
|
kill -KILL "-${LOCAL_PID}" 2>/dev/null || kill -KILL "${LOCAL_PID}" 2>/dev/null || true
|
|
fi
|
|
|
|
echo "Cleanup requested. Check logs if any process was already exiting:"
|
|
echo " local: ${LOCAL_LOG}"
|
|
echo " remote: ${REMOTE_LOG}"
|
|
exit "${status}"
|
|
}
|
|
|
|
trap cleanup INT TERM EXIT
|
|
|
|
REMOTE_CLEANUP_CMD="$(remote_cleanup_command)"
|
|
printf -v REMOTE_CLEANUP_CMD_Q '%q' "${REMOTE_CLEANUP_CMD}"
|
|
|
|
echo "Launching multi-node eval"
|
|
echo " master: ${MASTER_ADDR}:${MASTER_PORT}"
|
|
echo " remote: ${REMOTE_HOST}"
|
|
echo " repo: ${REPO_ROOT}"
|
|
echo " stablewm: ${STABLEWM_HOME}"
|
|
echo " config: ${CONFIG_NAME}"
|
|
echo " policy: ${POLICY}"
|
|
echo " output: ${OUTPUT_FILENAME}"
|
|
echo " extra: ${EXTRA_ARGS:-<none>}"
|
|
echo " tail logs: ${TAIL_LOGS}"
|
|
echo " preload wait: ${PRELOAD_WAIT}"
|
|
if [[ "${PRELOAD_WAIT}" == "1" ]]; then
|
|
echo " preload signal: ${PRELOAD_SIGNAL_FILE}"
|
|
echo " start command: touch ${PRELOAD_SIGNAL_FILE}"
|
|
fi
|
|
echo " local log: ${LOCAL_LOG}"
|
|
echo " remote log: ${REMOTE_LOG}"
|
|
|
|
if [[ "${DRY_RUN}" == "1" ]]; then
|
|
echo
|
|
echo "Remote command:"
|
|
echo "ssh ${SSH_OPTS[*]} ${REMOTE_HOST} bash -lc ${REMOTE_CMD_Q}"
|
|
echo
|
|
echo "Local command:"
|
|
printf -v LOCAL_CMD_Q '%q' "${LOCAL_CMD}"
|
|
echo "bash -lc ${LOCAL_CMD_Q}"
|
|
exit 0
|
|
fi
|
|
|
|
if [[ "${PRELOAD_WAIT}" == "1" && "${PRELOAD_CLEAR_SIGNAL}" == "1" ]]; then
|
|
rm -f "${PRELOAD_SIGNAL_FILE}"
|
|
fi
|
|
|
|
echo "Starting remote node_rank=1..."
|
|
ssh "${SSH_OPTS[@]}" "${REMOTE_HOST}" "bash -lc ${REMOTE_CMD_Q}" >"${REMOTE_LOG}" 2>&1 &
|
|
REMOTE_PID="$!"
|
|
|
|
if [[ "${TAIL_LOGS}" == "1" ]]; then
|
|
start_log_tail "node2" "${REMOTE_LOG}"
|
|
REMOTE_TAIL_PID="$!"
|
|
fi
|
|
|
|
sleep 3
|
|
|
|
echo "Starting local node_rank=0..."
|
|
set +e
|
|
setsid bash -lc "${LOCAL_CMD}" >"${LOCAL_LOG}" 2>&1 &
|
|
LOCAL_PID="$!"
|
|
|
|
if [[ "${TAIL_LOGS}" == "1" ]]; then
|
|
start_log_tail "node3" "${LOCAL_LOG}"
|
|
LOCAL_TAIL_PID="$!"
|
|
fi
|
|
|
|
wait "${LOCAL_PID}"
|
|
LOCAL_STATUS="$?"
|
|
|
|
wait "${REMOTE_PID}"
|
|
REMOTE_STATUS="$?"
|
|
set -e
|
|
|
|
stop_log_tails
|
|
trap - INT TERM EXIT
|
|
|
|
echo "Local status: ${LOCAL_STATUS}"
|
|
echo "Remote status: ${REMOTE_STATUS}"
|
|
echo "Local log: ${LOCAL_LOG}"
|
|
echo "Remote log: ${REMOTE_LOG}"
|
|
|
|
if [[ "${LOCAL_STATUS}" -ne 0 || "${REMOTE_STATUS}" -ne 0 ]]; then
|
|
echo "Multi-node eval failed. Tail logs:"
|
|
echo "===== local tail ====="
|
|
tail -80 "${LOCAL_LOG}" || true
|
|
echo "===== remote tail ====="
|
|
tail -80 "${REMOTE_LOG}" || true
|
|
exit 1
|
|
fi
|
|
|
|
echo "Multi-node eval complete."
|