#!/usr/bin/env bash set -euo pipefail # Launch 2-node LeWM evaluation from node-3. # # Defaults match the current cluster layout: # node-3: 10.16.200.9, node_rank=0 # node-2: 10.16.200.8, node_rank=1 # Each node runs two local torchrun processes for two visible GPUs. REPO_ROOT="${REPO_ROOT:-/home/lewm/lewm}" REMOTE_HOST="${REMOTE_HOST:-lewm@10.16.200.8}" MASTER_ADDR="${MASTER_ADDR:-10.16.200.9}" MASTER_PORT="${MASTER_PORT:-29500}" NNODES="${NNODES:-2}" NPROC_PER_NODE="${NPROC_PER_NODE:-2}" CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1}" STABLEWM_HOME="${STABLEWM_HOME:-/home/lewm/.stable-wm}" CONFIG_NAME="${CONFIG_NAME:-pusht.yaml}" POLICY="${POLICY:-pusht/lewm}" OUTPUT_FILENAME="${OUTPUT_FILENAME:-pusht_multinode_results.txt}" EXTRA_ARGS="${EXTRA_ARGS:-}" DRY_RUN="${DRY_RUN:-0}" TAIL_LOGS="${TAIL_LOGS:-1}" PRELOAD_WAIT="${PRELOAD_WAIT:-0}" PRELOAD_SIGNAL_FILE="${PRELOAD_SIGNAL_FILE:-/tmp/lewm_preload_start}" PRELOAD_CLEAR_SIGNAL="${PRELOAD_CLEAR_SIGNAL:-1}" LOG_DIR="${LOG_DIR:-${REPO_ROOT}/logs/multinode}" mkdir -p "${LOG_DIR}" RUN_ID="$(date +%Y%m%d_%H%M%S)" LOCAL_LOG="${LOG_DIR}/${RUN_ID}_node3_rank0.log" REMOTE_LOG="${LOG_DIR}/${RUN_ID}_node2_rank1.log" SSH_OPTS=( -F /dev/null -o StrictHostKeyChecking=no -o ServerAliveInterval=30 -o ServerAliveCountMax=20 ) COMMON_ARGS=( "--config-name=${CONFIG_NAME}" "policy=${POLICY}" "multi_node.enabled=true" "output.filename=${OUTPUT_FILENAME}" ) if [[ "${PRELOAD_WAIT}" == "1" ]]; then COMMON_ARGS+=( "preload_wait.enabled=true" "preload_wait.file=${PRELOAD_SIGNAL_FILE}" ) fi if [[ -n "${EXTRA_ARGS}" ]]; then # shellcheck disable=SC2206 COMMON_ARGS+=(${EXTRA_ARGS}) fi make_command() { local node_rank="$1" local repo_q cuda_q stablewm_q arg_q eval_args printf -v repo_q '%q' "${REPO_ROOT}" printf -v cuda_q '%q' "${CUDA_VISIBLE_DEVICES}" printf -v stablewm_q '%q' "${STABLEWM_HOME}" eval_args="" for arg in "${COMMON_ARGS[@]}"; do printf -v arg_q '%q' "${arg}" eval_args+=" ${arg_q}" done printf 'cd %s && source .venv/bin/activate && export CUDA_VISIBLE_DEVICES=%s && export STABLEWM_HOME=%s && torchrun --nnodes=%q --nproc_per_node=%q --node_rank=%q --master_addr=%q --master_port=%q eval.py%s' \ "${repo_q}" \ "${cuda_q}" \ "${stablewm_q}" \ "${NNODES}" \ "${NPROC_PER_NODE}" \ "${node_rank}" \ "${MASTER_ADDR}" \ "${MASTER_PORT}" \ "${eval_args}" } REMOTE_CMD="$(make_command 1)" LOCAL_CMD="$(make_command 0)" printf -v REMOTE_CMD_Q '%q' "${REMOTE_CMD}" REMOTE_PID="" LOCAL_PID="" LOCAL_TAIL_PID="" REMOTE_TAIL_PID="" REMOTE_CLEANUP_CMD="" REMOTE_CLEANUP_CMD_Q="" start_log_tail() { local label="$1" local log_file="$2" local label_q log_q printf -v label_q '%q' "${label}" printf -v log_q '%q' "${log_file}" setsid bash -lc "tail -n +1 -F ${log_q} 2>/dev/null | sed -u 's/^/[${label_q}] /'" & } stop_log_tails() { local pid for pid in "${LOCAL_TAIL_PID}" "${REMOTE_TAIL_PID}"; do if [[ -n "${pid}" ]] && kill -0 "${pid}" 2>/dev/null; then kill -TERM "-${pid}" 2>/dev/null || kill -TERM "${pid}" 2>/dev/null || true fi done } remote_cleanup_command() { local pattern_q local patterns=( "torchrun .*--master_addr=${MASTER_ADDR} .*--master_port=${MASTER_PORT} .*eval.py" "torchrun .*--master_port=${MASTER_PORT} .*eval.py" "python.*eval.py .*output.filename=${OUTPUT_FILENAME}" ) printf 'set +e; ' for pattern in "${patterns[@]}"; do printf -v pattern_q '%q' "${pattern}" printf 'pkill -TERM -f %s 2>/dev/null; ' "${pattern_q}" done printf 'sleep 2; ' for pattern in "${patterns[@]}"; do printf -v pattern_q '%q' "${pattern}" printf 'pkill -KILL -f %s 2>/dev/null; ' "${pattern_q}" done printf 'true' } cleanup() { local status="$?" trap - INT TERM EXIT if [[ "${status}" -eq 0 ]]; then return 0 fi echo echo "Stopping multi-node eval..." stop_log_tails if [[ -n "${LOCAL_PID}" ]] && kill -0 "${LOCAL_PID}" 2>/dev/null; then kill -TERM "-${LOCAL_PID}" 2>/dev/null || kill -TERM "${LOCAL_PID}" 2>/dev/null || true fi if [[ -n "${REMOTE_PID}" ]] && kill -0 "${REMOTE_PID}" 2>/dev/null; then kill -TERM "${REMOTE_PID}" 2>/dev/null || true fi ssh "${SSH_OPTS[@]}" "${REMOTE_HOST}" "bash -lc ${REMOTE_CLEANUP_CMD_Q}" >/dev/null 2>&1 || true if [[ -n "${LOCAL_PID}" ]] && kill -0 "${LOCAL_PID}" 2>/dev/null; then sleep 2 kill -KILL "-${LOCAL_PID}" 2>/dev/null || kill -KILL "${LOCAL_PID}" 2>/dev/null || true fi echo "Cleanup requested. Check logs if any process was already exiting:" echo " local: ${LOCAL_LOG}" echo " remote: ${REMOTE_LOG}" exit "${status}" } trap cleanup INT TERM EXIT REMOTE_CLEANUP_CMD="$(remote_cleanup_command)" printf -v REMOTE_CLEANUP_CMD_Q '%q' "${REMOTE_CLEANUP_CMD}" echo "Launching multi-node eval" echo " master: ${MASTER_ADDR}:${MASTER_PORT}" echo " remote: ${REMOTE_HOST}" echo " repo: ${REPO_ROOT}" echo " stablewm: ${STABLEWM_HOME}" echo " config: ${CONFIG_NAME}" echo " policy: ${POLICY}" echo " output: ${OUTPUT_FILENAME}" echo " extra: ${EXTRA_ARGS:-}" echo " tail logs: ${TAIL_LOGS}" echo " preload wait: ${PRELOAD_WAIT}" if [[ "${PRELOAD_WAIT}" == "1" ]]; then echo " preload signal: ${PRELOAD_SIGNAL_FILE}" echo " start command: touch ${PRELOAD_SIGNAL_FILE}" fi echo " local log: ${LOCAL_LOG}" echo " remote log: ${REMOTE_LOG}" if [[ "${DRY_RUN}" == "1" ]]; then echo echo "Remote command:" echo "ssh ${SSH_OPTS[*]} ${REMOTE_HOST} bash -lc ${REMOTE_CMD_Q}" echo echo "Local command:" printf -v LOCAL_CMD_Q '%q' "${LOCAL_CMD}" echo "bash -lc ${LOCAL_CMD_Q}" exit 0 fi if [[ "${PRELOAD_WAIT}" == "1" && "${PRELOAD_CLEAR_SIGNAL}" == "1" ]]; then rm -f "${PRELOAD_SIGNAL_FILE}" fi echo "Starting remote node_rank=1..." ssh "${SSH_OPTS[@]}" "${REMOTE_HOST}" "bash -lc ${REMOTE_CMD_Q}" >"${REMOTE_LOG}" 2>&1 & REMOTE_PID="$!" if [[ "${TAIL_LOGS}" == "1" ]]; then start_log_tail "node2" "${REMOTE_LOG}" REMOTE_TAIL_PID="$!" fi sleep 3 echo "Starting local node_rank=0..." set +e setsid bash -lc "${LOCAL_CMD}" >"${LOCAL_LOG}" 2>&1 & LOCAL_PID="$!" if [[ "${TAIL_LOGS}" == "1" ]]; then start_log_tail "node3" "${LOCAL_LOG}" LOCAL_TAIL_PID="$!" fi wait "${LOCAL_PID}" LOCAL_STATUS="$?" wait "${REMOTE_PID}" REMOTE_STATUS="$?" set -e stop_log_tails trap - INT TERM EXIT echo "Local status: ${LOCAL_STATUS}" echo "Remote status: ${REMOTE_STATUS}" echo "Local log: ${LOCAL_LOG}" echo "Remote log: ${REMOTE_LOG}" if [[ "${LOCAL_STATUS}" -ne 0 || "${REMOTE_STATUS}" -ne 0 ]]; then echo "Multi-node eval failed. Tail logs:" echo "===== local tail =====" tail -80 "${LOCAL_LOG}" || true echo "===== remote tail =====" tail -80 "${REMOTE_LOG}" || true exit 1 fi echo "Multi-node eval complete."