#!/usr/bin/env bash set -euo pipefail # Manage the dask cluster used by TN path search. # # Defaults target two servers: # scheduler: 10.20.1.103:8786 # workers: 10.20.1.103, 10.20.6.101 # # Usage: # tools/manage_tn_dask_cluster.sh start # tools/manage_tn_dask_cluster.sh status # tools/manage_tn_dask_cluster.sh stop # # Common overrides: # SCHEDULER_HOST=10.20.1.103 # WORKER_HOSTS="10.20.1.103 10.20.6.101" # NWORKERS=48 # NTHREADS=1 # ROOT_DIR=/home/yx/qibotn # PYTHON_BIN=.venv/bin/python ROOT_DIR="${ROOT_DIR:-/home/yx/qibotn}" PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}" SCHEDULER_HOST="${SCHEDULER_HOST:-10.20.1.103}" SCHEDULER_PORT="${SCHEDULER_PORT:-8786}" DASHBOARD_ADDRESS="${DASHBOARD_ADDRESS:-:8787}" WORKER_HOSTS="${WORKER_HOSTS:-10.20.1.103 10.20.6.101}" NWORKERS="${NWORKERS:-84}" NTHREADS="${NTHREADS:-1}" MEMORY_LIMIT="${MEMORY_LIMIT:-0}" LOCAL_DIRECTORY="${LOCAL_DIRECTORY:-/tmp/qibotn-dask}" LOG_DIR="${LOG_DIR:-$ROOT_DIR/logs/dask}" SSH_BIN="${SSH_BIN:-ssh}" DASK_WORKER_TTL="${DASK_WORKER_TTL:-24 hours}" DASK_TICK_LIMIT="${DASK_TICK_LIMIT:-30 minutes}" DASK_LOST_WORKER_TIMEOUT="${DASK_LOST_WORKER_TIMEOUT:-30 minutes}" SCHEDULER_ADDR="tcp://${SCHEDULER_HOST}:${SCHEDULER_PORT}" is_local_host() { local host="$1" [[ "$host" == "localhost" || "$host" == "127.0.0.1" ]] && return 0 [[ "$host" == "$(hostname)" ]] && return 0 [[ "$host" == "$(hostname -f 2>/dev/null || true)" ]] && return 0 hostname -I 2>/dev/null | tr ' ' '\n' | grep -qx "$host" } run_on_host() { local host="$1" shift local cmd="$*" if is_local_host "$host"; then bash -lc "$cmd" else "$SSH_BIN" "$host" "bash -lc $(printf '%q' "$cmd")" fi } start_scheduler() { local host="$SCHEDULER_HOST" local log="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.log" local pid_file="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.pid" run_on_host "$host" " set -euo pipefail cd '$ROOT_DIR' mkdir -p '$LOG_DIR' if [[ -s '$pid_file' ]]; then pid=\$(cat '$pid_file') if kill -0 \"\$pid\" 2>/dev/null; then echo \"scheduler already running on $host pid=\$pid\" exit 0 fi fi DASK_DISTRIBUTED__SCHEDULER__WORKER_TTL='$DASK_WORKER_TTL' \ DASK_DISTRIBUTED__ADMIN__TICK__LIMIT='$DASK_TICK_LIMIT' \ DASK_DISTRIBUTED__DEPLOY__LOST_WORKER_TIMEOUT='$DASK_LOST_WORKER_TIMEOUT' \ setsid '$PYTHON_BIN' -m distributed.cli.dask_scheduler \ --host '$SCHEDULER_HOST' \ --port '$SCHEDULER_PORT' \ --dashboard-address '$DASHBOARD_ADDRESS' \ > '$log' 2>&1 < /dev/null & pid=\$! echo \"\$pid\" > '$pid_file' echo \"scheduler host=$host pid=\$pid addr=$SCHEDULER_ADDR log=$log\" " } start_worker() { local host="$1" local log="$LOG_DIR/worker_${host}.log" local pid_file="$LOG_DIR/worker_${host}.pid" run_on_host "$host" " set -euo pipefail cd '$ROOT_DIR' mkdir -p '$LOG_DIR' '$LOCAL_DIRECTORY' if [[ -s '$pid_file' ]]; then pid=\$(cat '$pid_file') if kill -0 \"\$pid\" 2>/dev/null; then echo \"worker already running on $host pid=\$pid\" exit 0 fi fi TCM_ENABLE=1 \ DASK_DISTRIBUTED__SCHEDULER__WORKER_TTL='$DASK_WORKER_TTL' \ DASK_DISTRIBUTED__ADMIN__TICK__LIMIT='$DASK_TICK_LIMIT' \ DASK_DISTRIBUTED__DEPLOY__LOST_WORKER_TIMEOUT='$DASK_LOST_WORKER_TIMEOUT' \ setsid '$PYTHON_BIN' -m distributed.cli.dask_worker \ '$SCHEDULER_ADDR' \ --host '$host' \ --nworkers '$NWORKERS' \ --nthreads '$NTHREADS' \ --memory-limit '$MEMORY_LIMIT' \ --local-directory '$LOCAL_DIRECTORY' \ > '$log' 2>&1 < /dev/null & pid=\$! echo \"\$pid\" > '$pid_file' echo \"worker host=$host pid=\$pid scheduler=$SCHEDULER_ADDR log=$log\" " } stop_host() { local host="$1" local scheduler_pid_file="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.pid" local worker_pid_file="$LOG_DIR/worker_${host}.pid" run_on_host "$host" " set +e for pid_file in '$worker_pid_file' '$scheduler_pid_file'; do [[ -f \"\$pid_file\" ]] || continue if [[ \"\$pid_file\" == '$scheduler_pid_file' && '$host' != '$SCHEDULER_HOST' ]]; then continue fi pid=\$(cat \"\$pid_file\") kill \"\$pid\" 2>/dev/null || true rm -f \"\$pid_file\" done pkill -f '[d]istributed.cli.dask_worker.*$SCHEDULER_ADDR' pkill -f '[d]istributed.cli.dask_scheduler.*--port $SCHEDULER_PORT' true " } status_host() { local host="$1" local scheduler_pid_file="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.pid" local worker_pid_file="$LOG_DIR/worker_${host}.pid" echo "--------------------------------------------------------------------------------" echo "host=$host" run_on_host "$host" " set +e for pid_file in '$worker_pid_file' '$scheduler_pid_file'; do [[ -f \"\$pid_file\" ]] || continue if [[ \"\$pid_file\" == '$scheduler_pid_file' && '$host' != '$SCHEDULER_HOST' ]]; then continue fi pid=\$(cat \"\$pid_file\") if kill -0 \"\$pid\" 2>/dev/null; then ps -p \"\$pid\" -o pid,ppid,stat,etime,cmd --no-headers else echo \"stale pid_file=\$pid_file pid=\$pid\" fi done pgrep -af '[d]istributed.cli.dask' || true " } case "${1:-help}" in start) start_scheduler sleep 2 for host in $WORKER_HOSTS; do start_worker "$host" done echo echo "Dask scheduler: $SCHEDULER_ADDR" echo "Dashboard: http://$SCHEDULER_HOST$DASHBOARD_ADDRESS" ;; stop) for host in $WORKER_HOSTS; do stop_host "$host" done stop_host "$SCHEDULER_HOST" ;; status) status_host "$SCHEDULER_HOST" for host in $WORKER_HOSTS; do [[ "$host" == "$SCHEDULER_HOST" ]] && continue status_host "$host" done ;; restart) "$0" stop sleep 2 "$0" start ;; help|*) cat <