补充
Some checks failed
Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled
Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled
Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled
Tests / check (push) Has been cancelled
Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled
Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled
Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled
Some checks failed
Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled
Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled
Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled
Tests / check (push) Has been cancelled
Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled
Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled
Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled
This commit is contained in:
@@ -5,7 +5,7 @@ set -euo pipefail
|
||||
#
|
||||
# Defaults target two servers:
|
||||
# scheduler: 10.20.1.103:8786
|
||||
# workers: 10.20.1.103, 10.20.1.102
|
||||
# workers: 10.20.1.103, 10.20.6.101
|
||||
#
|
||||
# Usage:
|
||||
# tools/manage_tn_dask_cluster.sh start
|
||||
@@ -14,7 +14,7 @@ set -euo pipefail
|
||||
#
|
||||
# Common overrides:
|
||||
# SCHEDULER_HOST=10.20.1.103
|
||||
# WORKER_HOSTS="10.20.1.103 10.20.1.102"
|
||||
# WORKER_HOSTS="10.20.1.103 10.20.6.101"
|
||||
# NWORKERS=48
|
||||
# NTHREADS=1
|
||||
# ROOT_DIR=/home/yx/qibotn
|
||||
@@ -25,8 +25,8 @@ PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
|
||||
SCHEDULER_HOST="${SCHEDULER_HOST:-10.20.1.103}"
|
||||
SCHEDULER_PORT="${SCHEDULER_PORT:-8786}"
|
||||
DASHBOARD_ADDRESS="${DASHBOARD_ADDRESS:-:8787}"
|
||||
WORKER_HOSTS="${WORKER_HOSTS:-10.20.1.103 10.20.1.102}"
|
||||
NWORKERS="${NWORKERS:-48}"
|
||||
WORKER_HOSTS="${WORKER_HOSTS:-10.20.1.103 10.20.6.101}"
|
||||
NWORKERS="${NWORKERS:-84}"
|
||||
NTHREADS="${NTHREADS:-1}"
|
||||
MEMORY_LIMIT="${MEMORY_LIMIT:-0}"
|
||||
LOCAL_DIRECTORY="${LOCAL_DIRECTORY:-/tmp/qibotn-dask}"
|
||||
|
||||
93
tools/run_tn_dask_mpi_all.sh
Executable file
93
tools/run_tn_dask_mpi_all.sh
Executable file
@@ -0,0 +1,93 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
cd "$ROOT_DIR"
|
||||
|
||||
CASE="${CASE:-main1}"
|
||||
OBSERVABLES="${OBSERVABLES:-long_z_string}"
|
||||
NQUBITS="${NQUBITS:-34}"
|
||||
NLAYERS="${NLAYERS:-20}"
|
||||
TORCH_THREADS="${TORCH_THREADS:-48}"
|
||||
SEARCH_REPEATS="${SEARCH_REPEATS:-2048}"
|
||||
SEARCH_TIME="${SEARCH_TIME:-300}"
|
||||
TN_TARGET_SIZE="${TN_TARGET_SIZE:-8589934592}"
|
||||
TN_TARGET_SLICES="${TN_TARGET_SLICES:-}"
|
||||
|
||||
PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
|
||||
DTYPE="${DTYPE:-complex64}"
|
||||
TREE_DIR="${TREE_DIR:-trees/contest_tn}"
|
||||
DASK_ADDRESS="${DASK_ADDRESS:-tcp://10.20.1.103:8786}"
|
||||
MPIEXEC_FULL="${MPIEXEC_FULL:-mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2}"
|
||||
SYNC_TREES="${SYNC_TREES:-1}"
|
||||
SYNC_HOSTS="${SYNC_HOSTS:-${WORKER_HOSTS:-}}"
|
||||
SSH_BIN="${SSH_BIN:-ssh}"
|
||||
|
||||
export TCM_ENABLE="${TCM_ENABLE:-1}"
|
||||
|
||||
tn_slice_args=(--tn-target-size "$TN_TARGET_SIZE")
|
||||
if [[ -n "$TN_TARGET_SLICES" ]]; then
|
||||
tn_slice_args+=(--tn-target-slices "$TN_TARGET_SLICES")
|
||||
fi
|
||||
|
||||
is_local_host() {
|
||||
local host="$1"
|
||||
[[ "$host" == "localhost" || "$host" == "127.0.0.1" ]] && return 0
|
||||
[[ "$host" == "$(hostname)" ]] && return 0
|
||||
[[ "$host" == "$(hostname -f 2>/dev/null || true)" ]] && return 0
|
||||
hostname -I 2>/dev/null | tr ' ' '\n' | grep -qx "$host"
|
||||
}
|
||||
|
||||
sync_trees_to_hosts() {
|
||||
[[ "$SYNC_TREES" == "1" ]] || return 0
|
||||
[[ -n "$SYNC_HOSTS" ]] || return 0
|
||||
|
||||
local src_dir="$TREE_DIR"
|
||||
local dst_dir="$TREE_DIR"
|
||||
if [[ "$TREE_DIR" != /* ]]; then
|
||||
src_dir="$ROOT_DIR/$TREE_DIR"
|
||||
dst_dir="$ROOT_DIR/$TREE_DIR"
|
||||
fi
|
||||
|
||||
for host in $SYNC_HOSTS; do
|
||||
is_local_host "$host" && continue
|
||||
echo "Sync tree dir to $host:$dst_dir"
|
||||
"$SSH_BIN" "$host" "mkdir -p $(printf '%q' "$dst_dir")"
|
||||
if command -v rsync >/dev/null 2>&1; then
|
||||
rsync -a "$src_dir/" "$host:$dst_dir/"
|
||||
else
|
||||
scp -q "$src_dir"/*.pkl "$host:$dst_dir/"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
tools/manage_tn_dask_cluster.sh start
|
||||
|
||||
echo "Search with dask: $DASK_ADDRESS"
|
||||
"$PYTHON_BIN" -u tools/tn_contest_runner.py search \
|
||||
--case "$CASE" \
|
||||
--nqubits "$NQUBITS" \
|
||||
--nlayers "$NLAYERS" \
|
||||
--observables $OBSERVABLES \
|
||||
--tree-dir "$TREE_DIR" \
|
||||
--dask-address "$DASK_ADDRESS" \
|
||||
--torch-threads "$TORCH_THREADS" \
|
||||
--dtype "$DTYPE" \
|
||||
--tn-search-repeats "$SEARCH_REPEATS" \
|
||||
--tn-search-time "$SEARCH_TIME" \
|
||||
"${tn_slice_args[@]}"
|
||||
|
||||
sync_trees_to_hosts
|
||||
|
||||
echo "Contract with MPI: $MPIEXEC_FULL"
|
||||
read -r -a mpi_prefix <<< "$MPIEXEC_FULL"
|
||||
"${mpi_prefix[@]}" "$PYTHON_BIN" -u tools/tn_contest_runner.py contract \
|
||||
--mpi \
|
||||
--case "$CASE" \
|
||||
--nqubits "$NQUBITS" \
|
||||
--nlayers "$NLAYERS" \
|
||||
--observables $OBSERVABLES \
|
||||
--tree-dir "$TREE_DIR" \
|
||||
--torch-threads "$TORCH_THREADS" \
|
||||
--dtype "$DTYPE" \
|
||||
"${tn_slice_args[@]}"
|
||||
@@ -199,7 +199,7 @@ def build_parallel_opts(args, tree_file=None, search_only=False):
|
||||
"search_workers": args.tn_search_workers or args.torch_threads,
|
||||
"max_repeats": args.tn_search_repeats,
|
||||
"max_time": args.tn_search_time,
|
||||
"print_stats": not args.no_tn_stats,
|
||||
"print_stats": False,
|
||||
}
|
||||
if args.tn_search_backend is not None:
|
||||
opts["search_backend"] = args.tn_search_backend
|
||||
@@ -303,7 +303,7 @@ def run_one(args, case_name, obs_name, mode):
|
||||
f"failed_trials={search_stats.get('failed_trials', 'na')} "
|
||||
f"requested_trials={search_stats.get('requested_trials', 'na')} "
|
||||
f"best_score={search_stats.get('best_score', float('nan')):.6g} "
|
||||
f"slices={cost.get('slices')} "
|
||||
f"slices={cost.get('nslices')} "
|
||||
f"log10_flops={cost.get('log10_flops', float('nan')):.3f} "
|
||||
f"log10_write={cost.get('log10_write', float('nan')):.3f} "
|
||||
f"log2_size={cost.get('log2_size', float('nan')):.3f} "
|
||||
@@ -337,6 +337,11 @@ def apply_case_defaults(args):
|
||||
def stop_dask_cluster(args):
|
||||
if args.keep_dask or args.tn_search_backend != "dask" or not args.dask_address:
|
||||
return
|
||||
if args.mpi:
|
||||
from mpi4py import MPI
|
||||
|
||||
if MPI.COMM_WORLD.Get_rank() != 0:
|
||||
return
|
||||
script = ROOT / "tools" / "manage_tn_dask_cluster.sh"
|
||||
if not script.exists():
|
||||
print(f"dask_stop_skipped reason=missing_script path={script}", flush=True)
|
||||
|
||||
Reference in New Issue
Block a user