init commit

This commit is contained in:
yuchen-x
2025-09-12 21:53:41 +08:00
parent 275a568149
commit d7be60f9fe
105 changed files with 16119 additions and 1 deletions

32
scripts/train.sh Normal file
View File

@@ -0,0 +1,32 @@
# NCCL configuration
# export NCCL_DEBUG=debug
# export NCCL_IB_DISABLE=0
# export NCCL_IB_GID_INDEX=3
# export NCCL_NET_GDR_LEVEL=3
# export CUDA_LAUNCH_BLOCKING=1
# export NCCL_TOPO_FILE=/tmp/topo.txt
# export MASTER_ADDR="master.ip."
# export MASTER_PROT=12366
# args
name="experiment_name"
config_file=configs/train/config.yaml
# save root dir for logs, checkpoints, tensorboard record, etc.
save_root="/path/to/savedir"
mkdir -p $save_root/$name
## run
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 -m torch.distributed.launch \
--nproc_per_node=8 --nnodes=1 --master_addr=127.0.0.1 --master_port=12366 --node_rank=0 \
./scripts/trainer.py \
--base $config_file \
--train \
--name $name \
--logdir $save_root \
--devices 8 \
--total_gpus=8 \
lightning.trainer.num_nodes=1