From c13d3f704e2a80086772430d9dbc6f069ae2090e Mon Sep 17 00:00:00 2001 From: zhttjd Date: Thu, 21 Aug 2025 17:25:52 +0800 Subject: [PATCH] Added cli config for DiffusionDrive, fix minor readme and script issues --- model_examples/DiffusionDrive/README.md | 8 ++ .../migrate_to_ascend/train_8p.sh | 76 +++++++++++++++---- model_examples/PanoOcc/README.md | 2 +- .../PanoOcc/migrate_to_ascend/train_8p.sh | 3 +- 4 files changed, 74 insertions(+), 15 deletions(-) diff --git a/model_examples/DiffusionDrive/README.md b/model_examples/DiffusionDrive/README.md index f9ca0390..5f5cafb6 100644 --- a/model_examples/DiffusionDrive/README.md +++ b/model_examples/DiffusionDrive/README.md @@ -17,6 +17,7 @@ - [快速开始](#快速开始) - [训练模型](#训练模型) - [验证性能](#验证性能) + - [训练脚本支持的命令行参数](#训练脚本支持的命令行参数) - [训练结果](#训练结果) - [版本说明](#版本说明) - [变更](#变更) @@ -215,6 +216,13 @@ bash migrate_to_ascend/train_8p.sh bash migrate_to_ascend/train_8p.sh --performance ``` +## 训练脚本支持的命令行参数 +`train_8p,sh` +* `--performance`:添加该参数,训练脚本仅验机器性能;未添加时,正常长跑训练完整epochs数 +* `--num_npu=*`: 可调整训练使用的npu卡数,取值范围[1, 8],默认为8 +* `--batch_size`: 可调整每张卡的batch size,取值范围为>=1的整数,上限由显存占用决定,默认为1 + + ## 训练结果 diff --git a/model_examples/DiffusionDrive/migrate_to_ascend/train_8p.sh b/model_examples/DiffusionDrive/migrate_to_ascend/train_8p.sh index 8dfe2cac..27fb3cf5 100644 --- a/model_examples/DiffusionDrive/migrate_to_ascend/train_8p.sh +++ b/model_examples/DiffusionDrive/migrate_to_ascend/train_8p.sh @@ -8,35 +8,77 @@ if [ -d ${OUTPUT_PATH} ]; then fi mkdir -p ${OUTPUT_PATH} -# Default路径 -CONFIG=projects/configs/diffusiondrive_configs/diffusiondrive_small_stage2.py + +################################################################## +# 默认参数 + +CONFIG_FILE=projects/configs/diffusiondrive_configs/diffusiondrive_small_stage2.py GLOBAL_BATCH_SIZE=48 -# 设置NPU卡数 -RANK_SIZE=8 +RANK_SIZE=8 # NPU卡数 +BATCH_SIZE=6 -# 传参: -# 如果设置了--performance,仅验性能(仅需少量epochs,建议同步设置--epoch为小一点的数字),不验精度。 -# 不设置时默认为验精度模式,将运行config文件里设置的全量epochs,较为耗时 -# 如果设置了--config,则可以指定config文件路径覆盖default路径 PERFORMANCE_MODE=0 -TEE_TO_STDOUT=0 +################################################################## + +# 获取传入的命令行参数 for para in $* do + # 如果设置了--performance,仅验性能(仅需训练1k步),不验精度。 + # 不设置时默认为精度模式,将训练全量epochs,较为耗时 if [[ $para == --performance ]]; then PERFORMANCE_MODE=1 fi + # 可指定其他的config文件路径 if [[ $para == --config=* ]]; then CONFIG_FILE=`echo ${para#*=}` fi + + # 可通过入参设置单卡的Batch Size(对应config文件里的samples_per_gpu) + if [[ $para == --batch_size=* ]]; then + BATCH_SIZE=`echo ${para#*=}` + fi + + # 可通过入参修改单机场景下使用多少块NPU,即RANK_SIZE + if [[ $para == --num_npu=* ]]; then + RANK_SIZE=`echo ${para#*=}` + fi done +GLOBAL_BATCH_SIZE=$(expr $RANK_SIZE \* $BATCH_SIZE) -################################################################## +# ################################################################## +# 修改config文件更新参数 + +# 备份config文件 +cp ${CONFIG_FILE} ${CONFIG_FILE}.bak + +# 更新config文件里的参数 +sed -i "s|total_batch_size[[:space:]]*=[[:space:]]*[0-9]\{1,\}|total_batch_size = ${GLOBAL_BATCH_SIZE}|g" ${CONFIG_FILE} +sed -i "s|num_gpus[[:space:]]*=[[:space:]]*[0-9]\{1,\}|num_gpus = ${RANK_SIZE}|g" ${CONFIG_FILE} + +# 定义复原config文件的callback +restore_config() { + if [ -f ${CONFIG_FILE}.bak ]; then + mv -f ${CONFIG_FILE}.bak ${CONFIG_FILE} + fi +} + +# 设置信号捕获,如果训练 +# 正常退出(EXIT) +# 用户中断(SIGINT) +# Kill终止请求(SIGTERM) +# 命令执行失败(ERR) +# 可以自动还原对config文件的修改 +trap restore_config EXIT SIGINT SIGTERM ERR +# ################################################################## + + +# ################################################################## # 配置环境变量 echo "[INFO] Start setting ENV VAR" @@ -90,17 +132,22 @@ MPORT=${PORT:-28651} # 训练 ################################################# +echo "[DiffusionDrive] Training..." +echo "Path to realtime training logs: ${OUTPUT_PATH}" + start_time=$(date +%s) echo "start_time=$(date -d @${start_time} "+%Y-%m-%d %H:%M:%S")" PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ +# $(dirname "$0")指定此脚本的上级目录,因此训练调用的train.py文件应与此脚本位于同一个目录内 + # 验精度 if [[ ${PERFORMANCE_MODE} == 0 ]]; then nohup python -m torch.distributed.run \ --nproc_per_node=$RANK_SIZE \ --master_port=$MPORT \ - migrate_to_ascend/train.py $CONFIG \ + $(dirname "$0")/train.py $CONFIG_FILE \ --launcher pytorch \ --deterministic > ${OUTPUT_PATH}/train_8p_full.log 2>&1 & wait @@ -110,7 +157,7 @@ else nohup python -m torch.distributed.run \ --nproc_per_node=$RANK_SIZE \ --master_port=$MPORT \ - migrate_to_ascend/train.py $CONFIG \ + $(dirname "$0")/train.py $CONFIG_FILE \ --launcher pytorch \ --deterministic --performance > ${OUTPUT_PATH}/train_8p_performance.log 2>&1 & wait @@ -143,4 +190,7 @@ echo " - Final Performance images/sec : ${avg_fps}" if [[ ${PERFORMANCE_MODE} == 0 ]]; then L2=`grep "val" ${log_file} | awk -F "L2: " '{print $2}' | awk 'END {print}'` echo " - L2 : ${L2}" -fi \ No newline at end of file +fi + + + diff --git a/model_examples/PanoOcc/README.md b/model_examples/PanoOcc/README.md index 01916e43..5ed36e6b 100644 --- a/model_examples/PanoOcc/README.md +++ b/model_examples/PanoOcc/README.md @@ -250,7 +250,7 @@ bash eval_8p.sh [CHECKPOINT_FILE] [OUTPUT_PATH] [NUM_NPUS] [NUM_NPUS] * `--epochs=*`: 可调整训练epochs数,取值范围为正整数,默认24个epochs * `--num_npu=*`: 可调整训练使用的npu卡数,取值范围[1, 8],默认为8 * `--workers_per_npu=*`:可调整每张卡的数据加载子进程的数量,取值范围为>=0的整数,上限由共享内存等多方面因素决定,默认值为6 -* `--batch_size`: (当前版本暂不支持bs大于1,仅作为预埋参数,待后续更新)可调整每张卡的batch size,取值范围为>1的整数,上限由显存占用决定,默认为1 +* `--batch_size`: (当前版本暂不支持bs大于1,仅作为预埋参数,待后续更新)可调整每张卡的batch size,取值范围为>=1的整数,上限由显存占用决定,默认为1 # 训练结果 diff --git a/model_examples/PanoOcc/migrate_to_ascend/train_8p.sh b/model_examples/PanoOcc/migrate_to_ascend/train_8p.sh index f7bc340d..415c4a3b 100644 --- a/model_examples/PanoOcc/migrate_to_ascend/train_8p.sh +++ b/model_examples/PanoOcc/migrate_to_ascend/train_8p.sh @@ -32,7 +32,7 @@ do fi # 可通过入参修改单机场景下使用多少块NPU,即RANK_SIZE - if [[ $para == --num_npus=* ]]; then + if [[ $para == --num_npu=* ]]; then RANK_SIZE=`echo ${para#*=}` fi @@ -122,6 +122,7 @@ start_time=$(date +%s) # 开始训练 echo "[PanoOcc] Training..." +echo "Path to realtime training logs: ${OUTPUT_PATH}" NNODES=${NNODES:-1} NODE_RANK=${NODE_RANK:-0} -- Gitee