From 20c43037baa2ff9aeb88a4c4a771f62fa4407a3a Mon Sep 17 00:00:00 2001 From: zhangchenrui Date: Mon, 25 Aug 2025 10:43:39 +0800 Subject: [PATCH] update flashocc shell, add num_npu and batch_size parameter --- model_examples/FlashOCC/README.md | 14 ++++- ...rain_8p_flashocc_r50_fp16_backbone_full.sh | 63 +++++++++++++++++-- ...rain_8p_flashocc_r50_fp16_backbone_perf.sh | 63 +++++++++++++++++-- .../test/train_8p_flashocc_r50_full.sh | 63 +++++++++++++++++-- .../test/train_8p_flashocc_r50_perf.sh | 63 +++++++++++++++++-- 5 files changed, 244 insertions(+), 22 deletions(-) diff --git a/model_examples/FlashOCC/README.md b/model_examples/FlashOCC/README.md index cd6b089b..9df30939 100644 --- a/model_examples/FlashOCC/README.md +++ b/model_examples/FlashOCC/README.md @@ -168,28 +168,36 @@ FlashOCC是一种高效且轻量化的占用预测框架,专为自动驾驶系 #### 开始训练 - 在模型源码根目录下,运行训练脚本。 + + 运行脚本支持命令行参数: + - '--num-npu':NPU卡数,默认为8; + - '--batch-size': 每卡batch-size大小,默认为24; - 单机8卡性能训练 ``` - bash test/train_8p_flashocc_r50_perf.sh # 8卡性能 + bash test/train_8p_flashocc_r50_perf.sh + (option) bash test/train_8p_flashocc_r50_perf.sh --num-npu 8 --batch-size 24 # 8卡性能 ``` - 单机8卡精度训练 ``` - bash test/train_8p_flashocc_r50_full.sh # 8卡精度 + bash test/train_8p_flashocc_r50_full.sh + (option) bash test/train_8p_flashocc_r50_full.sh --num-npu 8 --batch-size 24 # 8卡精度 ``` - 单机8卡backbone FP16性能训练 ``` bash test/train_8p_flashocc_r50_fp16_backbone_perf.sh + (option) bash test/train_8p_flashocc_r50_fp16_backbone_perf.sh --num-npu 8 --batch-size 24 ``` - 单机8卡backbone FP16精度训练 ``` bash test/train_8p_flashocc_r50_fp16_backbone_full.sh + (option) bash test/train_8p_flashocc_r50_fp16_backbone_full.sh --num-npu 8 --batch-size 24 ``` #### 训练结果 @@ -214,6 +222,8 @@ FlashOCC是一种高效且轻量化的占用预测框架,专为自动驾驶系 2025.8.20:增大num worker,更新fp16性能。 +2025.8.25:优化训练脚本,增加入参。 + # FAQ ## 训练时报错`ImportError: cannot import name 'gcd' from 'fraction'` diff --git a/model_examples/FlashOCC/test/train_8p_flashocc_r50_fp16_backbone_full.sh b/model_examples/FlashOCC/test/train_8p_flashocc_r50_fp16_backbone_full.sh index 7f38ebf5..dab17019 100644 --- a/model_examples/FlashOCC/test/train_8p_flashocc_r50_fp16_backbone_full.sh +++ b/model_examples/FlashOCC/test/train_8p_flashocc_r50_fp16_backbone_full.sh @@ -4,12 +4,47 @@ NETWORK="FlashOCC_R50" DEVICE_TYPE=$(uname -m) -WORLD_SIZE=8 +NUM_NPU=8 BATCH_SIZE=24 TOTAL_EPOCHS=24 +while [[ $# -gt 0 ]]; do + case $1 in + --NUM-NPU|--num-npu) + NUM_NPU="$2" + shift 2 + ;; + --BATCH-SIZE|--batch-size) + BATCH_SIZE="$2" + shift 2 + ;; + --TOTAL-EPOCHS|--total-epochs) + TOTAL_EPOCHS="$2" + shift 2 + ;; + --help) + echo "用法: $0 [选项]" + echo "选项:" + echo " --NUM-NPU 设置 NUM-NPU (默认: 8)" + echo " --BATCH-SIZE 设置 BATCH_SIZE (默认: 24)" + echo " --TOTAL-EPOCHS 设置 TOTAL_EPOCHS (默认: 24)" + echo " --help 显示帮助信息" + exit 0 + ;; + *) + echo "未知选项: $1" + echo "使用 --help 查看帮助" + exit 1 + ;; + esac +done + +echo "NUM_NPU: $NUM_NPU" +echo "BATCH_SIZE: $BATCH_SIZE" +echo "TOTAL_EPOCHS: $TOTAL_EPOCHS" + # 训练用例名称 -CASE_NAME=${NETWORK}_${WORLD_SIZE}p_bs${BATCH_SIZE}_e${TOTAL_EPOCHS}_full +CASE_NAME=${NETWORK}_${NUM_NPU}p_bs${BATCH_SIZE}_e${TOTAL_EPOCHS}_full echo "[FlashOCC] CASE_NAME = ${CASE_NAME}" # 创建输出目录 @@ -65,12 +100,30 @@ sed -i 's/^\(\s*\)is_cuda\s*=\s*True/\1is_cuda = False/' projects/mmdet3d_plugin # 每个step打印时间 sed -i 's/interval=1,/interval=50,/g' mmdetection3d/configs/_base_/default_runtime.py +cfg_file="projects/configs/flashocc/flashocc-r50.py" + +# 备份config文件 +cp ${cfg_file} ${cfg_file}.bak + +# 修改batchsize +sed -i "s/samples_per_gpu=*[0-9]\{1,\},/samples_per_gpu=$BATCH_SIZE,/g" ${cfg_file} + +#复原callback +restore_config() { + if [ -f ${cfg_file}.bak ]; then + mv -f ${cfg_file}.bak ${cfg_file} + fi +} + +#异常复原 +trap restore_config EXIT SIGINT SIGTERM ERR + # 训练开始时间 start_time=$(date +%s) # 开始训练 echo "[FlashOCC] Training..." -bash ./tools/dist_train_fp16_backbone.sh ./projects/configs/flashocc/flashocc-r50.py ${WORLD_SIZE} --work-dir ${OUTPUT_PATH}/work_dir > ${OUTPUT_PATH}/train.log 2>&1 & +bash ./tools/dist_train_fp16_backbone.sh ./projects/configs/flashocc/flashocc-r50.py ${NUM_NPU} --work-dir ${OUTPUT_PATH}/work_dir > ${OUTPUT_PATH}/train.log 2>&1 & wait # 训练结束时间 @@ -87,7 +140,7 @@ echo "[FlashOCC] E2E Training Time (sec) : ${e2e_time}" if [[ ${TOTAL_EPOCHS} == 24 ]]; then # 验证精度 echo "[FlashOCC] Evaluating ..." - bash ./tools/dist_test.sh ./projects/configs/flashocc/flashocc-r50.py ${OUTPUT_PATH}/work_dir/epoch_24_ema.pth ${WORLD_SIZE} --eval mAP > ${OUTPUT_PATH}/eval_result.log 2>&1 & + bash ./tools/dist_test.sh ./projects/configs/flashocc/flashocc-r50.py ${OUTPUT_PATH}/work_dir/epoch_24_ema.pth ${NUM_NPU} --eval mAP > ${OUTPUT_PATH}/eval_result.log 2>&1 & wait mIoU=$(grep -o "mIoU of 6019 samples: [0-9.]*" ${OUTPUT_PATH}/eval_result.log | awk 'END {print $NF}') echo "[FlashOCC] mIoU : ${mIoU}" @@ -96,7 +149,7 @@ fi # 将关键信息打印到 ${CASE_NAME}.log 中 echo "Network = ${NETWORK}" > ${OUTPUT_PATH}/${CASE_NAME}.log echo "DeviceType = ${DEVICE_TYPE}" >> ${OUTPUT_PATH}/${CASE_NAME}.log -echo "RankSize = ${WORLD_SIZE}" >> ${OUTPUT_PATH}/${CASE_NAME}.log +echo "RankSize = ${NUM_NPU}" >> ${OUTPUT_PATH}/${CASE_NAME}.log echo "BatchSize = ${BATCH_SIZE}" >> ${OUTPUT_PATH}/${CASE_NAME}.log echo "CaseName = ${CASE_NAME}" >> ${OUTPUT_PATH}/${CASE_NAME}.log echo "E2ETrainingTime = ${e2e_time}" >> ${OUTPUT_PATH}/${CASE_NAME}.log diff --git a/model_examples/FlashOCC/test/train_8p_flashocc_r50_fp16_backbone_perf.sh b/model_examples/FlashOCC/test/train_8p_flashocc_r50_fp16_backbone_perf.sh index 4d3c2524..2b1f8eb4 100644 --- a/model_examples/FlashOCC/test/train_8p_flashocc_r50_fp16_backbone_perf.sh +++ b/model_examples/FlashOCC/test/train_8p_flashocc_r50_fp16_backbone_perf.sh @@ -4,12 +4,47 @@ NETWORK="FlashOCC_R50" DEVICE_TYPE=$(uname -m) -WORLD_SIZE=8 +NUM_NPU=8 BATCH_SIZE=24 TOTAL_EPOCHS=1 +while [[ $# -gt 0 ]]; do + case $1 in + --NUM-NPU|--num-npu) + NUM_NPU="$2" + shift 2 + ;; + --BATCH-SIZE|--batch-size) + BATCH_SIZE="$2" + shift 2 + ;; + --TOTAL-EPOCHS|--total-epochs) + TOTAL_EPOCHS="$2" + shift 2 + ;; + --help) + echo "用法: $0 [选项]" + echo "选项:" + echo " --NUM-NPU 设置 NUM-NPU (默认: 8)" + echo " --BATCH-SIZE 设置 BATCH_SIZE (默认: 24)" + echo " --TOTAL-EPOCHS 设置 TOTAL_EPOCHS (默认: 1)" + echo " --help 显示帮助信息" + exit 0 + ;; + *) + echo "未知选项: $1" + echo "使用 --help 查看帮助" + exit 1 + ;; + esac +done + +echo "NUM_NPU: $NUM_NPU" +echo "BATCH_SIZE: $BATCH_SIZE" +echo "TOTAL_EPOCHS: $TOTAL_EPOCHS" + # 训练用例名称 -CASE_NAME=${NETWORK}_${WORLD_SIZE}p_bs${BATCH_SIZE}_e${TOTAL_EPOCHS}_perf +CASE_NAME=${NETWORK}_${NUM_NPU}p_bs${BATCH_SIZE}_e${TOTAL_EPOCHS}_perf echo "[FlashOCC] CASE_NAME = ${CASE_NAME}" # 创建输出目录 @@ -65,12 +100,30 @@ sed -i 's/^\(\s*\)is_cuda\s*=\s*True/\1is_cuda = False/' projects/mmdet3d_plugin # 每个step打印时间 sed -i 's/interval=50,/interval=1,/g' mmdetection3d/configs/_base_/default_runtime.py +cfg_file="projects/configs/flashocc/flashocc-r50-perf.py" + +# 备份config文件 +cp ${cfg_file} ${cfg_file}.bak + +# 修改batchsize +sed -i "s/samples_per_gpu=*[0-9]\{1,\},/samples_per_gpu=$BATCH_SIZE,/g" ${cfg_file} + +#复原callback +restore_config() { + if [ -f ${cfg_file}.bak ]; then + mv -f ${cfg_file}.bak ${cfg_file} + fi +} + +#异常复原 +trap restore_config EXIT SIGINT SIGTERM ERR + # 训练开始时间 start_time=$(date +%s) # 开始训练 echo "[FlashOCC] Training..." -bash ./tools/dist_train_fp16_backbone.sh ./projects/configs/flashocc/flashocc-r50-perf.py ${WORLD_SIZE} --work-dir ${OUTPUT_PATH}/work_dir > ${OUTPUT_PATH}/train.log 2>&1 & +bash ./tools/dist_train_fp16_backbone.sh ./projects/configs/flashocc/flashocc-r50-perf.py ${NUM_NPU} --work-dir ${OUTPUT_PATH}/work_dir > ${OUTPUT_PATH}/train.log 2>&1 & wait # 训练结束时间 @@ -84,7 +137,7 @@ e2e_time=$(($end_time - $start_time)) echo "[FlashOCC] E2E Training Time (sec) : ${e2e_time}" avg_time=`grep -a 'mmdet - INFO - Epoch ' ${OUTPUT_PATH}/train.log |awk -F "time: " '{print $2}' | awk -F ", " '{print $1}' | awk 'NR>10 {sum+=$1; count++} END {if (count != 0) printf("%.3f",sum/count)}'` -fps_value=$(awk BEGIN'{print ('$BATCH_SIZE' * '$WORLD_SIZE')/'$avg_time'}') +fps_value=$(awk BEGIN'{print ('$BATCH_SIZE' * '$NUM_NPU')/'$avg_time'}') # 吞吐量 echo "[FlashOCC] Final Performance images/sec : ${fps_value}" @@ -92,7 +145,7 @@ echo "[FlashOCC] Final Performance images/sec : ${fps_value}" # 将关键信息打印到 ${CASE_NAME}.log 中 echo "Network = ${NETWORK}" > ${OUTPUT_PATH}/${CASE_NAME}.log echo "DeviceType = ${DEVICE_TYPE}" >> ${OUTPUT_PATH}/${CASE_NAME}.log -echo "RankSize = ${WORLD_SIZE}" >> ${OUTPUT_PATH}/${CASE_NAME}.log +echo "RankSize = ${NUM_NPU}" >> ${OUTPUT_PATH}/${CASE_NAME}.log echo "BatchSize = ${BATCH_SIZE}" >> ${OUTPUT_PATH}/${CASE_NAME}.log echo "CaseName = ${CASE_NAME}" >> ${OUTPUT_PATH}/${CASE_NAME}.log echo "E2ETrainingTime = ${e2e_time}" >> ${OUTPUT_PATH}/${CASE_NAME}.log diff --git a/model_examples/FlashOCC/test/train_8p_flashocc_r50_full.sh b/model_examples/FlashOCC/test/train_8p_flashocc_r50_full.sh index 9272a8c6..072468ca 100644 --- a/model_examples/FlashOCC/test/train_8p_flashocc_r50_full.sh +++ b/model_examples/FlashOCC/test/train_8p_flashocc_r50_full.sh @@ -4,12 +4,47 @@ NETWORK="FlashOCC_R50" DEVICE_TYPE=$(uname -m) -WORLD_SIZE=8 +NUM_NPU=8 BATCH_SIZE=24 TOTAL_EPOCHS=24 +while [[ $# -gt 0 ]]; do + case $1 in + --NUM-NPU|--num-npu) + NUM_NPU="$2" + shift 2 + ;; + --BATCH-SIZE|--batch-size) + BATCH_SIZE="$2" + shift 2 + ;; + --TOTAL-EPOCHS|--total-epochs) + TOTAL_EPOCHS="$2" + shift 2 + ;; + --help) + echo "用法: $0 [选项]" + echo "选项:" + echo " --NUM-NPU 设置 NUM-NPU (默认: 8)" + echo " --BATCH-SIZE 设置 BATCH_SIZE (默认: 24)" + echo " --TOTAL-EPOCHS 设置 TOTAL_EPOCHS (默认: 24)" + echo " --help 显示帮助信息" + exit 0 + ;; + *) + echo "未知选项: $1" + echo "使用 --help 查看帮助" + exit 1 + ;; + esac +done + +echo "NUM_NPU: $NUM_NPU" +echo "BATCH_SIZE: $BATCH_SIZE" +echo "TOTAL_EPOCHS: $TOTAL_EPOCHS" + # 训练用例名称 -CASE_NAME=${NETWORK}_${WORLD_SIZE}p_bs${BATCH_SIZE}_e${TOTAL_EPOCHS}_full +CASE_NAME=${NETWORK}_${NUM_NPU}p_bs${BATCH_SIZE}_e${TOTAL_EPOCHS}_full echo "[FlashOCC] CASE_NAME = ${CASE_NAME}" # 创建输出目录 @@ -65,12 +100,30 @@ sed -i 's/^\(\s*\)is_cuda\s*=\s*True/\1is_cuda = False/' projects/mmdet3d_plugin # 每个step打印时间 sed -i 's/interval=1,/interval=50,/g' mmdetection3d/configs/_base_/default_runtime.py +cfg_file="projects/configs/flashocc/flashocc-r50.py" + +# 备份config文件 +cp ${cfg_file} ${cfg_file}.bak + +# 修改batchsize +sed -i "s/samples_per_gpu=*[0-9]\{1,\},/samples_per_gpu=$BATCH_SIZE,/g" ${cfg_file} + +#复原callback +restore_config() { + if [ -f ${cfg_file}.bak ]; then + mv -f ${cfg_file}.bak ${cfg_file} + fi +} + +#异常复原 +trap restore_config EXIT SIGINT SIGTERM ERR + # 训练开始时间 start_time=$(date +%s) # 开始训练 echo "[FlashOCC] Training..." -bash ./tools/dist_train.sh ./projects/configs/flashocc/flashocc-r50.py ${WORLD_SIZE} --work-dir ${OUTPUT_PATH}/work_dir > ${OUTPUT_PATH}/train.log 2>&1 & +bash ./tools/dist_train.sh ./projects/configs/flashocc/flashocc-r50.py ${NUM_NPU} --work-dir ${OUTPUT_PATH}/work_dir > ${OUTPUT_PATH}/train.log 2>&1 & wait # 训练结束时间 @@ -87,7 +140,7 @@ echo "[FlashOCC] E2E Training Time (sec) : ${e2e_time}" if [[ ${TOTAL_EPOCHS} == 24 ]]; then # 验证精度 echo "[FlashOCC] Evaluating ..." - bash ./tools/dist_test.sh ./projects/configs/flashocc/flashocc-r50.py ${OUTPUT_PATH}/work_dir/epoch_24.pth ${WORLD_SIZE} --eval mAP > ${OUTPUT_PATH}/eval_result.log 2>&1 & + bash ./tools/dist_test.sh ./projects/configs/flashocc/flashocc-r50.py ${OUTPUT_PATH}/work_dir/epoch_24.pth ${NUM_NPU} --eval mAP > ${OUTPUT_PATH}/eval_result.log 2>&1 & wait mIoU=$(grep -o "mIoU of 6019 samples: [0-9.]*" ${OUTPUT_PATH}/eval_result.log | awk 'END {print $NF}') echo "[FlashOCC] mIoU : ${mIoU}" @@ -96,7 +149,7 @@ fi # 将关键信息打印到 ${CASE_NAME}.log 中 echo "Network = ${NETWORK}" > ${OUTPUT_PATH}/${CASE_NAME}.log echo "DeviceType = ${DEVICE_TYPE}" >> ${OUTPUT_PATH}/${CASE_NAME}.log -echo "RankSize = ${WORLD_SIZE}" >> ${OUTPUT_PATH}/${CASE_NAME}.log +echo "RankSize = ${NUM_NPU}" >> ${OUTPUT_PATH}/${CASE_NAME}.log echo "BatchSize = ${BATCH_SIZE}" >> ${OUTPUT_PATH}/${CASE_NAME}.log echo "CaseName = ${CASE_NAME}" >> ${OUTPUT_PATH}/${CASE_NAME}.log echo "E2ETrainingTime = ${e2e_time}" >> ${OUTPUT_PATH}/${CASE_NAME}.log diff --git a/model_examples/FlashOCC/test/train_8p_flashocc_r50_perf.sh b/model_examples/FlashOCC/test/train_8p_flashocc_r50_perf.sh index 1e298a13..f8c9b852 100644 --- a/model_examples/FlashOCC/test/train_8p_flashocc_r50_perf.sh +++ b/model_examples/FlashOCC/test/train_8p_flashocc_r50_perf.sh @@ -4,12 +4,47 @@ NETWORK="FlashOCC_R50" DEVICE_TYPE=$(uname -m) -WORLD_SIZE=8 +NUM_NPU=8 BATCH_SIZE=24 TOTAL_EPOCHS=1 +while [[ $# -gt 0 ]]; do + case $1 in + --NUM-NPU|--num-npu) + NUM_NPU="$2" + shift 2 + ;; + --BATCH-SIZE|--batch-size) + BATCH_SIZE="$2" + shift 2 + ;; + --TOTAL-EPOCHS|--total-epochs) + TOTAL_EPOCHS="$2" + shift 2 + ;; + --help) + echo "用法: $0 [选项]" + echo "选项:" + echo " --NUM-NPU 设置 NUM-NPU (默认: 8)" + echo " --BATCH-SIZE 设置 BATCH_SIZE (默认: 24)" + echo " --TOTAL-EPOCHS 设置 TOTAL_EPOCHS (默认: 1)" + echo " --help 显示帮助信息" + exit 0 + ;; + *) + echo "未知选项: $1" + echo "使用 --help 查看帮助" + exit 1 + ;; + esac +done + +echo "NUM_NPU: $NUM_NPU" +echo "BATCH_SIZE: $BATCH_SIZE" +echo "TOTAL_EPOCHS: $TOTAL_EPOCHS" + # 训练用例名称 -CASE_NAME=${NETWORK}_${WORLD_SIZE}p_bs${BATCH_SIZE}_e${TOTAL_EPOCHS}_perf +CASE_NAME=${NETWORK}_${NUM_NPU}p_bs${BATCH_SIZE}_e${TOTAL_EPOCHS}_perf echo "[FlashOCC] CASE_NAME = ${CASE_NAME}" # 创建输出目录 @@ -65,12 +100,30 @@ sed -i 's/^\(\s*\)is_cuda\s*=\s*True/\1is_cuda = False/' projects/mmdet3d_plugin # 每个step打印时间 sed -i 's/interval=50,/interval=1,/g' mmdetection3d/configs/_base_/default_runtime.py +cfg_file="projects/configs/flashocc/flashocc-r50-perf.py" + +# 备份config文件 +cp ${cfg_file} ${cfg_file}.bak + +# 修改batchsize +sed -i "s/samples_per_gpu=*[0-9]\{1,\},/samples_per_gpu=$BATCH_SIZE,/g" ${cfg_file} + +#复原callback +restore_config() { + if [ -f ${cfg_file}.bak ]; then + mv -f ${cfg_file}.bak ${cfg_file} + fi +} + +#异常复原 +trap restore_config EXIT SIGINT SIGTERM ERR + # 训练开始时间 start_time=$(date +%s) # 开始训练 echo "[FlashOCC] Training..." -bash ./tools/dist_train.sh ./projects/configs/flashocc/flashocc-r50-perf.py ${WORLD_SIZE} --work-dir ${OUTPUT_PATH}/work_dir > ${OUTPUT_PATH}/train.log 2>&1 & +bash ./tools/dist_train.sh ./projects/configs/flashocc/flashocc-r50-perf.py ${NUM_NPU} --work-dir ${OUTPUT_PATH}/work_dir > ${OUTPUT_PATH}/train.log 2>&1 & wait # 训练结束时间 @@ -84,7 +137,7 @@ e2e_time=$(($end_time - $start_time)) echo "[FlashOCC] E2E Training Time (sec) : ${e2e_time}" avg_time=`grep -a 'mmdet - INFO - Epoch ' ${OUTPUT_PATH}/train.log |awk -F "time: " '{print $2}' | awk -F ", " '{print $1}' | awk 'NR>10 {sum+=$1; count++} END {if (count != 0) printf("%.3f",sum/count)}'` -fps_value=$(awk BEGIN'{print ('$BATCH_SIZE' * '$WORLD_SIZE')/'$avg_time'}') +fps_value=$(awk BEGIN'{print ('$BATCH_SIZE' * '$NUM_NPU')/'$avg_time'}') # 吞吐量 echo "[FlashOCC] Final Performance images/sec : ${fps_value}" @@ -92,7 +145,7 @@ echo "[FlashOCC] Final Performance images/sec : ${fps_value}" # 将关键信息打印到 ${CASE_NAME}.log 中 echo "Network = ${NETWORK}" > ${OUTPUT_PATH}/${CASE_NAME}.log echo "DeviceType = ${DEVICE_TYPE}" >> ${OUTPUT_PATH}/${CASE_NAME}.log -echo "RankSize = ${WORLD_SIZE}" >> ${OUTPUT_PATH}/${CASE_NAME}.log +echo "RankSize = ${NUM_NPU}" >> ${OUTPUT_PATH}/${CASE_NAME}.log echo "BatchSize = ${BATCH_SIZE}" >> ${OUTPUT_PATH}/${CASE_NAME}.log echo "CaseName = ${CASE_NAME}" >> ${OUTPUT_PATH}/${CASE_NAME}.log echo "E2ETrainingTime = ${e2e_time}" >> ${OUTPUT_PATH}/${CASE_NAME}.log -- Gitee