diff --git a/model_examples/FlashOCC/README.md b/model_examples/FlashOCC/README.md index cd6b089bde6281d25bdfd2a60591ca36f582d3b4..9df30939d57b77c44f3970e2105f602104152ebf 100644 --- a/model_examples/FlashOCC/README.md +++ b/model_examples/FlashOCC/README.md @@ -168,28 +168,36 @@ FlashOCC是一种高效且轻量化的占用预测框架,专为自动驾驶系 #### 开始训练 - 在模型源码根目录下,运行训练脚本。 + + 运行脚本支持命令行参数: + - '--num-npu':NPU卡数,默认为8; + - '--batch-size': 每卡batch-size大小,默认为24; - 单机8卡性能训练 ``` - bash test/train_8p_flashocc_r50_perf.sh # 8卡性能 + bash test/train_8p_flashocc_r50_perf.sh + (option) bash test/train_8p_flashocc_r50_perf.sh --num-npu 8 --batch-size 24 # 8卡性能 ``` - 单机8卡精度训练 ``` - bash test/train_8p_flashocc_r50_full.sh # 8卡精度 + bash test/train_8p_flashocc_r50_full.sh + (option) bash test/train_8p_flashocc_r50_full.sh --num-npu 8 --batch-size 24 # 8卡精度 ``` - 单机8卡backbone FP16性能训练 ``` bash test/train_8p_flashocc_r50_fp16_backbone_perf.sh + (option) bash test/train_8p_flashocc_r50_fp16_backbone_perf.sh --num-npu 8 --batch-size 24 ``` - 单机8卡backbone FP16精度训练 ``` bash test/train_8p_flashocc_r50_fp16_backbone_full.sh + (option) bash test/train_8p_flashocc_r50_fp16_backbone_full.sh --num-npu 8 --batch-size 24 ``` #### 训练结果 @@ -214,6 +222,8 @@ FlashOCC是一种高效且轻量化的占用预测框架,专为自动驾驶系 2025.8.20:增大num worker,更新fp16性能。 +2025.8.25:优化训练脚本,增加入参。 + # FAQ ## 训练时报错`ImportError: cannot import name 'gcd' from 'fraction'` diff --git a/model_examples/FlashOCC/test/train_8p_flashocc_r50_fp16_backbone_full.sh b/model_examples/FlashOCC/test/train_8p_flashocc_r50_fp16_backbone_full.sh index 7f38ebf56ba2850e7831ab8944492062dba1a8f5..dab1701905df2457894db4f8bd4094b3cbf4946b 100644 --- a/model_examples/FlashOCC/test/train_8p_flashocc_r50_fp16_backbone_full.sh +++ b/model_examples/FlashOCC/test/train_8p_flashocc_r50_fp16_backbone_full.sh @@ -4,12 +4,47 @@ NETWORK="FlashOCC_R50" DEVICE_TYPE=$(uname -m) -WORLD_SIZE=8 +NUM_NPU=8 BATCH_SIZE=24 TOTAL_EPOCHS=24 +while [[ $# -gt 0 ]]; do + case $1 in + --NUM-NPU|--num-npu) + NUM_NPU="$2" + shift 2 + ;; + --BATCH-SIZE|--batch-size) + BATCH_SIZE="$2" + shift 2 + ;; + --TOTAL-EPOCHS|--total-epochs) + TOTAL_EPOCHS="$2" + shift 2 + ;; + --help) + echo "用法: $0 [选项]" + echo "选项:" + echo " --NUM-NPU 设置 NUM-NPU (默认: 8)" + echo " --BATCH-SIZE 设置 BATCH_SIZE (默认: 24)" + echo " --TOTAL-EPOCHS 设置 TOTAL_EPOCHS (默认: 24)" + echo " --help 显示帮助信息" + exit 0 + ;; + *) + echo "未知选项: $1" + echo "使用 --help 查看帮助" + exit 1 + ;; + esac +done + +echo "NUM_NPU: $NUM_NPU" +echo "BATCH_SIZE: $BATCH_SIZE" +echo "TOTAL_EPOCHS: $TOTAL_EPOCHS" + # 训练用例名称 -CASE_NAME=${NETWORK}_${WORLD_SIZE}p_bs${BATCH_SIZE}_e${TOTAL_EPOCHS}_full +CASE_NAME=${NETWORK}_${NUM_NPU}p_bs${BATCH_SIZE}_e${TOTAL_EPOCHS}_full echo "[FlashOCC] CASE_NAME = ${CASE_NAME}" # 创建输出目录 @@ -65,12 +100,30 @@ sed -i 's/^\(\s*\)is_cuda\s*=\s*True/\1is_cuda = False/' projects/mmdet3d_plugin # 每个step打印时间 sed -i 's/interval=1,/interval=50,/g' mmdetection3d/configs/_base_/default_runtime.py +cfg_file="projects/configs/flashocc/flashocc-r50.py" + +# 备份config文件 +cp ${cfg_file} ${cfg_file}.bak + +# 修改batchsize +sed -i "s/samples_per_gpu=*[0-9]\{1,\},/samples_per_gpu=$BATCH_SIZE,/g" ${cfg_file} + +#复原callback +restore_config() { + if [ -f ${cfg_file}.bak ]; then + mv -f ${cfg_file}.bak ${cfg_file} + fi +} + +#异常复原 +trap restore_config EXIT SIGINT SIGTERM ERR + # 训练开始时间 start_time=$(date +%s) # 开始训练 echo "[FlashOCC] Training..." -bash ./tools/dist_train_fp16_backbone.sh ./projects/configs/flashocc/flashocc-r50.py ${WORLD_SIZE} --work-dir ${OUTPUT_PATH}/work_dir > ${OUTPUT_PATH}/train.log 2>&1 & +bash ./tools/dist_train_fp16_backbone.sh ./projects/configs/flashocc/flashocc-r50.py ${NUM_NPU} --work-dir ${OUTPUT_PATH}/work_dir > ${OUTPUT_PATH}/train.log 2>&1 & wait # 训练结束时间 @@ -87,7 +140,7 @@ echo "[FlashOCC] E2E Training Time (sec) : ${e2e_time}" if [[ ${TOTAL_EPOCHS} == 24 ]]; then # 验证精度 echo "[FlashOCC] Evaluating ..." - bash ./tools/dist_test.sh ./projects/configs/flashocc/flashocc-r50.py ${OUTPUT_PATH}/work_dir/epoch_24_ema.pth ${WORLD_SIZE} --eval mAP > ${OUTPUT_PATH}/eval_result.log 2>&1 & + bash ./tools/dist_test.sh ./projects/configs/flashocc/flashocc-r50.py ${OUTPUT_PATH}/work_dir/epoch_24_ema.pth ${NUM_NPU} --eval mAP > ${OUTPUT_PATH}/eval_result.log 2>&1 & wait mIoU=$(grep -o "mIoU of 6019 samples: [0-9.]*" ${OUTPUT_PATH}/eval_result.log | awk 'END {print $NF}') echo "[FlashOCC] mIoU : ${mIoU}" @@ -96,7 +149,7 @@ fi # 将关键信息打印到 ${CASE_NAME}.log 中 echo "Network = ${NETWORK}" > ${OUTPUT_PATH}/${CASE_NAME}.log echo "DeviceType = ${DEVICE_TYPE}" >> ${OUTPUT_PATH}/${CASE_NAME}.log -echo "RankSize = ${WORLD_SIZE}" >> ${OUTPUT_PATH}/${CASE_NAME}.log +echo "RankSize = ${NUM_NPU}" >> ${OUTPUT_PATH}/${CASE_NAME}.log echo "BatchSize = ${BATCH_SIZE}" >> ${OUTPUT_PATH}/${CASE_NAME}.log echo "CaseName = ${CASE_NAME}" >> ${OUTPUT_PATH}/${CASE_NAME}.log echo "E2ETrainingTime = ${e2e_time}" >> ${OUTPUT_PATH}/${CASE_NAME}.log diff --git a/model_examples/FlashOCC/test/train_8p_flashocc_r50_fp16_backbone_perf.sh b/model_examples/FlashOCC/test/train_8p_flashocc_r50_fp16_backbone_perf.sh index 4d3c252455128ad079f3458856d562acdbbc1f5c..2b1f8eb4fac8153e83fd282c42985b3b3cd4d5ef 100644 --- a/model_examples/FlashOCC/test/train_8p_flashocc_r50_fp16_backbone_perf.sh +++ b/model_examples/FlashOCC/test/train_8p_flashocc_r50_fp16_backbone_perf.sh @@ -4,12 +4,47 @@ NETWORK="FlashOCC_R50" DEVICE_TYPE=$(uname -m) -WORLD_SIZE=8 +NUM_NPU=8 BATCH_SIZE=24 TOTAL_EPOCHS=1 +while [[ $# -gt 0 ]]; do + case $1 in + --NUM-NPU|--num-npu) + NUM_NPU="$2" + shift 2 + ;; + --BATCH-SIZE|--batch-size) + BATCH_SIZE="$2" + shift 2 + ;; + --TOTAL-EPOCHS|--total-epochs) + TOTAL_EPOCHS="$2" + shift 2 + ;; + --help) + echo "用法: $0 [选项]" + echo "选项:" + echo " --NUM-NPU 设置 NUM-NPU (默认: 8)" + echo " --BATCH-SIZE 设置 BATCH_SIZE (默认: 24)" + echo " --TOTAL-EPOCHS 设置 TOTAL_EPOCHS (默认: 1)" + echo " --help 显示帮助信息" + exit 0 + ;; + *) + echo "未知选项: $1" + echo "使用 --help 查看帮助" + exit 1 + ;; + esac +done + +echo "NUM_NPU: $NUM_NPU" +echo "BATCH_SIZE: $BATCH_SIZE" +echo "TOTAL_EPOCHS: $TOTAL_EPOCHS" + # 训练用例名称 -CASE_NAME=${NETWORK}_${WORLD_SIZE}p_bs${BATCH_SIZE}_e${TOTAL_EPOCHS}_perf +CASE_NAME=${NETWORK}_${NUM_NPU}p_bs${BATCH_SIZE}_e${TOTAL_EPOCHS}_perf echo "[FlashOCC] CASE_NAME = ${CASE_NAME}" # 创建输出目录 @@ -65,12 +100,30 @@ sed -i 's/^\(\s*\)is_cuda\s*=\s*True/\1is_cuda = False/' projects/mmdet3d_plugin # 每个step打印时间 sed -i 's/interval=50,/interval=1,/g' mmdetection3d/configs/_base_/default_runtime.py +cfg_file="projects/configs/flashocc/flashocc-r50-perf.py" + +# 备份config文件 +cp ${cfg_file} ${cfg_file}.bak + +# 修改batchsize +sed -i "s/samples_per_gpu=*[0-9]\{1,\},/samples_per_gpu=$BATCH_SIZE,/g" ${cfg_file} + +#复原callback +restore_config() { + if [ -f ${cfg_file}.bak ]; then + mv -f ${cfg_file}.bak ${cfg_file} + fi +} + +#异常复原 +trap restore_config EXIT SIGINT SIGTERM ERR + # 训练开始时间 start_time=$(date +%s) # 开始训练 echo "[FlashOCC] Training..." -bash ./tools/dist_train_fp16_backbone.sh ./projects/configs/flashocc/flashocc-r50-perf.py ${WORLD_SIZE} --work-dir ${OUTPUT_PATH}/work_dir > ${OUTPUT_PATH}/train.log 2>&1 & +bash ./tools/dist_train_fp16_backbone.sh ./projects/configs/flashocc/flashocc-r50-perf.py ${NUM_NPU} --work-dir ${OUTPUT_PATH}/work_dir > ${OUTPUT_PATH}/train.log 2>&1 & wait # 训练结束时间 @@ -84,7 +137,7 @@ e2e_time=$(($end_time - $start_time)) echo "[FlashOCC] E2E Training Time (sec) : ${e2e_time}" avg_time=`grep -a 'mmdet - INFO - Epoch ' ${OUTPUT_PATH}/train.log |awk -F "time: " '{print $2}' | awk -F ", " '{print $1}' | awk 'NR>10 {sum+=$1; count++} END {if (count != 0) printf("%.3f",sum/count)}'` -fps_value=$(awk BEGIN'{print ('$BATCH_SIZE' * '$WORLD_SIZE')/'$avg_time'}') +fps_value=$(awk BEGIN'{print ('$BATCH_SIZE' * '$NUM_NPU')/'$avg_time'}') # 吞吐量 echo "[FlashOCC] Final Performance images/sec : ${fps_value}" @@ -92,7 +145,7 @@ echo "[FlashOCC] Final Performance images/sec : ${fps_value}" # 将关键信息打印到 ${CASE_NAME}.log 中 echo "Network = ${NETWORK}" > ${OUTPUT_PATH}/${CASE_NAME}.log echo "DeviceType = ${DEVICE_TYPE}" >> ${OUTPUT_PATH}/${CASE_NAME}.log -echo "RankSize = ${WORLD_SIZE}" >> ${OUTPUT_PATH}/${CASE_NAME}.log +echo "RankSize = ${NUM_NPU}" >> ${OUTPUT_PATH}/${CASE_NAME}.log echo "BatchSize = ${BATCH_SIZE}" >> ${OUTPUT_PATH}/${CASE_NAME}.log echo "CaseName = ${CASE_NAME}" >> ${OUTPUT_PATH}/${CASE_NAME}.log echo "E2ETrainingTime = ${e2e_time}" >> ${OUTPUT_PATH}/${CASE_NAME}.log diff --git a/model_examples/FlashOCC/test/train_8p_flashocc_r50_full.sh b/model_examples/FlashOCC/test/train_8p_flashocc_r50_full.sh index 9272a8c63701612a61af7c3b2a961f2789146f40..072468caaa5a2ee2f636341bace858527619e172 100644 --- a/model_examples/FlashOCC/test/train_8p_flashocc_r50_full.sh +++ b/model_examples/FlashOCC/test/train_8p_flashocc_r50_full.sh @@ -4,12 +4,47 @@ NETWORK="FlashOCC_R50" DEVICE_TYPE=$(uname -m) -WORLD_SIZE=8 +NUM_NPU=8 BATCH_SIZE=24 TOTAL_EPOCHS=24 +while [[ $# -gt 0 ]]; do + case $1 in + --NUM-NPU|--num-npu) + NUM_NPU="$2" + shift 2 + ;; + --BATCH-SIZE|--batch-size) + BATCH_SIZE="$2" + shift 2 + ;; + --TOTAL-EPOCHS|--total-epochs) + TOTAL_EPOCHS="$2" + shift 2 + ;; + --help) + echo "用法: $0 [选项]" + echo "选项:" + echo " --NUM-NPU 设置 NUM-NPU (默认: 8)" + echo " --BATCH-SIZE 设置 BATCH_SIZE (默认: 24)" + echo " --TOTAL-EPOCHS 设置 TOTAL_EPOCHS (默认: 24)" + echo " --help 显示帮助信息" + exit 0 + ;; + *) + echo "未知选项: $1" + echo "使用 --help 查看帮助" + exit 1 + ;; + esac +done + +echo "NUM_NPU: $NUM_NPU" +echo "BATCH_SIZE: $BATCH_SIZE" +echo "TOTAL_EPOCHS: $TOTAL_EPOCHS" + # 训练用例名称 -CASE_NAME=${NETWORK}_${WORLD_SIZE}p_bs${BATCH_SIZE}_e${TOTAL_EPOCHS}_full +CASE_NAME=${NETWORK}_${NUM_NPU}p_bs${BATCH_SIZE}_e${TOTAL_EPOCHS}_full echo "[FlashOCC] CASE_NAME = ${CASE_NAME}" # 创建输出目录 @@ -65,12 +100,30 @@ sed -i 's/^\(\s*\)is_cuda\s*=\s*True/\1is_cuda = False/' projects/mmdet3d_plugin # 每个step打印时间 sed -i 's/interval=1,/interval=50,/g' mmdetection3d/configs/_base_/default_runtime.py +cfg_file="projects/configs/flashocc/flashocc-r50.py" + +# 备份config文件 +cp ${cfg_file} ${cfg_file}.bak + +# 修改batchsize +sed -i "s/samples_per_gpu=*[0-9]\{1,\},/samples_per_gpu=$BATCH_SIZE,/g" ${cfg_file} + +#复原callback +restore_config() { + if [ -f ${cfg_file}.bak ]; then + mv -f ${cfg_file}.bak ${cfg_file} + fi +} + +#异常复原 +trap restore_config EXIT SIGINT SIGTERM ERR + # 训练开始时间 start_time=$(date +%s) # 开始训练 echo "[FlashOCC] Training..." -bash ./tools/dist_train.sh ./projects/configs/flashocc/flashocc-r50.py ${WORLD_SIZE} --work-dir ${OUTPUT_PATH}/work_dir > ${OUTPUT_PATH}/train.log 2>&1 & +bash ./tools/dist_train.sh ./projects/configs/flashocc/flashocc-r50.py ${NUM_NPU} --work-dir ${OUTPUT_PATH}/work_dir > ${OUTPUT_PATH}/train.log 2>&1 & wait # 训练结束时间 @@ -87,7 +140,7 @@ echo "[FlashOCC] E2E Training Time (sec) : ${e2e_time}" if [[ ${TOTAL_EPOCHS} == 24 ]]; then # 验证精度 echo "[FlashOCC] Evaluating ..." - bash ./tools/dist_test.sh ./projects/configs/flashocc/flashocc-r50.py ${OUTPUT_PATH}/work_dir/epoch_24.pth ${WORLD_SIZE} --eval mAP > ${OUTPUT_PATH}/eval_result.log 2>&1 & + bash ./tools/dist_test.sh ./projects/configs/flashocc/flashocc-r50.py ${OUTPUT_PATH}/work_dir/epoch_24.pth ${NUM_NPU} --eval mAP > ${OUTPUT_PATH}/eval_result.log 2>&1 & wait mIoU=$(grep -o "mIoU of 6019 samples: [0-9.]*" ${OUTPUT_PATH}/eval_result.log | awk 'END {print $NF}') echo "[FlashOCC] mIoU : ${mIoU}" @@ -96,7 +149,7 @@ fi # 将关键信息打印到 ${CASE_NAME}.log 中 echo "Network = ${NETWORK}" > ${OUTPUT_PATH}/${CASE_NAME}.log echo "DeviceType = ${DEVICE_TYPE}" >> ${OUTPUT_PATH}/${CASE_NAME}.log -echo "RankSize = ${WORLD_SIZE}" >> ${OUTPUT_PATH}/${CASE_NAME}.log +echo "RankSize = ${NUM_NPU}" >> ${OUTPUT_PATH}/${CASE_NAME}.log echo "BatchSize = ${BATCH_SIZE}" >> ${OUTPUT_PATH}/${CASE_NAME}.log echo "CaseName = ${CASE_NAME}" >> ${OUTPUT_PATH}/${CASE_NAME}.log echo "E2ETrainingTime = ${e2e_time}" >> ${OUTPUT_PATH}/${CASE_NAME}.log diff --git a/model_examples/FlashOCC/test/train_8p_flashocc_r50_perf.sh b/model_examples/FlashOCC/test/train_8p_flashocc_r50_perf.sh index 1e298a13e7b14dfaa2e24c430fd092a691afd743..f8c9b852e0a28b8948ddb5adf8c49fce0b7866d2 100644 --- a/model_examples/FlashOCC/test/train_8p_flashocc_r50_perf.sh +++ b/model_examples/FlashOCC/test/train_8p_flashocc_r50_perf.sh @@ -4,12 +4,47 @@ NETWORK="FlashOCC_R50" DEVICE_TYPE=$(uname -m) -WORLD_SIZE=8 +NUM_NPU=8 BATCH_SIZE=24 TOTAL_EPOCHS=1 +while [[ $# -gt 0 ]]; do + case $1 in + --NUM-NPU|--num-npu) + NUM_NPU="$2" + shift 2 + ;; + --BATCH-SIZE|--batch-size) + BATCH_SIZE="$2" + shift 2 + ;; + --TOTAL-EPOCHS|--total-epochs) + TOTAL_EPOCHS="$2" + shift 2 + ;; + --help) + echo "用法: $0 [选项]" + echo "选项:" + echo " --NUM-NPU 设置 NUM-NPU (默认: 8)" + echo " --BATCH-SIZE 设置 BATCH_SIZE (默认: 24)" + echo " --TOTAL-EPOCHS 设置 TOTAL_EPOCHS (默认: 1)" + echo " --help 显示帮助信息" + exit 0 + ;; + *) + echo "未知选项: $1" + echo "使用 --help 查看帮助" + exit 1 + ;; + esac +done + +echo "NUM_NPU: $NUM_NPU" +echo "BATCH_SIZE: $BATCH_SIZE" +echo "TOTAL_EPOCHS: $TOTAL_EPOCHS" + # 训练用例名称 -CASE_NAME=${NETWORK}_${WORLD_SIZE}p_bs${BATCH_SIZE}_e${TOTAL_EPOCHS}_perf +CASE_NAME=${NETWORK}_${NUM_NPU}p_bs${BATCH_SIZE}_e${TOTAL_EPOCHS}_perf echo "[FlashOCC] CASE_NAME = ${CASE_NAME}" # 创建输出目录 @@ -65,12 +100,30 @@ sed -i 's/^\(\s*\)is_cuda\s*=\s*True/\1is_cuda = False/' projects/mmdet3d_plugin # 每个step打印时间 sed -i 's/interval=50,/interval=1,/g' mmdetection3d/configs/_base_/default_runtime.py +cfg_file="projects/configs/flashocc/flashocc-r50-perf.py" + +# 备份config文件 +cp ${cfg_file} ${cfg_file}.bak + +# 修改batchsize +sed -i "s/samples_per_gpu=*[0-9]\{1,\},/samples_per_gpu=$BATCH_SIZE,/g" ${cfg_file} + +#复原callback +restore_config() { + if [ -f ${cfg_file}.bak ]; then + mv -f ${cfg_file}.bak ${cfg_file} + fi +} + +#异常复原 +trap restore_config EXIT SIGINT SIGTERM ERR + # 训练开始时间 start_time=$(date +%s) # 开始训练 echo "[FlashOCC] Training..." -bash ./tools/dist_train.sh ./projects/configs/flashocc/flashocc-r50-perf.py ${WORLD_SIZE} --work-dir ${OUTPUT_PATH}/work_dir > ${OUTPUT_PATH}/train.log 2>&1 & +bash ./tools/dist_train.sh ./projects/configs/flashocc/flashocc-r50-perf.py ${NUM_NPU} --work-dir ${OUTPUT_PATH}/work_dir > ${OUTPUT_PATH}/train.log 2>&1 & wait # 训练结束时间 @@ -84,7 +137,7 @@ e2e_time=$(($end_time - $start_time)) echo "[FlashOCC] E2E Training Time (sec) : ${e2e_time}" avg_time=`grep -a 'mmdet - INFO - Epoch ' ${OUTPUT_PATH}/train.log |awk -F "time: " '{print $2}' | awk -F ", " '{print $1}' | awk 'NR>10 {sum+=$1; count++} END {if (count != 0) printf("%.3f",sum/count)}'` -fps_value=$(awk BEGIN'{print ('$BATCH_SIZE' * '$WORLD_SIZE')/'$avg_time'}') +fps_value=$(awk BEGIN'{print ('$BATCH_SIZE' * '$NUM_NPU')/'$avg_time'}') # 吞吐量 echo "[FlashOCC] Final Performance images/sec : ${fps_value}" @@ -92,7 +145,7 @@ echo "[FlashOCC] Final Performance images/sec : ${fps_value}" # 将关键信息打印到 ${CASE_NAME}.log 中 echo "Network = ${NETWORK}" > ${OUTPUT_PATH}/${CASE_NAME}.log echo "DeviceType = ${DEVICE_TYPE}" >> ${OUTPUT_PATH}/${CASE_NAME}.log -echo "RankSize = ${WORLD_SIZE}" >> ${OUTPUT_PATH}/${CASE_NAME}.log +echo "RankSize = ${NUM_NPU}" >> ${OUTPUT_PATH}/${CASE_NAME}.log echo "BatchSize = ${BATCH_SIZE}" >> ${OUTPUT_PATH}/${CASE_NAME}.log echo "CaseName = ${CASE_NAME}" >> ${OUTPUT_PATH}/${CASE_NAME}.log echo "E2ETrainingTime = ${e2e_time}" >> ${OUTPUT_PATH}/${CASE_NAME}.log