diff --git a/model_examples/PointPillar/README.md b/model_examples/PointPillar/README.md index 0fac1e6b8ee31c8ad8eb62ca0dd90edb12103026..a4fbdc59c013b913d29241101f280db0991bcb61 100644 --- a/model_examples/PointPillar/README.md +++ b/model_examples/PointPillar/README.md @@ -195,10 +195,18 @@ code_path=model_examples/PointPillar ``` 2. 运行训练脚本。 该模型支持单机单机8卡训练 + + 运行脚本支持命令行参数: + - '--num-npu':NPU卡数,默认为8; + - '--batch-size': 每卡batch-size大小,默认为4; ``` cd tools/test + # 8卡精度脚本 bash train_pointpillar_full_8p.sh + (option) bash train_pointpillar_full_8p.sh --num-npu 8 --batch-size 4 + # 8卡性能脚本 bash train_pointpillar_performance_8p.sh + (option) bash train_pointpillar_performance_8p.sh --num-npu 8 --batch-size 4 ``` 训练完成后,权重文件保存在当前路径下,并输出模型训练精度和性能信息 @@ -214,8 +222,8 @@ code_path=model_examples/PointPillar 训练性能结果展示表 | Exp | global batch size | FPS | | - | - | - | -| 8p-竞品A | 256 | 486 | -| 8p-Atlas 800T A2| 256 | 576 | +| 8p-竞品A | 32 | 60.75 | +| 8p-Atlas 800T A2| 32 | 70.79 | ## FAQ ### ImportError:/usr/local/gcc-7.5.0/lib64/libgomp.so.1:cannot allocate memory in static TLS block, @@ -263,3 +271,4 @@ pip install protobuf [2025-06-12] **NEW:** PointPillar模型更新fps计算方式,更新性能指标 +[2025-08-25] **NEW:** PointPillar模型更新fps,更新脚本传参 diff --git a/model_examples/PointPillar/test/train_pointpillar_full_8p.sh b/model_examples/PointPillar/test/train_pointpillar_full_8p.sh index 62bb80d4022565287f26352abefb506d2e0ae3ba..8530e6391ac213a79462e5e2775c0a425cfb43d4 100644 --- a/model_examples/PointPillar/test/train_pointpillar_full_8p.sh +++ b/model_examples/PointPillar/test/train_pointpillar_full_8p.sh @@ -14,19 +14,48 @@ fi #集合通信参数,不需要修改 -export RANK_SIZE=8 RANK_ID_START=0 #基础参数,需要模型审视修改 #网络名称,同目录名称 Network="PointPillar" #训练batch_size -batch_size=32 +batch_size=4 +#训练NPU卡数 +num_npu=8 #训练模型配置文件 cfg_file="cfgs/kitti_models/pointpillar.yaml" #计算mAP时整除的倍率(检测类别数3,每个类别对应的指标数4) num_metric=12 +while [[ $# -gt 0 ]]; do + case $1 in + --NUM-NPU|--num-npu) + num_npu="$2" + shift 2 + ;; + --BATCH-SIZE|--batch-size) + batch_size="$2" + shift 2 + ;; + --help) + echo "用法: $0 [选项]" + echo "选项:" + echo " --NUM-NPU 设置 NUM-NPU (默认: 8)" + echo " --BATCH-SIZE 设置 BATCH_SIZE (默认: 24)" + echo " --help 显示帮助信息" + exit 0 + ;; + *) + echo "未知选项: $1" + echo "使用 --help 查看帮助" + exit 1 + ;; + esac +done + +echo "num_npu: $num_npu" +echo "batch_size: $batch_size" #设置环境变量,不需要修改 ASCEND_DEVICE_ID=0 @@ -48,6 +77,22 @@ if [ x"${etp_flag}" != x"true" ];then source ${test_path_dir}/env_npu.sh fi +# 备份config文件 +cp ${cfg_file} ${cfg_file}.bak + +#修改batchsize +sed -i "s/BATCH_SIZE_PER_GPU:[[:space:]]*[0-9]\{1,\}/BATCH_SIZE_PER_GPU: $batch_size/g" ${cfg_file} + +#复原callback +restore_config() { + if [ -f ${cfg_file}.bak ]; then + mv -f ${cfg_file}.bak ${cfg_file} + fi +} + +#异常复原 +trap restore_config EXIT SIGINT SIGTERM ERR + #训练开始时间,不需要修改 start_time=$(date +%s) @@ -66,7 +111,7 @@ done echo $PORT nohup python -m torch.distributed.launch \ - --nproc_per_node=${RANK_SIZE} \ + --nproc_per_node=${num_npu} \ --rdzv_endpoint=localhost:${PORT} \ train.py \ --launcher pytorch \ @@ -77,12 +122,14 @@ wait end_time=$(date +%s) e2e_time=$(( $end_time - $start_time )) +#复原batchsize +sed -i "s/BATCH_SIZE_PER_GPU: $batch_size/BATCH_SIZE_PER_GPU: 4/g" cfgs/kitti_models/pointpillar.yaml #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 avg_time=`grep -a 'Batch time: ' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F 'Batch time: ' '{print $2}'|awk 'NR>10'|awk -F '(' '{print $1}'|awk '{a+=$1} END {if (NR != 0) printf("%.3f",a/NR)}'` -FPS=`awk 'BEGIN{printf "%.3f\n", '$batch_size'*'${RANK_SIZE}'/'$avg_time'}'` +FPS=`awk 'BEGIN{printf "%.3f\n", '$batch_size'*'${num_npu}'/'$avg_time'}'` #打印,不需要修改 echo "Final Performance images/sec : $FPS" @@ -92,13 +139,13 @@ echo "E2E Training Duration sec : $e2e_time" #训练用例信息,不需要修改 BatchSize=${batch_size} DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' +CaseName=${Network}_bs${BatchSize}_${num_npu}'p'_'acc' #获取性能数据,不需要修改 #吞吐量 ActualFPS=${FPS} #单迭代训练时长 -TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'` +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${num_npu}'*1000/'${FPS}'}'` #打印精度数据,并打印到${CaseName}.log中 declare -A metrics @@ -114,7 +161,7 @@ done #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${num_npu}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "BatchSize = ${BatchSize}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "DeviceType = ${DeviceType}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "CaseName = ${CaseName}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/model_examples/PointPillar/test/train_pointpillar_performance_8p.sh b/model_examples/PointPillar/test/train_pointpillar_performance_8p.sh index 12e727e30100ec284956088c40e9ffbde0308264..b6bec642b5c0d40586289f978773298a28e3afea 100644 --- a/model_examples/PointPillar/test/train_pointpillar_performance_8p.sh +++ b/model_examples/PointPillar/test/train_pointpillar_performance_8p.sh @@ -14,19 +14,48 @@ fi #集合通信参数,不需要修改 -export RANK_SIZE=8 RANK_ID_START=0 #基础参数,需要模型审视修改 #网络名称,同目录名称 Network="PointPillar" #训练batch_size -batch_size=32 +batch_size=4 +#训练NPU卡数 +num_npu=8 #训练模型配置文件 cfg_file="cfgs/kitti_models/pointpillar.yaml" #计算mAP时整除的倍率(检测类别数3,每个类别对应的指标数4) num_metric=12 +while [[ $# -gt 0 ]]; do + case $1 in + --NUM-NPU|--num-npu) + num_npu="$2" + shift 2 + ;; + --BATCH-SIZE|--batch-size) + batch_size="$2" + shift 2 + ;; + --help) + echo "用法: $0 [选项]" + echo "选项:" + echo " --NUM-NPU 设置 NUM-NPU (默认: 8)" + echo " --BATCH-SIZE 设置 BATCH_SIZE (默认: 24)" + echo " --help 显示帮助信息" + exit 0 + ;; + *) + echo "未知选项: $1" + echo "使用 --help 查看帮助" + exit 1 + ;; + esac +done + +echo "num_npu: $num_npu" +echo "batch_size: $batch_size" #设置环境变量,不需要修改 ASCEND_DEVICE_ID=0 @@ -48,6 +77,22 @@ if [ x"${etp_flag}" != x"true" ];then source ${test_path_dir}/env_npu.sh fi +# 备份config文件 +cp ${cfg_file} ${cfg_file}.bak + +#修改batchsize +sed -i "s/BATCH_SIZE_PER_GPU:[[:space:]]*[0-9]\{1,\}/BATCH_SIZE_PER_GPU: $batch_size/g" ${cfg_file} + +#复原callback +restore_config() { + if [ -f ${cfg_file}.bak ]; then + mv -f ${cfg_file}.bak ${cfg_file} + fi +} + +#异常复原 +trap restore_config EXIT SIGINT SIGTERM ERR + #训练开始时间,不需要修改 start_time=$(date +%s) @@ -66,7 +111,7 @@ done echo $PORT nohup python -m torch.distributed.launch \ - --nproc_per_node=${RANK_SIZE} \ + --nproc_per_node=${num_npu} \ --rdzv_endpoint=localhost:${PORT} \ train.py \ --launcher pytorch \ @@ -79,12 +124,11 @@ wait end_time=$(date +%s) e2e_time=$(( $end_time - $start_time )) - #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 avg_time=`grep -a 'Batch time: ' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F 'Batch time: ' '{print $2}'|awk 'NR>10'|awk -F '(' '{print $1}'|tail -100|awk '{a+=$1} END {if (NR != 0) printf("%.3f",a/NR)}'` -FPS=`awk 'BEGIN{printf "%.3f\n", '$batch_size'*'${RANK_SIZE}'/'$avg_time'}'` +FPS=`awk 'BEGIN{printf "%.3f\n", '$batch_size'*'${num_npu}'/'$avg_time'}'` #打印,不需要修改 echo "Final Performance images/sec : $FPS" @@ -94,17 +138,17 @@ echo "E2E Training Duration sec : $e2e_time" #训练用例信息,不需要修改 BatchSize=${batch_size} DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' +CaseName=${Network}_bs${BatchSize}_${num_npu}'p'_'acc' #获取性能数据,不需要修改 #吞吐量 ActualFPS=${FPS} #单迭代训练时长 -TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'` +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${num_npu}'*1000/'${FPS}'}'` #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${num_npu}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "BatchSize = ${BatchSize}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "DeviceType = ${DeviceType}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "CaseName = ${CaseName}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log