diff --git a/.clang_format.hook b/.clang_format.hook index 1d928216867c0ba3897d71542fea44debf8d72a0..fdc3c054c0be41a1bda613fc572af0d9cf6f3c13 100644 --- a/.clang_format.hook +++ b/.clang_format.hook @@ -1,15 +1,20 @@ #!/bin/bash set -e -readonly VERSION="3.8" +readonly VERSION="13.0.0" version=$(clang-format -version) +if ! [[ $(python -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $1$2}') -ge 36 ]]; then + echo "clang-format installation by pip need python version great equal 3.6, + please change the default python to higher version." + exit 1 +fi + if ! [[ $version == *"$VERSION"* ]]; then - echo "clang-format version check failed." - echo "a version contains '$VERSION' is needed, but get '$version'" - echo "you can install the right version, and make an soft-link to '\$PATH' env" - exit -1 + # low version of pip may not have the source of clang-format whl + pip install --upgrade pip + pip install clang-format==13.0.0 fi clang-format $@ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1584bc76a9dd8ddff9d05a8cb693bcbd2e09fcde..5f7fec8c333c302c22454f4663fdd3b13d1c971e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,4 +32,4 @@ description: Format files with ClangFormat entry: bash .clang_format.hook -i language: system - files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$ + files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$ \ No newline at end of file diff --git a/LICENSE b/LICENSE index 261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64..5fe86943b37a77970679f826e78c71045569f819 100644 --- a/LICENSE +++ b/LICENSE @@ -1,3 +1,5 @@ +Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved + Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ @@ -186,7 +188,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright [yyyy] [name of copyright owner] + Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/PPOCRLabel/PPOCRLabel.py b/PPOCRLabel/PPOCRLabel.py index 0a3ae1cb3b8fc004aa7c48dc86b6546a80e17a0f..6c8154d1c9ac557b3033be32608d95d37f673b86 100644 --- a/PPOCRLabel/PPOCRLabel.py +++ b/PPOCRLabel/PPOCRLabel.py @@ -1617,8 +1617,9 @@ class MainWindow(QMainWindow): key_cls = 'None' if not self.kie_mode else box.get('key_cls', 'None') shapes.append((box['transcription'], box['points'], None, key_cls, box.get('difficult', False))) - self.loadLabels(shapes) - self.canvas.verified = False + if shapes != []: + self.loadLabels(shapes) + self.canvas.verified = False def validFilestate(self, filePath): if filePath not in self.fileStatedict.keys(): @@ -2203,7 +2204,7 @@ class MainWindow(QMainWindow): msg = 'Can not recognise the detection box in ' + self.filePath + '. Please change manually' QMessageBox.information(self, "Information", msg) return - result = self.ocr.ocr(img_crop, cls=True, det=False) + result = self.ocr.ocr(img_crop, cls=True, det=False)[0] if result[0][0] != '': if shape.line_color == DEFAULT_LOCK_COLOR: shape.label = result[0][0] @@ -2264,7 +2265,7 @@ class MainWindow(QMainWindow): msg = 'Can not recognise the detection box in ' + self.filePath + '. Please change manually' QMessageBox.information(self, "Information", msg) return - result = self.ocr.ocr(img_crop, cls=True, det=False) + result = self.ocr.ocr(img_crop, cls=True, det=False)[0] if result[0][0] != '': result.insert(0, box) print('result in reRec is ', result) @@ -2415,12 +2416,12 @@ class MainWindow(QMainWindow): # merge the text result in the cell texts = '' probs = 0. # the probability of the cell is avgerage prob of every text box in the cell - bboxes = self.ocr.ocr(img_crop, det=True, rec=False, cls=False) + bboxes = self.ocr.ocr(img_crop, det=True, rec=False, cls=False)[0] if len(bboxes) > 0: bboxes.reverse() # top row text at first for _bbox in bboxes: patch = get_rotate_crop_image(img_crop, np.array(_bbox, np.float32)) - rec_res = self.ocr.ocr(patch, det=False, rec=True, cls=False) + rec_res = self.ocr.ocr(patch, det=False, rec=True, cls=False)[0] text = rec_res[0][0] if text != '': texts += text + ('' if text[0].isalpha() else ' ') # add space between english word diff --git a/PPOCRLabel/README.md b/PPOCRLabel/README.md index 9c483e1feff5a07467f7aa90343391888004bce7..456d313e2e754cc033c48b6ea252c0d1c3ded9b5 100644 --- a/PPOCRLabel/README.md +++ b/PPOCRLabel/README.md @@ -60,7 +60,7 @@ PPOCRLabel can be started in two ways: whl package and Python script. The whl pa ```bash pip install PPOCRLabel # install -# Select label mode and run +# Select label mode and run PPOCRLabel # [Normal mode] for [detection + recognition] labeling PPOCRLabel --kie True # [KIE mode] for [detection + recognition + keyword extraction] labeling ``` @@ -76,7 +76,7 @@ PPOCRLabel --kie True # [KIE mode] for [detection + recognition + keyword extrac pip3 install PPOCRLabel pip3 install trash-cli -# Select label mode and run +# Select label mode and run PPOCRLabel # [Normal mode] for [detection + recognition] labeling PPOCRLabel --kie True # [KIE mode] for [detection + recognition + keyword extraction] labeling ``` @@ -86,7 +86,7 @@ PPOCRLabel --kie True # [KIE mode] for [detection + recognition + keyword extrac pip3 install PPOCRLabel pip3 install opencv-contrib-python-headless==4.2.0.32 -# Select label mode and run +# Select label mode and run PPOCRLabel # [Normal mode] for [detection + recognition] labeling PPOCRLabel --kie True # [KIE mode] for [detection + recognition + keyword extraction] labeling ``` @@ -97,17 +97,17 @@ If you modify the PPOCRLabel file (for example, specifying a new built-in model) ```bash cd ./PPOCRLabel # Switch to the PPOCRLabel directory -# Select label mode and run +# Select label mode and run python PPOCRLabel.py # [Normal mode] for [detection + recognition] labeling python PPOCRLabel.py --kie True # [KIE mode] for [detection + recognition + keyword extraction] labeling ``` #### 1.2.3 Build and Install the Whl Package Locally -Compile and install a new whl package, where 1.0.2 is the version number, you can specify the new version in 'setup.py'. +Compile and install a new whl package, where 0.0.0 is the version number, you can specify the new version in 'setup.py'. ```bash cd ./PPOCRLabel python3 setup.py bdist_wheel -pip3 install dist/PPOCRLabel-2.1.2-py2.py3-none-any.whl +pip3 install dist/PPOCRLabel-0.0.0-py2.py3-none-any.whl ``` @@ -140,24 +140,24 @@ pip3 install dist/PPOCRLabel-2.1.2-py2.py3-none-any.whl 10. Labeling result: the user can export the label result manually through the menu "File - Export Label", while the program will also export automatically if "File - Auto export Label Mode" is selected. The manually checked label will be stored in *Label.txt* under the opened picture folder. Click "File"-"Export Recognition Results" in the menu bar, the recognition training data of such pictures will be saved in the *crop_img* folder, and the recognition label will be saved in *rec_gt.txt*[4]. ### 2.2 Table Annotation -The table annotation is aimed at extracting the structure of the table in a picture and converting it to Excel format, +The table annotation is aimed at extracting the structure of the table in a picture and converting it to Excel format, so the annotation needs to be done simultaneously with external software to edit Excel. -In PPOCRLabel, complete the text information labeling (text and position), complete the table structure information +In PPOCRLabel, complete the text information labeling (text and position), complete the table structure information labeling in the Excel file, the recommended steps are: -1. Table annotation: After opening the table picture, click on the `Table Recognition` button in the upper right corner of PPOCRLabel, which will call the table recognition model in PP-Structure to automatically label +1. Table annotation: After opening the table picture, click on the `Table Recognition` button in the upper right corner of PPOCRLabel, which will call the table recognition model in PP-Structure to automatically label the table and pop up Excel at the same time. -2. Change the recognition result: **label each cell** (i.e. the text in a cell is marked as a box). Right click on the box and click on `Cell Re-recognition`. +2. Change the recognition result: **label each cell** (i.e. the text in a cell is marked as a box). Right click on the box and click on `Cell Re-recognition`. You can use the model to automatically recognise the text within a cell. 3. Mark the table structure: for each cell contains the text, **mark as any identifier (such as `1`) in Excel**, to ensure that the merged cell structure is same as the original picture. - > Note: If there are blank cells in the table, you also need to mark them with a bounding box so that the total number of cells is the same as in the image. + > Note: If there are blank cells in the table, you also need to mark them with a bounding box so that the total number of cells is the same as in the image. 4. ***Adjust cell order:*** Click on the menu `View` - `Show Box Number` to show the box ordinal numbers, and drag all the results under the 'Recognition Results' column on the right side of the software interface to make the box numbers are arranged from left to right, top to bottom -5. Export JSON format annotation: close all Excel files corresponding to table images, click `File`-`Export table JSON annotation` to obtain JSON annotation results. +5. Export JSON format annotation: close all Excel files corresponding to table images, click `File`-`Export Table Label` to obtain `gt.txt` annotation results. ### 2.3 Note @@ -209,9 +209,9 @@ labeling in the Excel file, the recommended steps are: - Default model: PPOCRLabel uses the Chinese and English ultra-lightweight OCR model in PaddleOCR by default, supports Chinese, English and number recognition, and multiple language detection. - Model language switching: Changing the built-in model language is supportable by clicking "PaddleOCR"-"Choose OCR Model" in the menu bar. Currently supported languages​include French, German, Korean, and Japanese. - For specific model download links, please refer to [PaddleOCR Model List](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_en/models_list_en.md#multilingual-recognition-modelupdating) + For specific model download links, please refer to [PaddleOCR Model List](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6/doc/doc_en/models_list_en.md) -- **Custom Model**: If users want to replace the built-in model with their own inference model, they can follow the [Custom Model Code Usage](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/doc/doc_en/whl_en.md#31-use-by-code) by modifying PPOCRLabel.py for [Instantiation of PaddleOCR class](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/PPOCRLabel/PPOCRLabel.py#L86) : +- **Custom Model**: If users want to replace the built-in model with their own inference model, they can follow the [Custom Model Code Usage](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6/doc/doc_en/whl_en.md#31-use-by-code) by modifying PPOCRLabel.py for [Instantiation of PaddleOCR class](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6/PPOCRLabel/PPOCRLabel.py#L97) : add parameter `det_model_dir` in `self.ocr = PaddleOCR(use_pdserving=False, use_angle_cls=True, det=True, cls=True, use_gpu=gpu, lang=lang) ` @@ -233,7 +233,7 @@ PPOCRLabel supports three ways to export Label.txt ``` cd ./PPOCRLabel # Change the directory to the PPOCRLabel folder - python gen_ocr_train_val_test.py --trainValTestRatio 6:2:2 --datasetRootPath ../train_data + python gen_ocr_train_val_test.py --trainValTestRatio 6:2:2 --datasetRootPath ../train_data ``` Parameter Description: @@ -255,7 +255,7 @@ PPOCRLabel supports three ways to export Label.txt |- word_003.jpg | ... ``` - + ### 3.5 Error message - If paddleocr is installed with whl, it has a higher priority than calling PaddleOCR class with paddleocr.py, which may cause an exception if whl package is not updated. diff --git a/PPOCRLabel/README_ch.md b/PPOCRLabel/README_ch.md index afe1a08ff9c5445b2643b9453985951fcb7a90f5..ef3e18a444c05af165ab26214403e49b4ce40277 100644 --- a/PPOCRLabel/README_ch.md +++ b/PPOCRLabel/README_ch.md @@ -101,12 +101,12 @@ python PPOCRLabel.py --lang ch #### 1.2.3 本地构建whl包并安装 -编译与安装新的whl包,其中1.0.2为版本号,可在 `setup.py` 中指定新版本。 +编译与安装新的whl包,其中0.0.0为版本号,可在 `setup.py` 中指定新版本。 ```bash cd ./PPOCRLabel -python3 setup.py bdist_wheel -pip3 install dist/PPOCRLabel-2.1.2-py2.py3-none-any.whl -i https://mirror.baidu.com/pypi/simple +python3 setup.py bdist_wheel +pip3 install dist/PPOCRLabel-0.0.0-py2.py3-none-any.whl -i https://mirror.baidu.com/pypi/simple ``` @@ -126,20 +126,20 @@ pip3 install dist/PPOCRLabel-2.1.2-py2.py3-none-any.whl -i https://mirror.baidu. 9. 删除:点击 “删除图像”,图片将会被删除至回收站。 10. 导出结果:用户可以通过菜单中“文件-导出标记结果”手动导出,同时也可以点击“文件 - 自动导出标记结果”开启自动导出。手动确认过的标记将会被存放在所打开图片文件夹下的*Label.txt*中。在菜单栏点击 “文件” - "导出识别结果"后,会将此类图片的识别训练数据保存在*crop_img*文件夹下,识别标签保存在*rec_gt.txt*中[4]。 -### 2.2 表格标注 +### 2.2 表格标注([视频演示](https://www.bilibili.com/video/BV1wR4y1v7JE/?share_source=copy_web&vd_source=cf1f9d24648d49636e3d109c9f9a377d&t=1998)) 表格标注针对表格的结构化提取,将图片中的表格转换为Excel格式,因此标注时需要配合外部软件打开Excel同时完成。在PPOCRLabel软件中完成表格中的文字信息标注(文字与位置)、在Excel文件中完成表格结构信息标注,推荐的步骤为: 1. 表格识别:打开表格图片后,点击软件右上角 `表格识别` 按钮,软件调用PP-Structure中的表格识别模型,自动为表格打标签,同时弹出Excel -2. 更改标注结果:**以表格中的单元格为单位增加标注框**(即一个单元格内的文字都标记为一个框)。标注框上鼠标右键后点击 `单元格重识别` +2. 更改标注结果:**以表格中的单元格为单位增加标注框**(即一个单元格内的文字都标记为一个框)。标注框上鼠标右键后点击 `单元格重识别` 可利用模型自动识别单元格内的文字。 - + > 注意:如果表格中存在空白单元格,同样需要使用一个标注框将其标出,使得单元格总数与图像中保持一致。 3. **调整单元格顺序**:点击软件`视图-显示框编号` 打开标注框序号,在软件界面右侧拖动 `识别结果` 一栏下的所有结果,使得标注框编号按照从左到右,从上到下的顺序排列,按行依次标注。 - + 4. 标注表格结构:**在外部Excel软件中,将存在文字的单元格标记为任意标识符(如 `1` )**,保证Excel中的单元格合并情况与原图相同即可(即不需要Excel中的单元格文字与图片中的文字完全相同) -5. 导出JSON格式:关闭所有表格图像对应的Excel,点击 `文件`-`导出表格JSON标注` 获得JSON标注结果。 +5. 导出JSON格式:关闭所有表格图像对应的Excel,点击 `文件`-`导出表格标注`,生成gt.txt标注文件。 ### 2.3 注意 @@ -191,9 +191,9 @@ pip3 install dist/PPOCRLabel-2.1.2-py2.py3-none-any.whl -i https://mirror.baidu. - 默认模型:PPOCRLabel默认使用PaddleOCR中的中英文超轻量OCR模型,支持中英文与数字识别,多种语言检测。 - - 模型语言切换:用户可通过菜单栏中 "PaddleOCR" - "选择模型" 切换内置模型语言,目前支持的语言包括法文、德文、韩文、日文。具体模型下载链接可参考[PaddleOCR模型列表](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/models_list.md). + - 模型语言切换:用户可通过菜单栏中 "PaddleOCR" - "选择模型" 切换内置模型语言,目前支持的语言包括法文、德文、韩文、日文。具体模型下载链接可参考[PaddleOCR模型列表](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6/doc/doc_ch/models_list.md). - - **自定义模型**:如果用户想将内置模型更换为自己的推理模型,可根据[自定义模型代码使用](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/whl.md#%E8%87%AA%E5%AE%9A%E4%B9%89%E6%A8%A1%E5%9E%8B),通过修改PPOCRLabel.py中针对[PaddleOCR类的实例化](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/PPOCRLabel/PPOCRLabel.py#L116) 实现,例如指定检测模型:`self.ocr = PaddleOCR(det=True, cls=True, use_gpu=gpu, lang=lang) `,在 `det_model_dir` 中传入 自己的模型即可。 + - **自定义模型**:如果用户想将内置模型更换为自己的推理模型,可根据[自定义模型代码使用](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6/doc/doc_ch/whl.md#3-%E8%87%AA%E5%AE%9A%E4%B9%89%E6%A8%A1%E5%9E%8B),通过修改PPOCRLabel.py中针对[PaddleOCR类的实例化](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6/PPOCRLabel/PPOCRLabel.py#L97) 或者[PPStructure](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6/PPOCRLabel/PPOCRLabel.py#L104)实现,例如指定检测模型:`self.ocr = PaddleOCR(det=True, cls=True, use_gpu=gpu, lang=lang) `,在 `det_model_dir` 中传入自己的模型即可。 ### 3.3 导出标记结果 @@ -213,7 +213,7 @@ PPOCRLabel支持三种导出方式: ``` cd ./PPOCRLabel # 将目录切换到PPOCRLabel文件夹下 -python gen_ocr_train_val_test.py --trainValTestRatio 6:2:2 --datasetRootPath ../train_data +python gen_ocr_train_val_test.py --trainValTestRatio 6:2:2 --datasetRootPath ../train_data ``` 参数说明: @@ -235,7 +235,7 @@ python gen_ocr_train_val_test.py --trainValTestRatio 6:2:2 --datasetRootPath ../ |- word_003.jpg | ... ``` - + ### 3.5 错误提示 - 如果同时使用whl包安装了paddleocr,其优先级大于通过paddleocr.py调用PaddleOCR类,whl包未更新时会导致程序异常。 diff --git a/PPOCRLabel/libs/autoDialog.py b/PPOCRLabel/libs/autoDialog.py index 189a590de851228e08d71f1dd2c00c823b9c2b0c..55636eec0fb35add23224cfc6917374d837f2191 100644 --- a/PPOCRLabel/libs/autoDialog.py +++ b/PPOCRLabel/libs/autoDialog.py @@ -40,7 +40,7 @@ class Worker(QThread): if self.model == 'paddle': h, w, _ = cv2.imdecode(np.fromfile(Imgpath, dtype=np.uint8), 1).shape if h > 32 and w > 32: - self.result_dic = self.ocr.ocr(Imgpath, cls=True, det=True) + self.result_dic = self.ocr.ocr(Imgpath, cls=True, det=True)[0] else: print('The size of', Imgpath, 'is too small to be recognised') self.result_dic = None diff --git a/PPOCRLabel/requirements.txt b/PPOCRLabel/requirements.txt index d66dba23788196291544142c114a841522b339dc..a10b3453a91d1af62d63b06b7f86f3b6ea2f5962 100644 --- a/PPOCRLabel/requirements.txt +++ b/PPOCRLabel/requirements.txt @@ -1,3 +1,3 @@ pyqt5 paddleocr -xlrd==1.2.0 \ No newline at end of file +xlrd==1.2.0 diff --git a/PPOCRLabel/setup.py b/PPOCRLabel/setup.py index a112df544ee385ba3dc87ffac3e15a9dc390c4db..9770b632bd44e820fe5a03558b1836ddcebb85cb 100644 --- a/PPOCRLabel/setup.py +++ b/PPOCRLabel/setup.py @@ -33,10 +33,10 @@ setup( package_dir={'PPOCRLabel': ''}, include_package_data=True, entry_points={"console_scripts": ["PPOCRLabel= PPOCRLabel.PPOCRLabel:main"]}, - version='2.1.2', + version='2.1.3', install_requires=requirements, license='Apache License 2.0', - description='PPOCRLabel is a semi-automatic graphic annotation tool suitable for OCR field, with built-in PPOCR model to automatically detect and re-recognize data. It is written in python3 and pyqt5, supporting rectangular box annotation and four-point annotation modes. Annotations can be directly used for the training of PPOCR detection and recognition models', + description='PPOCRLabelv2 is a semi-automatic graphic annotation tool suitable for OCR field, with built-in PP-OCR model to automatically detect and re-recognize data. It is written in Python3 and PyQT5, supporting rectangular box, table, irregular text and key information annotation modes. Annotations can be directly used for the training of PP-OCR detection and recognition models.', long_description=readme(), long_description_content_type='text/markdown', url='https://github.com/PaddlePaddle/PaddleOCR', diff --git a/README.md b/README.md old mode 100644 new mode 100755 index 8e869f6de551dd18ea0e8e4768081b5129ba87ea..a57365e0a67225aea5da981865c1ee97ea1237d8 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -English | [简体中文](README_ch.md) | [हिन्दी](./doc/doc_i18n/README_हिन्द.md) | [日本語](./doc/doc_i18n/README_日本語.md) | [한국인](./doc/doc_i18n/README_한국어.md) | [Pу́сский язы́к](./doc/doc_i18n/README_Ру́сский_язы́к.md) +[English](README_en.md) | 简体中文 | [हिन्दी](./doc/doc_i18n/README_हिन्द.md) | [日本語](./doc/doc_i18n/README_日本語.md) | [한국인](./doc/doc_i18n/README_한국어.md) | [Pу́сский язы́к](./doc/doc_i18n/README_Ру́сский_язы́к.md)

@@ -13,195 +13,220 @@ English | [简体中文](README_ch.md) | [हिन्दी](./doc/doc_i18n/REA

-## Introduction +## 简介 -PaddleOCR aims to create multilingual, awesome, leading, and practical OCR tools that help users train better models and apply them into practice. +PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力开发者训练出更好的模型,并应用落地。
- +
-## 📣 Recent updates -- 💥 **Live Preview: Oct 24 - Oct 26, China Standard Time, 20:30**, Engineers@PaddleOCR will show PP-StructureV2 optimization strategy for 3 days. - - Scan the QR code below using WeChat, follow the PaddlePaddle official account and fill out the questionnaire to join the WeChat group, get the live link and 20G OCR learning materials (including PDF2Word application, 10 models in vertical scenarios, etc.) +## 📣 近期更新 +- **🔥2023.8.7 发布 PaddleOCR [release/2.7](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.7)** + - 发布[PP-OCRv4](./doc/doc_ch/PP-OCRv4_introduction.md),提供mobile和server两种模型 + - PP-OCRv4-mobile:速度可比情况下,中文场景效果相比于PP-OCRv3再提升4.5%,英文场景提升10%,80语种多语言模型平均识别准确率提升8%以上 + - PP-OCRv4-server:发布了目前精度最高的OCR模型,中英文场景上检测模型精度提升4.9%, 识别模型精度提升2% + 可参考[快速开始](./doc/doc_ch/quickstart.md) 一行命令快速使用,同时也可在飞桨AI套件(PaddleX)中的[通用OCR产业方案](https://aistudio.baidu.com/aistudio/modelsdetail?modelId=286)中低代码完成模型训练、推理、高性能部署全流程 + - 发布[PP-ChatOCR](https://aistudio.baidu.com/aistudio/modelsdetail?modelId=332) ,使用融合PP-OCR模型和文心大模型的通用场景关键信息抽取全新方案 +- 🔨**2022.11 新增实现[4种前沿算法](doc/doc_ch/algorithm_overview.md)**:文本检测 [DRRG](doc/doc_ch/algorithm_det_drrg.md), 文本识别 [RFL](doc/doc_ch/algorithm_rec_rfl.md), 文本超分[Text Telescope](doc/doc_ch/algorithm_sr_telescope.md),公式识别[CAN](doc/doc_ch/algorithm_rec_can.md) +- **2022.10 优化[JS版PP-OCRv3模型](./deploy/paddlejs/README_ch.md)**:模型大小仅4.3M,预测速度提升8倍,配套web demo开箱即用 +- **💥 直播回放:PaddleOCR研发团队详解PP-StructureV2优化策略**。微信扫描[下方二维码](#开源社区),关注公众号并填写问卷后进入官方交流群,获取直播回放链接与20G重磅OCR学习大礼包(内含PDF转Word应用程序、10种垂类模型、《动手学OCR》电子书等) +- **🔥2022.8.24 发布 PaddleOCR [release/2.6](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.6)** + - 发布[PP-StructureV2](./ppstructure/README_ch.md),系统功能性能全面升级,适配中文场景,新增支持[版面复原](./ppstructure/recovery/README_ch.md),支持**一行命令完成PDF转Word**; + - [版面分析](./ppstructure/layout/README_ch.md)模型优化:模型存储减少95%,速度提升11倍,平均CPU耗时仅需41ms; + - [表格识别](./ppstructure/table/README_ch.md)模型优化:设计3大优化策略,预测耗时不变情况下,模型精度提升6%; + - [关键信息抽取](./ppstructure/kie/README_ch.md)模型优化:设计视觉无关模型结构,语义实体识别精度提升2.8%,关系抽取精度提升9.1%。 +- 🔥**2022.8 发布 [OCR场景应用集合](./applications)**:包含数码管、液晶屏、车牌、高精度SVTR模型、手写体识别等**9个垂类模型**,覆盖通用,制造、金融、交通行业的主要OCR垂类应用。 + +> [更多](./doc/doc_ch/update.md) + +## 🌟 特性 + +支持多种OCR相关前沿算法,在此基础上打造产业级特色模型[PP-OCR](./doc/doc_ch/ppocr_introduction.md)、[PP-Structure](./ppstructure/README_ch.md)和[PP-ChatOCR](https://aistudio.baidu.com/aistudio/projectdetail/6488689),并打通数据生产、模型训练、压缩、预测部署全流程。
- +
-- **🔥2022.8.24 Release PaddleOCR [release/2.6](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.6)** - - Release [PP-StructureV2](./ppstructure/),with functions and performance fully upgraded, adapted to Chinese scenes, and new support for [Layout Recovery](./ppstructure/recovery) and **one line command to convert PDF to Word**; - - [Layout Analysis](./ppstructure/layout) optimization: model storage reduced by 95%, while speed increased by 11 times, and the average CPU time-cost is only 41ms; - - [Table Recognition](./ppstructure/table) optimization: 3 optimization strategies are designed, and the model accuracy is improved by 6% under comparable time consumption; - - [Key Information Extraction](./ppstructure/kie) optimization:a visual-independent model structure is designed, the accuracy of semantic entity recognition is increased by 2.8%, and the accuracy of relation extraction is increased by 9.1%. -- **🔥2022.8 Release [OCR scene application collection](./applications/README_en.md)** - - Release **9 vertical models** such as digital tube, LCD screen, license plate, handwriting recognition model, high-precision SVTR model, etc, covering the main OCR vertical applications in general, manufacturing, finance, and transportation industries. -- **2022.8 Add implementation of [8 cutting-edge algorithms](doc/doc_en/algorithm_overview_en.md)** - - Text Detection: [FCENet](doc/doc_en/algorithm_det_fcenet_en.md), [DB++](doc/doc_en/algorithm_det_db_en.md) - - Text Recognition: [ViTSTR](doc/doc_en/algorithm_rec_vitstr_en.md), [ABINet](doc/doc_en/algorithm_rec_abinet_en.md), [VisionLAN](doc/doc_en/algorithm_rec_visionlan_en.md), [SPIN](doc/doc_en/algorithm_rec_spin_en.md), [RobustScanner](doc/doc_en/algorithm_rec_robustscanner_en.md) - - Table Recognition: [TableMaster](doc/doc_en/algorithm_table_master_en.md) -- **2022.5.9 Release PaddleOCR [release/2.5](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.5)** - - Release [PP-OCRv3](./doc/doc_en/ppocr_introduction_en.md#pp-ocrv3): With comparable speed, the effect of Chinese scene is further improved by 5% compared with PP-OCRv2, the effect of English scene is improved by 11%, and the average recognition accuracy of 80 language multilingual models is improved by more than 5%. - - Release [PPOCRLabelv2](./PPOCRLabel): Add the annotation function for table recognition task, key information extraction task and irregular text image. - - Release interactive e-book [*"Dive into OCR"*](./doc/doc_en/ocr_book_en.md), covers the cutting-edge theory and code practice of OCR full stack technology. -- [more](./doc/doc_en/update_en.md) +> 上述内容的使用方法建议从文档教程中的快速开始体验 -## 🌟 Features +## ⚡ 快速开始 -PaddleOCR support a variety of cutting-edge algorithms related to OCR, and developed industrial featured models/solution [PP-OCR](./doc/doc_en/ppocr_introduction_en.md) and [PP-Structure](./ppstructure/README.md) on this basis, and get through the whole process of data production, model training, compression, inference and deployment. +- 在线网站体验: + - PP-OCRv4 在线体验地址:https://aistudio.baidu.com/aistudio/projectdetail/6611435 + - PP-ChatOCR 在线体验地址:https://aistudio.baidu.com/aistudio/projectdetail/6488689 +- 一行命令快速使用:[快速开始(中英文/多语言/文档分析)](./doc/doc_ch/quickstart.md) +- 飞桨AI套件(PaddleX)中训练、推理、高性能部署全流程体验: + - PP-OCRv4:https://aistudio.baidu.com/aistudio/modelsdetail?modelId=286 + - PP-ChatOCR:https://aistudio.baidu.com/aistudio/modelsdetail?modelId=332 +- 移动端demo体验:[安装包DEMO下载地址](https://ai.baidu.com/easyedge/app/openSource?from=paddlelite)(基于EasyEdge和Paddle-Lite, 支持iOS和Android系统) -
- -
- -> It is recommended to start with the “quick experience” in the document tutorial - - -## ⚡ Quick Experience - -- Web online experience for the ultra-lightweight OCR: [Online Experience](https://www.paddlepaddle.org.cn/hub/scene/ocr) -- Mobile DEMO experience (based on EasyEdge and Paddle-Lite, supports iOS and Android systems): [Sign in to the website to obtain the QR code for installing the App](https://ai.baidu.com/easyedge/app/openSource?from=paddlelite) -- One line of code quick use: [Quick Start](./doc/doc_en/quickstart_en.md) - - - -## 📚 E-book: *Dive Into OCR* -- [Dive Into OCR ](./doc/doc_en/ocr_book_en.md) - - -## 👫 Community + +## 📖 技术交流合作 +- 飞桨AI套件([PaddleX](http://10.136.157.23:8080/paddle/paddleX))提供了飞桨模型训压推一站式全流程高效率开发平台,其使命是助力AI技术快速落地,愿景是使人人成为AI Developer! + - PaddleX 目前覆盖图像分类、目标检测、图像分割、3D、OCR和时序预测等领域方向,已内置了36种基础单模型,例如RP-DETR、PP-YOLOE、PP-HGNet、PP-LCNet、PP-LiteSeg等;集成了12种实用的产业方案,例如PP-OCRv4、PP-ChatOCR、PP-ShiTu、PP-TS、车载路面垃圾检测、野生动物违禁制品识别等。 + - PaddleX 提供了“工具箱”和“开发者”两种AI开发模式。工具箱模式可以无代码调优关键超参,开发者模式可以低代码进行单模型训压推和多模型串联推理,同时支持云端和本地端。 + - PaddleX 还支持联创开发,利润分成!目前 PaddleX 正在快速迭代,欢迎广大的个人开发者和企业开发者参与进来,共创繁荣的 AI 技术生态! -- For international developers, we regard [PaddleOCR Discussions](https://github.com/PaddlePaddle/PaddleOCR/discussions) as our international community platform. All ideas and questions can be discussed here in English. - -- For Chinese develops, Scan the QR code below with your Wechat, you can join the official technical discussion group. For richer community content, please refer to [中文README](README_ch.md), looking forward to your participation. +微信扫描下面二维码添加运营同学,并回复【paddlex】,运营同学会邀请您加入官方交流群,获得更高效的问题答疑。
- + +

飞桨AI套件【PaddleX】技术交流群二维码

- - -## 🛠️ PP-OCR Series Model List(Update on September 8th) - -| Model introduction | Model name | Recommended scene | Detection model | Direction classifier | Recognition model | -| ------------------------------------------------------------ | ---------------------------- | ----------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | -| Chinese and English ultra-lightweight PP-OCRv3 model(16.2M) | ch_PP-OCRv3_xx | Mobile & Server | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar) | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) | -| English ultra-lightweight PP-OCRv3 model(13.4M) | en_PP-OCRv3_xx | Mobile & Server | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_distill_train.tar) | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) | -| Chinese and English ultra-lightweight PP-OCRv2 model(11.6M) | ch_PP-OCRv2_xx |Mobile & Server|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)| [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar)| -| Chinese and English ultra-lightweight PP-OCR model (9.4M) | ch_ppocr_mobile_v2.0_xx | Mobile & server |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar)|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_train.tar) | -| Chinese and English general PP-OCR model (143.4M) | ch_ppocr_server_v2.0_xx | Server |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_train.tar) | - - -- For more model downloads (including multiple languages), please refer to [PP-OCR series model downloads](./doc/doc_en/models_list_en.md). -- For a new language request, please refer to [Guideline for new language_requests](#language_requests). -- For structural document analysis models, please refer to [PP-Structure models](./ppstructure/docs/models_list_en.md). - -## 📖 Tutorials -- [Environment Preparation](./doc/doc_en/environment_en.md) -- [PP-OCR 🔥](./doc/doc_en/ppocr_introduction_en.md) - - [Quick Start](./doc/doc_en/quickstart_en.md) - - [Model Zoo](./doc/doc_en/models_en.md) - - [Model training](./doc/doc_en/training_en.md) - - [Text Detection](./doc/doc_en/detection_en.md) - - [Text Recognition](./doc/doc_en/recognition_en.md) - - [Text Direction Classification](./doc/doc_en/angle_class_en.md) - - Model Compression - - [Model Quantization](./deploy/slim/quantization/README_en.md) - - [Model Pruning](./deploy/slim/prune/README_en.md) - - [Knowledge Distillation](./doc/doc_en/knowledge_distillation_en.md) - - [Inference and Deployment](./deploy/README.md) - - [Python Inference](./doc/doc_en/inference_ppocr_en.md) - - [C++ Inference](./deploy/cpp_infer/readme.md) - - [Serving](./deploy/pdserving/README.md) - - [Mobile](./deploy/lite/readme.md) - - [Paddle2ONNX](./deploy/paddle2onnx/readme.md) - - [PaddleCloud](./deploy/paddlecloud/README.md) - - [Benchmark](./doc/doc_en/benchmark_en.md) -- [PP-Structure 🔥](./ppstructure/README.md) - - [Quick Start](./ppstructure/docs/quickstart_en.md) - - [Model Zoo](./ppstructure/docs/models_list_en.md) - - [Model training](./doc/doc_en/training_en.md) - - [Layout Analysis](./ppstructure/layout/README.md) - - [Table Recognition](./ppstructure/table/README.md) - - [Key Information Extraction](./ppstructure/kie/README.md) - - [Inference and Deployment](./deploy/README.md) - - [Python Inference](./ppstructure/docs/inference_en.md) - - [C++ Inference](./deploy/cpp_infer/readme.md) - - [Serving](./deploy/hubserving/readme_en.md) -- [Academic Algorithms](./doc/doc_en/algorithm_overview_en.md) - - [Text detection](./doc/doc_en/algorithm_overview_en.md) - - [Text recognition](./doc/doc_en/algorithm_overview_en.md) - - [End-to-end OCR](./doc/doc_en/algorithm_overview_en.md) - - [Table Recognition](./doc/doc_en/algorithm_overview_en.md) - - [Key Information Extraction](./doc/doc_en/algorithm_overview_en.md) - - [Add New Algorithms to PaddleOCR](./doc/doc_en/add_new_algorithm_en.md) -- Data Annotation and Synthesis - - [Semi-automatic Annotation Tool: PPOCRLabel](./PPOCRLabel/README.md) - - [Data Synthesis Tool: Style-Text](./StyleText/README.md) - - [Other Data Annotation Tools](./doc/doc_en/data_annotation_en.md) - - [Other Data Synthesis Tools](./doc/doc_en/data_synthesis_en.md) -- Datasets - - [General OCR Datasets(Chinese/English)](doc/doc_en/dataset/datasets_en.md) - - [HandWritten_OCR_Datasets(Chinese)](doc/doc_en/dataset/handwritten_datasets_en.md) - - [Various OCR Datasets(multilingual)](doc/doc_en/dataset/vertical_and_multilingual_datasets_en.md) - - [Layout Analysis](doc/doc_en/dataset/layout_datasets_en.md) - - [Table Recognition](doc/doc_en/dataset/table_datasets_en.md) - - [Key Information Extraction](doc/doc_en/dataset/kie_datasets_en.md) -- [Code Structure](./doc/doc_en/tree_en.md) -- [Visualization](#Visualization) -- [Community](#Community) -- [New language requests](#language_requests) -- [FAQ](./doc/doc_en/FAQ_en.md) -- [References](./doc/doc_en/reference_en.md) -- [License](#LICENSE) - - - -## 👀 Visualization [more](./doc/doc_en/visualization_en.md) + +## 📚《动手学OCR》电子书 +- [《动手学OCR》电子书](./doc/doc_ch/ocr_book.md) + + +## 🚀 开源共建 +- **👫 加入社区**:感谢大家长久以来对 PaddleOCR 的支持和关注,与广大开发者共同构建一个专业、和谐、相互帮助的开源社区是 PaddleOCR 的目标。我们非常欢迎各位开发者参与到飞桨社区的开源建设中,加入开源、共建飞桨。**为感谢社区开发者在 PaddleOCR release2.7 中做出的代码贡献,我们将为贡献者制作与邮寄[开源贡献证书](https://github.com/PaddlePaddle/community/blob/master/contributors/certificate-inspection.md),烦请[填写问卷](https://paddle.wjx.cn/vm/wFNr6w7.aspx)提供必要的邮寄信息。** +- **🤩 社区活动**:飞桨开源社区长期运营与发布各类丰富的活动与开发任务,在 PaddleOCR 社区,你可以关注以下社区活动,并选择自己感兴趣的内容参与开源共建: + - **🎁 飞桨套件快乐开源常规赛 | [传送门](https://github.com/PaddlePaddle/PaddleOCR/issues/10223)**:OCR 社区常规赛升级版,以建设更好用的 OCR 套件为目标,包括但不限于学术前沿模型训练与推理、打磨优化 OCR 工具与应用项目开发等,任何有利于社区意见流动和问题解决的行为都热切希望大家的参与。让我们共同成长为飞桨套件的重要 Contributor 🎉🎉🎉。 + - **💡 新需求征集 | [传送门](https://github.com/PaddlePaddle/PaddleOCR/issues/10334)**:你在日常研究和实践深度学习过程中,有哪些你期望的 feature 亟待实现?请按照格式描述你想实现的 feature 和你提出的初步实现思路,我们会定期沟通与讨论这些需求,并将其纳入未来的版本规划中。 + - **💬 PP-SIG 技术研讨会 | [传送门](https://github.com/PaddlePaddle/community/tree/master/ppsigs)**:PP-SIG 是飞桨社区开发者由于相同的兴趣汇聚在一起形成的虚拟组织,通过定期召开技术研讨会的方式,分享行业前沿动态、探讨社区需求与技术开发细节、发起社区联合贡献任务。PaddleOCR 希望可以通过 AI 的力量助力任何一位有梦想的开发者实现自己的想法,享受创造价值带来的愉悦。 +- **📑 项目合作**:如果你有企业中明确的 OCR 垂类应用需求,我们推荐你使用训压推一站式全流程高效率开发平台 PaddleX,助力 AI 技术快速落地。PaddleX 还支持联创开发,利润分成!欢迎广大的个人开发者和企业开发者参与进来,共创繁荣的 AI 技术生态! + + + +## 🛠️ PP-OCR系列模型列表(更新中) + +| 模型简介 | 模型名称 | 推荐场景 | 检测模型 | 方向分类器 | 识别模型 | +| ------------------------------------- | ----------------------- | --------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | +| 中英文超轻量PP-OCRv4模型(15.8M) | ch_PP-OCRv4_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_det_distill_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_rec_train.tar) | +| 中英文超轻量PP-OCRv3模型(16.2M) | ch_PP-OCRv3_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) | +| 英文超轻量PP-OCRv3模型(13.4M) | en_PP-OCRv3_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_distill_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) | + +- 超轻量OCR系列更多模型下载(包括多语言),可以参考[PP-OCR系列模型下载](./doc/doc_ch/models_list.md),文档分析相关模型参考[PP-Structure系列模型下载](./ppstructure/docs/models_list.md) + +### PaddleOCR场景应用模型 + +| 行业 | 类别 | 亮点 | 文档说明 | 模型下载 | +| ---- | ------------ | ---------------------------------- | ------------------------------------------------------------ | --------------------------------------------- | +| 制造 | 数码管识别 | 数码管数据合成、漏识别调优 | [光功率计数码管字符识别](./applications/光功率计数码管字符识别/光功率计数码管字符识别.md) | [下载链接](./applications/README.md#模型下载) | +| 金融 | 通用表单识别 | 多模态通用表单结构化提取 | [多模态表单识别](./applications/多模态表单识别.md) | [下载链接](./applications/README.md#模型下载) | +| 交通 | 车牌识别 | 多角度图像处理、轻量模型、端侧部署 | [轻量级车牌识别](./applications/轻量级车牌识别.md) | [下载链接](./applications/README.md#模型下载) | + +- 更多制造、金融、交通行业的主要OCR垂类应用模型(如电表、液晶屏、高精度SVTR模型等),可参考[场景应用模型下载](./applications) + + + +## 📖 文档教程 + +- [运行环境准备](./doc/doc_ch/environment.md) +- [PP-OCR文本检测识别🔥](./doc/doc_ch/ppocr_introduction.md) + - [快速开始](./doc/doc_ch/quickstart.md) + - [模型库](./doc/doc_ch/models_list.md) + - [模型训练](./doc/doc_ch/training.md) + - [文本检测](./doc/doc_ch/detection.md) + - [文本识别](./doc/doc_ch/recognition.md) + - [文本方向分类器](./doc/doc_ch/angle_class.md) + - 模型压缩 + - [模型量化](./deploy/slim/quantization/README.md) + - [模型裁剪](./deploy/slim/prune/README.md) + - [知识蒸馏](./doc/doc_ch/knowledge_distillation.md) + - [推理部署](./deploy/README_ch.md) + - [基于Python预测引擎推理](./doc/doc_ch/inference_ppocr.md) + - [基于C++预测引擎推理](./deploy/cpp_infer/readme_ch.md) + - [服务化部署](./deploy/pdserving/README_CN.md) + - [端侧部署](./deploy/lite/readme.md) + - [Paddle2ONNX模型转化与预测](./deploy/paddle2onnx/readme.md) + - [云上飞桨部署工具](./deploy/paddlecloud/README.md) + - [Benchmark](./doc/doc_ch/benchmark.md) +- [PP-Structure文档分析🔥](./ppstructure/README_ch.md) + - [快速开始](./ppstructure/docs/quickstart.md) + - [模型库](./ppstructure/docs/models_list.md) + - [模型训练](./doc/doc_ch/training.md) + - [版面分析](./ppstructure/layout/README_ch.md) + - [表格识别](./ppstructure/table/README_ch.md) + - [关键信息提取](./ppstructure/kie/README_ch.md) + - [推理部署](./deploy/README_ch.md) + - [基于Python预测引擎推理](./ppstructure/docs/inference.md) + - [基于C++预测引擎推理](./deploy/cpp_infer/readme_ch.md) + - [服务化部署](./deploy/hubserving/readme.md) +- [前沿算法与模型🚀](./doc/doc_ch/algorithm_overview.md) + - [文本检测算法](./doc/doc_ch/algorithm_overview.md) + - [文本识别算法](./doc/doc_ch/algorithm_overview.md) + - [端到端OCR算法](./doc/doc_ch/algorithm_overview.md) + - [表格识别算法](./doc/doc_ch/algorithm_overview.md) + - [关键信息抽取算法](./doc/doc_ch/algorithm_overview.md) + - [使用PaddleOCR架构添加新算法](./doc/doc_ch/add_new_algorithm.md) +- [场景应用](./applications) +- 数据标注与合成 + - [半自动标注工具PPOCRLabel](./PPOCRLabel/README_ch.md) + - [数据合成工具Style-Text](./StyleText/README_ch.md) + - [其它数据标注工具](./doc/doc_ch/data_annotation.md) + - [其它数据合成工具](./doc/doc_ch/data_synthesis.md) +- 数据集 + - [通用中英文OCR数据集](doc/doc_ch/dataset/datasets.md) + - [手写中文OCR数据集](doc/doc_ch/dataset/handwritten_datasets.md) + - [垂类多语言OCR数据集](doc/doc_ch/dataset/vertical_and_multilingual_datasets.md) + - [版面分析数据集](doc/doc_ch/dataset/layout_datasets.md) + - [表格识别数据集](doc/doc_ch/dataset/table_datasets.md) + - [关键信息提取数据集](doc/doc_ch/dataset/kie_datasets.md) +- [代码组织结构](./doc/doc_ch/tree.md) +- [效果展示](#效果展示) +- [《动手学OCR》电子书📚](./doc/doc_ch/ocr_book.md) +- [开源社区](#开源社区) +- FAQ + - [通用问题](./doc/doc_ch/FAQ.md) + - [PaddleOCR实战问题](./doc/doc_ch/FAQ.md) +- [参考文献](./doc/doc_ch/reference.md) +- [许可证书](#许可证书) + + + + +## 👀 效果展示 [more](./doc/doc_ch/visualization.md)
-PP-OCRv3 Chinese model +PP-OCRv3 中文模型 +
+
+
-PP-OCRv3 English model +PP-OCRv3 英文模型 +
+
+
-PP-OCRv3 Multilingual model +PP-OCRv3 多语言模型 +
+
-PP-StructureV2 +PP-Structure 文档分析 -- layout analysis + table recognition +- 版面分析+表格识别
-- SER (Semantic entity recognition) -
- -
- +- SER(语义实体识别)
@@ -210,11 +235,11 @@ PaddleOCR support a variety of cutting-edge algorithms related to OCR, and devel -- RE (Relation Extraction)
- -
+ + +- RE(关系提取)
@@ -223,21 +248,13 @@ PaddleOCR support a variety of cutting-edge algorithms related to OCR, and devel -
- - -## 🇺🇳 Guideline for New Language Requests - -If you want to request a new language support, a PR with 1 following files are needed: - -1. In folder [ppocr/utils/dict](./ppocr/utils/dict), -it is necessary to submit the dict text to this path and name it with `{language}_dict.txt` that contains a list of all characters. Please see the format example from other files in that folder. - -If your language has unique elements, please tell me in advance within any way, such as useful links, wikipedia and so on. +
+ +
-More details, please refer to [Multilingual OCR Development Plan](https://github.com/PaddlePaddle/PaddleOCR/issues/1048). + + - -## 📄 License -This project is released under Apache 2.0 license +## 许可证书 +本项目的发布受Apache 2.0 license许可认证。 diff --git a/README_ch.md b/README_ch.md deleted file mode 100755 index 5fec27bd66d35596126e84f81d6019ce31217f2e..0000000000000000000000000000000000000000 --- a/README_ch.md +++ /dev/null @@ -1,261 +0,0 @@ -[English](README.md) | 简体中文 | [हिन्दी](./doc/doc_i18n/README_हिन्द.md) | [日本語](./doc/doc_i18n/README_日本語.md) | [한국인](./doc/doc_i18n/README_한국어.md) | [Pу́сский язы́к](./doc/doc_i18n/README_Ру́сский_язы́к.md) - -

- -

-

- - - - - - - -

- -## 简介 - -PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力开发者训练出更好的模型,并应用落地。 - -
- -
- -
- -
- -## 📣 近期更新 - -- **💥 直播预告:10.24-10.26日每晚8点半**,PaddleOCR研发团队详解PP-StructureV2优化策略。微信扫描下方二维码,关注公众号并填写问卷后进入官方交流群,获取直播链接与20G重磅OCR学习大礼包(内含PDF转Word应用程序、10种垂类模型、《动手学OCR》电子书等) - -
- -
- -- **🔥2022.8.24 发布 PaddleOCR [release/2.6](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.6)** - - 发布[PP-StructureV2](./ppstructure/README_ch.md),系统功能性能全面升级,适配中文场景,新增支持[版面复原](./ppstructure/recovery/README_ch.md),支持**一行命令完成PDF转Word**; - - [版面分析](./ppstructure/layout/README_ch.md)模型优化:模型存储减少95%,速度提升11倍,平均CPU耗时仅需41ms; - - [表格识别](./ppstructure/table/README_ch.md)模型优化:设计3大优化策略,预测耗时不变情况下,模型精度提升6%; - - [关键信息抽取](./ppstructure/kie/README_ch.md)模型优化:设计视觉无关模型结构,语义实体识别精度提升2.8%,关系抽取精度提升9.1%。 - -- **🔥2022.8 发布 [OCR场景应用集合](./applications)** - - - 包含数码管、液晶屏、车牌、高精度SVTR模型、手写体识别等**9个垂类模型**,覆盖通用,制造、金融、交通行业的主要OCR垂类应用。 - - -- **2022.8 新增实现[8种前沿算法](doc/doc_ch/algorithm_overview.md)** - - 文本检测:[FCENet](doc/doc_ch/algorithm_det_fcenet.md), [DB++](doc/doc_ch/algorithm_det_db.md) - - 文本识别:[ViTSTR](doc/doc_ch/algorithm_rec_vitstr.md), [ABINet](doc/doc_ch/algorithm_rec_abinet.md), [VisionLAN](doc/doc_ch/algorithm_rec_visionlan.md), [SPIN](doc/doc_ch/algorithm_rec_spin.md), [RobustScanner](doc/doc_ch/algorithm_rec_robustscanner.md) - - 表格识别:[TableMaster](doc/doc_ch/algorithm_table_master.md) - - -- **2022.5.9 发布 PaddleOCR [release/2.5](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.5)** - - 发布[PP-OCRv3](./doc/doc_ch/ppocr_introduction.md#pp-ocrv3),速度可比情况下,中文场景效果相比于PP-OCRv2再提升5%,英文场景提升11%,80语种多语言模型平均识别准确率提升5%以上; - - 发布半自动标注工具[PPOCRLabelv2](./PPOCRLabel):新增表格文字图像、图像关键信息抽取任务和不规则文字图像的标注功能; - - 发布OCR产业落地工具集:打通22种训练部署软硬件环境与方式,覆盖企业90%的训练部署环境需求; - - 发布交互式OCR开源电子书[《动手学OCR》](./doc/doc_ch/ocr_book.md),覆盖OCR全栈技术的前沿理论与代码实践,并配套教学视频。 - -> [更多](./doc/doc_ch/update.md) - -## 🌟 特性 - -支持多种OCR相关前沿算法,在此基础上打造产业级特色模型[PP-OCR](./doc/doc_ch/ppocr_introduction.md)和[PP-Structure](./ppstructure/README_ch.md),并打通数据生产、模型训练、压缩、预测部署全流程。 - -
- -
- -> 上述内容的使用方法建议从文档教程中的快速开始体验 - - -## ⚡ 快速开始 - -- 在线网站体验:超轻量PP-OCR mobile模型体验地址:https://www.paddlepaddle.org.cn/hub/scene/ocr -- 移动端demo体验:[安装包DEMO下载地址](https://ai.baidu.com/easyedge/app/openSource?from=paddlelite)(基于EasyEdge和Paddle-Lite, 支持iOS和Android系统) -- 一行命令快速使用:[快速开始(中英文/多语言/文档分析)](./doc/doc_ch/quickstart.md) - - -## 📚《动手学OCR》电子书 -- [《动手学OCR》电子书](./doc/doc_ch/ocr_book.md) - - - -## 👫 开源社区 -- **📑项目合作:** 如果您是企业开发者且有明确的OCR垂类应用需求,填写[问卷](https://paddle.wjx.cn/vj/QwF7GKw.aspx)后可免费与官方团队展开不同层次的合作。 -- **👫加入社区:** 微信扫描二维码并填写问卷之后,加入交流群领取20G重磅OCR学习大礼包 - - **包括《动手学OCR》电子书** ,配套讲解视频和notebook项目;PaddleOCR历次发版直播课视频; - - **OCR场景应用模型集合:** 包含数码管、液晶屏、车牌、高精度SVTR模型、手写体识别等垂类模型,覆盖通用,制造、金融、交通行业的主要OCR垂类应用。 - - PDF2Word应用程序;OCR社区优秀开发者项目分享视频。 -- **🏅️社区项目**:[社区项目](./doc/doc_ch/thirdparty.md)文档中包含了社区用户**使用PaddleOCR开发的各种工具、应用**以及**为PaddleOCR贡献的功能、优化的文档与代码**等,是官方为社区开发者打造的荣誉墙,也是帮助优质项目宣传的广播站。 -- **🎁社区常规赛**:社区常规赛是面向OCR开发者的积分赛事,覆盖文档、代码、模型和应用四大类型,以季度为单位评选并发放奖励,赛题详情与报名方法可参考[链接](https://github.com/PaddlePaddle/PaddleOCR/issues/4982)。 - -
- -
- - - - -## 🛠️ PP-OCR系列模型列表(更新中) - -| 模型简介 | 模型名称 | 推荐场景 | 检测模型 | 方向分类器 | 识别模型 | -| ------------------------------------- | ----------------------- | --------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | -| 中英文超轻量PP-OCRv3模型(16.2M) | ch_PP-OCRv3_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) | -| 英文超轻量PP-OCRv3模型(13.4M) | en_PP-OCRv3_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_distill_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) | - -- 超轻量OCR系列更多模型下载(包括多语言),可以参考[PP-OCR系列模型下载](./doc/doc_ch/models_list.md),文档分析相关模型参考[PP-Structure系列模型下载](./ppstructure/docs/models_list.md) - -### PaddleOCR场景应用模型 - -| 行业 | 类别 | 亮点 | 文档说明 | 模型下载 | -| ---- | ------------ | ---------------------------------- | ------------------------------------------------------------ | --------------------------------------------- | -| 制造 | 数码管识别 | 数码管数据合成、漏识别调优 | [光功率计数码管字符识别](./applications/光功率计数码管字符识别/光功率计数码管字符识别.md) | [下载链接](./applications/README.md#模型下载) | -| 金融 | 通用表单识别 | 多模态通用表单结构化提取 | [多模态表单识别](./applications/多模态表单识别.md) | [下载链接](./applications/README.md#模型下载) | -| 交通 | 车牌识别 | 多角度图像处理、轻量模型、端侧部署 | [轻量级车牌识别](./applications/轻量级车牌识别.md) | [下载链接](./applications/README.md#模型下载) | - -- 更多制造、金融、交通行业的主要OCR垂类应用模型(如电表、液晶屏、高精度SVTR模型等),可参考[场景应用模型下载](./applications) - - - -## 📖 文档教程 - -- [运行环境准备](./doc/doc_ch/environment.md) -- [PP-OCR文本检测识别🔥](./doc/doc_ch/ppocr_introduction.md) - - [快速开始](./doc/doc_ch/quickstart.md) - - [模型库](./doc/doc_ch/models_list.md) - - [模型训练](./doc/doc_ch/training.md) - - [文本检测](./doc/doc_ch/detection.md) - - [文本识别](./doc/doc_ch/recognition.md) - - [文本方向分类器](./doc/doc_ch/angle_class.md) - - 模型压缩 - - [模型量化](./deploy/slim/quantization/README.md) - - [模型裁剪](./deploy/slim/prune/README.md) - - [知识蒸馏](./doc/doc_ch/knowledge_distillation.md) - - [推理部署](./deploy/README_ch.md) - - [基于Python预测引擎推理](./doc/doc_ch/inference_ppocr.md) - - [基于C++预测引擎推理](./deploy/cpp_infer/readme_ch.md) - - [服务化部署](./deploy/pdserving/README_CN.md) - - [端侧部署](./deploy/lite/readme.md) - - [Paddle2ONNX模型转化与预测](./deploy/paddle2onnx/readme.md) - - [云上飞桨部署工具](./deploy/paddlecloud/README.md) - - [Benchmark](./doc/doc_ch/benchmark.md) -- [PP-Structure文档分析🔥](./ppstructure/README_ch.md) - - [快速开始](./ppstructure/docs/quickstart.md) - - [模型库](./ppstructure/docs/models_list.md) - - [模型训练](./doc/doc_ch/training.md) - - [版面分析](./ppstructure/layout/README_ch.md) - - [表格识别](./ppstructure/table/README_ch.md) - - [关键信息提取](./ppstructure/kie/README_ch.md) - - [推理部署](./deploy/README_ch.md) - - [基于Python预测引擎推理](./ppstructure/docs/inference.md) - - [基于C++预测引擎推理](./deploy/cpp_infer/readme_ch.md) - - [服务化部署](./deploy/hubserving/readme.md) -- [前沿算法与模型🚀](./doc/doc_ch/algorithm_overview.md) - - [文本检测算法](./doc/doc_ch/algorithm_overview.md) - - [文本识别算法](./doc/doc_ch/algorithm_overview.md) - - [端到端OCR算法](./doc/doc_ch/algorithm_overview.md) - - [表格识别算法](./doc/doc_ch/algorithm_overview.md) - - [关键信息抽取算法](./doc/doc_ch/algorithm_overview.md) - - [使用PaddleOCR架构添加新算法](./doc/doc_ch/add_new_algorithm.md) -- [场景应用](./applications) -- 数据标注与合成 - - [半自动标注工具PPOCRLabel](./PPOCRLabel/README_ch.md) - - [数据合成工具Style-Text](./StyleText/README_ch.md) - - [其它数据标注工具](./doc/doc_ch/data_annotation.md) - - [其它数据合成工具](./doc/doc_ch/data_synthesis.md) -- 数据集 - - [通用中英文OCR数据集](doc/doc_ch/dataset/datasets.md) - - [手写中文OCR数据集](doc/doc_ch/dataset/handwritten_datasets.md) - - [垂类多语言OCR数据集](doc/doc_ch/dataset/vertical_and_multilingual_datasets.md) - - [版面分析数据集](doc/doc_ch/dataset/layout_datasets.md) - - [表格识别数据集](doc/doc_ch/dataset/table_datasets.md) - - [关键信息提取数据集](doc/doc_ch/dataset/kie_datasets.md) -- [代码组织结构](./doc/doc_ch/tree.md) -- [效果展示](#效果展示) -- [《动手学OCR》电子书📚](./doc/doc_ch/ocr_book.md) -- [开源社区](#开源社区) -- FAQ - - [通用问题](./doc/doc_ch/FAQ.md) - - [PaddleOCR实战问题](./doc/doc_ch/FAQ.md) -- [参考文献](./doc/doc_ch/reference.md) -- [许可证书](#许可证书) - - - - -## 👀 效果展示 [more](./doc/doc_ch/visualization.md) - -
-PP-OCRv3 中文模型 - -
- - - -
- -
- - -
-PP-OCRv3 英文模型 - -
- - -
- -
- - -
-PP-OCRv3 多语言模型 - -
- - -
- -
- -
-PP-Structure 文档分析 - -- 版面分析+表格识别 -
- -
- -- SER(语义实体识别) -
- -
- -
- -
- -
- -
- -- RE(关系提取) -
- -
- -
- -
- -
- -
- -
- - - -## 许可证书 -本项目的发布受Apache 2.0 license许可认证。 diff --git a/README_en.md b/README_en.md new file mode 100644 index 0000000000000000000000000000000000000000..fa2789e8895a521463568d824f42d952ad5eb031 --- /dev/null +++ b/README_en.md @@ -0,0 +1,265 @@ +English | [简体中文](README_ch.md) | [हिन्दी](./doc/doc_i18n/README_हिन्द.md) | [日本語](./doc/doc_i18n/README_日本語.md) | [한국인](./doc/doc_i18n/README_한국어.md) | [Pу́сский язы́к](./doc/doc_i18n/README_Ру́сский_язы́к.md) + +

+ +

+

+ + + + + + + +

+ +## Introduction + +PaddleOCR aims to create multilingual, awesome, leading, and practical OCR tools that help users train better models and apply them into practice. + +
+ +
+ +
+ +
+ +## 📣 Recent updates +- **🔥2023.8.7 Release PaddleOCR[release/2.7](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.7)** + - Release [PP-OCRv4](./doc/doc_ch/PP-OCRv4_introduction.md), support mobile version and server version + - PP-OCRv4-mobile:When the speed is comparable, the effect of the Chinese scene is improved by 4.5% compared with PP-OCRv3, the English scene is improved by 10%, and the average recognition accuracy of the 80-language multilingual model is increased by more than 8%. + - PP-OCRv4-server:Release the OCR model with the highest accuracy at present, the detection model accuracy increased by 4.9% in the Chinese and English scenes, and the recognition model accuracy increased by 2% + refer [quickstart](./doc/doc_en/quickstart_en.md) quick use by one line command, At the same time, the whole process of model training, reasoning, and high-performance deployment can also be completed with few code in the [General OCR Industry Solution](https://aistudio.baidu.com/aistudio/modelsdetail?modelId=286) in PaddleX. + - Release[PP-ChatOCR](https://aistudio.baidu.com/aistudio/modelsdetail?modelId=332), a new scheme for extracting key information of general scenes using PP-OCR model and ERNIE LLM. +- 🔨**2022.11 Add implementation of [4 cutting-edge algorithms](doc/doc_ch/algorithm_overview_en.md)**:Text Detection [DRRG](doc/doc_en/algorithm_det_drrg_en.md), Text Recognition [RFL](./doc/doc_en/algorithm_rec_rfl_en.md), Image Super-Resolution [Text Telescope](doc/doc_en/algorithm_sr_telescope_en.md),Handwritten Mathematical Expression Recognition [CAN](doc/doc_en/algorithm_rec_can_en.md) +- **2022.10 release [optimized JS version PP-OCRv3 model](./deploy/paddlejs/README.md)** with 4.3M model size, 8x faster inference time, and a ready-to-use web demo +- 💥 **Live Playback: Introduction to PP-StructureV2 optimization strategy**. Scan [the QR code below](#Community) using WeChat, follow the PaddlePaddle official account and fill out the questionnaire to join the WeChat group, get the live link and 20G OCR learning materials (including PDF2Word application, 10 models in vertical scenarios, etc.) + + +- **🔥2022.8.24 Release PaddleOCR [release/2.6](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.6)** + - Release [PP-StructureV2](./ppstructure/),with functions and performance fully upgraded, adapted to Chinese scenes, and new support for [Layout Recovery](./ppstructure/recovery) and **one line command to convert PDF to Word**; + - [Layout Analysis](./ppstructure/layout) optimization: model storage reduced by 95%, while speed increased by 11 times, and the average CPU time-cost is only 41ms; + - [Table Recognition](./ppstructure/table) optimization: 3 optimization strategies are designed, and the model accuracy is improved by 6% under comparable time consumption; + - [Key Information Extraction](./ppstructure/kie) optimization:a visual-independent model structure is designed, the accuracy of semantic entity recognition is increased by 2.8%, and the accuracy of relation extraction is increased by 9.1%. +- **🔥2022.8 Release [OCR scene application collection](./applications/README_en.md)** + - Release **9 vertical models** such as digital tube, LCD screen, license plate, handwriting recognition model, high-precision SVTR model, etc, covering the main OCR vertical applications in general, manufacturing, finance, and transportation industries. +- **2022.8 Add implementation of [8 cutting-edge algorithms](doc/doc_en/algorithm_overview_en.md)** + - Text Detection: [FCENet](doc/doc_en/algorithm_det_fcenet_en.md), [DB++](doc/doc_en/algorithm_det_db_en.md) + - Text Recognition: [ViTSTR](doc/doc_en/algorithm_rec_vitstr_en.md), [ABINet](doc/doc_en/algorithm_rec_abinet_en.md), [VisionLAN](doc/doc_en/algorithm_rec_visionlan_en.md), [SPIN](doc/doc_en/algorithm_rec_spin_en.md), [RobustScanner](doc/doc_en/algorithm_rec_robustscanner_en.md) + - Table Recognition: [TableMaster](doc/doc_en/algorithm_table_master_en.md) +- **2022.5.9 Release PaddleOCR [release/2.5](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.5)** + - Release [PP-OCRv3](./doc/doc_en/ppocr_introduction_en.md#pp-ocrv3): With comparable speed, the effect of Chinese scene is further improved by 5% compared with PP-OCRv2, the effect of English scene is improved by 11%, and the average recognition accuracy of 80 language multilingual models is improved by more than 5%. + - Release [PPOCRLabelv2](./PPOCRLabel): Add the annotation function for table recognition task, key information extraction task and irregular text image. + - Release interactive e-book [*"Dive into OCR"*](./doc/doc_en/ocr_book_en.md), covers the cutting-edge theory and code practice of OCR full stack technology. +- [more](./doc/doc_en/update_en.md) + + +## 🌟 Features + +PaddleOCR support a variety of cutting-edge algorithms related to OCR, and developed industrial featured models/solution [PP-OCR](./doc/doc_en/ppocr_introduction_en.md)、 [PP-Structure](./ppstructure/README.md) and [PP-ChatOCR](https://aistudio.baidu.com/aistudio/projectdetail/6488689) on this basis, and get through the whole process of data production, model training, compression, inference and deployment. + +
+ +
+ +> It is recommended to start with the “quick experience” in the document tutorial + + +## ⚡ Quick Experience + +- Web online experience + - PP-OCRv4 online experience:https://aistudio.baidu.com/aistudio/projectdetail/6611435 + - PP-ChatOCR online experience:https://aistudio.baidu.com/aistudio/projectdetail/6488689 +- One line of code quick use: [Quick Start(Chinese/English/Multilingual/Document Analysis](./doc/doc_en/quickstart_en.md) +- Full-process experience of training, inference, and high-performance deployment in the Paddle AI suite (PaddleX): + - PP-OCRv4:https://aistudio.baidu.com/aistudio/modelsdetail?modelId=286 + - PP-ChatOCR:https://aistudio.baidu.com/aistudio/modelsdetail?modelId=332 +- Mobile demo experience:[Installation DEMO](https://ai.baidu.com/easyedge/app/openSource?from=paddlelite)(Based on EasyEdge and Paddle-Lite, support iOS and Android systems) + + + +## 📖 Technical exchange and cooperation +- ([PaddleX](http://10.136.157.23:8080/paddle/paddleX))provides a one-stop full-process high-efficiency development platform for flying paddle ecological model training, pressure, and push. Its mission is to help AI technology quickly land, and its vision is to make everyone an AI Developer! + - PaddleX currently covers areas such as image classification, object detection, image segmentation, 3D, OCR, and time series prediction, and has built-in 36 basic single models, such as RP-DETR, PP-YOLOE, PP-HGNet, PP-LCNet, PP- LiteSeg, etc.; integrated 12 practical industrial solutions, such as PP-OCRv4, PP-ChatOCR, PP-ShiTu, PP-TS, vehicle-mounted road waste detection, identification of prohibited wildlife products, etc. + - PaddleX provides two AI development modes: "Toolbox" and "Developer". The toolbox mode can tune key hyperparameters without code, and the developer mode can perform single-model training, push and multi-model serial inference with low code, and supports both cloud and local terminals. + - PaddleX also supports joint innovation and development, profit sharing! At present, PaddleX is rapidly iterating, and welcomes the participation of individual developers and enterprise developers to create a prosperous AI technology ecosystem! + +Scan the QR code below on WeChat to add operation students, and reply [paddlex], operation students will invite you to join the official communication group for more efficient questions and answers. + +
+ +

[PaddleX] technology exchange group QR code

+
+ + +## 📚 E-book: *Dive Into OCR* +- [Dive Into OCR ](./doc/doc_en/ocr_book_en.md) + + + +## 👫 Community + +- For international developers, we regard [PaddleOCR Discussions](https://github.com/PaddlePaddle/PaddleOCR/discussions) as our international community platform. All ideas and questions can be discussed here in English. + +- For Chinese develops, Scan the QR code below with your Wechat, you can join the official technical discussion group. For richer community content, please refer to [中文README](README_ch.md), looking forward to your participation. + +
+ +
+ + + +## 🛠️ PP-OCR Series Model List(Update on September 8th) + +| Model introduction | Model name | Recommended scene | Detection model | Direction classifier | Recognition model | +| ------------------------------------------------------------ | ---------------------------- | ----------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | +| Chinese and English ultra-lightweight PP-OCRv4 model(16.2M) | ch_PP-OCRv4_xx | Mobile & Server | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_det_distill_train.tar) | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_rec_train.tar) | +| Chinese and English ultra-lightweight PP-OCRv3 model(16.2M) | ch_PP-OCRv3_xx | Mobile & Server | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar) | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) | +| English ultra-lightweight PP-OCRv3 model(13.4M) | en_PP-OCRv3_xx | Mobile & Server | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_distill_train.tar) | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) | + +- For more model downloads (including multiple languages), please refer to [PP-OCR series model downloads](./doc/doc_en/models_list_en.md). +- For a new language request, please refer to [Guideline for new language_requests](#language_requests). +- For structural document analysis models, please refer to [PP-Structure models](./ppstructure/docs/models_list_en.md). + + +## 📖 Tutorials +- [Environment Preparation](./doc/doc_en/environment_en.md) +- [PP-OCR 🔥](./doc/doc_en/ppocr_introduction_en.md) + - [Quick Start](./doc/doc_en/quickstart_en.md) + - [Model Zoo](./doc/doc_en/models_en.md) + - [Model training](./doc/doc_en/training_en.md) + - [Text Detection](./doc/doc_en/detection_en.md) + - [Text Recognition](./doc/doc_en/recognition_en.md) + - [Text Direction Classification](./doc/doc_en/angle_class_en.md) + - Model Compression + - [Model Quantization](./deploy/slim/quantization/README_en.md) + - [Model Pruning](./deploy/slim/prune/README_en.md) + - [Knowledge Distillation](./doc/doc_en/knowledge_distillation_en.md) + - [Inference and Deployment](./deploy/README.md) + - [Python Inference](./doc/doc_en/inference_ppocr_en.md) + - [C++ Inference](./deploy/cpp_infer/readme.md) + - [Serving](./deploy/pdserving/README.md) + - [Mobile](./deploy/lite/readme.md) + - [Paddle2ONNX](./deploy/paddle2onnx/readme.md) + - [PaddleCloud](./deploy/paddlecloud/README.md) + - [Benchmark](./doc/doc_en/benchmark_en.md) +- [PP-Structure 🔥](./ppstructure/README.md) + - [Quick Start](./ppstructure/docs/quickstart_en.md) + - [Model Zoo](./ppstructure/docs/models_list_en.md) + - [Model training](./doc/doc_en/training_en.md) + - [Layout Analysis](./ppstructure/layout/README.md) + - [Table Recognition](./ppstructure/table/README.md) + - [Key Information Extraction](./ppstructure/kie/README.md) + - [Inference and Deployment](./deploy/README.md) + - [Python Inference](./ppstructure/docs/inference_en.md) + - [C++ Inference](./deploy/cpp_infer/readme.md) + - [Serving](./deploy/hubserving/readme_en.md) +- [Academic Algorithms](./doc/doc_en/algorithm_overview_en.md) + - [Text detection](./doc/doc_en/algorithm_overview_en.md) + - [Text recognition](./doc/doc_en/algorithm_overview_en.md) + - [End-to-end OCR](./doc/doc_en/algorithm_overview_en.md) + - [Table Recognition](./doc/doc_en/algorithm_overview_en.md) + - [Key Information Extraction](./doc/doc_en/algorithm_overview_en.md) + - [Add New Algorithms to PaddleOCR](./doc/doc_en/add_new_algorithm_en.md) +- Data Annotation and Synthesis + - [Semi-automatic Annotation Tool: PPOCRLabel](./PPOCRLabel/README.md) + - [Data Synthesis Tool: Style-Text](./StyleText/README.md) + - [Other Data Annotation Tools](./doc/doc_en/data_annotation_en.md) + - [Other Data Synthesis Tools](./doc/doc_en/data_synthesis_en.md) +- Datasets + - [General OCR Datasets(Chinese/English)](doc/doc_en/dataset/datasets_en.md) + - [HandWritten_OCR_Datasets(Chinese)](doc/doc_en/dataset/handwritten_datasets_en.md) + - [Various OCR Datasets(multilingual)](doc/doc_en/dataset/vertical_and_multilingual_datasets_en.md) + - [Layout Analysis](doc/doc_en/dataset/layout_datasets_en.md) + - [Table Recognition](doc/doc_en/dataset/table_datasets_en.md) + - [Key Information Extraction](doc/doc_en/dataset/kie_datasets_en.md) +- [Code Structure](./doc/doc_en/tree_en.md) +- [Visualization](#Visualization) +- [Community](#Community) +- [New language requests](#language_requests) +- [FAQ](./doc/doc_en/FAQ_en.md) +- [References](./doc/doc_en/reference_en.md) +- [License](#LICENSE) + + + +## 👀 Visualization [more](./doc/doc_en/visualization_en.md) + +
+PP-OCRv3 Chinese model +
+ + + +
+
+ +
+PP-OCRv3 English model +
+ + +
+
+ +
+PP-OCRv3 Multilingual model +
+ + +
+
+ +
+PP-StructureV2 + +- layout analysis + table recognition +
+ +
+ +- SER (Semantic entity recognition) +
+ +
+ +
+ +
+ +
+ +
+ +- RE (Relation Extraction) +
+ +
+ +
+ +
+ +
+ +
+ +
+ + +## 🇺🇳 Guideline for New Language Requests + +If you want to request a new language support, a PR with 1 following files are needed: + +1. In folder [ppocr/utils/dict](./ppocr/utils/dict), +it is necessary to submit the dict text to this path and name it with `{language}_dict.txt` that contains a list of all characters. Please see the format example from other files in that folder. + +If your language has unique elements, please tell me in advance within any way, such as useful links, wikipedia and so on. + +More details, please refer to [Multilingual OCR Development Plan](https://github.com/PaddlePaddle/PaddleOCR/issues/1048). + + + +## 📄 License +This project is released under Apache 2.0 license diff --git a/StyleText/engine/text_drawers.py b/StyleText/engine/text_drawers.py index 20375c13613f40c298ec83ff8fddf0e8fb73a9b0..6ccc42377a5e3dd2aa5f546182338bcd7b23dab2 100644 --- a/StyleText/engine/text_drawers.py +++ b/StyleText/engine/text_drawers.py @@ -23,7 +23,8 @@ class StdTextDrawer(object): def get_valid_height(self, font_path): font = ImageFont.truetype(font_path, self.height - 4) - _, font_height = font.getsize(self.char_list) + left, top, right, bottom = font.getbbox(self.char_list) + _, font_height = right - left, bottom - top if font_height <= self.height - 4: return self.height - 4 else: diff --git "a/applications/PCB\345\255\227\347\254\246\350\257\206\345\210\253/gen_data/gen.py" "b/applications/PCB\345\255\227\347\254\246\350\257\206\345\210\253/gen_data/gen.py" index 4c768067f998b6b4bbe0b2f5982f46a3f01fc872..0eb00cd1efc218ff72de7e1a7747ebd1154caff0 100644 --- "a/applications/PCB\345\255\227\347\254\246\350\257\206\345\210\253/gen_data/gen.py" +++ "b/applications/PCB\345\255\227\347\254\246\350\257\206\345\210\253/gen_data/gen.py" @@ -17,6 +17,7 @@ https://github.com/zcswdt/Color_OCR_image_generator """ import os import random +import PIL from PIL import Image, ImageDraw, ImageFont import json import argparse @@ -55,7 +56,11 @@ def get_horizontal_text_picture(image_file, chars, fonts_list, cf): ch_w = [] ch_h = [] for ch in chars: - wt, ht = font.getsize(ch) + if int(PIL.__version__.split('.')[0]) < 10: + wt, ht = font.getsize(ch) + else: + left, top, right, bottom = font.getbbox(ch) + wt, ht = right - left, bottom - top ch_w.append(wt) ch_h.append(ht) f_w = sum(ch_w) @@ -101,7 +106,11 @@ def get_vertical_text_picture(image_file, chars, fonts_list, cf): ch_w = [] ch_h = [] for ch in chars: - wt, ht = font.getsize(ch) + if int(PIL.__version__.split('.')[0]) < 10: + wt, ht = font.getsize(ch) + else: + left, top, right, bottom = font.getbbox(ch) + wt, ht = right - left, bottom - top ch_w.append(wt) ch_h.append(ht) f_w = max(ch_w) diff --git "a/applications/\345\277\253\351\200\237\346\236\204\345\273\272\345\215\241\350\257\201\347\261\273OCR.md" "b/applications/\345\277\253\351\200\237\346\236\204\345\273\272\345\215\241\350\257\201\347\261\273OCR.md" index a5a9460ff3f141fd433941344b81031207fbf986..79266c6c2054f5e4124e542397f1532af7445da5 100644 --- "a/applications/\345\277\253\351\200\237\346\236\204\345\273\272\345\215\241\350\257\201\347\261\273OCR.md" +++ "b/applications/\345\277\253\351\200\237\346\236\204\345\273\272\345\215\241\350\257\201\347\261\273OCR.md" @@ -223,7 +223,7 @@ AIStudio项目链接:[快速构建卡证类OCR](https://aistudio.baidu.com/ais 2)获取并解压预训练模型,如果要使用其他模型可以从模型库里自主选择合适模型。 ``` -!wget -P work/pre_trained/ https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar +!wget -P work/pre_trained/ https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar !tar -vxf /home/aistudio/work/pre_trained/ch_PP-OCRv3_det_distill_train.tar -C /home/aistudio/work/pre_trained ``` 3) 安装必要依赖 @@ -275,7 +275,7 @@ AIStudio项目链接:[快速构建卡证类OCR](https://aistudio.baidu.com/ais ```python class DetLabelEncode(object): - + # 修改检测标签的编码处,新增了参数分类数:num_classes,重写初始化方法,以及分类标签的读取 def __init__(self, label_list, num_classes=8, **kwargs): @@ -315,11 +315,11 @@ class DetLabelEncode(object): classes.append(int(self.label_list.index(txt))) if len(boxes) == 0: - + return None boxes = self.expand_points_num(boxes) boxes = np.array(boxes, dtype=np.float32) - txt_tags = np.array(txt_tags, dtype=np.bool) + txt_tags = np.array(txt_tags, dtype=np.bool_) classes = classes data['polys'] = boxes data['texts'] = txts @@ -410,10 +410,10 @@ class MakeShrinkMap(object): data['shrink_map'] = gt - + if self.num_classes > 1: data['class_mask'] = gt_class - + data['shrink_mask'] = mask return data ``` @@ -634,10 +634,10 @@ class DBPostProcess(object): ''' h, w = bitmap.shape[:2] box = _box.copy() - xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int), 0, w - 1) - xmax = np.clip(np.ceil(box[:, 0].max()).astype(np.int), 0, w - 1) - ymin = np.clip(np.floor(box[:, 1].min()).astype(np.int), 0, h - 1) - ymax = np.clip(np.ceil(box[:, 1].max()).astype(np.int), 0, h - 1) + xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int32), 0, w - 1) + xmax = np.clip(np.ceil(box[:, 0].max()).astype(np.int32), 0, w - 1) + ymin = np.clip(np.floor(box[:, 1].min()).astype(np.int32), 0, h - 1) + ymax = np.clip(np.ceil(box[:, 1].max()).astype(np.int32), 0, h - 1) mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8) box[:, 0] = box[:, 0] - xmin @@ -752,11 +752,11 @@ class DBPostProcess(object): 其他命令: ``` !python /home/aistudio/work/PaddleOCR/tools/eval.py -c /home/aistudio/work/PaddleOCR/configs/det/det_mv3_db.yml -!python /home/aistudio/work/PaddleOCR/tools/infer_det.py -c /home/aistudio/work/PaddleOCR/configs/det/det_mv3_db.yml +!python /home/aistudio/work/PaddleOCR/tools/infer_det.py -c /home/aistudio/work/PaddleOCR/configs/det/det_mv3_db.yml ``` 模型推理 ``` -!python /home/aistudio/work/PaddleOCR/tools/infer/predict_det.py --image_dir="/home/aistudio/work/test_img/" --det_model_dir="/home/aistudio/work/PaddleOCR/output/infer" +!python /home/aistudio/work/PaddleOCR/tools/infer/predict_det.py --image_dir="/home/aistudio/work/test_img/" --det_model_dir="/home/aistudio/work/PaddleOCR/output/infer" ``` ## 5 总结 diff --git a/benchmark/PaddleOCR_DBNet/.gitattributes b/benchmark/PaddleOCR_DBNet/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..8543e0a71603c693e83841b4a29a04c54a24d2a0 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/.gitattributes @@ -0,0 +1,2 @@ +*.html linguist-language=python +*.ipynb linguist-language=python \ No newline at end of file diff --git a/benchmark/PaddleOCR_DBNet/.gitignore b/benchmark/PaddleOCR_DBNet/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..cef1c73b393daf2b192bef1aafd9c612517247a4 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/.gitignore @@ -0,0 +1,16 @@ +.DS_Store +*.pth +*.pyc +*.pyo +*.log +*.tmp +*.pkl +__pycache__/ +.idea/ +output/ +test/*.jpg +datasets/ +index/ +train_log/ +log/ +profiling_log/ \ No newline at end of file diff --git a/benchmark/PaddleOCR_DBNet/LICENSE.md b/benchmark/PaddleOCR_DBNet/LICENSE.md new file mode 100644 index 0000000000000000000000000000000000000000..b09cd7856d58590578ee1a4f3ad45d1310a97f87 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/LICENSE.md @@ -0,0 +1,201 @@ +Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/benchmark/PaddleOCR_DBNet/README.MD b/benchmark/PaddleOCR_DBNet/README.MD new file mode 100644 index 0000000000000000000000000000000000000000..dbc07faafb1fbf519ad64bb6b6a801889fd3042e --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/README.MD @@ -0,0 +1,132 @@ +# Real-time Scene Text Detection with Differentiable Binarization + +**note**: some code is inherited from [WenmuZhou/DBNet.pytorch](https://github.com/WenmuZhou/DBNet.pytorch) + +[中文解读](https://zhuanlan.zhihu.com/p/94677957) + +![network](imgs/paper/db.jpg) + +## update +2020-06-07: 添加灰度图训练,训练灰度图时需要在配置里移除`dataset.args.transforms.Normalize` + +## Install Using Conda +``` +conda env create -f environment.yml +git clone https://github.com/WenmuZhou/DBNet.paddle.git +cd DBNet.paddle/ +``` + +or +## Install Manually +```bash +conda create -n dbnet python=3.6 +conda activate dbnet + +conda install ipython pip + +# python dependencies +pip install -r requirement.txt + +# clone repo +git clone https://github.com/WenmuZhou/DBNet.paddle.git +cd DBNet.paddle/ + +``` + +## Requirements +* paddlepaddle 2.4+ + +## Download + +TBD + +## Data Preparation + +Training data: prepare a text `train.txt` in the following format, use '\t' as a separator +``` +./datasets/train/img/001.jpg ./datasets/train/gt/001.txt +``` + +Validation data: prepare a text `test.txt` in the following format, use '\t' as a separator +``` +./datasets/test/img/001.jpg ./datasets/test/gt/001.txt +``` +- Store images in the `img` folder +- Store groundtruth in the `gt` folder + +The groundtruth can be `.txt` files, with the following format: +``` +x1, y1, x2, y2, x3, y3, x4, y4, annotation +``` + + +## Train +1. config the `dataset['train']['dataset'['data_path']'`,`dataset['validate']['dataset'['data_path']`in [config/icdar2015_resnet18_fpn_DBhead_polyLR.yaml](cconfig/icdar2015_resnet18_fpn_DBhead_polyLR.yaml) +* . single gpu train +```bash +bash singlel_gpu_train.sh +``` +* . Multi-gpu training +```bash +bash multi_gpu_train.sh +``` +## Test + +[eval.py](tools/eval.py) is used to test model on test dataset + +1. config `model_path` in [eval.sh](eval.sh) +2. use following script to test +```bash +bash eval.sh +``` + +## Predict +[predict.py](tools/predict.py) Can be used to inference on all images in a folder +1. config `model_path`,`input_folder`,`output_folder` in [predict.sh](predict.sh) +2. use following script to predict +``` +bash predict.sh +``` +You can change the `model_path` in the `predict.sh` file to your model location. + +tips: if result is not good, you can change `thre` in [predict.sh](predict.sh) + +## Export Model + +[export_model.py](tools/export_model.py) Can be used to inference on all images in a folder + +use following script to export inference model +``` +python tools/export_model.py --config_file config/icdar2015_resnet50_FPN_DBhead_polyLR.yaml -o trainer.resume_checkpoint=model_best.pth trainer.output_dir=output/infer +``` + +## Paddle Inference infer + +[infer.py](tools/infer.py) Can be used to inference on all images in a folder + +use following script to export inference model +``` +python tools/infer.py --model-dir=output/infer/ --img-path imgs/paper/db.jpg +``` + +

Performance

+ +### [ICDAR 2015](http://rrc.cvc.uab.es/?ch=4) +only train on ICDAR2015 dataset + +| Method | image size (short size) |learning rate | Precision (%) | Recall (%) | F-measure (%) | FPS | +|:--------------------------:|:-------:|:--------:|:--------:|:------------:|:---------------:|:-----:| +| ImageNet-resnet50-FPN-DBHead(torch) |736 |1e-3|90.19 | 78.14 | 83.88 | 27 | +| ImageNet-resnet50-FPN-DBHead(paddle) |736 |1e-3| 89.47 | 79.03 | 83.92 | 27 | +| ImageNet-resnet50-FPN-DBHead(paddle_amp) |736 |1e-3| 88.62 | 79.95 | 84.06 | 27 | + + +### examples +TBD + + +### reference +1. https://arxiv.org/pdf/1911.08947.pdf +2. https://github.com/WenmuZhou/DBNet.pytorch + +**If this repository helps you,please star it. Thanks.** diff --git a/benchmark/PaddleOCR_DBNet/base/__init__.py b/benchmark/PaddleOCR_DBNet/base/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..223e9e02d7ab1fbc2819f9ff33e04a2bd0d0303c --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/base/__init__.py @@ -0,0 +1,2 @@ +from .base_trainer import BaseTrainer +from .base_dataset import BaseDataSet \ No newline at end of file diff --git a/benchmark/PaddleOCR_DBNet/base/base_dataset.py b/benchmark/PaddleOCR_DBNet/base/base_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..4a839a8ffbc34a2f671e905f8750269eb5af0371 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/base/base_dataset.py @@ -0,0 +1,87 @@ +# -*- coding: utf-8 -*- +# @Time : 2019/12/4 13:12 +# @Author : zhoujun +import copy +from paddle.io import Dataset +from data_loader.modules import * + + +class BaseDataSet(Dataset): + def __init__(self, + data_path: str, + img_mode, + pre_processes, + filter_keys, + ignore_tags, + transform=None, + target_transform=None): + assert img_mode in ['RGB', 'BRG', 'GRAY'] + self.ignore_tags = ignore_tags + self.data_list = self.load_data(data_path) + item_keys = [ + 'img_path', 'img_name', 'text_polys', 'texts', 'ignore_tags' + ] + for item in item_keys: + assert item in self.data_list[ + 0], 'data_list from load_data must contains {}'.format( + item_keys) + self.img_mode = img_mode + self.filter_keys = filter_keys + self.transform = transform + self.target_transform = target_transform + self._init_pre_processes(pre_processes) + + def _init_pre_processes(self, pre_processes): + self.aug = [] + if pre_processes is not None: + for aug in pre_processes: + if 'args' not in aug: + args = {} + else: + args = aug['args'] + if isinstance(args, dict): + cls = eval(aug['type'])(**args) + else: + cls = eval(aug['type'])(args) + self.aug.append(cls) + + def load_data(self, data_path: str) -> list: + """ + 把数据加载为一个list: + :params data_path: 存储数据的文件夹或者文件 + return a dict ,包含了,'img_path','img_name','text_polys','texts','ignore_tags' + """ + raise NotImplementedError + + def apply_pre_processes(self, data): + for aug in self.aug: + data = aug(data) + return data + + def __getitem__(self, index): + try: + data = copy.deepcopy(self.data_list[index]) + im = cv2.imread(data['img_path'], 1 + if self.img_mode != 'GRAY' else 0) + if self.img_mode == 'RGB': + im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) + data['img'] = im + data['shape'] = [im.shape[0], im.shape[1]] + data = self.apply_pre_processes(data) + + if self.transform: + data['img'] = self.transform(data['img']) + data['text_polys'] = data['text_polys'].tolist() + if len(self.filter_keys): + data_dict = {} + for k, v in data.items(): + if k not in self.filter_keys: + data_dict[k] = v + return data_dict + else: + return data + except: + return self.__getitem__(np.random.randint(self.__len__())) + + def __len__(self): + return len(self.data_list) diff --git a/benchmark/PaddleOCR_DBNet/base/base_trainer.py b/benchmark/PaddleOCR_DBNet/base/base_trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..82c308d361ca0b1da274aefdef6b626a81ae3cc7 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/base/base_trainer.py @@ -0,0 +1,250 @@ +# -*- coding: utf-8 -*- +# @Time : 2019/8/23 21:50 +# @Author : zhoujun + +import os +import pathlib +import shutil +from pprint import pformat + +import anyconfig +import paddle +import numpy as np +import random +from paddle.jit import to_static +from paddle.static import InputSpec + +from utils import setup_logger + + +class BaseTrainer: + def __init__(self, + config, + model, + criterion, + train_loader, + validate_loader, + metric_cls, + post_process=None): + config['trainer']['output_dir'] = os.path.join( + str(pathlib.Path(os.path.abspath(__name__)).parent), + config['trainer']['output_dir']) + config['name'] = config['name'] + '_' + model.name + self.save_dir = config['trainer']['output_dir'] + self.checkpoint_dir = os.path.join(self.save_dir, 'checkpoint') + + os.makedirs(self.checkpoint_dir, exist_ok=True) + + self.global_step = 0 + self.start_epoch = 0 + self.config = config + self.criterion = criterion + # logger and tensorboard + self.visualdl_enable = self.config['trainer'].get('visual_dl', False) + self.epochs = self.config['trainer']['epochs'] + self.log_iter = self.config['trainer']['log_iter'] + if paddle.distributed.get_rank() == 0: + anyconfig.dump(config, os.path.join(self.save_dir, 'config.yaml')) + self.logger = setup_logger(os.path.join(self.save_dir, 'train.log')) + self.logger_info(pformat(self.config)) + + self.model = self.apply_to_static(model) + + # device + if paddle.device.cuda.device_count( + ) > 0 and paddle.device.is_compiled_with_cuda(): + self.with_cuda = True + random.seed(self.config['trainer']['seed']) + np.random.seed(self.config['trainer']['seed']) + paddle.seed(self.config['trainer']['seed']) + else: + self.with_cuda = False + self.logger_info('train with and paddle {}'.format(paddle.__version__)) + # metrics + self.metrics = { + 'recall': 0, + 'precision': 0, + 'hmean': 0, + 'train_loss': float('inf'), + 'best_model_epoch': 0 + } + + self.train_loader = train_loader + if validate_loader is not None: + assert post_process is not None and metric_cls is not None + self.validate_loader = validate_loader + self.post_process = post_process + self.metric_cls = metric_cls + self.train_loader_len = len(train_loader) + + if self.validate_loader is not None: + self.logger_info( + 'train dataset has {} samples,{} in dataloader, validate dataset has {} samples,{} in dataloader'. + format( + len(self.train_loader.dataset), self.train_loader_len, + len(self.validate_loader.dataset), + len(self.validate_loader))) + else: + self.logger_info( + 'train dataset has {} samples,{} in dataloader'.format( + len(self.train_loader.dataset), self.train_loader_len)) + + self._initialize_scheduler() + + self._initialize_optimizer() + + # resume or finetune + if self.config['trainer']['resume_checkpoint'] != '': + self._load_checkpoint( + self.config['trainer']['resume_checkpoint'], resume=True) + elif self.config['trainer']['finetune_checkpoint'] != '': + self._load_checkpoint( + self.config['trainer']['finetune_checkpoint'], resume=False) + + if self.visualdl_enable and paddle.distributed.get_rank() == 0: + from visualdl import LogWriter + self.writer = LogWriter(self.save_dir) + + # 混合精度训练 + self.amp = self.config.get('amp', None) + if self.amp == 'None': + self.amp = None + if self.amp: + self.amp['scaler'] = paddle.amp.GradScaler( + init_loss_scaling=self.amp.get("scale_loss", 1024), + use_dynamic_loss_scaling=self.amp.get( + 'use_dynamic_loss_scaling', True)) + self.model, self.optimizer = paddle.amp.decorate( + models=self.model, + optimizers=self.optimizer, + level=self.amp.get('amp_level', 'O2')) + + # 分布式训练 + if paddle.device.cuda.device_count() > 1: + self.model = paddle.DataParallel(self.model) + # make inverse Normalize + self.UN_Normalize = False + for t in self.config['dataset']['train']['dataset']['args'][ + 'transforms']: + if t['type'] == 'Normalize': + self.normalize_mean = t['args']['mean'] + self.normalize_std = t['args']['std'] + self.UN_Normalize = True + + def apply_to_static(self, model): + support_to_static = self.config['trainer'].get('to_static', False) + if support_to_static: + specs = None + print('static') + specs = [InputSpec([None, 3, -1, -1])] + model = to_static(model, input_spec=specs) + self.logger_info( + "Successfully to apply @to_static with specs: {}".format(specs)) + return model + + def train(self): + """ + Full training logic + """ + for epoch in range(self.start_epoch + 1, self.epochs + 1): + self.epoch_result = self._train_epoch(epoch) + self._on_epoch_finish() + if paddle.distributed.get_rank() == 0 and self.visualdl_enable: + self.writer.close() + self._on_train_finish() + + def _train_epoch(self, epoch): + """ + Training logic for an epoch + + :param epoch: Current epoch number + """ + raise NotImplementedError + + def _eval(self, epoch): + """ + eval logic for an epoch + + :param epoch: Current epoch number + """ + raise NotImplementedError + + def _on_epoch_finish(self): + raise NotImplementedError + + def _on_train_finish(self): + raise NotImplementedError + + def _save_checkpoint(self, epoch, file_name): + """ + Saving checkpoints + + :param epoch: current epoch number + :param log: logging information of the epoch + :param save_best: if True, rename the saved checkpoint to 'model_best.pth.tar' + """ + state_dict = self.model.state_dict() + state = { + 'epoch': epoch, + 'global_step': self.global_step, + 'state_dict': state_dict, + 'optimizer': self.optimizer.state_dict(), + 'config': self.config, + 'metrics': self.metrics + } + filename = os.path.join(self.checkpoint_dir, file_name) + paddle.save(state, filename) + + def _load_checkpoint(self, checkpoint_path, resume): + """ + Resume from saved checkpoints + :param checkpoint_path: Checkpoint path to be resumed + """ + self.logger_info("Loading checkpoint: {} ...".format(checkpoint_path)) + checkpoint = paddle.load(checkpoint_path) + self.model.set_state_dict(checkpoint['state_dict']) + if resume: + self.global_step = checkpoint['global_step'] + self.start_epoch = checkpoint['epoch'] + self.config['lr_scheduler']['args']['last_epoch'] = self.start_epoch + # self.scheduler.load_state_dict(checkpoint['scheduler']) + self.optimizer.set_state_dict(checkpoint['optimizer']) + if 'metrics' in checkpoint: + self.metrics = checkpoint['metrics'] + self.logger_info("resume from checkpoint {} (epoch {})".format( + checkpoint_path, self.start_epoch)) + else: + self.logger_info("finetune from checkpoint {}".format( + checkpoint_path)) + + def _initialize(self, name, module, *args, **kwargs): + module_name = self.config[name]['type'] + module_args = self.config[name].get('args', {}) + assert all([k not in module_args for k in kwargs + ]), 'Overwriting kwargs given in config file is not allowed' + module_args.update(kwargs) + return getattr(module, module_name)(*args, **module_args) + + def _initialize_scheduler(self): + self.lr_scheduler = self._initialize('lr_scheduler', + paddle.optimizer.lr) + + def _initialize_optimizer(self): + self.optimizer = self._initialize( + 'optimizer', + paddle.optimizer, + parameters=self.model.parameters(), + learning_rate=self.lr_scheduler) + + def inverse_normalize(self, batch_img): + if self.UN_Normalize: + batch_img[:, 0, :, :] = batch_img[:, 0, :, :] * self.normalize_std[ + 0] + self.normalize_mean[0] + batch_img[:, 1, :, :] = batch_img[:, 1, :, :] * self.normalize_std[ + 1] + self.normalize_mean[1] + batch_img[:, 2, :, :] = batch_img[:, 2, :, :] * self.normalize_std[ + 2] + self.normalize_mean[2] + + def logger_info(self, s): + if paddle.distributed.get_rank() == 0: + self.logger.info(s) diff --git a/benchmark/PaddleOCR_DBNet/config/SynthText.yaml b/benchmark/PaddleOCR_DBNet/config/SynthText.yaml new file mode 100644 index 0000000000000000000000000000000000000000..61d5da7d3b4b8ec9da0bd79020d95c694ecef513 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/config/SynthText.yaml @@ -0,0 +1,40 @@ +name: DBNet +dataset: + train: + dataset: + type: SynthTextDataset # 数据集类型 + args: + data_path: ''# SynthTextDataset 根目录 + pre_processes: # 数据的预处理过程,包含augment和标签制作 + - type: IaaAugment # 使用imgaug进行变换 + args: + - {'type':Fliplr, 'args':{'p':0.5}} + - {'type': Affine, 'args':{'rotate':[-10,10]}} + - {'type':Resize,'args':{'size':[0.5,3]}} + - type: EastRandomCropData + args: + size: [640,640] + max_tries: 50 + keep_ratio: true + - type: MakeBorderMap + args: + shrink_ratio: 0.4 + - type: MakeShrinkMap + args: + shrink_ratio: 0.4 + min_text_size: 8 + transforms: # 对图片进行的变换方式 + - type: ToTensor + args: {} + - type: Normalize + args: + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + img_mode: RGB + filter_keys: ['img_path','img_name','text_polys','texts','ignore_tags','shape'] # 返回数据之前,从数据字典里删除的key + ignore_tags: ['*', '###'] + loader: + batch_size: 1 + shuffle: true + num_workers: 0 + collate_fn: '' \ No newline at end of file diff --git a/benchmark/PaddleOCR_DBNet/config/SynthText_resnet18_FPN_DBhead_polyLR.yaml b/benchmark/PaddleOCR_DBNet/config/SynthText_resnet18_FPN_DBhead_polyLR.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a665e94a7fbaa531b67a385ceeeae3be843e97b6 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/config/SynthText_resnet18_FPN_DBhead_polyLR.yaml @@ -0,0 +1,65 @@ +name: DBNet +base: ['config/SynthText.yaml'] +arch: + type: Model + backbone: + type: resnet18 + pretrained: true + neck: + type: FPN + inner_channels: 256 + head: + type: DBHead + out_channels: 2 + k: 50 +post_processing: + type: SegDetectorRepresenter + args: + thresh: 0.3 + box_thresh: 0.7 + max_candidates: 1000 + unclip_ratio: 1.5 # from paper +metric: + type: QuadMetric + args: + is_output_polygon: false +loss: + type: DBLoss + alpha: 1 + beta: 10 + ohem_ratio: 3 +optimizer: + type: Adam + args: + lr: 0.001 + weight_decay: 0 + amsgrad: true +lr_scheduler: + type: WarmupPolyLR + args: + warmup_epoch: 3 +trainer: + seed: 2 + epochs: 1200 + log_iter: 10 + show_images_iter: 50 + resume_checkpoint: '' + finetune_checkpoint: '' + output_dir: output + visual_dl: false +amp: + scale_loss: 1024 + amp_level: O2 + custom_white_list: [] + custom_black_list: ['exp', 'sigmoid', 'concat'] +dataset: + train: + dataset: + args: + data_path: ./datasets/SynthText + img_mode: RGB + loader: + batch_size: 2 + shuffle: true + num_workers: 6 + collate_fn: '' \ No newline at end of file diff --git a/benchmark/PaddleOCR_DBNet/config/icdar2015.yaml b/benchmark/PaddleOCR_DBNet/config/icdar2015.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4551b14b24acecf079b677699dceed01f7a68c12 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/config/icdar2015.yaml @@ -0,0 +1,69 @@ +name: DBNet +dataset: + train: + dataset: + type: ICDAR2015Dataset # 数据集类型 + args: + data_path: # 一个存放 img_path \t gt_path的文件 + - '' + pre_processes: # 数据的预处理过程,包含augment和标签制作 + - type: IaaAugment # 使用imgaug进行变换 + args: + - {'type':Fliplr, 'args':{'p':0.5}} + - {'type': Affine, 'args':{'rotate':[-10,10]}} + - {'type':Resize,'args':{'size':[0.5,3]}} + - type: EastRandomCropData + args: + size: [640,640] + max_tries: 50 + keep_ratio: true + - type: MakeBorderMap + args: + shrink_ratio: 0.4 + thresh_min: 0.3 + thresh_max: 0.7 + - type: MakeShrinkMap + args: + shrink_ratio: 0.4 + min_text_size: 8 + transforms: # 对图片进行的变换方式 + - type: ToTensor + args: {} + - type: Normalize + args: + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + img_mode: RGB + filter_keys: [img_path,img_name,text_polys,texts,ignore_tags,shape] # 返回数据之前,从数据字典里删除的key + ignore_tags: ['*', '###'] + loader: + batch_size: 1 + shuffle: true + num_workers: 0 + collate_fn: '' + validate: + dataset: + type: ICDAR2015Dataset + args: + data_path: + - '' + pre_processes: + - type: ResizeShortSize + args: + short_size: 736 + resize_text_polys: false + transforms: + - type: ToTensor + args: {} + - type: Normalize + args: + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + img_mode: RGB + filter_keys: [] + ignore_tags: ['*', '###'] + loader: + batch_size: 1 + shuffle: true + num_workers: 0 + collate_fn: ICDARCollectFN \ No newline at end of file diff --git a/benchmark/PaddleOCR_DBNet/config/icdar2015_dcn_resnet18_FPN_DBhead_polyLR.yaml b/benchmark/PaddleOCR_DBNet/config/icdar2015_dcn_resnet18_FPN_DBhead_polyLR.yaml new file mode 100644 index 0000000000000000000000000000000000000000..608ef42c1a2085a7450cf1662071f26cd7d472ae --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/config/icdar2015_dcn_resnet18_FPN_DBhead_polyLR.yaml @@ -0,0 +1,82 @@ +name: DBNet +base: ['config/icdar2015.yaml'] +arch: + type: Model + backbone: + type: deformable_resnet18 + pretrained: true + neck: + type: FPN + inner_channels: 256 + head: + type: DBHead + out_channels: 2 + k: 50 +post_processing: + type: SegDetectorRepresenter + args: + thresh: 0.3 + box_thresh: 0.7 + max_candidates: 1000 + unclip_ratio: 1.5 # from paper +metric: + type: QuadMetric + args: + is_output_polygon: false +loss: + type: DBLoss + alpha: 1 + beta: 10 + ohem_ratio: 3 +optimizer: + type: Adam + args: + lr: 0.001 + weight_decay: 0 + amsgrad: true +lr_scheduler: + type: WarmupPolyLR + args: + warmup_epoch: 3 +trainer: + seed: 2 + epochs: 1200 + log_iter: 10 + show_images_iter: 50 + resume_checkpoint: '' + finetune_checkpoint: '' + output_dir: output + visual_dl: false +amp: + scale_loss: 1024 + amp_level: O2 + custom_white_list: [] + custom_black_list: ['exp', 'sigmoid', 'concat'] +dataset: + train: + dataset: + args: + data_path: + - ./datasets/train.txt + img_mode: RGB + loader: + batch_size: 1 + shuffle: true + num_workers: 6 + collate_fn: '' + validate: + dataset: + args: + data_path: + - ./datasets/test.txt + pre_processes: + - type: ResizeShortSize + args: + short_size: 736 + resize_text_polys: false + img_mode: RGB + loader: + batch_size: 1 + shuffle: true + num_workers: 6 + collate_fn: ICDARCollectFN \ No newline at end of file diff --git a/benchmark/PaddleOCR_DBNet/config/icdar2015_resnet18_FPN_DBhead_polyLR.yaml b/benchmark/PaddleOCR_DBNet/config/icdar2015_resnet18_FPN_DBhead_polyLR.yaml new file mode 100644 index 0000000000000000000000000000000000000000..62c392b9ce67e754a212fa219fdc51e685d52f32 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/config/icdar2015_resnet18_FPN_DBhead_polyLR.yaml @@ -0,0 +1,82 @@ +name: DBNet +base: ['config/icdar2015.yaml'] +arch: + type: Model + backbone: + type: resnet18 + pretrained: true + neck: + type: FPN + inner_channels: 256 + head: + type: DBHead + out_channels: 2 + k: 50 +post_processing: + type: SegDetectorRepresenter + args: + thresh: 0.3 + box_thresh: 0.7 + max_candidates: 1000 + unclip_ratio: 1.5 # from paper +metric: + type: QuadMetric + args: + is_output_polygon: false +loss: + type: DBLoss + alpha: 1 + beta: 10 + ohem_ratio: 3 +optimizer: + type: Adam + args: + lr: 0.001 + weight_decay: 0 + amsgrad: true +lr_scheduler: + type: WarmupPolyLR + args: + warmup_epoch: 3 +trainer: + seed: 2 + epochs: 1200 + log_iter: 10 + show_images_iter: 50 + resume_checkpoint: '' + finetune_checkpoint: '' + output_dir: output + visual_dl: false +amp: + scale_loss: 1024 + amp_level: O2 + custom_white_list: [] + custom_black_list: ['exp', 'sigmoid', 'concat'] +dataset: + train: + dataset: + args: + data_path: + - ./datasets/train.txt + img_mode: RGB + loader: + batch_size: 1 + shuffle: true + num_workers: 6 + collate_fn: '' + validate: + dataset: + args: + data_path: + - ./datasets/test.txt + pre_processes: + - type: ResizeShortSize + args: + short_size: 736 + resize_text_polys: false + img_mode: RGB + loader: + batch_size: 1 + shuffle: true + num_workers: 6 + collate_fn: ICDARCollectFN diff --git a/benchmark/PaddleOCR_DBNet/config/icdar2015_resnet18_FPN_DBhead_polyLR_finetune.yaml b/benchmark/PaddleOCR_DBNet/config/icdar2015_resnet18_FPN_DBhead_polyLR_finetune.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9b018d5cdd8877055f0c0372f313ab5d1beeb881 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/config/icdar2015_resnet18_FPN_DBhead_polyLR_finetune.yaml @@ -0,0 +1,83 @@ +name: DBNet +base: ['config/icdar2015.yaml'] +arch: + type: Model + backbone: + type: resnet18 + pretrained: true + neck: + type: FPN + inner_channels: 256 + head: + type: DBHead + out_channels: 2 + k: 50 +post_processing: + type: SegDetectorRepresenter + args: + thresh: 0.3 + box_thresh: 0.7 + max_candidates: 1000 + unclip_ratio: 1.5 # from paper +metric: + type: QuadMetric + args: + is_output_polygon: false +loss: + type: DBLoss + alpha: 1 + beta: 10 + ohem_ratio: 3 +optimizer: + type: Adam + args: + lr: 0.001 + weight_decay: 0 + amsgrad: true +lr_scheduler: + type: StepLR + args: + step_size: 10 + gama: 0.8 +trainer: + seed: 2 + epochs: 500 + log_iter: 10 + show_images_iter: 50 + resume_checkpoint: '' + finetune_checkpoint: '' + output_dir: output + visual_dl: false +amp: + scale_loss: 1024 + amp_level: O2 + custom_white_list: [] + custom_black_list: ['exp', 'sigmoid', 'concat'] +dataset: + train: + dataset: + args: + data_path: + - ./datasets/train.txt + img_mode: RGB + loader: + batch_size: 1 + shuffle: true + num_workers: 6 + collate_fn: '' + validate: + dataset: + args: + data_path: + - ./datasets/test.txt + pre_processes: + - type: ResizeShortSize + args: + short_size: 736 + resize_text_polys: false + img_mode: RGB + loader: + batch_size: 1 + shuffle: true + num_workers: 6 + collate_fn: ICDARCollectFN diff --git a/benchmark/PaddleOCR_DBNet/config/icdar2015_resnet50_FPN_DBhead_polyLR.yaml b/benchmark/PaddleOCR_DBNet/config/icdar2015_resnet50_FPN_DBhead_polyLR.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2a870fd7c75de41c6f7cf20854029dfd7ed38e16 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/config/icdar2015_resnet50_FPN_DBhead_polyLR.yaml @@ -0,0 +1,79 @@ +name: DBNet +base: ['config/icdar2015.yaml'] +arch: + type: Model + backbone: + type: resnet50 + pretrained: true + neck: + type: FPN + inner_channels: 256 + head: + type: DBHead + out_channels: 2 + k: 50 +post_processing: + type: SegDetectorRepresenter + args: + thresh: 0.3 + box_thresh: 0.7 + max_candidates: 1000 + unclip_ratio: 1.5 # from paper +metric: + type: QuadMetric + args: + is_output_polygon: false +loss: + type: DBLoss + alpha: 1 + beta: 10 + ohem_ratio: 3 +optimizer: + type: Adam +lr_scheduler: + type: Polynomial + args: + learning_rate: 0.001 + warmup_epoch: 3 +trainer: + seed: 2 + epochs: 1200 + log_iter: 10 + show_images_iter: 50 + resume_checkpoint: '' + finetune_checkpoint: '' + output_dir: output/fp16_o2 + visual_dl: false +amp: + scale_loss: 1024 + amp_level: O2 + custom_white_list: [] + custom_black_list: ['exp', 'sigmoid', 'concat'] +dataset: + train: + dataset: + args: + data_path: + - ./datasets/train.txt + img_mode: RGB + loader: + batch_size: 16 + shuffle: true + num_workers: 6 + collate_fn: '' + validate: + dataset: + args: + data_path: + - ./datasets/test.txt + pre_processes: + - type: ResizeShortSize + args: + short_size: 736 + resize_text_polys: false + img_mode: RGB + loader: + batch_size: 1 + shuffle: true + num_workers: 6 + collate_fn: ICDARCollectFN diff --git a/benchmark/PaddleOCR_DBNet/config/open_dataset.yaml b/benchmark/PaddleOCR_DBNet/config/open_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..97267586c28fbd0145a8b14ed39c5520d6716bee --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/config/open_dataset.yaml @@ -0,0 +1,73 @@ +name: DBNet +dataset: + train: + dataset: + type: DetDataset # 数据集类型 + args: + data_path: # 一个存放 img_path \t gt_path的文件 + - '' + pre_processes: # 数据的预处理过程,包含augment和标签制作 + - type: IaaAugment # 使用imgaug进行变换 + args: + - {'type':Fliplr, 'args':{'p':0.5}} + - {'type': Affine, 'args':{'rotate':[-10,10]}} + - {'type':Resize,'args':{'size':[0.5,3]}} + - type: EastRandomCropData + args: + size: [640,640] + max_tries: 50 + keep_ratio: true + - type: MakeBorderMap + args: + shrink_ratio: 0.4 + thresh_min: 0.3 + thresh_max: 0.7 + - type: MakeShrinkMap + args: + shrink_ratio: 0.4 + min_text_size: 8 + transforms: # 对图片进行的变换方式 + - type: ToTensor + args: {} + - type: Normalize + args: + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + img_mode: RGB + load_char_annotation: false + expand_one_char: false + filter_keys: [img_path,img_name,text_polys,texts,ignore_tags,shape] # 返回数据之前,从数据字典里删除的key + ignore_tags: ['*', '###'] + loader: + batch_size: 1 + shuffle: true + num_workers: 0 + collate_fn: '' + validate: + dataset: + type: DetDataset + args: + data_path: + - '' + pre_processes: + - type: ResizeShortSize + args: + short_size: 736 + resize_text_polys: false + transforms: + - type: ToTensor + args: {} + - type: Normalize + args: + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + img_mode: RGB + load_char_annotation: false # 是否加载字符级标注 + expand_one_char: false # 是否对只有一个字符的框进行宽度扩充,扩充后w = w+h + filter_keys: [] + ignore_tags: ['*', '###'] + loader: + batch_size: 1 + shuffle: true + num_workers: 0 + collate_fn: ICDARCollectFN \ No newline at end of file diff --git a/benchmark/PaddleOCR_DBNet/config/open_dataset_dcn_resnet50_FPN_DBhead_polyLR.yaml b/benchmark/PaddleOCR_DBNet/config/open_dataset_dcn_resnet50_FPN_DBhead_polyLR.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6c81738720e4de47ff2e528653eb0069b19dffdd --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/config/open_dataset_dcn_resnet50_FPN_DBhead_polyLR.yaml @@ -0,0 +1,86 @@ +name: DBNet +base: ['config/open_dataset.yaml'] +arch: + type: Model + backbone: + type: deformable_resnet18 + pretrained: true + neck: + type: FPN + inner_channels: 256 + head: + type: DBHead + out_channels: 2 + k: 50 +post_processing: + type: SegDetectorRepresenter + args: + thresh: 0.3 + box_thresh: 0.7 + max_candidates: 1000 + unclip_ratio: 1.5 # from paper +metric: + type: QuadMetric + args: + is_output_polygon: false +loss: + type: DBLoss + alpha: 1 + beta: 10 + ohem_ratio: 3 +optimizer: + type: Adam + args: + lr: 0.001 + weight_decay: 0 + amsgrad: true +lr_scheduler: + type: WarmupPolyLR + args: + warmup_epoch: 3 +trainer: + seed: 2 + epochs: 1200 + log_iter: 1 + show_images_iter: 1 + resume_checkpoint: '' + finetune_checkpoint: '' + output_dir: output + visual_dl: false +amp: + scale_loss: 1024 + amp_level: O2 + custom_white_list: [] + custom_black_list: ['exp', 'sigmoid', 'concat'] +dataset: + train: + dataset: + args: + data_path: + - ./datasets/train.json + img_mode: RGB + load_char_annotation: false + expand_one_char: false + loader: + batch_size: 2 + shuffle: true + num_workers: 6 + collate_fn: '' + validate: + dataset: + args: + data_path: + - ./datasets/test.json + pre_processes: + - type: ResizeShortSize + args: + short_size: 736 + resize_text_polys: false + img_mode: RGB + load_char_annotation: false + expand_one_char: false + loader: + batch_size: 1 + shuffle: true + num_workers: 6 + collate_fn: ICDARCollectFN diff --git a/benchmark/PaddleOCR_DBNet/config/open_dataset_resnest50_FPN_DBhead_polyLR.yaml b/benchmark/PaddleOCR_DBNet/config/open_dataset_resnest50_FPN_DBhead_polyLR.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d47ab06eddfebdd45376a7d60a771923215b932d --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/config/open_dataset_resnest50_FPN_DBhead_polyLR.yaml @@ -0,0 +1,86 @@ +name: DBNet +base: ['config/open_dataset.yaml'] +arch: + type: Model + backbone: + type: resnest50 + pretrained: true + neck: + type: FPN + inner_channels: 256 + head: + type: DBHead + out_channels: 2 + k: 50 +post_processing: + type: SegDetectorRepresenter + args: + thresh: 0.3 + box_thresh: 0.7 + max_candidates: 1000 + unclip_ratio: 1.5 # from paper +metric: + type: QuadMetric + args: + is_output_polygon: false +loss: + type: DBLoss + alpha: 1 + beta: 10 + ohem_ratio: 3 +optimizer: + type: Adam + args: + lr: 0.001 + weight_decay: 0 + amsgrad: true +lr_scheduler: + type: WarmupPolyLR + args: + warmup_epoch: 3 +trainer: + seed: 2 + epochs: 1200 + log_iter: 1 + show_images_iter: 1 + resume_checkpoint: '' + finetune_checkpoint: '' + output_dir: output + visual_dl: false +amp: + scale_loss: 1024 + amp_level: O2 + custom_white_list: [] + custom_black_list: ['exp', 'sigmoid', 'concat'] +dataset: + train: + dataset: + args: + data_path: + - ./datasets/train.json + img_mode: RGB + load_char_annotation: false + expand_one_char: false + loader: + batch_size: 2 + shuffle: true + num_workers: 6 + collate_fn: '' + validate: + dataset: + args: + data_path: + - ./datasets/test.json + pre_processes: + - type: ResizeShortSize + args: + short_size: 736 + resize_text_polys: false + img_mode: RGB + load_char_annotation: false + expand_one_char: false + loader: + batch_size: 1 + shuffle: true + num_workers: 6 + collate_fn: ICDARCollectFN diff --git a/benchmark/PaddleOCR_DBNet/config/open_dataset_resnet18_FPN_DBhead_polyLR.yaml b/benchmark/PaddleOCR_DBNet/config/open_dataset_resnet18_FPN_DBhead_polyLR.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ff16ddb26dc90b62dabde59630b2450de4fdf326 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/config/open_dataset_resnet18_FPN_DBhead_polyLR.yaml @@ -0,0 +1,93 @@ +name: DBNet +base: ['config/open_dataset.yaml'] +arch: + type: Model + backbone: + type: resnet18 + pretrained: true + neck: + type: FPN + inner_channels: 256 + head: + type: DBHead + out_channels: 2 + k: 50 +post_processing: + type: SegDetectorRepresenter + args: + thresh: 0.3 + box_thresh: 0.7 + max_candidates: 1000 + unclip_ratio: 1.5 # from paper +metric: + type: QuadMetric + args: + is_output_polygon: false +loss: + type: DBLoss + alpha: 1 + beta: 10 + ohem_ratio: 3 +optimizer: + type: Adam + args: + lr: 0.001 + weight_decay: 0 + amsgrad: true +lr_scheduler: + type: WarmupPolyLR + args: + warmup_epoch: 3 +trainer: + seed: 2 + epochs: 1200 + log_iter: 1 + show_images_iter: 1 + resume_checkpoint: '' + finetune_checkpoint: '' + output_dir: output + visual_dl: false +amp: + scale_loss: 1024 + amp_level: O2 + custom_white_list: [] + custom_black_list: ['exp', 'sigmoid', 'concat'] +dataset: + train: + dataset: + args: + data_path: + - ./datasets/train.json + transforms: # 对图片进行的变换方式 + - type: ToTensor + args: {} + - type: Normalize + args: + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + img_mode: RGB + load_char_annotation: false + expand_one_char: false + loader: + batch_size: 2 + shuffle: true + num_workers: 6 + collate_fn: '' + validate: + dataset: + args: + data_path: + - ./datasets/test.json + pre_processes: + - type: ResizeShortSize + args: + short_size: 736 + resize_text_polys: false + img_mode: RGB + load_char_annotation: false + expand_one_char: false + loader: + batch_size: 1 + shuffle: true + num_workers: 6 + collate_fn: ICDARCollectFN diff --git a/benchmark/PaddleOCR_DBNet/data_loader/__init__.py b/benchmark/PaddleOCR_DBNet/data_loader/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..afc6e56b51948a3ec237967b9cf360eea984d625 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/data_loader/__init__.py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- +# @Time : 2019/8/23 21:52 +# @Author : zhoujun +import copy + +import PIL +import numpy as np +import paddle +from paddle.io import DataLoader, DistributedBatchSampler, BatchSampler + +from paddle.vision import transforms + + +def get_dataset(data_path, module_name, transform, dataset_args): + """ + 获取训练dataset + :param data_path: dataset文件列表,每个文件内以如下格式存储 ‘path/to/img\tlabel’ + :param module_name: 所使用的自定义dataset名称,目前只支持data_loaders.ImageDataset + :param transform: 该数据集使用的transforms + :param dataset_args: module_name的参数 + :return: 如果data_path列表不为空,返回对于的ConcatDataset对象,否则None + """ + from . import dataset + s_dataset = getattr(dataset, module_name)(transform=transform, + data_path=data_path, + **dataset_args) + return s_dataset + + +def get_transforms(transforms_config): + tr_list = [] + for item in transforms_config: + if 'args' not in item: + args = {} + else: + args = item['args'] + cls = getattr(transforms, item['type'])(**args) + tr_list.append(cls) + tr_list = transforms.Compose(tr_list) + return tr_list + + +class ICDARCollectFN: + def __init__(self, *args, **kwargs): + pass + + def __call__(self, batch): + data_dict = {} + to_tensor_keys = [] + for sample in batch: + for k, v in sample.items(): + if k not in data_dict: + data_dict[k] = [] + if isinstance(v, (np.ndarray, paddle.Tensor, PIL.Image.Image)): + if k not in to_tensor_keys: + to_tensor_keys.append(k) + data_dict[k].append(v) + for k in to_tensor_keys: + data_dict[k] = paddle.stack(data_dict[k], 0) + return data_dict + + +def get_dataloader(module_config, distributed=False): + if module_config is None: + return None + config = copy.deepcopy(module_config) + dataset_args = config['dataset']['args'] + if 'transforms' in dataset_args: + img_transfroms = get_transforms(dataset_args.pop('transforms')) + else: + img_transfroms = None + # 创建数据集 + dataset_name = config['dataset']['type'] + data_path = dataset_args.pop('data_path') + if data_path == None: + return None + + data_path = [x for x in data_path if x is not None] + if len(data_path) == 0: + return None + if 'collate_fn' not in config['loader'] or config['loader'][ + 'collate_fn'] is None or len(config['loader']['collate_fn']) == 0: + config['loader']['collate_fn'] = None + else: + config['loader']['collate_fn'] = eval(config['loader']['collate_fn'])() + + _dataset = get_dataset( + data_path=data_path, + module_name=dataset_name, + transform=img_transfroms, + dataset_args=dataset_args) + sampler = None + if distributed: + # 3)使用DistributedSampler + batch_sampler = DistributedBatchSampler( + dataset=_dataset, + batch_size=config['loader'].pop('batch_size'), + shuffle=config['loader'].pop('shuffle')) + else: + batch_sampler = BatchSampler( + dataset=_dataset, + batch_size=config['loader'].pop('batch_size'), + shuffle=config['loader'].pop('shuffle')) + loader = DataLoader( + dataset=_dataset, batch_sampler=batch_sampler, **config['loader']) + return loader diff --git a/benchmark/PaddleOCR_DBNet/data_loader/dataset.py b/benchmark/PaddleOCR_DBNet/data_loader/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..29d3954fe6b89c4585435d0221fa25c28cc8adef --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/data_loader/dataset.py @@ -0,0 +1,181 @@ +# -*- coding: utf-8 -*- +# @Time : 2019/8/23 21:54 +# @Author : zhoujun +import pathlib +import os +import cv2 +import numpy as np +import scipy.io as sio +from tqdm.auto import tqdm + +from base import BaseDataSet +from utils import order_points_clockwise, get_datalist, load, expand_polygon + + +class ICDAR2015Dataset(BaseDataSet): + def __init__(self, + data_path: str, + img_mode, + pre_processes, + filter_keys, + ignore_tags, + transform=None, + **kwargs): + super().__init__(data_path, img_mode, pre_processes, filter_keys, + ignore_tags, transform) + + def load_data(self, data_path: str) -> list: + data_list = get_datalist(data_path) + t_data_list = [] + for img_path, label_path in data_list: + data = self._get_annotation(label_path) + if len(data['text_polys']) > 0: + item = { + 'img_path': img_path, + 'img_name': pathlib.Path(img_path).stem + } + item.update(data) + t_data_list.append(item) + else: + print('there is no suit bbox in {}'.format(label_path)) + return t_data_list + + def _get_annotation(self, label_path: str) -> dict: + boxes = [] + texts = [] + ignores = [] + with open(label_path, encoding='utf-8', mode='r') as f: + for line in f.readlines(): + params = line.strip().strip('\ufeff').strip( + '\xef\xbb\xbf').split(',') + try: + box = order_points_clockwise( + np.array(list(map(float, params[:8]))).reshape(-1, 2)) + if cv2.contourArea(box) > 0: + boxes.append(box) + label = params[8] + texts.append(label) + ignores.append(label in self.ignore_tags) + except: + print('load label failed on {}'.format(label_path)) + data = { + 'text_polys': np.array(boxes), + 'texts': texts, + 'ignore_tags': ignores, + } + return data + + +class DetDataset(BaseDataSet): + def __init__(self, + data_path: str, + img_mode, + pre_processes, + filter_keys, + ignore_tags, + transform=None, + **kwargs): + self.load_char_annotation = kwargs['load_char_annotation'] + self.expand_one_char = kwargs['expand_one_char'] + super().__init__(data_path, img_mode, pre_processes, filter_keys, + ignore_tags, transform) + + def load_data(self, data_path: str) -> list: + """ + 从json文件中读取出 文本行的坐标和gt,字符的坐标和gt + :param data_path: + :return: + """ + data_list = [] + for path in data_path: + content = load(path) + for gt in tqdm( + content['data_list'], desc='read file {}'.format(path)): + img_path = os.path.join(content['data_root'], gt['img_name']) + polygons = [] + texts = [] + illegibility_list = [] + language_list = [] + for annotation in gt['annotations']: + if len(annotation['polygon']) == 0 or len(annotation[ + 'text']) == 0: + continue + if len(annotation['text']) > 1 and self.expand_one_char: + annotation['polygon'] = expand_polygon(annotation[ + 'polygon']) + polygons.append(annotation['polygon']) + texts.append(annotation['text']) + illegibility_list.append(annotation['illegibility']) + language_list.append(annotation['language']) + if self.load_char_annotation: + for char_annotation in annotation['chars']: + if len(char_annotation['polygon']) == 0 or len( + char_annotation['char']) == 0: + continue + polygons.append(char_annotation['polygon']) + texts.append(char_annotation['char']) + illegibility_list.append(char_annotation[ + 'illegibility']) + language_list.append(char_annotation['language']) + data_list.append({ + 'img_path': img_path, + 'img_name': gt['img_name'], + 'text_polys': np.array(polygons), + 'texts': texts, + 'ignore_tags': illegibility_list + }) + return data_list + + +class SynthTextDataset(BaseDataSet): + def __init__(self, + data_path: str, + img_mode, + pre_processes, + filter_keys, + transform=None, + **kwargs): + self.transform = transform + self.dataRoot = pathlib.Path(data_path) + if not self.dataRoot.exists(): + raise FileNotFoundError('Dataset folder is not exist.') + + self.targetFilePath = self.dataRoot / 'gt.mat' + if not self.targetFilePath.exists(): + raise FileExistsError('Target file is not exist.') + targets = {} + sio.loadmat( + self.targetFilePath, + targets, + squeeze_me=True, + struct_as_record=False, + variable_names=['imnames', 'wordBB', 'txt']) + + self.imageNames = targets['imnames'] + self.wordBBoxes = targets['wordBB'] + self.transcripts = targets['txt'] + super().__init__(data_path, img_mode, pre_processes, filter_keys, + transform) + + def load_data(self, data_path: str) -> list: + t_data_list = [] + for imageName, wordBBoxes, texts in zip( + self.imageNames, self.wordBBoxes, self.transcripts): + item = {} + wordBBoxes = np.expand_dims( + wordBBoxes, axis=2) if (wordBBoxes.ndim == 2) else wordBBoxes + _, _, numOfWords = wordBBoxes.shape + text_polys = wordBBoxes.reshape( + [8, numOfWords], order='F').T # num_words * 8 + text_polys = text_polys.reshape(numOfWords, 4, + 2) # num_of_words * 4 * 2 + transcripts = [word for line in texts for word in line.split()] + if numOfWords != len(transcripts): + continue + item['img_path'] = str(self.dataRoot / imageName) + item['img_name'] = (self.dataRoot / imageName).stem + item['text_polys'] = text_polys + item['texts'] = transcripts + item['ignore_tags'] = [x in self.ignore_tags for x in transcripts] + t_data_list.append(item) + return t_data_list diff --git a/benchmark/PaddleOCR_DBNet/data_loader/modules/__init__.py b/benchmark/PaddleOCR_DBNet/data_loader/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bc055dae5d44876463c5eb1edfea18b96c319ca8 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/data_loader/modules/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- +# @Time : 2019/12/4 10:53 +# @Author : zhoujun +from .iaa_augment import IaaAugment +from .augment import * +from .random_crop_data import EastRandomCropData, PSERandomCrop +from .make_border_map import MakeBorderMap +from .make_shrink_map import MakeShrinkMap diff --git a/benchmark/PaddleOCR_DBNet/data_loader/modules/augment.py b/benchmark/PaddleOCR_DBNet/data_loader/modules/augment.py new file mode 100644 index 0000000000000000000000000000000000000000..e81bc123d914a7f2eb143d3c2504fab0e0f8d63f --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/data_loader/modules/augment.py @@ -0,0 +1,304 @@ +# -*- coding: utf-8 -*- +# @Time : 2019/8/23 21:52 +# @Author : zhoujun + +import math +import numbers +import random + +import cv2 +import numpy as np +from skimage.util import random_noise + + +class RandomNoise: + def __init__(self, random_rate): + self.random_rate = random_rate + + def __call__(self, data: dict): + """ + 对图片加噪声 + :param data: {'img':,'text_polys':,'texts':,'ignore_tags':} + :return: + """ + if random.random() > self.random_rate: + return data + data['img'] = (random_noise( + data['img'], mode='gaussian', clip=True) * 255).astype(im.dtype) + return data + + +class RandomScale: + def __init__(self, scales, random_rate): + """ + :param scales: 尺度 + :param ramdon_rate: 随机系数 + :return: + """ + self.random_rate = random_rate + self.scales = scales + + def __call__(self, data: dict) -> dict: + """ + 从scales中随机选择一个尺度,对图片和文本框进行缩放 + :param data: {'img':,'text_polys':,'texts':,'ignore_tags':} + :return: + """ + if random.random() > self.random_rate: + return data + im = data['img'] + text_polys = data['text_polys'] + + tmp_text_polys = text_polys.copy() + rd_scale = float(np.random.choice(self.scales)) + im = cv2.resize(im, dsize=None, fx=rd_scale, fy=rd_scale) + tmp_text_polys *= rd_scale + + data['img'] = im + data['text_polys'] = tmp_text_polys + return data + + +class RandomRotateImgBox: + def __init__(self, degrees, random_rate, same_size=False): + """ + :param degrees: 角度,可以是一个数值或者list + :param ramdon_rate: 随机系数 + :param same_size: 是否保持和原图一样大 + :return: + """ + if isinstance(degrees, numbers.Number): + if degrees < 0: + raise ValueError( + "If degrees is a single number, it must be positive.") + degrees = (-degrees, degrees) + elif isinstance(degrees, list) or isinstance( + degrees, tuple) or isinstance(degrees, np.ndarray): + if len(degrees) != 2: + raise ValueError( + "If degrees is a sequence, it must be of len 2.") + degrees = degrees + else: + raise Exception( + 'degrees must in Number or list or tuple or np.ndarray') + self.degrees = degrees + self.same_size = same_size + self.random_rate = random_rate + + def __call__(self, data: dict) -> dict: + """ + 从scales中随机选择一个尺度,对图片和文本框进行缩放 + :param data: {'img':,'text_polys':,'texts':,'ignore_tags':} + :return: + """ + if random.random() > self.random_rate: + return data + im = data['img'] + text_polys = data['text_polys'] + + # ---------------------- 旋转图像 ---------------------- + w = im.shape[1] + h = im.shape[0] + angle = np.random.uniform(self.degrees[0], self.degrees[1]) + + if self.same_size: + nw = w + nh = h + else: + # 角度变弧度 + rangle = np.deg2rad(angle) + # 计算旋转之后图像的w, h + nw = (abs(np.sin(rangle) * h) + abs(np.cos(rangle) * w)) + nh = (abs(np.cos(rangle) * h) + abs(np.sin(rangle) * w)) + # 构造仿射矩阵 + rot_mat = cv2.getRotationMatrix2D((nw * 0.5, nh * 0.5), angle, 1) + # 计算原图中心点到新图中心点的偏移量 + rot_move = np.dot(rot_mat, + np.array([(nw - w) * 0.5, (nh - h) * 0.5, 0])) + # 更新仿射矩阵 + rot_mat[0, 2] += rot_move[0] + rot_mat[1, 2] += rot_move[1] + # 仿射变换 + rot_img = cv2.warpAffine( + im, + rot_mat, (int(math.ceil(nw)), int(math.ceil(nh))), + flags=cv2.INTER_LANCZOS4) + + # ---------------------- 矫正bbox坐标 ---------------------- + # rot_mat是最终的旋转矩阵 + # 获取原始bbox的四个中点,然后将这四个点转换到旋转后的坐标系下 + rot_text_polys = list() + for bbox in text_polys: + point1 = np.dot(rot_mat, np.array([bbox[0, 0], bbox[0, 1], 1])) + point2 = np.dot(rot_mat, np.array([bbox[1, 0], bbox[1, 1], 1])) + point3 = np.dot(rot_mat, np.array([bbox[2, 0], bbox[2, 1], 1])) + point4 = np.dot(rot_mat, np.array([bbox[3, 0], bbox[3, 1], 1])) + rot_text_polys.append([point1, point2, point3, point4]) + data['img'] = rot_img + data['text_polys'] = np.array(rot_text_polys) + return data + + +class RandomResize: + def __init__(self, size, random_rate, keep_ratio=False): + """ + :param input_size: resize尺寸,数字或者list的形式,如果为list形式,就是[w,h] + :param ramdon_rate: 随机系数 + :param keep_ratio: 是否保持长宽比 + :return: + """ + if isinstance(size, numbers.Number): + if size < 0: + raise ValueError( + "If input_size is a single number, it must be positive.") + size = (size, size) + elif isinstance(size, list) or isinstance(size, tuple) or isinstance( + size, np.ndarray): + if len(size) != 2: + raise ValueError( + "If input_size is a sequence, it must be of len 2.") + size = (size[0], size[1]) + else: + raise Exception( + 'input_size must in Number or list or tuple or np.ndarray') + self.size = size + self.keep_ratio = keep_ratio + self.random_rate = random_rate + + def __call__(self, data: dict) -> dict: + """ + 从scales中随机选择一个尺度,对图片和文本框进行缩放 + :param data: {'img':,'text_polys':,'texts':,'ignore_tags':} + :return: + """ + if random.random() > self.random_rate: + return data + im = data['img'] + text_polys = data['text_polys'] + + if self.keep_ratio: + # 将图片短边pad到和长边一样 + h, w, c = im.shape + max_h = max(h, self.size[0]) + max_w = max(w, self.size[1]) + im_padded = np.zeros((max_h, max_w, c), dtype=np.uint8) + im_padded[:h, :w] = im.copy() + im = im_padded + text_polys = text_polys.astype(np.float32) + h, w, _ = im.shape + im = cv2.resize(im, self.size) + w_scale = self.size[0] / float(w) + h_scale = self.size[1] / float(h) + text_polys[:, :, 0] *= w_scale + text_polys[:, :, 1] *= h_scale + + data['img'] = im + data['text_polys'] = text_polys + return data + + +def resize_image(img, short_size): + height, width, _ = img.shape + if height < width: + new_height = short_size + new_width = new_height / height * width + else: + new_width = short_size + new_height = new_width / width * height + new_height = int(round(new_height / 32) * 32) + new_width = int(round(new_width / 32) * 32) + resized_img = cv2.resize(img, (new_width, new_height)) + return resized_img, (new_width / width, new_height / height) + + +class ResizeShortSize: + def __init__(self, short_size, resize_text_polys=True): + """ + :param size: resize尺寸,数字或者list的形式,如果为list形式,就是[w,h] + :return: + """ + self.short_size = short_size + self.resize_text_polys = resize_text_polys + + def __call__(self, data: dict) -> dict: + """ + 对图片和文本框进行缩放 + :param data: {'img':,'text_polys':,'texts':,'ignore_tags':} + :return: + """ + im = data['img'] + text_polys = data['text_polys'] + + h, w, _ = im.shape + short_edge = min(h, w) + if short_edge < self.short_size: + # 保证短边 >= short_size + scale = self.short_size / short_edge + im = cv2.resize(im, dsize=None, fx=scale, fy=scale) + scale = (scale, scale) + # im, scale = resize_image(im, self.short_size) + if self.resize_text_polys: + # text_polys *= scale + text_polys[:, 0] *= scale[0] + text_polys[:, 1] *= scale[1] + + data['img'] = im + data['text_polys'] = text_polys + return data + + +class HorizontalFlip: + def __init__(self, random_rate): + """ + + :param random_rate: 随机系数 + """ + self.random_rate = random_rate + + def __call__(self, data: dict) -> dict: + """ + 从scales中随机选择一个尺度,对图片和文本框进行缩放 + :param data: {'img':,'text_polys':,'texts':,'ignore_tags':} + :return: + """ + if random.random() > self.random_rate: + return data + im = data['img'] + text_polys = data['text_polys'] + + flip_text_polys = text_polys.copy() + flip_im = cv2.flip(im, 1) + h, w, _ = flip_im.shape + flip_text_polys[:, :, 0] = w - flip_text_polys[:, :, 0] + + data['img'] = flip_im + data['text_polys'] = flip_text_polys + return data + + +class VerticallFlip: + def __init__(self, random_rate): + """ + + :param random_rate: 随机系数 + """ + self.random_rate = random_rate + + def __call__(self, data: dict) -> dict: + """ + 从scales中随机选择一个尺度,对图片和文本框进行缩放 + :param data: {'img':,'text_polys':,'texts':,'ignore_tags':} + :return: + """ + if random.random() > self.random_rate: + return data + im = data['img'] + text_polys = data['text_polys'] + + flip_text_polys = text_polys.copy() + flip_im = cv2.flip(im, 0) + h, w, _ = flip_im.shape + flip_text_polys[:, :, 1] = h - flip_text_polys[:, :, 1] + data['img'] = flip_im + data['text_polys'] = flip_text_polys + return data diff --git a/benchmark/PaddleOCR_DBNet/data_loader/modules/iaa_augment.py b/benchmark/PaddleOCR_DBNet/data_loader/modules/iaa_augment.py new file mode 100644 index 0000000000000000000000000000000000000000..1cf891bbd6fd5763aacbd60749d4dd9b6de89681 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/data_loader/modules/iaa_augment.py @@ -0,0 +1,71 @@ +# -*- coding: utf-8 -*- +# @Time : 2019/12/4 18:06 +# @Author : zhoujun +import numpy as np +import imgaug +import imgaug.augmenters as iaa + + +class AugmenterBuilder(object): + def __init__(self): + pass + + def build(self, args, root=True): + if args is None or len(args) == 0: + return None + elif isinstance(args, list): + if root: + sequence = [self.build(value, root=False) for value in args] + return iaa.Sequential(sequence) + else: + return getattr( + iaa, + args[0])(* [self.to_tuple_if_list(a) for a in args[1:]]) + elif isinstance(args, dict): + cls = getattr(iaa, args['type']) + return cls(**{ + k: self.to_tuple_if_list(v) + for k, v in args['args'].items() + }) + else: + raise RuntimeError('unknown augmenter arg: ' + str(args)) + + def to_tuple_if_list(self, obj): + if isinstance(obj, list): + return tuple(obj) + return obj + + +class IaaAugment(): + def __init__(self, augmenter_args): + self.augmenter_args = augmenter_args + self.augmenter = AugmenterBuilder().build(self.augmenter_args) + + def __call__(self, data): + image = data['img'] + shape = image.shape + + if self.augmenter: + aug = self.augmenter.to_deterministic() + data['img'] = aug.augment_image(image) + data = self.may_augment_annotation(aug, data, shape) + return data + + def may_augment_annotation(self, aug, data, shape): + if aug is None: + return data + + line_polys = [] + for poly in data['text_polys']: + new_poly = self.may_augment_poly(aug, shape, poly) + line_polys.append(new_poly) + data['text_polys'] = np.array(line_polys) + return data + + def may_augment_poly(self, aug, img_shape, poly): + keypoints = [imgaug.Keypoint(p[0], p[1]) for p in poly] + keypoints = aug.augment_keypoints( + [imgaug.KeypointsOnImage( + keypoints, shape=img_shape)])[0].keypoints + poly = [(p.x, p.y) for p in keypoints] + return poly diff --git a/benchmark/PaddleOCR_DBNet/data_loader/modules/make_border_map.py b/benchmark/PaddleOCR_DBNet/data_loader/modules/make_border_map.py new file mode 100644 index 0000000000000000000000000000000000000000..2985f3c8a01c67efb71b5279edf95dd3f9fe5680 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/data_loader/modules/make_border_map.py @@ -0,0 +1,143 @@ +import cv2 +import numpy as np +np.seterr(divide='ignore', invalid='ignore') +import pyclipper +from shapely.geometry import Polygon + + +class MakeBorderMap(): + def __init__(self, shrink_ratio=0.4, thresh_min=0.3, thresh_max=0.7): + self.shrink_ratio = shrink_ratio + self.thresh_min = thresh_min + self.thresh_max = thresh_max + + def __call__(self, data: dict) -> dict: + """ + 从scales中随机选择一个尺度,对图片和文本框进行缩放 + :param data: {'img':,'text_polys':,'texts':,'ignore_tags':} + :return: + """ + im = data['img'] + text_polys = data['text_polys'] + ignore_tags = data['ignore_tags'] + + canvas = np.zeros(im.shape[:2], dtype=np.float32) + mask = np.zeros(im.shape[:2], dtype=np.float32) + + for i in range(len(text_polys)): + if ignore_tags[i]: + continue + self.draw_border_map(text_polys[i], canvas, mask=mask) + canvas = canvas * (self.thresh_max - self.thresh_min) + self.thresh_min + + data['threshold_map'] = canvas + data['threshold_mask'] = mask + return data + + def draw_border_map(self, polygon, canvas, mask): + polygon = np.array(polygon) + assert polygon.ndim == 2 + assert polygon.shape[1] == 2 + + polygon_shape = Polygon(polygon) + if polygon_shape.area <= 0: + return + distance = polygon_shape.area * ( + 1 - np.power(self.shrink_ratio, 2)) / polygon_shape.length + subject = [tuple(l) for l in polygon] + padding = pyclipper.PyclipperOffset() + padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON) + + padded_polygon = np.array(padding.Execute(distance)[0]) + cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0) + + xmin = padded_polygon[:, 0].min() + xmax = padded_polygon[:, 0].max() + ymin = padded_polygon[:, 1].min() + ymax = padded_polygon[:, 1].max() + width = xmax - xmin + 1 + height = ymax - ymin + 1 + + polygon[:, 0] = polygon[:, 0] - xmin + polygon[:, 1] = polygon[:, 1] - ymin + + xs = np.broadcast_to( + np.linspace( + 0, width - 1, num=width).reshape(1, width), (height, width)) + ys = np.broadcast_to( + np.linspace( + 0, height - 1, num=height).reshape(height, 1), (height, width)) + + distance_map = np.zeros( + (polygon.shape[0], height, width), dtype=np.float32) + for i in range(polygon.shape[0]): + j = (i + 1) % polygon.shape[0] + absolute_distance = self.distance(xs, ys, polygon[i], polygon[j]) + distance_map[i] = np.clip(absolute_distance / distance, 0, 1) + distance_map = distance_map.min(axis=0) + + xmin_valid = min(max(0, xmin), canvas.shape[1] - 1) + xmax_valid = min(max(0, xmax), canvas.shape[1] - 1) + ymin_valid = min(max(0, ymin), canvas.shape[0] - 1) + ymax_valid = min(max(0, ymax), canvas.shape[0] - 1) + canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1] = np.fmax( + 1 - distance_map[ymin_valid - ymin:ymax_valid - ymax + height, + xmin_valid - xmin:xmax_valid - xmax + width], + canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1]) + + def distance(self, xs, ys, point_1, point_2): + ''' + compute the distance from point to a line + ys: coordinates in the first axis + xs: coordinates in the second axis + point_1, point_2: (x, y), the end of the line + ''' + height, width = xs.shape[:2] + square_distance_1 = np.square(xs - point_1[0]) + np.square(ys - point_1[ + 1]) + square_distance_2 = np.square(xs - point_2[0]) + np.square(ys - point_2[ + 1]) + square_distance = np.square(point_1[0] - point_2[0]) + np.square( + point_1[1] - point_2[1]) + + cosin = (square_distance - square_distance_1 - square_distance_2) / ( + 2 * np.sqrt(square_distance_1 * square_distance_2)) + square_sin = 1 - np.square(cosin) + square_sin = np.nan_to_num(square_sin) + + result = np.sqrt(square_distance_1 * square_distance_2 * square_sin / + square_distance) + result[cosin < + 0] = np.sqrt(np.fmin(square_distance_1, square_distance_2))[cosin + < 0] + # self.extend_line(point_1, point_2, result) + return result + + def extend_line(self, point_1, point_2, result): + ex_point_1 = (int( + round(point_1[0] + (point_1[0] - point_2[0]) * ( + 1 + self.shrink_ratio))), int( + round(point_1[1] + (point_1[1] - point_2[1]) * ( + 1 + self.shrink_ratio)))) + cv2.line( + result, + tuple(ex_point_1), + tuple(point_1), + 4096.0, + 1, + lineType=cv2.LINE_AA, + shift=0) + ex_point_2 = (int( + round(point_2[0] + (point_2[0] - point_1[0]) * ( + 1 + self.shrink_ratio))), int( + round(point_2[1] + (point_2[1] - point_1[1]) * ( + 1 + self.shrink_ratio)))) + cv2.line( + result, + tuple(ex_point_2), + tuple(point_2), + 4096.0, + 1, + lineType=cv2.LINE_AA, + shift=0) + return ex_point_1, ex_point_2 diff --git a/benchmark/PaddleOCR_DBNet/data_loader/modules/make_shrink_map.py b/benchmark/PaddleOCR_DBNet/data_loader/modules/make_shrink_map.py new file mode 100644 index 0000000000000000000000000000000000000000..3f268b9dead349538e9f47d5b960feea27f90c51 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/data_loader/modules/make_shrink_map.py @@ -0,0 +1,133 @@ +import numpy as np +import cv2 + + +def shrink_polygon_py(polygon, shrink_ratio): + """ + 对框进行缩放,返回去的比例为1/shrink_ratio 即可 + """ + cx = polygon[:, 0].mean() + cy = polygon[:, 1].mean() + polygon[:, 0] = cx + (polygon[:, 0] - cx) * shrink_ratio + polygon[:, 1] = cy + (polygon[:, 1] - cy) * shrink_ratio + return polygon + + +def shrink_polygon_pyclipper(polygon, shrink_ratio): + from shapely.geometry import Polygon + import pyclipper + polygon_shape = Polygon(polygon) + distance = polygon_shape.area * ( + 1 - np.power(shrink_ratio, 2)) / polygon_shape.length + subject = [tuple(l) for l in polygon] + padding = pyclipper.PyclipperOffset() + padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON) + shrinked = padding.Execute(-distance) + if shrinked == []: + shrinked = np.array(shrinked) + else: + shrinked = np.array(shrinked[0]).reshape(-1, 2) + return shrinked + + +class MakeShrinkMap(): + r''' + Making binary mask from detection data with ICDAR format. + Typically following the process of class `MakeICDARData`. + ''' + + def __init__(self, + min_text_size=8, + shrink_ratio=0.4, + shrink_type='pyclipper'): + shrink_func_dict = { + 'py': shrink_polygon_py, + 'pyclipper': shrink_polygon_pyclipper + } + self.shrink_func = shrink_func_dict[shrink_type] + self.min_text_size = min_text_size + self.shrink_ratio = shrink_ratio + + def __call__(self, data: dict) -> dict: + """ + 从scales中随机选择一个尺度,对图片和文本框进行缩放 + :param data: {'img':,'text_polys':,'texts':,'ignore_tags':} + :return: + """ + image = data['img'] + text_polys = data['text_polys'] + ignore_tags = data['ignore_tags'] + + h, w = image.shape[:2] + text_polys, ignore_tags = self.validate_polygons(text_polys, + ignore_tags, h, w) + gt = np.zeros((h, w), dtype=np.float32) + mask = np.ones((h, w), dtype=np.float32) + for i in range(len(text_polys)): + polygon = text_polys[i] + height = max(polygon[:, 1]) - min(polygon[:, 1]) + width = max(polygon[:, 0]) - min(polygon[:, 0]) + if ignore_tags[i] or min(height, width) < self.min_text_size: + cv2.fillPoly(mask, + polygon.astype(np.int32)[np.newaxis, :, :], 0) + ignore_tags[i] = True + else: + shrinked = self.shrink_func(polygon, self.shrink_ratio) + if shrinked.size == 0: + cv2.fillPoly(mask, + polygon.astype(np.int32)[np.newaxis, :, :], 0) + ignore_tags[i] = True + continue + cv2.fillPoly(gt, [shrinked.astype(np.int32)], 1) + + data['shrink_map'] = gt + data['shrink_mask'] = mask + return data + + def validate_polygons(self, polygons, ignore_tags, h, w): + ''' + polygons (numpy.array, required): of shape (num_instances, num_points, 2) + ''' + if len(polygons) == 0: + return polygons, ignore_tags + assert len(polygons) == len(ignore_tags) + for polygon in polygons: + polygon[:, 0] = np.clip(polygon[:, 0], 0, w - 1) + polygon[:, 1] = np.clip(polygon[:, 1], 0, h - 1) + + for i in range(len(polygons)): + area = self.polygon_area(polygons[i]) + if abs(area) < 1: + ignore_tags[i] = True + if area > 0: + polygons[i] = polygons[i][::-1, :] + return polygons, ignore_tags + + def polygon_area(self, polygon): + return cv2.contourArea(polygon) + # edge = 0 + # for i in range(polygon.shape[0]): + # next_index = (i + 1) % polygon.shape[0] + # edge += (polygon[next_index, 0] - polygon[i, 0]) * (polygon[next_index, 1] - polygon[i, 1]) + # + # return edge / 2. + + +if __name__ == '__main__': + from shapely.geometry import Polygon + import pyclipper + + polygon = np.array([[0, 0], [100, 10], [100, 100], [10, 90]]) + a = shrink_polygon_py(polygon, 0.4) + print(a) + print(shrink_polygon_py(a, 1 / 0.4)) + b = shrink_polygon_pyclipper(polygon, 0.4) + print(b) + poly = Polygon(b) + distance = poly.area * 1.5 / poly.length + offset = pyclipper.PyclipperOffset() + offset.AddPath(b, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON) + expanded = np.array(offset.Execute(distance)) + bounding_box = cv2.minAreaRect(expanded) + points = cv2.boxPoints(bounding_box) + print(points) diff --git a/benchmark/PaddleOCR_DBNet/data_loader/modules/random_crop_data.py b/benchmark/PaddleOCR_DBNet/data_loader/modules/random_crop_data.py new file mode 100644 index 0000000000000000000000000000000000000000..fac2e4c07cfca966e6c17231f2ab4bce702f191f --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/data_loader/modules/random_crop_data.py @@ -0,0 +1,206 @@ +import random + +import cv2 +import numpy as np + + +# random crop algorithm similar to https://github.com/argman/EAST +class EastRandomCropData(): + def __init__(self, + size=(640, 640), + max_tries=50, + min_crop_side_ratio=0.1, + require_original_image=False, + keep_ratio=True): + self.size = size + self.max_tries = max_tries + self.min_crop_side_ratio = min_crop_side_ratio + self.require_original_image = require_original_image + self.keep_ratio = keep_ratio + + def __call__(self, data: dict) -> dict: + """ + 从scales中随机选择一个尺度,对图片和文本框进行缩放 + :param data: {'img':,'text_polys':,'texts':,'ignore_tags':} + :return: + """ + im = data['img'] + text_polys = data['text_polys'] + ignore_tags = data['ignore_tags'] + texts = data['texts'] + all_care_polys = [ + text_polys[i] for i, tag in enumerate(ignore_tags) if not tag + ] + # 计算crop区域 + crop_x, crop_y, crop_w, crop_h = self.crop_area(im, all_care_polys) + # crop 图片 保持比例填充 + scale_w = self.size[0] / crop_w + scale_h = self.size[1] / crop_h + scale = min(scale_w, scale_h) + h = int(crop_h * scale) + w = int(crop_w * scale) + if self.keep_ratio: + if len(im.shape) == 3: + padimg = np.zeros((self.size[1], self.size[0], im.shape[2]), + im.dtype) + else: + padimg = np.zeros((self.size[1], self.size[0]), im.dtype) + padimg[:h, :w] = cv2.resize( + im[crop_y:crop_y + crop_h, crop_x:crop_x + crop_w], (w, h)) + img = padimg + else: + img = cv2.resize(im[crop_y:crop_y + crop_h, crop_x:crop_x + crop_w], + tuple(self.size)) + # crop 文本框 + text_polys_crop = [] + ignore_tags_crop = [] + texts_crop = [] + for poly, text, tag in zip(text_polys, texts, ignore_tags): + poly = ((poly - (crop_x, crop_y)) * scale).tolist() + if not self.is_poly_outside_rect(poly, 0, 0, w, h): + text_polys_crop.append(poly) + ignore_tags_crop.append(tag) + texts_crop.append(text) + data['img'] = img + data['text_polys'] = np.float32(text_polys_crop) + data['ignore_tags'] = ignore_tags_crop + data['texts'] = texts_crop + return data + + def is_poly_in_rect(self, poly, x, y, w, h): + poly = np.array(poly) + if poly[:, 0].min() < x or poly[:, 0].max() > x + w: + return False + if poly[:, 1].min() < y or poly[:, 1].max() > y + h: + return False + return True + + def is_poly_outside_rect(self, poly, x, y, w, h): + poly = np.array(poly) + if poly[:, 0].max() < x or poly[:, 0].min() > x + w: + return True + if poly[:, 1].max() < y or poly[:, 1].min() > y + h: + return True + return False + + def split_regions(self, axis): + regions = [] + min_axis = 0 + for i in range(1, axis.shape[0]): + if axis[i] != axis[i - 1] + 1: + region = axis[min_axis:i] + min_axis = i + regions.append(region) + return regions + + def random_select(self, axis, max_size): + xx = np.random.choice(axis, size=2) + xmin = np.min(xx) + xmax = np.max(xx) + xmin = np.clip(xmin, 0, max_size - 1) + xmax = np.clip(xmax, 0, max_size - 1) + return xmin, xmax + + def region_wise_random_select(self, regions, max_size): + selected_index = list(np.random.choice(len(regions), 2)) + selected_values = [] + for index in selected_index: + axis = regions[index] + xx = int(np.random.choice(axis, size=1)) + selected_values.append(xx) + xmin = min(selected_values) + xmax = max(selected_values) + return xmin, xmax + + def crop_area(self, im, text_polys): + h, w = im.shape[:2] + h_array = np.zeros(h, dtype=np.int32) + w_array = np.zeros(w, dtype=np.int32) + for points in text_polys: + points = np.round(points, decimals=0).astype(np.int32) + minx = np.min(points[:, 0]) + maxx = np.max(points[:, 0]) + w_array[minx:maxx] = 1 + miny = np.min(points[:, 1]) + maxy = np.max(points[:, 1]) + h_array[miny:maxy] = 1 + # ensure the cropped area not across a text + h_axis = np.where(h_array == 0)[0] + w_axis = np.where(w_array == 0)[0] + + if len(h_axis) == 0 or len(w_axis) == 0: + return 0, 0, w, h + + h_regions = self.split_regions(h_axis) + w_regions = self.split_regions(w_axis) + + for i in range(self.max_tries): + if len(w_regions) > 1: + xmin, xmax = self.region_wise_random_select(w_regions, w) + else: + xmin, xmax = self.random_select(w_axis, w) + if len(h_regions) > 1: + ymin, ymax = self.region_wise_random_select(h_regions, h) + else: + ymin, ymax = self.random_select(h_axis, h) + + if xmax - xmin < self.min_crop_side_ratio * w or ymax - ymin < self.min_crop_side_ratio * h: + # area too small + continue + num_poly_in_rect = 0 + for poly in text_polys: + if not self.is_poly_outside_rect(poly, xmin, ymin, xmax - xmin, + ymax - ymin): + num_poly_in_rect += 1 + break + + if num_poly_in_rect > 0: + return xmin, ymin, xmax - xmin, ymax - ymin + + return 0, 0, w, h + + +class PSERandomCrop(): + def __init__(self, size): + self.size = size + + def __call__(self, data): + imgs = data['imgs'] + + h, w = imgs[0].shape[0:2] + th, tw = self.size + if w == tw and h == th: + return imgs + + # label中存在文本实例,并且按照概率进行裁剪,使用threshold_label_map控制 + if np.max(imgs[2]) > 0 and random.random() > 3 / 8: + # 文本实例的左上角点 + tl = np.min(np.where(imgs[2] > 0), axis=1) - self.size + tl[tl < 0] = 0 + # 文本实例的右下角点 + br = np.max(np.where(imgs[2] > 0), axis=1) - self.size + br[br < 0] = 0 + # 保证选到右下角点时,有足够的距离进行crop + br[0] = min(br[0], h - th) + br[1] = min(br[1], w - tw) + + for _ in range(50000): + i = random.randint(tl[0], br[0]) + j = random.randint(tl[1], br[1]) + # 保证shrink_label_map有文本 + if imgs[1][i:i + th, j:j + tw].sum() <= 0: + continue + else: + break + else: + i = random.randint(0, h - th) + j = random.randint(0, w - tw) + + # return i, j, th, tw + for idx in range(len(imgs)): + if len(imgs[idx].shape) == 3: + imgs[idx] = imgs[idx][i:i + th, j:j + tw, :] + else: + imgs[idx] = imgs[idx][i:i + th, j:j + tw] + data['imgs'] = imgs + return data diff --git a/benchmark/PaddleOCR_DBNet/environment.yml b/benchmark/PaddleOCR_DBNet/environment.yml new file mode 100644 index 0000000000000000000000000000000000000000..571dbf2a0462a842e17420cbd2bdba2b56b62131 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/environment.yml @@ -0,0 +1,21 @@ +name: dbnet +channels: + - conda-forge + - defaults +dependencies: + - anyconfig==0.9.10 + - future==0.18.2 + - imgaug==0.4.0 + - matplotlib==3.1.2 + - numpy==1.17.4 + - opencv + - pyclipper + - PyYAML==5.2 + - scikit-image==0.16.2 + - Shapely==1.6.4 + - tensorboard=2 + - tqdm==4.40.1 + - ipython + - pip + - pip: + - polygon3 diff --git a/benchmark/PaddleOCR_DBNet/eval.sh b/benchmark/PaddleOCR_DBNet/eval.sh new file mode 100644 index 0000000000000000000000000000000000000000..b3bf46818610caedca6690a95713a020b678e2bf --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/eval.sh @@ -0,0 +1 @@ +CUDA_VISIBLE_DEVICES=0 python3 tools/eval.py --model_path '' \ No newline at end of file diff --git a/benchmark/PaddleOCR_DBNet/generate_lists.sh b/benchmark/PaddleOCR_DBNet/generate_lists.sh new file mode 100644 index 0000000000000000000000000000000000000000..84f408c64330f0aefb684ed738fa2dcdbc4af6da --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/generate_lists.sh @@ -0,0 +1,17 @@ +#Only use if your file names of the images and txts are identical +rm ./datasets/train_img.txt +rm ./datasets/train_gt.txt +rm ./datasets/test_img.txt +rm ./datasets/test_gt.txt +rm ./datasets/train.txt +rm ./datasets/test.txt +ls ./datasets/train/img/*.jpg > ./datasets/train_img.txt +ls ./datasets/train/gt/*.txt > ./datasets/train_gt.txt +ls ./datasets/test/img/*.jpg > ./datasets/test_img.txt +ls ./datasets/test/gt/*.txt > ./datasets/test_gt.txt +paste ./datasets/train_img.txt ./datasets/train_gt.txt > ./datasets/train.txt +paste ./datasets/test_img.txt ./datasets/test_gt.txt > ./datasets/test.txt +rm ./datasets/train_img.txt +rm ./datasets/train_gt.txt +rm ./datasets/test_img.txt +rm ./datasets/test_gt.txt diff --git a/benchmark/PaddleOCR_DBNet/imgs/paper/db.jpg b/benchmark/PaddleOCR_DBNet/imgs/paper/db.jpg new file mode 100644 index 0000000000000000000000000000000000000000..aa6c7e9890551abb9aaf39fe76db67cb5588507b Binary files /dev/null and b/benchmark/PaddleOCR_DBNet/imgs/paper/db.jpg differ diff --git a/benchmark/PaddleOCR_DBNet/models/__init__.py b/benchmark/PaddleOCR_DBNet/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..26ff73ff690bf56c8d5cfb64a37bbffb706da7e2 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/models/__init__.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +# @Time : 2019/8/23 21:55 +# @Author : zhoujun +import copy +from .model import Model +from .losses import build_loss + +__all__ = ['build_loss', 'build_model'] +support_model = ['Model'] + + +def build_model(config): + """ + get architecture model class + """ + copy_config = copy.deepcopy(config) + arch_type = copy_config.pop('type') + assert arch_type in support_model, f'{arch_type} is not developed yet!, only {support_model} are support now' + arch_model = eval(arch_type)(copy_config) + return arch_model diff --git a/benchmark/PaddleOCR_DBNet/models/backbone/__init__.py b/benchmark/PaddleOCR_DBNet/models/backbone/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..740c8d5ff09311def6ca465aae40c34261518102 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/models/backbone/__init__.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- +# @Time : 2019/8/23 21:54 +# @Author : zhoujun + +from .resnet import * + +__all__ = ['build_backbone'] + +support_backbone = [ + 'resnet18', 'deformable_resnet18', 'deformable_resnet50', 'resnet50', + 'resnet34', 'resnet101', 'resnet152' +] + + +def build_backbone(backbone_name, **kwargs): + assert backbone_name in support_backbone, f'all support backbone is {support_backbone}' + backbone = eval(backbone_name)(**kwargs) + return backbone diff --git a/benchmark/PaddleOCR_DBNet/models/backbone/resnet.py b/benchmark/PaddleOCR_DBNet/models/backbone/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..9b30b382d98fec00d396dabc4f12f20ad8c77389 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/models/backbone/resnet.py @@ -0,0 +1,375 @@ +import math +import paddle +from paddle import nn + +BatchNorm2d = nn.BatchNorm2D + +__all__ = [ + 'ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', + 'deformable_resnet18', 'deformable_resnet50', 'resnet152' +] + +model_urls = { + 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', + 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', + 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', + 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', + 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', +} + + +def constant_init(module, constant, bias=0): + module.weight = paddle.create_parameter( + shape=module.weight.shape, + dtype='float32', + default_initializer=paddle.nn.initializer.Constant(constant)) + if hasattr(module, 'bias'): + module.bias = paddle.create_parameter( + shape=module.bias.shape, + dtype='float32', + default_initializer=paddle.nn.initializer.Constant(bias)) + + +def conv3x3(in_planes, out_planes, stride=1): + """3x3 convolution with padding""" + return nn.Conv2D( + in_planes, + out_planes, + kernel_size=3, + stride=stride, + padding=1, + bias_attr=False) + + +class BasicBlock(nn.Layer): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None, dcn=None): + super(BasicBlock, self).__init__() + self.with_dcn = dcn is not None + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = BatchNorm2d(planes, momentum=0.1) + self.relu = nn.ReLU() + self.with_modulated_dcn = False + if not self.with_dcn: + self.conv2 = nn.Conv2D( + planes, planes, kernel_size=3, padding=1, bias_attr=False) + else: + from paddle.version.ops import DeformConv2D + deformable_groups = dcn.get('deformable_groups', 1) + offset_channels = 18 + self.conv2_offset = nn.Conv2D( + planes, + deformable_groups * offset_channels, + kernel_size=3, + padding=1) + self.conv2 = DeformConv2D( + planes, planes, kernel_size=3, padding=1, bias_attr=False) + self.bn2 = BatchNorm2d(planes, momentum=0.1) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + # out = self.conv2(out) + if not self.with_dcn: + out = self.conv2(out) + else: + offset = self.conv2_offset(out) + out = self.conv2(out, offset) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Layer): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None, dcn=None): + super(Bottleneck, self).__init__() + self.with_dcn = dcn is not None + self.conv1 = nn.Conv2D(inplanes, planes, kernel_size=1, bias_attr=False) + self.bn1 = BatchNorm2d(planes, momentum=0.1) + self.with_modulated_dcn = False + if not self.with_dcn: + self.conv2 = nn.Conv2D( + planes, + planes, + kernel_size=3, + stride=stride, + padding=1, + bias_attr=False) + else: + deformable_groups = dcn.get('deformable_groups', 1) + from paddle.vision.ops import DeformConv2D + offset_channels = 18 + self.conv2_offset = nn.Conv2D( + planes, + deformable_groups * offset_channels, + stride=stride, + kernel_size=3, + padding=1) + self.conv2 = DeformConv2D( + planes, + planes, + kernel_size=3, + padding=1, + stride=stride, + bias_attr=False) + self.bn2 = BatchNorm2d(planes, momentum=0.1) + self.conv3 = nn.Conv2D( + planes, planes * 4, kernel_size=1, bias_attr=False) + self.bn3 = BatchNorm2d(planes * 4, momentum=0.1) + self.relu = nn.ReLU() + self.downsample = downsample + self.stride = stride + self.dcn = dcn + self.with_dcn = dcn is not None + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + # out = self.conv2(out) + if not self.with_dcn: + out = self.conv2(out) + else: + offset = self.conv2_offset(out) + out = self.conv2(out, offset) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class ResNet(nn.Layer): + def __init__(self, block, layers, in_channels=3, dcn=None): + self.dcn = dcn + self.inplanes = 64 + super(ResNet, self).__init__() + self.out_channels = [] + self.conv1 = nn.Conv2D( + in_channels, + 64, + kernel_size=7, + stride=2, + padding=3, + bias_attr=False) + self.bn1 = BatchNorm2d(64, momentum=0.1) + self.relu = nn.ReLU() + self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2, dcn=dcn) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2, dcn=dcn) + self.layer4 = self._make_layer(block, 512, layers[3], stride=2, dcn=dcn) + + if self.dcn is not None: + for m in self.modules(): + if isinstance(m, Bottleneck) or isinstance(m, BasicBlock): + if hasattr(m, 'conv2_offset'): + constant_init(m.conv2_offset, 0) + + def _make_layer(self, block, planes, blocks, stride=1, dcn=None): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2D( + self.inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias_attr=False), + BatchNorm2d( + planes * block.expansion, momentum=0.1), ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample, dcn=dcn)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes, dcn=dcn)) + self.out_channels.append(planes * block.expansion) + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + + x2 = self.layer1(x) + x3 = self.layer2(x2) + x4 = self.layer3(x3) + x5 = self.layer4(x4) + + return x2, x3, x4, x5 + + +def load_torch_params(paddle_model, torch_patams): + paddle_params = paddle_model.state_dict() + + fc_names = ['classifier'] + for key, torch_value in torch_patams.items(): + if 'num_batches_tracked' in key: + continue + key = key.replace("running_var", "_variance").replace( + "running_mean", "_mean").replace("module.", "") + torch_value = torch_value.detach().cpu().numpy() + if key in paddle_params: + flag = [i in key for i in fc_names] + if any(flag) and "weight" in key: # ignore bias + new_shape = [1, 0] + list(range(2, torch_value.ndim)) + print( + f"name: {key}, ori shape: {torch_value.shape}, new shape: {torch_value.transpose(new_shape).shape}" + ) + torch_value = torch_value.transpose(new_shape) + paddle_params[key] = torch_value + else: + print(f'{key} not in paddle') + paddle_model.set_state_dict(paddle_params) + + +def load_models(model, model_name): + import torch.utils.model_zoo as model_zoo + torch_patams = model_zoo.load_url(model_urls[model_name]) + load_torch_params(model, torch_patams) + + +def resnet18(pretrained=True, **kwargs): + """Constructs a ResNet-18 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs) + if pretrained: + assert kwargs.get( + 'in_channels', + 3) == 3, 'in_channels must be 3 whem pretrained is True' + print('load from imagenet') + load_models(model, 'resnet18') + return model + + +def deformable_resnet18(pretrained=True, **kwargs): + """Constructs a ResNet-18 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = ResNet( + BasicBlock, [2, 2, 2, 2], dcn=dict(deformable_groups=1), **kwargs) + if pretrained: + assert kwargs.get( + 'in_channels', + 3) == 3, 'in_channels must be 3 whem pretrained is True' + print('load from imagenet') + model.load_state_dict( + model_zoo.load_url(model_urls['resnet18']), strict=False) + return model + + +def resnet34(pretrained=True, **kwargs): + """Constructs a ResNet-34 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs) + if pretrained: + assert kwargs.get( + 'in_channels', + 3) == 3, 'in_channels must be 3 whem pretrained is True' + model.load_state_dict( + model_zoo.load_url(model_urls['resnet34']), strict=False) + return model + + +def resnet50(pretrained=True, **kwargs): + """Constructs a ResNet-50 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) + if pretrained: + assert kwargs.get( + 'in_channels', + 3) == 3, 'in_channels must be 3 whem pretrained is True' + load_models(model, 'resnet50') + return model + + +def deformable_resnet50(pretrained=True, **kwargs): + """Constructs a ResNet-50 model with deformable conv. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = ResNet( + Bottleneck, [3, 4, 6, 3], dcn=dict(deformable_groups=1), **kwargs) + if pretrained: + assert kwargs.get( + 'in_channels', + 3) == 3, 'in_channels must be 3 whem pretrained is True' + model.load_state_dict( + model_zoo.load_url(model_urls['resnet50']), strict=False) + return model + + +def resnet101(pretrained=True, **kwargs): + """Constructs a ResNet-101 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs) + if pretrained: + assert kwargs.get( + 'in_channels', + 3) == 3, 'in_channels must be 3 whem pretrained is True' + model.load_state_dict( + model_zoo.load_url(model_urls['resnet101']), strict=False) + return model + + +def resnet152(pretrained=True, **kwargs): + """Constructs a ResNet-152 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs) + if pretrained: + assert kwargs.get( + 'in_channels', + 3) == 3, 'in_channels must be 3 whem pretrained is True' + model.load_state_dict( + model_zoo.load_url(model_urls['resnet152']), strict=False) + return model + + +if __name__ == '__main__': + + x = paddle.zeros([2, 3, 640, 640]) + net = resnet50(pretrained=True) + y = net(x) + for u in y: + print(u.shape) + + print(net.out_channels) diff --git a/benchmark/PaddleOCR_DBNet/models/basic.py b/benchmark/PaddleOCR_DBNet/models/basic.py new file mode 100644 index 0000000000000000000000000000000000000000..f661878df7e1a7a9cbadc2f58d532c3af4949589 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/models/basic.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +# @Time : 2019/12/6 11:19 +# @Author : zhoujun +from paddle import nn + + +class ConvBnRelu(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + padding_mode='zeros', + inplace=True): + super().__init__() + self.conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias_attr=bias, + padding_mode=padding_mode) + self.bn = nn.BatchNorm2D(out_channels) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.relu(x) + return x diff --git a/benchmark/PaddleOCR_DBNet/models/head/DBHead.py b/benchmark/PaddleOCR_DBNet/models/head/DBHead.py new file mode 100644 index 0000000000000000000000000000000000000000..29277cec9d3eb60929e2bda2e78d08856008f0b4 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/models/head/DBHead.py @@ -0,0 +1,138 @@ +# -*- coding: utf-8 -*- +# @Time : 2019/12/4 14:54 +# @Author : zhoujun +import paddle +from paddle import nn, ParamAttr + + +class DBHead(nn.Layer): + def __init__(self, in_channels, out_channels, k=50): + super().__init__() + self.k = k + self.binarize = nn.Sequential( + nn.Conv2D( + in_channels, + in_channels // 4, + 3, + padding=1, + weight_attr=ParamAttr( + initializer=nn.initializer.KaimingNormal())), + nn.BatchNorm2D( + in_channels // 4, + weight_attr=ParamAttr(initializer=nn.initializer.Constant(1)), + bias_attr=ParamAttr(initializer=nn.initializer.Constant(1e-4))), + nn.ReLU(), + nn.Conv2DTranspose( + in_channels // 4, + in_channels // 4, + 2, + 2, + weight_attr=ParamAttr( + initializer=nn.initializer.KaimingNormal())), + nn.BatchNorm2D( + in_channels // 4, + weight_attr=ParamAttr(initializer=nn.initializer.Constant(1)), + bias_attr=ParamAttr(initializer=nn.initializer.Constant(1e-4))), + nn.ReLU(), + nn.Conv2DTranspose( + in_channels // 4, + 1, + 2, + 2, + weight_attr=nn.initializer.KaimingNormal()), + nn.Sigmoid()) + + self.thresh = self._init_thresh(in_channels) + + def forward(self, x): + shrink_maps = self.binarize(x) + threshold_maps = self.thresh(x) + if self.training: + binary_maps = self.step_function(shrink_maps, threshold_maps) + y = paddle.concat( + (shrink_maps, threshold_maps, binary_maps), axis=1) + else: + y = paddle.concat((shrink_maps, threshold_maps), axis=1) + return y + + def _init_thresh(self, + inner_channels, + serial=False, + smooth=False, + bias=False): + in_channels = inner_channels + if serial: + in_channels += 1 + self.thresh = nn.Sequential( + nn.Conv2D( + in_channels, + inner_channels // 4, + 3, + padding=1, + bias_attr=bias, + weight_attr=ParamAttr( + initializer=nn.initializer.KaimingNormal())), + nn.BatchNorm2D( + inner_channels // 4, + weight_attr=ParamAttr(initializer=nn.initializer.Constant(1)), + bias_attr=ParamAttr(initializer=nn.initializer.Constant(1e-4))), + nn.ReLU(), + self._init_upsample( + inner_channels // 4, + inner_channels // 4, + smooth=smooth, + bias=bias), + nn.BatchNorm2D( + inner_channels // 4, + weight_attr=ParamAttr(initializer=nn.initializer.Constant(1)), + bias_attr=ParamAttr(initializer=nn.initializer.Constant(1e-4))), + nn.ReLU(), + self._init_upsample( + inner_channels // 4, 1, smooth=smooth, bias=bias), + nn.Sigmoid()) + return self.thresh + + def _init_upsample(self, + in_channels, + out_channels, + smooth=False, + bias=False): + if smooth: + inter_out_channels = out_channels + if out_channels == 1: + inter_out_channels = in_channels + module_list = [ + nn.Upsample( + scale_factor=2, mode='nearest'), nn.Conv2D( + in_channels, + inter_out_channels, + 3, + 1, + 1, + bias_attr=bias, + weight_attr=ParamAttr( + initializer=nn.initializer.KaimingNormal())) + ] + if out_channels == 1: + module_list.append( + nn.Conv2D( + in_channels, + out_channels, + kernel_size=1, + stride=1, + padding=1, + bias_attr=True, + weight_attr=ParamAttr( + initializer=nn.initializer.KaimingNormal()))) + return nn.Sequential(module_list) + else: + return nn.Conv2DTranspose( + in_channels, + out_channels, + 2, + 2, + weight_attr=ParamAttr( + initializer=nn.initializer.KaimingNormal())) + + def step_function(self, x, y): + return paddle.reciprocal(1 + paddle.exp(-self.k * (x - y))) diff --git a/benchmark/PaddleOCR_DBNet/models/head/__init__.py b/benchmark/PaddleOCR_DBNet/models/head/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5610c69754ebd1d8d7aa1b69773e91672481e418 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/models/head/__init__.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +# @Time : 2020/6/5 11:35 +# @Author : zhoujun +from .DBHead import DBHead + +__all__ = ['build_head'] +support_head = ['DBHead'] + + +def build_head(head_name, **kwargs): + assert head_name in support_head, f'all support head is {support_head}' + head = eval(head_name)(**kwargs) + return head \ No newline at end of file diff --git a/benchmark/PaddleOCR_DBNet/models/losses/DB_loss.py b/benchmark/PaddleOCR_DBNet/models/losses/DB_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..74d240c17b11ae784203a3916b06925a0a2f3af6 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/models/losses/DB_loss.py @@ -0,0 +1,49 @@ +import paddle +from models.losses.basic_loss import BalanceCrossEntropyLoss, MaskL1Loss, DiceLoss + + +class DBLoss(paddle.nn.Layer): + def __init__(self, + alpha=1.0, + beta=10, + ohem_ratio=3, + reduction='mean', + eps=1e-06): + """ + Implement PSE Loss. + :param alpha: binary_map loss 前面的系数 + :param beta: threshold_map loss 前面的系数 + :param ohem_ratio: OHEM的比例 + :param reduction: 'mean' or 'sum'对 batch里的loss 算均值或求和 + """ + super().__init__() + assert reduction in ['mean', 'sum'], " reduction must in ['mean','sum']" + self.alpha = alpha + self.beta = beta + self.bce_loss = BalanceCrossEntropyLoss(negative_ratio=ohem_ratio) + self.dice_loss = DiceLoss(eps=eps) + self.l1_loss = MaskL1Loss(eps=eps) + self.ohem_ratio = ohem_ratio + self.reduction = reduction + + def forward(self, pred, batch): + shrink_maps = pred[:, 0, :, :] + threshold_maps = pred[:, 1, :, :] + binary_maps = pred[:, 2, :, :] + loss_shrink_maps = self.bce_loss(shrink_maps, batch['shrink_map'], + batch['shrink_mask']) + loss_threshold_maps = self.l1_loss( + threshold_maps, batch['threshold_map'], batch['threshold_mask']) + metrics = dict( + loss_shrink_maps=loss_shrink_maps, + loss_threshold_maps=loss_threshold_maps) + if pred.shape[1] > 2: + loss_binary_maps = self.dice_loss(binary_maps, batch['shrink_map'], + batch['shrink_mask']) + metrics['loss_binary_maps'] = loss_binary_maps + loss_all = (self.alpha * loss_shrink_maps + self.beta * + loss_threshold_maps + loss_binary_maps) + metrics['loss'] = loss_all + else: + metrics['loss'] = loss_shrink_maps + return metrics diff --git a/benchmark/PaddleOCR_DBNet/models/losses/__init__.py b/benchmark/PaddleOCR_DBNet/models/losses/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9dc0f1033b66999bb9cc7edb3c39d80836963c56 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/models/losses/__init__.py @@ -0,0 +1,16 @@ +# -*- coding: utf-8 -*- +# @Time : 2020/6/5 11:36 +# @Author : zhoujun +import copy +from .DB_loss import DBLoss + +__all__ = ['build_loss'] +support_loss = ['DBLoss'] + + +def build_loss(config): + copy_config = copy.deepcopy(config) + loss_type = copy_config.pop('type') + assert loss_type in support_loss, f'all support loss is {support_loss}' + criterion = eval(loss_type)(**copy_config) + return criterion diff --git a/benchmark/PaddleOCR_DBNet/models/losses/basic_loss.py b/benchmark/PaddleOCR_DBNet/models/losses/basic_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..8e68cb172aea81afc5bd6c5d75dedd6841c59456 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/models/losses/basic_loss.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- +# @Time : 2019/12/4 14:39 +# @Author : zhoujun +import paddle +import paddle.nn as nn + + +class BalanceCrossEntropyLoss(nn.Layer): + ''' + Balanced cross entropy loss. + Shape: + - Input: :math:`(N, 1, H, W)` + - GT: :math:`(N, 1, H, W)`, same shape as the input + - Mask: :math:`(N, H, W)`, same spatial shape as the input + - Output: scalar. + + ''' + + def __init__(self, negative_ratio=3.0, eps=1e-6): + super(BalanceCrossEntropyLoss, self).__init__() + self.negative_ratio = negative_ratio + self.eps = eps + + def forward(self, + pred: paddle.Tensor, + gt: paddle.Tensor, + mask: paddle.Tensor, + return_origin=False): + ''' + Args: + pred: shape :math:`(N, 1, H, W)`, the prediction of network + gt: shape :math:`(N, 1, H, W)`, the target + mask: shape :math:`(N, H, W)`, the mask indicates positive regions + ''' + positive = (gt * mask) + negative = ((1 - gt) * mask) + positive_count = int(positive.sum()) + negative_count = min( + int(negative.sum()), int(positive_count * self.negative_ratio)) + loss = nn.functional.binary_cross_entropy(pred, gt, reduction='none') + positive_loss = loss * positive + negative_loss = loss * negative + negative_loss, _ = negative_loss.reshape([-1]).topk(negative_count) + + balance_loss = (positive_loss.sum() + negative_loss.sum()) / ( + positive_count + negative_count + self.eps) + + if return_origin: + return balance_loss, loss + return balance_loss + + +class DiceLoss(nn.Layer): + ''' + Loss function from https://arxiv.org/abs/1707.03237, + where iou computation is introduced heatmap manner to measure the + diversity bwtween tow heatmaps. + ''' + + def __init__(self, eps=1e-6): + super(DiceLoss, self).__init__() + self.eps = eps + + def forward(self, pred: paddle.Tensor, gt, mask, weights=None): + ''' + pred: one or two heatmaps of shape (N, 1, H, W), + the losses of tow heatmaps are added together. + gt: (N, 1, H, W) + mask: (N, H, W) + ''' + return self._compute(pred, gt, mask, weights) + + def _compute(self, pred, gt, mask, weights): + if len(pred.shape) == 4: + pred = pred[:, 0, :, :] + gt = gt[:, 0, :, :] + assert pred.shape == gt.shape + assert pred.shape == mask.shape + if weights is not None: + assert weights.shape == mask.shape + mask = weights * mask + intersection = (pred * gt * mask).sum() + + union = (pred * mask).sum() + (gt * mask).sum() + self.eps + loss = 1 - 2.0 * intersection / union + assert loss <= 1 + return loss + + +class MaskL1Loss(nn.Layer): + def __init__(self, eps=1e-6): + super(MaskL1Loss, self).__init__() + self.eps = eps + + def forward(self, pred: paddle.Tensor, gt, mask): + loss = (paddle.abs(pred - gt) * mask).sum() / (mask.sum() + self.eps) + return loss diff --git a/benchmark/PaddleOCR_DBNet/models/model.py b/benchmark/PaddleOCR_DBNet/models/model.py new file mode 100644 index 0000000000000000000000000000000000000000..ee24ff5b3d6b5a6c30b64de56a0aa83e9960001e --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/models/model.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- +# @Time : 2019/8/23 21:57 +# @Author : zhoujun +from addict import Dict +from paddle import nn +import paddle.nn.functional as F + +from models.backbone import build_backbone +from models.neck import build_neck +from models.head import build_head + + +class Model(nn.Layer): + def __init__(self, model_config: dict): + """ + PANnet + :param model_config: 模型配置 + """ + super().__init__() + model_config = Dict(model_config) + backbone_type = model_config.backbone.pop('type') + neck_type = model_config.neck.pop('type') + head_type = model_config.head.pop('type') + self.backbone = build_backbone(backbone_type, **model_config.backbone) + self.neck = build_neck( + neck_type, + in_channels=self.backbone.out_channels, + **model_config.neck) + self.head = build_head( + head_type, in_channels=self.neck.out_channels, **model_config.head) + self.name = f'{backbone_type}_{neck_type}_{head_type}' + + def forward(self, x): + _, _, H, W = x.shape + backbone_out = self.backbone(x) + neck_out = self.neck(backbone_out) + y = self.head(neck_out) + y = F.interpolate(y, size=(H, W), mode='bilinear', align_corners=True) + return y diff --git a/benchmark/PaddleOCR_DBNet/models/neck/FPN.py b/benchmark/PaddleOCR_DBNet/models/neck/FPN.py new file mode 100644 index 0000000000000000000000000000000000000000..53a3fa4b80c556ba87fa559494771f72acae0788 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/models/neck/FPN.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- +# @Time : 2019/9/13 10:29 +# @Author : zhoujun +import paddle +import paddle.nn.functional as F +from paddle import nn + +from models.basic import ConvBnRelu + + +class FPN(nn.Layer): + def __init__(self, in_channels, inner_channels=256, **kwargs): + """ + :param in_channels: 基础网络输出的维度 + :param kwargs: + """ + super().__init__() + inplace = True + self.conv_out = inner_channels + inner_channels = inner_channels // 4 + # reduce layers + self.reduce_conv_c2 = ConvBnRelu( + in_channels[0], inner_channels, kernel_size=1, inplace=inplace) + self.reduce_conv_c3 = ConvBnRelu( + in_channels[1], inner_channels, kernel_size=1, inplace=inplace) + self.reduce_conv_c4 = ConvBnRelu( + in_channels[2], inner_channels, kernel_size=1, inplace=inplace) + self.reduce_conv_c5 = ConvBnRelu( + in_channels[3], inner_channels, kernel_size=1, inplace=inplace) + # Smooth layers + self.smooth_p4 = ConvBnRelu( + inner_channels, + inner_channels, + kernel_size=3, + padding=1, + inplace=inplace) + self.smooth_p3 = ConvBnRelu( + inner_channels, + inner_channels, + kernel_size=3, + padding=1, + inplace=inplace) + self.smooth_p2 = ConvBnRelu( + inner_channels, + inner_channels, + kernel_size=3, + padding=1, + inplace=inplace) + + self.conv = nn.Sequential( + nn.Conv2D( + self.conv_out, + self.conv_out, + kernel_size=3, + padding=1, + stride=1), + nn.BatchNorm2D(self.conv_out), + nn.ReLU()) + self.out_channels = self.conv_out + + def forward(self, x): + c2, c3, c4, c5 = x + # Top-down + p5 = self.reduce_conv_c5(c5) + p4 = self._upsample_add(p5, self.reduce_conv_c4(c4)) + p4 = self.smooth_p4(p4) + p3 = self._upsample_add(p4, self.reduce_conv_c3(c3)) + p3 = self.smooth_p3(p3) + p2 = self._upsample_add(p3, self.reduce_conv_c2(c2)) + p2 = self.smooth_p2(p2) + + x = self._upsample_cat(p2, p3, p4, p5) + x = self.conv(x) + return x + + def _upsample_add(self, x, y): + return F.interpolate(x, size=y.shape[2:]) + y + + def _upsample_cat(self, p2, p3, p4, p5): + h, w = p2.shape[2:] + p3 = F.interpolate(p3, size=(h, w)) + p4 = F.interpolate(p4, size=(h, w)) + p5 = F.interpolate(p5, size=(h, w)) + return paddle.concat([p2, p3, p4, p5], axis=1) diff --git a/benchmark/PaddleOCR_DBNet/models/neck/__init__.py b/benchmark/PaddleOCR_DBNet/models/neck/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..76553413784df2d5d87824e71d8de9b95ce7ce7e --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/models/neck/__init__.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +# @Time : 2020/6/5 11:34 +# @Author : zhoujun +from .FPN import FPN + +__all__ = ['build_neck'] +support_neck = ['FPN'] + + +def build_neck(neck_name, **kwargs): + assert neck_name in support_neck, f'all support neck is {support_neck}' + neck = eval(neck_name)(**kwargs) + return neck diff --git a/benchmark/PaddleOCR_DBNet/multi_gpu_train.sh b/benchmark/PaddleOCR_DBNet/multi_gpu_train.sh new file mode 100644 index 0000000000000000000000000000000000000000..b49a73f1581d8b40c20aeb01fd6ccc37fd6ab24b --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/multi_gpu_train.sh @@ -0,0 +1,2 @@ +# export NCCL_P2P_DISABLE=1 +CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m paddle.distributed.launch tools/train.py --config_file "config/icdar2015_resnet50_FPN_DBhead_polyLR.yaml" \ No newline at end of file diff --git a/benchmark/PaddleOCR_DBNet/post_processing/__init__.py b/benchmark/PaddleOCR_DBNet/post_processing/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2f8e43223dfaf509c1bec58e828dd98a4630eb90 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/post_processing/__init__.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +# @Time : 2019/12/5 15:17 +# @Author : zhoujun + +from .seg_detector_representer import SegDetectorRepresenter + + +def get_post_processing(config): + try: + cls = eval(config['type'])(**config['args']) + return cls + except: + return None \ No newline at end of file diff --git a/benchmark/PaddleOCR_DBNet/post_processing/seg_detector_representer.py b/benchmark/PaddleOCR_DBNet/post_processing/seg_detector_representer.py new file mode 100644 index 0000000000000000000000000000000000000000..f1273dcfcce0a73cc76f841354aed2623219a8d8 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/post_processing/seg_detector_representer.py @@ -0,0 +1,192 @@ +import cv2 +import numpy as np +import pyclipper +import paddle +from shapely.geometry import Polygon + + +class SegDetectorRepresenter(): + def __init__(self, + thresh=0.3, + box_thresh=0.7, + max_candidates=1000, + unclip_ratio=1.5): + self.min_size = 3 + self.thresh = thresh + self.box_thresh = box_thresh + self.max_candidates = max_candidates + self.unclip_ratio = unclip_ratio + + def __call__(self, batch, pred, is_output_polygon=False): + ''' + batch: (image, polygons, ignore_tags + batch: a dict produced by dataloaders. + image: tensor of shape (N, C, H, W). + polygons: tensor of shape (N, K, 4, 2), the polygons of objective regions. + ignore_tags: tensor of shape (N, K), indicates whether a region is ignorable or not. + shape: the original shape of images. + filename: the original filenames of images. + pred: + binary: text region segmentation map, with shape (N, H, W) + thresh: [if exists] thresh hold prediction with shape (N, H, W) + thresh_binary: [if exists] binarized with threshhold, (N, H, W) + ''' + if isinstance(pred, paddle.Tensor): + pred = pred.numpy() + pred = pred[:, 0, :, :] + segmentation = self.binarize(pred) + boxes_batch = [] + scores_batch = [] + for batch_index in range(pred.shape[0]): + height, width = batch['shape'][batch_index] + if is_output_polygon: + boxes, scores = self.polygons_from_bitmap( + pred[batch_index], segmentation[batch_index], width, height) + else: + boxes, scores = self.boxes_from_bitmap( + pred[batch_index], segmentation[batch_index], width, height) + boxes_batch.append(boxes) + scores_batch.append(scores) + return boxes_batch, scores_batch + + def binarize(self, pred): + return pred > self.thresh + + def polygons_from_bitmap(self, pred, _bitmap, dest_width, dest_height): + ''' + _bitmap: single map with shape (H, W), + whose values are binarized as {0, 1} + ''' + + assert len(_bitmap.shape) == 2 + bitmap = _bitmap # The first channel + height, width = bitmap.shape + boxes = [] + scores = [] + + contours, _ = cv2.findContours((bitmap * 255).astype(np.uint8), + cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) + + for contour in contours[:self.max_candidates]: + epsilon = 0.005 * cv2.arcLength(contour, True) + approx = cv2.approxPolyDP(contour, epsilon, True) + points = approx.reshape((-1, 2)) + if points.shape[0] < 4: + continue + # _, sside = self.get_mini_boxes(contour) + # if sside < self.min_size: + # continue + score = self.box_score_fast(pred, contour.squeeze(1)) + if self.box_thresh > score: + continue + + if points.shape[0] > 2: + box = self.unclip(points, unclip_ratio=self.unclip_ratio) + if len(box) > 1: + continue + else: + continue + box = box.reshape(-1, 2) + _, sside = self.get_mini_boxes(box.reshape((-1, 1, 2))) + if sside < self.min_size + 2: + continue + + if not isinstance(dest_width, int): + dest_width = dest_width.item() + dest_height = dest_height.item() + + box[:, 0] = np.clip( + np.round(box[:, 0] / width * dest_width), 0, dest_width) + box[:, 1] = np.clip( + np.round(box[:, 1] / height * dest_height), 0, dest_height) + boxes.append(box) + scores.append(score) + return boxes, scores + + def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height): + ''' + _bitmap: single map with shape (H, W), + whose values are binarized as {0, 1} + ''' + + assert len(_bitmap.shape) == 2 + bitmap = _bitmap # The first channel + height, width = bitmap.shape + contours, _ = cv2.findContours((bitmap * 255).astype(np.uint8), + cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) + num_contours = min(len(contours), self.max_candidates) + boxes = np.zeros((num_contours, 4, 2), dtype=np.int16) + scores = np.zeros((num_contours, ), dtype=np.float32) + + for index in range(num_contours): + contour = contours[index].squeeze(1) + points, sside = self.get_mini_boxes(contour) + if sside < self.min_size: + continue + points = np.array(points) + score = self.box_score_fast(pred, contour) + if self.box_thresh > score: + continue + + box = self.unclip( + points, unclip_ratio=self.unclip_ratio).reshape(-1, 1, 2) + box, sside = self.get_mini_boxes(box) + if sside < self.min_size + 2: + continue + box = np.array(box) + if not isinstance(dest_width, int): + dest_width = dest_width.item() + dest_height = dest_height.item() + + box[:, 0] = np.clip( + np.round(box[:, 0] / width * dest_width), 0, dest_width) + box[:, 1] = np.clip( + np.round(box[:, 1] / height * dest_height), 0, dest_height) + boxes[index, :, :] = box.astype(np.int16) + scores[index] = score + return boxes, scores + + def unclip(self, box, unclip_ratio=1.5): + poly = Polygon(box) + distance = poly.area * unclip_ratio / poly.length + offset = pyclipper.PyclipperOffset() + offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON) + expanded = np.array(offset.Execute(distance)) + return expanded + + def get_mini_boxes(self, contour): + bounding_box = cv2.minAreaRect(contour) + points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0]) + + index_1, index_2, index_3, index_4 = 0, 1, 2, 3 + if points[1][1] > points[0][1]: + index_1 = 0 + index_4 = 1 + else: + index_1 = 1 + index_4 = 0 + if points[3][1] > points[2][1]: + index_2 = 2 + index_3 = 3 + else: + index_2 = 3 + index_3 = 2 + + box = [ + points[index_1], points[index_2], points[index_3], points[index_4] + ] + return box, min(bounding_box[1]) + + def box_score_fast(self, bitmap, _box): + h, w = bitmap.shape[:2] + box = _box.copy() + xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int), 0, w - 1) + xmax = np.clip(np.ceil(box[:, 0].max()).astype(np.int), 0, w - 1) + ymin = np.clip(np.floor(box[:, 1].min()).astype(np.int), 0, h - 1) + ymax = np.clip(np.ceil(box[:, 1].max()).astype(np.int), 0, h - 1) + + mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8) + box[:, 0] = box[:, 0] - xmin + box[:, 1] = box[:, 1] - ymin + cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1) + return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0] diff --git a/benchmark/PaddleOCR_DBNet/predict.sh b/benchmark/PaddleOCR_DBNet/predict.sh new file mode 100644 index 0000000000000000000000000000000000000000..37ab148283024c2360b7b13df24e5cc5b5cdaa4f --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/predict.sh @@ -0,0 +1 @@ +CUDA_VISIBLE_DEVICES=0 python tools/predict.py --model_path model_best.pth --input_folder ./input --output_folder ./output --thre 0.7 --polygon --show --save_result \ No newline at end of file diff --git a/benchmark/PaddleOCR_DBNet/requirement.txt b/benchmark/PaddleOCR_DBNet/requirement.txt new file mode 100644 index 0000000000000000000000000000000000000000..191819f32c8e5e6d318201f56a84b1faa5e3bce7 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/requirement.txt @@ -0,0 +1,13 @@ +anyconfig +future +imgaug +matplotlib +numpy +opencv-python +Polygon3 +pyclipper +PyYAML +scikit-image +Shapely +tqdm +addict \ No newline at end of file diff --git a/benchmark/PaddleOCR_DBNet/singlel_gpu_train.sh b/benchmark/PaddleOCR_DBNet/singlel_gpu_train.sh new file mode 100644 index 0000000000000000000000000000000000000000..f8b9f0e89b25e4e00e129cdfb3c0a771846d8c56 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/singlel_gpu_train.sh @@ -0,0 +1 @@ +CUDA_VISIBLE_DEVICES=0 python3 tools/train.py --config_file "config/icdar2015_resnet50_FPN_DBhead_polyLR.yaml" \ No newline at end of file diff --git a/benchmark/PaddleOCR_DBNet/test/README.MD b/benchmark/PaddleOCR_DBNet/test/README.MD new file mode 100644 index 0000000000000000000000000000000000000000..b43c6e9a1990ab290f07bca1187e4cdefd46ff1e --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/test/README.MD @@ -0,0 +1,8 @@ +Place the images that you want to detect here. You better named them as such: +img_10.jpg +img_11.jpg +img_{img_id}.jpg + +For predicting single images, you can change the `img_path` in the `/tools/predict.py` to your image number. + +The result will be saved in the output_folder(default is test/output) you give in predict.sh \ No newline at end of file diff --git a/benchmark/PaddleOCR_DBNet/test_tipc/benchmark_train.sh b/benchmark/PaddleOCR_DBNet/test_tipc/benchmark_train.sh new file mode 100644 index 0000000000000000000000000000000000000000..d94dac2f43bed428236b265e8c7336d98a03c980 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/test_tipc/benchmark_train.sh @@ -0,0 +1,287 @@ +#!/bin/bash +source test_tipc/common_func.sh + +# run benchmark sh +# Usage: +# bash run_benchmark_train.sh config.txt params +# or +# bash run_benchmark_train.sh config.txt + +function func_parser_params(){ + strs=$1 + IFS="=" + array=(${strs}) + tmp=${array[1]} + echo ${tmp} +} + +function set_dynamic_epoch(){ + string=$1 + num=$2 + _str=${string:1:6} + IFS="C" + arr=(${_str}) + M=${arr[0]} + P=${arr[1]} + ep=`expr $num \* $M \* $P` + echo $ep +} + +function func_sed_params(){ + filename=$1 + line=$2 + param_value=$3 + params=`sed -n "${line}p" $filename` + IFS=":" + array=(${params}) + key=${array[0]} + value=${array[1]} + + new_params="${key}:${param_value}" + IFS=";" + cmd="sed -i '${line}s/.*/${new_params}/' '${filename}'" + eval $cmd +} + +function set_gpu_id(){ + string=$1 + _str=${string:1:6} + IFS="C" + arr=(${_str}) + M=${arr[0]} + P=${arr[1]} + gn=`expr $P - 1` + gpu_num=`expr $gn / $M` + seq=`seq -s "," 0 $gpu_num` + echo $seq +} + +function get_repo_name(){ + IFS=";" + cur_dir=$(pwd) + IFS="/" + arr=(${cur_dir}) + echo ${arr[-1]} +} + +FILENAME=$1 +# copy FILENAME as new +new_filename="./test_tipc/benchmark_train.txt" +cmd=`yes|cp $FILENAME $new_filename` +FILENAME=$new_filename +# MODE must be one of ['benchmark_train'] +MODE=$2 +PARAMS=$3 + +to_static="" +# parse "to_static" options and modify trainer into "to_static_trainer" +if [[ $PARAMS =~ "dynamicTostatic" ]] ;then + to_static="d2sT_" + sed -i 's/trainer:norm_train/trainer:to_static_train/g' $FILENAME + # clear PARAM contents + if [ $PARAMS = "to_static" ] ;then + PARAMS="" + fi +fi +# bash test_tipc/benchmark_train.sh test_tipc/configs/det_mv3_db_v2_0/train_benchmark.txt benchmark_train dynamic_bs8_fp32_DP_N1C8 +# bash test_tipc/benchmark_train.sh test_tipc/configs/det_mv3_db_v2_0/train_benchmark.txt benchmark_train dynamicTostatic_bs8_fp32_DP_N1C8 +# bash test_tipc/benchmark_train.sh test_tipc/configs/det_mv3_db_v2_0/train_benchmark.txt benchmark_train dynamic_bs8_null_DP_N1C1 +IFS=$'\n' +# parser params from train_benchmark.txt +dataline=`cat $FILENAME` +# parser params +IFS=$'\n' +lines=(${dataline}) +model_name=$(func_parser_value "${lines[1]}") +python_name=$(func_parser_value "${lines[2]}") + +# set env +python=${python_name} +export str_tmp=$(echo `pip list|grep paddlepaddle-gpu|awk -F ' ' '{print $2}'`) +export frame_version=${str_tmp%%.post*} +export frame_commit=$(echo `${python} -c "import paddle;print(paddle.version.commit)"`) + +# 获取benchmark_params所在的行数 +line_num=`grep -n -w "train_benchmark_params" $FILENAME | cut -d ":" -f 1` +# for train log parser +batch_size=$(func_parser_value "${lines[line_num]}") +line_num=`expr $line_num + 1` +fp_items=$(func_parser_value "${lines[line_num]}") +line_num=`expr $line_num + 1` +epoch=$(func_parser_value "${lines[line_num]}") + +line_num=`expr $line_num + 1` +profile_option_key=$(func_parser_key "${lines[line_num]}") +profile_option_params=$(func_parser_value "${lines[line_num]}") +profile_option="${profile_option_key}:${profile_option_params}" + +line_num=`expr $line_num + 1` +flags_value=$(func_parser_value "${lines[line_num]}") +# set flags +IFS=";" +flags_list=(${flags_value}) +for _flag in ${flags_list[*]}; do + cmd="export ${_flag}" + eval $cmd +done + +# set log_name +repo_name=$(get_repo_name ) +SAVE_LOG=${BENCHMARK_LOG_DIR:-$(pwd)} # */benchmark_log +mkdir -p "${SAVE_LOG}/benchmark_log/" +status_log="${SAVE_LOG}/benchmark_log/results.log" + +# The number of lines in which train params can be replaced. +line_python=3 +line_gpuid=4 +line_precision=6 +line_epoch=7 +line_batchsize=9 +line_profile=13 +line_eval_py=24 +line_export_py=30 + +func_sed_params "$FILENAME" "${line_eval_py}" "null" +func_sed_params "$FILENAME" "${line_export_py}" "null" +func_sed_params "$FILENAME" "${line_python}" "$python" + +# if params +if [ ! -n "$PARAMS" ] ;then + # PARAMS input is not a word. + IFS="|" + batch_size_list=(${batch_size}) + fp_items_list=(${fp_items}) + device_num_list=(N1C4) + run_mode="DP" +elif [[ ${PARAMS} = "dynamicTostatic" ]];then + IFS="|" + model_type=$PARAMS + batch_size_list=(${batch_size}) + fp_items_list=(${fp_items}) + device_num_list=(N1C4) + run_mode="DP" +else + # parser params from input: modeltype_bs${bs_item}_${fp_item}_${run_mode}_${device_num} + IFS="_" + params_list=(${PARAMS}) + model_type=${params_list[0]} + batch_size=${params_list[1]} + batch_size=`echo ${batch_size} | tr -cd "[0-9]" ` + precision=${params_list[2]} + run_mode=${params_list[3]} + device_num=${params_list[4]} + IFS=";" + + if [ ${precision} = "fp16" ];then + precision="amp" + fi + + epoch=$(set_dynamic_epoch $device_num $epoch) + fp_items_list=($precision) + batch_size_list=($batch_size) + device_num_list=($device_num) +fi + +IFS="|" +for batch_size in ${batch_size_list[*]}; do + for train_precision in ${fp_items_list[*]}; do + for device_num in ${device_num_list[*]}; do + # sed batchsize and precision + if [ ${train_precision} = "amp" ];then + precision="fp16" + else + precision="fp32" + fi + + func_sed_params "$FILENAME" "${line_precision}" "$train_precision" + func_sed_params "$FILENAME" "${line_batchsize}" "$MODE=$batch_size" + func_sed_params "$FILENAME" "${line_epoch}" "$MODE=$epoch" + gpu_id=$(set_gpu_id $device_num) + + if [ ${#gpu_id} -le 1 ];then + log_path="$SAVE_LOG/profiling_log" + mkdir -p $log_path + log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}profiling" + func_sed_params "$FILENAME" "${line_gpuid}" "0" # sed used gpu_id + # set profile_option params + tmp=`sed -i "${line_profile}s/.*/${profile_option}/" "${FILENAME}"` + + # run test_train_inference_python.sh + cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " + echo $cmd + eval $cmd + eval "cat ${log_path}/${log_name}" + + # without profile + log_path="$SAVE_LOG/train_log" + speed_log_path="$SAVE_LOG/index" + mkdir -p $log_path + mkdir -p $speed_log_path + log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}log" + speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}speed" + func_sed_params "$FILENAME" "${line_profile}" "null" # sed profile_id as null + cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " + echo $cmd + job_bt=`date '+%Y%m%d%H%M%S'` + eval $cmd + job_et=`date '+%Y%m%d%H%M%S'` + export model_run_time=$((${job_et}-${job_bt})) + eval "cat ${log_path}/${log_name}" + + # parser log + _model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}" + cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \ + --speed_log_file '${speed_log_path}/${speed_log_name}' \ + --model_name ${_model_name} \ + --base_batch_size ${batch_size} \ + --run_mode ${run_mode} \ + --fp_item ${precision} \ + --keyword ips: \ + --skip_steps 2 \ + --device_num ${device_num} \ + --speed_unit samples/s \ + --convergence_key loss: " + echo $cmd + eval $cmd + last_status=${PIPESTATUS[0]} + status_check $last_status "${cmd}" "${status_log}" + else + IFS=";" + unset_env=`unset CUDA_VISIBLE_DEVICES` + log_path="$SAVE_LOG/train_log" + speed_log_path="$SAVE_LOG/index" + mkdir -p $log_path + mkdir -p $speed_log_path + log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}log" + speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}speed" + func_sed_params "$FILENAME" "${line_gpuid}" "$gpu_id" # sed used gpu_id + func_sed_params "$FILENAME" "${line_profile}" "null" # sed --profile_option as null + cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " + echo $cmd + job_bt=`date '+%Y%m%d%H%M%S'` + eval $cmd + job_et=`date '+%Y%m%d%H%M%S'` + export model_run_time=$((${job_et}-${job_bt})) + eval "cat ${log_path}/${log_name}" + # parser log + _model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}" + + cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \ + --speed_log_file '${speed_log_path}/${speed_log_name}' \ + --model_name ${_model_name} \ + --base_batch_size ${batch_size} \ + --run_mode ${run_mode} \ + --fp_item ${precision} \ + --keyword ips: \ + --skip_steps 2 \ + --device_num ${device_num} \ + --speed_unit images/s \ + --convergence_key loss: " + echo $cmd + eval $cmd + last_status=${PIPESTATUS[0]} + status_check $last_status "${cmd}" "${status_log}" + fi + done + done +done diff --git a/benchmark/PaddleOCR_DBNet/test_tipc/common_func.sh b/benchmark/PaddleOCR_DBNet/test_tipc/common_func.sh new file mode 100644 index 0000000000000000000000000000000000000000..c123d3cf6e6487ea6b0d5ef1a108e0994a7f1eb4 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/test_tipc/common_func.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +function func_parser_key(){ + strs=$1 + IFS=":" + array=(${strs}) + tmp=${array[0]} + echo ${tmp} +} + +function func_parser_value(){ + strs=$1 + IFS=":" + array=(${strs}) + tmp=${array[1]} + echo ${tmp} +} + +function func_set_params(){ + key=$1 + value=$2 + if [ ${key}x = "null"x ];then + echo " " + elif [[ ${value} = "null" ]] || [[ ${value} = " " ]] || [ ${#value} -le 0 ];then + echo " " + else + echo "${key}=${value}" + fi +} + +function func_parser_params(){ + strs=$1 + MODE=$2 + IFS=":" + array=(${strs}) + key=${array[0]} + tmp=${array[1]} + IFS="|" + res="" + for _params in ${tmp[*]}; do + IFS="=" + array=(${_params}) + mode=${array[0]} + value=${array[1]} + if [[ ${mode} = ${MODE} ]]; then + IFS="|" + #echo $(func_set_params "${mode}" "${value}") + echo $value + break + fi + IFS="|" + done + echo ${res} +} + +function status_check(){ + last_status=$1 # the exit code + run_command=$2 + run_log=$3 + model_name=$4 + log_path=$5 + if [ $last_status -eq 0 ]; then + echo -e "\033[33m Run successfully with command - ${model_name} - ${run_command} - ${log_path} \033[0m" | tee -a ${run_log} + else + echo -e "\033[33m Run failed with command - ${model_name} - ${run_command} - ${log_path} \033[0m" | tee -a ${run_log} + fi +} \ No newline at end of file diff --git a/benchmark/PaddleOCR_DBNet/test_tipc/configs/det_res50_db/train_infer_python.txt b/benchmark/PaddleOCR_DBNet/test_tipc/configs/det_res50_db/train_infer_python.txt new file mode 100644 index 0000000000000000000000000000000000000000..7dc3da35fe6fdef0ebfaa43609d28650667d6e62 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/test_tipc/configs/det_res50_db/train_infer_python.txt @@ -0,0 +1,61 @@ +===========================train_params=========================== +model_name:det_res50_db +python:python +gpu_list:0|0,1 +trainer.use_gpu:True|True +amp:null +trainer.epochs:lite_train_lite_infer=1|whole_train_whole_infer=300 +trainer.output_dir:./output/ +dataset.train.loader.batch_size:lite_train_lite_infer=8|whole_train_lite_infer=8 +trainer.finetune_checkpoint:null +train_model_name:checkpoint/model_latest.pth +train_infer_img_dir:imgs/paper/db.jpg +null:null +## +trainer:norm_train +norm_train:tools/train.py --config_file config/icdar2015_resnet50_FPN_DBhead_polyLR.yaml -o trainer.log_iter=1 trainer.enable_eval=False dataset.train.loader.shuffle=false arch.backbone.pretrained=False +quant_export:null +fpgm_export:null +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:null +null:null +## +===========================infer_params=========================== +trainer.output_dir:./output/ +trainer.resume_checkpoint: +norm_export:tools/export_model.py --config_file config/icdar2015_resnet50_FPN_DBhead_polyLR.yaml -o +quant_export:null +fpgm_export:null +distill_export:null +export1:null +export2:null +## +train_model:./inference/det_r50_vd_db_v2.0_train/best_accuracy +infer_export:tools/export_model.py --config_file config/icdar2015_resnet50_FPN_DBhead_polyLR.yaml -o +infer_quant:False +inference:tools/infer.py +--use_gpu:True|False +--enable_mkldnn:False +--cpu_threads:6 +--batch_size:1 +--use_tensorrt:False +--precision:fp32 +--model_dir: +--img_path:imgs/paper/db.jpg +--save_log_path:null +--benchmark:True +null:null +===========================infer_benchmark_params========================== +random_infer_input:[{float32,[3,640,640]}];[{float32,[3,960,960]}] +===========================train_benchmark_params========================== +batch_size:8 +fp_items:fp32|fp16 +epoch:2 +--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile +flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 +===========================to_static_train_benchmark_params=========================== +to_static_train:trainer.to_static=true diff --git a/benchmark/PaddleOCR_DBNet/test_tipc/prepare.sh b/benchmark/PaddleOCR_DBNet/test_tipc/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..cd8f56fd7d3bc07b2d35d1239bf723bb3de1e136 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/test_tipc/prepare.sh @@ -0,0 +1,54 @@ +#!/bin/bash +source test_tipc/common_func.sh + +FILENAME=$1 + +# MODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer' 'whole_train_whole_infer', +# 'whole_infer', 'klquant_whole_infer', +# 'cpp_infer', 'serving_infer'] + +MODE=$2 + +dataline=$(cat ${FILENAME}) + +# parser params +IFS=$'\n' +lines=(${dataline}) + +# The training params +model_name=$(func_parser_value "${lines[1]}") + +trainer_list=$(func_parser_value "${lines[14]}") + +if [ ${MODE} = "lite_train_lite_infer" ];then + python_name_list=$(func_parser_value "${lines[2]}") + array=(${python_name_list}) + python_name=${array[0]} + ${python_name} -m pip install -r requirement.txt + if [[ ${model_name} =~ "det_res50_db" ]];then + wget -nc https://paddle-wheel.bj.bcebos.com/benchmark/resnet50-19c8e357.pth -O /root/.cache/torch/hub/checkpoints/resnet50-19c8e357.pth + + # 下载数据集并解压 + rm -rf datasets + wget -nc https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/benchmark_train/datasets.tar + tar xf datasets.tar + fi +elif [ ${MODE} = "benchmark_train" ];then + python_name_list=$(func_parser_value "${lines[2]}") + array=(${python_name_list}) + python_name=${array[0]} + ${python_name} -m pip install -r requirement.txt + if [[ ${model_name} =~ "det_res50_db" ]];then + wget -nc https://paddle-wheel.bj.bcebos.com/benchmark/resnet50-19c8e357.pth -O /root/.cache/torch/hub/checkpoints/resnet50-19c8e357.pth + + # 下载数据集并解压 + rm -rf datasets + wget -nc https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/benchmark_train/datasets.tar + tar xf datasets.tar + # expand gt.txt 2 times + # cd ./train_data/icdar2015/text_localization + # for i in `seq 2`;do cp train_icdar2015_label.txt dup$i.txt;done + # cat dup* > train_icdar2015_label.txt && rm -rf dup* + # cd ../../../ + fi +fi \ No newline at end of file diff --git a/benchmark/PaddleOCR_DBNet/test_tipc/test_train_inference_python.sh b/benchmark/PaddleOCR_DBNet/test_tipc/test_train_inference_python.sh new file mode 100644 index 0000000000000000000000000000000000000000..a54591a60cbd68c101cecb60ad6f6e4acd758872 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/test_tipc/test_train_inference_python.sh @@ -0,0 +1,343 @@ +#!/bin/bash +source test_tipc/common_func.sh + +FILENAME=$1 +# MODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer' 'whole_train_whole_infer', 'whole_infer'] +MODE=$2 + +dataline=$(awk 'NR>=1{print}' $FILENAME) + +# parser params +IFS=$'\n' +lines=(${dataline}) + +# The training params +model_name=$(func_parser_value "${lines[1]}") +python=$(func_parser_value "${lines[2]}") +gpu_list=$(func_parser_value "${lines[3]}") +train_use_gpu_key=$(func_parser_key "${lines[4]}") +train_use_gpu_value=$(func_parser_value "${lines[4]}") +autocast_list=$(func_parser_value "${lines[5]}") +autocast_key=$(func_parser_key "${lines[5]}") +epoch_key=$(func_parser_key "${lines[6]}") +epoch_num=$(func_parser_params "${lines[6]}" "${MODE}") +save_model_key=$(func_parser_key "${lines[7]}") +train_batch_key=$(func_parser_key "${lines[8]}") +train_batch_value=$(func_parser_params "${lines[8]}" "${MODE}") +pretrain_model_key=$(func_parser_key "${lines[9]}") +pretrain_model_value=$(func_parser_value "${lines[9]}") +train_model_name=$(func_parser_value "${lines[10]}") +train_infer_img_dir=$(func_parser_value "${lines[11]}") +train_param_key1=$(func_parser_key "${lines[12]}") +train_param_value1=$(func_parser_value "${lines[12]}") + +trainer_list=$(func_parser_value "${lines[14]}") +trainer_norm=$(func_parser_key "${lines[15]}") +norm_trainer=$(func_parser_value "${lines[15]}") +pact_key=$(func_parser_key "${lines[16]}") +pact_trainer=$(func_parser_value "${lines[16]}") +fpgm_key=$(func_parser_key "${lines[17]}") +fpgm_trainer=$(func_parser_value "${lines[17]}") +distill_key=$(func_parser_key "${lines[18]}") +distill_trainer=$(func_parser_value "${lines[18]}") +trainer_key1=$(func_parser_key "${lines[19]}") +trainer_value1=$(func_parser_value "${lines[19]}") +trainer_key2=$(func_parser_key "${lines[20]}") +trainer_value2=$(func_parser_value "${lines[20]}") + +eval_py=$(func_parser_value "${lines[23]}") +eval_key1=$(func_parser_key "${lines[24]}") +eval_value1=$(func_parser_value "${lines[24]}") + +save_infer_key=$(func_parser_key "${lines[27]}") +export_weight=$(func_parser_key "${lines[28]}") +norm_export=$(func_parser_value "${lines[29]}") +pact_export=$(func_parser_value "${lines[30]}") +fpgm_export=$(func_parser_value "${lines[31]}") +distill_export=$(func_parser_value "${lines[32]}") +export_key1=$(func_parser_key "${lines[33]}") +export_value1=$(func_parser_value "${lines[33]}") +export_key2=$(func_parser_key "${lines[34]}") +export_value2=$(func_parser_value "${lines[34]}") +inference_dir=$(func_parser_value "${lines[35]}") + +# parser inference model +infer_model_dir_list=$(func_parser_value "${lines[36]}") +infer_export_list=$(func_parser_value "${lines[37]}") +infer_is_quant=$(func_parser_value "${lines[38]}") +# parser inference +inference_py=$(func_parser_value "${lines[39]}") +use_gpu_key=$(func_parser_key "${lines[40]}") +use_gpu_list=$(func_parser_value "${lines[40]}") +use_mkldnn_key=$(func_parser_key "${lines[41]}") +use_mkldnn_list=$(func_parser_value "${lines[41]}") +cpu_threads_key=$(func_parser_key "${lines[42]}") +cpu_threads_list=$(func_parser_value "${lines[42]}") +batch_size_key=$(func_parser_key "${lines[43]}") +batch_size_list=$(func_parser_value "${lines[43]}") +use_trt_key=$(func_parser_key "${lines[44]}") +use_trt_list=$(func_parser_value "${lines[44]}") +precision_key=$(func_parser_key "${lines[45]}") +precision_list=$(func_parser_value "${lines[45]}") +infer_model_key=$(func_parser_key "${lines[46]}") +image_dir_key=$(func_parser_key "${lines[47]}") +infer_img_dir=$(func_parser_value "${lines[47]}") +save_log_key=$(func_parser_key "${lines[48]}") +benchmark_key=$(func_parser_key "${lines[49]}") +benchmark_value=$(func_parser_value "${lines[49]}") +infer_key1=$(func_parser_key "${lines[50]}") +infer_value1=$(func_parser_value "${lines[50]}") + +LOG_PATH="./test_tipc/output/${model_name}/${MODE}" +mkdir -p ${LOG_PATH} +status_log="${LOG_PATH}/results_python.log" + +line_num=`grep -n -w "to_static_train_benchmark_params" $FILENAME | cut -d ":" -f 1` +to_static_key=$(func_parser_key "${lines[line_num]}") +to_static_trainer=$(func_parser_value "${lines[line_num]}") + +function func_inference(){ + IFS='|' + _python=$1 + _script=$2 + _model_dir=$3 + _log_path=$4 + _img_dir=$5 + _flag_quant=$6 + _gpu=$7 + # inference + for use_gpu in ${use_gpu_list[*]}; do + if [ ${use_gpu} = "False" ] || [ ${use_gpu} = "cpu" ]; then + for use_mkldnn in ${use_mkldnn_list[*]}; do + # if [ ${use_mkldnn} = "False" ] && [ ${_flag_quant} = "True" ]; then + # continue + # fi + for threads in ${cpu_threads_list[*]}; do + for batch_size in ${batch_size_list[*]}; do + for precision in ${precision_list[*]}; do + if [ ${use_mkldnn} = "False" ] && [ ${precision} = "fp16" ]; then + continue + fi # skip when enable fp16 but disable mkldnn + if [ ${_flag_quant} = "True" ] && [ ${precision} != "int8" ]; then + continue + fi # skip when quant model inference but precision is not int8 + set_precision=$(func_set_params "${precision_key}" "${precision}") + + _save_log_path="${_log_path}/python_infer_cpu_gpus_${_gpu}_usemkldnn_${use_mkldnn}_threads_${threads}_precision_${precision}_batchsize_${batch_size}.log" + set_infer_data=$(func_set_params "${image_dir_key}" "${_img_dir}") + set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}") + set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}") + set_mkldnn=$(func_set_params "${use_mkldnn_key}" "${use_mkldnn}") + set_cpu_threads=$(func_set_params "${cpu_threads_key}" "${threads}") + set_model_dir=$(func_set_params "${infer_model_key}" "${_model_dir}") + set_infer_params0=$(func_set_params "${save_log_key}" "${save_log_value}") + set_infer_params1=$(func_set_params "${infer_key1}" "${infer_value1}") + command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${set_mkldnn} ${set_cpu_threads} ${set_model_dir} ${set_batchsize} ${set_infer_params0} ${set_infer_data} ${set_benchmark} ${set_precision} ${set_infer_params1} > ${_save_log_path} 2>&1 " + eval $command + last_status=${PIPESTATUS[0]} + eval "cat ${_save_log_path}" + status_check $last_status "${command}" "${status_log}" "${model_name}" "${_save_log_path}" + done + done + done + done + elif [ ${use_gpu} = "True" ] || [ ${use_gpu} = "gpu" ]; then + for use_trt in ${use_trt_list[*]}; do + for precision in ${precision_list[*]}; do + if [[ ${_flag_quant} = "False" ]] && [[ ${precision} =~ "int8" ]]; then + continue + fi + if [[ ${precision} =~ "fp16" || ${precision} =~ "int8" ]] && [ ${use_trt} = "False" ]; then + continue + fi + if [[ ${use_trt} = "False" && ${precision} =~ "int8" ]] && [ ${_flag_quant} = "True" ]; then + continue + fi + for batch_size in ${batch_size_list[*]}; do + _save_log_path="${_log_path}/python_infer_gpu_gpus_${_gpu}_usetrt_${use_trt}_precision_${precision}_batchsize_${batch_size}.log" + set_infer_data=$(func_set_params "${image_dir_key}" "${_img_dir}") + set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}") + set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}") + set_tensorrt=$(func_set_params "${use_trt_key}" "${use_trt}") + set_precision=$(func_set_params "${precision_key}" "${precision}") + set_model_dir=$(func_set_params "${infer_model_key}" "${_model_dir}") + set_infer_params0=$(func_set_params "${save_log_key}" "${save_log_value}") + set_infer_params1=$(func_set_params "${infer_key1}" "${infer_value1}") + command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${set_tensorrt} ${set_precision} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_infer_params1} ${set_infer_params0} > ${_save_log_path} 2>&1 " + eval $command + last_status=${PIPESTATUS[0]} + eval "cat ${_save_log_path}" + status_check $last_status "${command}" "${status_log}" "${model_name}" "${_save_log_path}" + + done + done + done + else + echo "Does not support hardware other than CPU and GPU Currently!" + fi + done +} + +if [ ${MODE} = "whole_infer" ]; then + GPUID=$3 + if [ ${#GPUID} -le 0 ];then + env=" " + else + env="export CUDA_VISIBLE_DEVICES=${GPUID}" + fi + # set CUDA_VISIBLE_DEVICES + eval $env + export Count=0 + gpu=0 + IFS="|" + infer_run_exports=(${infer_export_list}) + infer_quant_flag=(${infer_is_quant}) + for infer_model in ${infer_model_dir_list[*]}; do + # run export + if [ ${infer_run_exports[Count]} != "null" ];then + save_infer_dir="${infer_model}" + set_export_weight=$(func_set_params "${export_weight}" "${infer_model}") + set_save_infer_key=$(func_set_params "${save_infer_key}" "${save_infer_dir}") + export_log_path="${LOG_PATH}_export_${Count}.log" + export_cmd="${python} ${infer_run_exports[Count]} ${set_export_weight} ${set_save_infer_key} > ${export_log_path} 2>&1 " + echo ${infer_run_exports[Count]} + echo $export_cmd + eval $export_cmd + status_export=$? + status_check $status_export "${export_cmd}" "${status_log}" "${model_name}" "${export_log_path}" + else + save_infer_dir=${infer_model} + fi + #run inference + is_quant=${infer_quant_flag[Count]} + func_inference "${python}" "${inference_py}" "${save_infer_dir}" "${LOG_PATH}" "${infer_img_dir}" ${is_quant} "${gpu}" + Count=$(($Count + 1)) + done +else + IFS="|" + export Count=0 + USE_GPU_KEY=(${train_use_gpu_value}) + for gpu in ${gpu_list[*]}; do + train_use_gpu=${USE_GPU_KEY[Count]} + Count=$(($Count + 1)) + ips="" + if [ ${gpu} = "-1" ];then + env="" + elif [ ${#gpu} -le 1 ];then + env="export CUDA_VISIBLE_DEVICES=${gpu}" + elif [ ${#gpu} -le 15 ];then + IFS="," + array=(${gpu}) + env="export CUDA_VISIBLE_DEVICES=${array[0]}" + IFS="|" + else + IFS=";" + array=(${gpu}) + ips=${array[0]} + gpu=${array[1]} + IFS="|" + env=" " + fi + for autocast in ${autocast_list[*]}; do + if [ ${autocast} = "amp" ]; then + set_amp_config="amp.scale_loss=1024.0 amp.use_dynamic_loss_scaling=True amp.amp_level=O2" + else + set_amp_config="amp=None" + fi + for trainer in ${trainer_list[*]}; do + flag_quant=False + if [ ${trainer} = ${pact_key} ]; then + run_train=${pact_trainer} + run_export=${pact_export} + flag_quant=True + elif [ ${trainer} = "${fpgm_key}" ]; then + run_train=${fpgm_trainer} + run_export=${fpgm_export} + elif [ ${trainer} = "${distill_key}" ]; then + run_train=${distill_trainer} + run_export=${distill_export} + elif [ ${trainer} = "${to_static_key}" ]; then + run_train="${norm_trainer} ${to_static_trainer}" + run_export=${norm_export} + elif [[ ${trainer} = ${trainer_key2} ]]; then + run_train=${trainer_value2} + run_export=${export_value2} + else + run_train=${norm_trainer} + run_export=${norm_export} + fi + + if [ ${run_train} = "null" ]; then + continue + fi + + set_epoch=$(func_set_params "${epoch_key}" "${epoch_num}") + set_pretrain=$(func_set_params "${pretrain_model_key}" "${pretrain_model_value}") + set_batchsize=$(func_set_params "${train_batch_key}" "${train_batch_value}") + set_train_params1=$(func_set_params "${train_param_key1}" "${train_param_value1}") + set_use_gpu=$(func_set_params "${train_use_gpu_key}" "${train_use_gpu}") + # if length of ips >= 15, then it is seen as multi-machine + # 15 is the min length of ips info for multi-machine: 0.0.0.0,0.0.0.0 + if [ ${#ips} -le 15 ];then + save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}" + nodes=1 + else + IFS="," + ips_array=(${ips}) + IFS="|" + nodes=${#ips_array[@]} + save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}" + fi + + + set_save_model=$(func_set_params "${save_model_key}" "${save_log}") + if [ ${#gpu} -le 2 ];then # train with cpu or single gpu + cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_batchsize} ${set_amp_config} ${set_train_params1}" + elif [ ${#ips} -le 15 ];then # train with multi-gpu + cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_batchsize} ${set_amp_config} ${set_train_params1}" + else # train with multi-machine + cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_batchsize} ${set_amp_config} ${set_train_params1}" + fi + # run train + eval $cmd + eval "cat ${save_log}/train.log >> ${save_log}.log" + status_check $? "${cmd}" "${status_log}" "${model_name}" "${save_log}.log" + + set_eval_pretrain=$(func_set_params "${pretrain_model_key}" "${save_log}/${train_model_name}") + + # run eval + if [ ${eval_py} != "null" ]; then + eval ${env} + set_eval_params1=$(func_set_params "${eval_key1}" "${eval_value1}") + eval_log_path="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}_eval.log" + eval_cmd="${python} ${eval_py} ${set_eval_pretrain} ${set_use_gpu} ${set_eval_params1} > ${eval_log_path} 2>&1 " + eval $eval_cmd + status_check $? "${eval_cmd}" "${status_log}" "${model_name}" "${eval_log_path}" + fi + # run export model + if [ ${run_export} != "null" ]; then + # run export model + save_infer_path="${save_log}" + export_log_path="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}_export.log" + set_export_weight=$(func_set_params "${export_weight}" "${save_log}/${train_model_name}") + set_save_infer_key=$(func_set_params "${save_infer_key}" "${save_infer_path}") + export_cmd="${python} ${run_export} ${set_export_weight} ${set_save_infer_key} > ${export_log_path} 2>&1 " + eval $export_cmd + status_check $? "${export_cmd}" "${status_log}" "${model_name}" "${export_log_path}" + + #run inference + eval $env + save_infer_path="${save_log}" + if [[ ${inference_dir} != "null" ]] && [[ ${inference_dir} != '##' ]]; then + infer_model_dir="${save_infer_path}/${inference_dir}" + else + infer_model_dir=${save_infer_path} + fi + func_inference "${python}" "${inference_py}" "${infer_model_dir}" "${LOG_PATH}" "${train_infer_img_dir}" "${flag_quant}" "${gpu}" + + eval "unset CUDA_VISIBLE_DEVICES" + fi + done # done with: for trainer in ${trainer_list[*]}; do + done # done with: for autocast in ${autocast_list[*]}; do + done # done with: for gpu in ${gpu_list[*]}; do +fi # end if [ ${MODE} = "infer" ]; then \ No newline at end of file diff --git a/benchmark/PaddleOCR_DBNet/tools/__init__.py b/benchmark/PaddleOCR_DBNet/tools/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7cbf835d7e1654d050df9fca997a774f3d7947ad --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/tools/__init__.py @@ -0,0 +1,3 @@ +# -*- coding: utf-8 -*- +# @Time : 2019/12/8 13:14 +# @Author : zhoujun \ No newline at end of file diff --git a/benchmark/PaddleOCR_DBNet/tools/eval.py b/benchmark/PaddleOCR_DBNet/tools/eval.py new file mode 100644 index 0000000000000000000000000000000000000000..fe514ddc0d4f1e09e784f346374e0e5aa2b998bc --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/tools/eval.py @@ -0,0 +1,87 @@ +# -*- coding: utf-8 -*- +# @Time : 2018/6/11 15:54 +# @Author : zhoujun +import os +import sys +import pathlib +__dir__ = pathlib.Path(os.path.abspath(__file__)) +sys.path.append(str(__dir__)) +sys.path.append(str(__dir__.parent.parent)) + +import argparse +import time +import paddle +from tqdm.auto import tqdm + + +class EVAL(): + def __init__(self, model_path, gpu_id=0): + from models import build_model + from data_loader import get_dataloader + from post_processing import get_post_processing + from utils import get_metric + self.gpu_id = gpu_id + if self.gpu_id is not None and isinstance( + self.gpu_id, int) and paddle.device.is_compiled_with_cuda(): + paddle.device.set_device("gpu:{}".format(self.gpu_id)) + else: + paddle.device.set_device("cpu") + checkpoint = paddle.load(model_path) + config = checkpoint['config'] + config['arch']['backbone']['pretrained'] = False + + self.validate_loader = get_dataloader(config['dataset']['validate'], + config['distributed']) + + self.model = build_model(config['arch']) + self.model.set_state_dict(checkpoint['state_dict']) + + self.post_process = get_post_processing(config['post_processing']) + self.metric_cls = get_metric(config['metric']) + + def eval(self): + self.model.eval() + raw_metrics = [] + total_frame = 0.0 + total_time = 0.0 + for i, batch in tqdm( + enumerate(self.validate_loader), + total=len(self.validate_loader), + desc='test model'): + with paddle.no_grad(): + start = time.time() + preds = self.model(batch['img']) + boxes, scores = self.post_process( + batch, + preds, + is_output_polygon=self.metric_cls.is_output_polygon) + total_frame += batch['img'].shape[0] + total_time += time.time() - start + raw_metric = self.metric_cls.validate_measure(batch, + (boxes, scores)) + raw_metrics.append(raw_metric) + metrics = self.metric_cls.gather_measure(raw_metrics) + print('FPS:{}'.format(total_frame / total_time)) + return { + 'recall': metrics['recall'].avg, + 'precision': metrics['precision'].avg, + 'fmeasure': metrics['fmeasure'].avg + } + + +def init_args(): + parser = argparse.ArgumentParser(description='DBNet.paddle') + parser.add_argument( + '--model_path', + required=False, + default='output/DBNet_resnet18_FPN_DBHead/checkpoint/1.pth', + type=str) + args = parser.parse_args() + return args + + +if __name__ == '__main__': + args = init_args() + eval = EVAL(args.model_path) + result = eval.eval() + print(result) diff --git a/benchmark/PaddleOCR_DBNet/tools/export_model.py b/benchmark/PaddleOCR_DBNet/tools/export_model.py new file mode 100644 index 0000000000000000000000000000000000000000..59a318a196a6480f904c7ddaed7eab49cbf3f80c --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/tools/export_model.py @@ -0,0 +1,57 @@ +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, ".."))) + +import argparse + +import paddle +from paddle.jit import to_static + +from models import build_model +from utils import Config, ArgsParser + + +def init_args(): + parser = ArgsParser() + args = parser.parse_args() + return args + + +def load_checkpoint(model, checkpoint_path): + """ + load checkpoints + :param checkpoint_path: Checkpoint path to be loaded + """ + checkpoint = paddle.load(checkpoint_path) + model.set_state_dict(checkpoint['state_dict']) + print('load checkpoint from {}'.format(checkpoint_path)) + + +def main(config): + model = build_model(config['arch']) + load_checkpoint(model, config['trainer']['resume_checkpoint']) + model.eval() + + save_path = config["trainer"]["output_dir"] + save_path = os.path.join(save_path, "inference") + infer_shape = [3, -1, -1] + model = to_static( + model, + input_spec=[ + paddle.static.InputSpec( + shape=[None] + infer_shape, dtype="float32") + ]) + + paddle.jit.save(model, save_path) + print("inference model is saved to {}".format(save_path)) + + +if __name__ == "__main__": + args = init_args() + assert os.path.exists(args.config_file) + config = Config(args.config_file) + config.merge_dict(args.opt) + main(config.cfg) diff --git a/benchmark/PaddleOCR_DBNet/tools/infer.py b/benchmark/PaddleOCR_DBNet/tools/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..24e919c33f2352e7e45ea2c2503fa0f94bb9cf58 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/tools/infer.py @@ -0,0 +1,298 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import pathlib +__dir__ = pathlib.Path(os.path.abspath(__file__)) +sys.path.append(str(__dir__)) +sys.path.append(str(__dir__.parent.parent)) + +import cv2 +import paddle +from paddle import inference +import numpy as np +from PIL import Image + +from paddle.vision import transforms +from tools.predict import resize_image +from post_processing import get_post_processing +from utils.util import draw_bbox, save_result + + +class InferenceEngine(object): + """InferenceEngine + + Inference engina class which contains preprocess, run, postprocess + """ + + def __init__(self, args): + """ + Args: + args: Parameters generated using argparser. + Returns: None + """ + super().__init__() + self.args = args + + # init inference engine + self.predictor, self.config, self.input_tensor, self.output_tensor = self.load_predictor( + os.path.join(args.model_dir, "inference.pdmodel"), + os.path.join(args.model_dir, "inference.pdiparams")) + + # build transforms + self.transforms = transforms.Compose([ + transforms.ToTensor(), transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + ]) + + # wamrup + if self.args.warmup > 0: + for idx in range(args.warmup): + print(idx) + x = np.random.rand(1, 3, self.args.crop_size, + self.args.crop_size).astype("float32") + self.input_tensor.copy_from_cpu(x) + self.predictor.run() + self.output_tensor.copy_to_cpu() + + self.post_process = get_post_processing({ + 'type': 'SegDetectorRepresenter', + 'args': { + 'thresh': 0.3, + 'box_thresh': 0.7, + 'max_candidates': 1000, + 'unclip_ratio': 1.5 + } + }) + + def load_predictor(self, model_file_path, params_file_path): + """load_predictor + initialize the inference engine + Args: + model_file_path: inference model path (*.pdmodel) + model_file_path: inference parmaeter path (*.pdiparams) + Return: + predictor: Predictor created using Paddle Inference. + config: Configuration of the predictor. + input_tensor: Input tensor of the predictor. + output_tensor: Output tensor of the predictor. + """ + args = self.args + config = inference.Config(model_file_path, params_file_path) + if args.use_gpu: + config.enable_use_gpu(1000, 0) + if args.use_tensorrt: + config.enable_tensorrt_engine( + workspace_size=1 << 30, + precision_mode=precision, + max_batch_size=args.max_batch_size, + min_subgraph_size=args. + min_subgraph_size, # skip the minmum trt subgraph + use_calib_mode=False) + + # collect shape + trt_shape_f = os.path.join(model_dir, "_trt_dynamic_shape.txt") + + if not os.path.exists(trt_shape_f): + config.collect_shape_range_info(trt_shape_f) + logger.info( + f"collect dynamic shape info into : {trt_shape_f}") + try: + config.enable_tuned_tensorrt_dynamic_shape(trt_shape_f, + True) + except Exception as E: + logger.info(E) + logger.info("Please keep your paddlepaddle-gpu >= 2.3.0!") + else: + config.disable_gpu() + # The thread num should not be greater than the number of cores in the CPU. + if args.enable_mkldnn: + # cache 10 different shapes for mkldnn to avoid memory leak + config.set_mkldnn_cache_capacity(10) + config.enable_mkldnn() + if args.precision == "fp16": + config.enable_mkldnn_bfloat16() + if hasattr(args, "cpu_threads"): + config.set_cpu_math_library_num_threads(args.cpu_threads) + else: + # default cpu threads as 10 + config.set_cpu_math_library_num_threads(10) + + # enable memory optim + config.enable_memory_optim() + config.disable_glog_info() + + config.switch_use_feed_fetch_ops(False) + config.switch_ir_optim(True) + + # create predictor + predictor = inference.create_predictor(config) + + # get input and output tensor property + input_names = predictor.get_input_names() + input_tensor = predictor.get_input_handle(input_names[0]) + + output_names = predictor.get_output_names() + output_tensor = predictor.get_output_handle(output_names[0]) + + return predictor, config, input_tensor, output_tensor + + def preprocess(self, img_path, short_size): + """preprocess + Preprocess to the input. + Args: + img_path: Image path. + Returns: Input data after preprocess. + """ + img = cv2.imread(img_path, 1) + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + h, w = img.shape[:2] + img = resize_image(img, short_size) + img = self.transforms(img) + img = np.expand_dims(img, axis=0) + shape_info = {'shape': [(h, w)]} + return img, shape_info + + def postprocess(self, x, shape_info, is_output_polygon): + """postprocess + Postprocess to the inference engine output. + Args: + x: Inference engine output. + Returns: Output data after argmax. + """ + box_list, score_list = self.post_process( + shape_info, x, is_output_polygon=is_output_polygon) + box_list, score_list = box_list[0], score_list[0] + if len(box_list) > 0: + if is_output_polygon: + idx = [x.sum() > 0 for x in box_list] + box_list = [box_list[i] for i, v in enumerate(idx) if v] + score_list = [score_list[i] for i, v in enumerate(idx) if v] + else: + idx = box_list.reshape(box_list.shape[0], -1).sum( + axis=1) > 0 # 去掉全为0的框 + box_list, score_list = box_list[idx], score_list[idx] + else: + box_list, score_list = [], [] + return box_list, score_list + + def run(self, x): + """run + Inference process using inference engine. + Args: + x: Input data after preprocess. + Returns: Inference engine output + """ + self.input_tensor.copy_from_cpu(x) + self.predictor.run() + output = self.output_tensor.copy_to_cpu() + return output + + +def get_args(add_help=True): + """ + parse args + """ + import argparse + + def str2bool(v): + return v.lower() in ("true", "t", "1") + + parser = argparse.ArgumentParser( + description="PaddlePaddle Classification Training", add_help=add_help) + + parser.add_argument("--model_dir", default=None, help="inference model dir") + parser.add_argument("--batch_size", type=int, default=1) + parser.add_argument( + "--short_size", default=1024, type=int, help="short size") + parser.add_argument("--img_path", default="./images/demo.jpg") + + parser.add_argument( + "--benchmark", default=False, type=str2bool, help="benchmark") + parser.add_argument("--warmup", default=0, type=int, help="warmup iter") + parser.add_argument( + '--polygon', action='store_true', help='output polygon or box') + + parser.add_argument("--use_gpu", type=str2bool, default=True) + parser.add_argument("--use_tensorrt", type=str2bool, default=False) + parser.add_argument("--precision", type=str, default="fp32") + parser.add_argument("--gpu_mem", type=int, default=500) + parser.add_argument("--gpu_id", type=int, default=0) + parser.add_argument("--enable_mkldnn", type=str2bool, default=False) + parser.add_argument("--cpu_threads", type=int, default=10) + + args = parser.parse_args() + return args + + +def main(args): + """ + Main inference function. + Args: + args: Parameters generated using argparser. + Returns: + class_id: Class index of the input. + prob: : Probability of the input. + """ + inference_engine = InferenceEngine(args) + + # init benchmark + if args.benchmark: + import auto_log + autolog = auto_log.AutoLogger( + model_name="db", + batch_size=args.batch_size, + inference_config=inference_engine.config, + gpu_ids="auto" if args.use_gpu else None) + + # enable benchmark + if args.benchmark: + autolog.times.start() + + # preprocess + img, shape_info = inference_engine.preprocess(args.img_path, + args.short_size) + + if args.benchmark: + autolog.times.stamp() + + output = inference_engine.run(img) + + if args.benchmark: + autolog.times.stamp() + + # postprocess + box_list, score_list = inference_engine.postprocess(output, shape_info, + args.polygon) + + if args.benchmark: + autolog.times.stamp() + autolog.times.end(stamp=True) + autolog.report() + + img = draw_bbox(cv2.imread(args.img_path)[:, :, ::-1], box_list) + # 保存结果到路径 + os.makedirs('output', exist_ok=True) + img_path = pathlib.Path(args.img_path) + output_path = os.path.join('output', img_path.stem + '_infer_result.jpg') + cv2.imwrite(output_path, img[:, :, ::-1]) + save_result( + output_path.replace('_infer_result.jpg', '.txt'), box_list, score_list, + args.polygon) + + +if __name__ == "__main__": + args = get_args() + main(args) diff --git a/benchmark/PaddleOCR_DBNet/tools/predict.py b/benchmark/PaddleOCR_DBNet/tools/predict.py new file mode 100644 index 0000000000000000000000000000000000000000..51beffd1706c0c12100a0d7fea98c7532b1272b6 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/tools/predict.py @@ -0,0 +1,178 @@ +# -*- coding: utf-8 -*- +# @Time : 2019/8/24 12:06 +# @Author : zhoujun + +import os +import sys +import pathlib +__dir__ = pathlib.Path(os.path.abspath(__file__)) +sys.path.append(str(__dir__)) +sys.path.append(str(__dir__.parent.parent)) + +import time +import cv2 +import paddle + +from data_loader import get_transforms +from models import build_model +from post_processing import get_post_processing + + +def resize_image(img, short_size): + height, width, _ = img.shape + if height < width: + new_height = short_size + new_width = new_height / height * width + else: + new_width = short_size + new_height = new_width / width * height + new_height = int(round(new_height / 32) * 32) + new_width = int(round(new_width / 32) * 32) + resized_img = cv2.resize(img, (new_width, new_height)) + return resized_img + + +class PaddleModel: + def __init__(self, model_path, post_p_thre=0.7, gpu_id=None): + ''' + 初始化模型 + :param model_path: 模型地址(可以是模型的参数或者参数和计算图一起保存的文件) + :param gpu_id: 在哪一块gpu上运行 + ''' + self.gpu_id = gpu_id + + if self.gpu_id is not None and isinstance( + self.gpu_id, int) and paddle.device.is_compiled_with_cuda(): + paddle.device.set_device("gpu:{}".format(self.gpu_id)) + else: + paddle.device.set_device("cpu") + checkpoint = paddle.load(model_path) + + config = checkpoint['config'] + config['arch']['backbone']['pretrained'] = False + self.model = build_model(config['arch']) + self.post_process = get_post_processing(config['post_processing']) + self.post_process.box_thresh = post_p_thre + self.img_mode = config['dataset']['train']['dataset']['args'][ + 'img_mode'] + self.model.set_state_dict(checkpoint['state_dict']) + self.model.eval() + + self.transform = [] + for t in config['dataset']['train']['dataset']['args']['transforms']: + if t['type'] in ['ToTensor', 'Normalize']: + self.transform.append(t) + self.transform = get_transforms(self.transform) + + def predict(self, + img_path: str, + is_output_polygon=False, + short_size: int=1024): + ''' + 对传入的图像进行预测,支持图像地址,opecv 读取图片,偏慢 + :param img_path: 图像地址 + :param is_numpy: + :return: + ''' + assert os.path.exists(img_path), 'file is not exists' + img = cv2.imread(img_path, 1 if self.img_mode != 'GRAY' else 0) + if self.img_mode == 'RGB': + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + h, w = img.shape[:2] + img = resize_image(img, short_size) + # 将图片由(w,h)变为(1,img_channel,h,w) + tensor = self.transform(img) + tensor = tensor.unsqueeze_(0) + + batch = {'shape': [(h, w)]} + with paddle.no_grad(): + start = time.time() + preds = self.model(tensor) + box_list, score_list = self.post_process( + batch, preds, is_output_polygon=is_output_polygon) + box_list, score_list = box_list[0], score_list[0] + if len(box_list) > 0: + if is_output_polygon: + idx = [x.sum() > 0 for x in box_list] + box_list = [box_list[i] for i, v in enumerate(idx) if v] + score_list = [score_list[i] for i, v in enumerate(idx) if v] + else: + idx = box_list.reshape(box_list.shape[0], -1).sum( + axis=1) > 0 # 去掉全为0的框 + box_list, score_list = box_list[idx], score_list[idx] + else: + box_list, score_list = [], [] + t = time.time() - start + return preds[0, 0, :, :].detach().cpu().numpy(), box_list, score_list, t + + +def save_depoly(net, input, save_path): + input_spec = [ + paddle.static.InputSpec( + shape=[None, 3, None, None], dtype="float32") + ] + net = paddle.jit.to_static(net, input_spec=input_spec) + + # save static model for inference directly + paddle.jit.save(net, save_path) + + +def init_args(): + import argparse + parser = argparse.ArgumentParser(description='DBNet.paddle') + parser.add_argument('--model_path', default=r'model_best.pth', type=str) + parser.add_argument( + '--input_folder', + default='./test/input', + type=str, + help='img path for predict') + parser.add_argument( + '--output_folder', + default='./test/output', + type=str, + help='img path for output') + parser.add_argument('--gpu', default=0, type=int, help='gpu for inference') + parser.add_argument( + '--thre', default=0.3, type=float, help='the thresh of post_processing') + parser.add_argument( + '--polygon', action='store_true', help='output polygon or box') + parser.add_argument('--show', action='store_true', help='show result') + parser.add_argument( + '--save_result', + action='store_true', + help='save box and score to txt file') + args = parser.parse_args() + return args + + +if __name__ == '__main__': + import pathlib + from tqdm import tqdm + import matplotlib.pyplot as plt + from utils.util import show_img, draw_bbox, save_result, get_image_file_list + + args = init_args() + print(args) + # 初始化网络 + model = PaddleModel(args.model_path, post_p_thre=args.thre, gpu_id=args.gpu) + img_folder = pathlib.Path(args.input_folder) + for img_path in tqdm(get_image_file_list(args.input_folder)): + preds, boxes_list, score_list, t = model.predict( + img_path, is_output_polygon=args.polygon) + img = draw_bbox(cv2.imread(img_path)[:, :, ::-1], boxes_list) + if args.show: + show_img(preds) + show_img(img, title=os.path.basename(img_path)) + plt.show() + # 保存结果到路径 + os.makedirs(args.output_folder, exist_ok=True) + img_path = pathlib.Path(img_path) + output_path = os.path.join(args.output_folder, + img_path.stem + '_result.jpg') + pred_path = os.path.join(args.output_folder, + img_path.stem + '_pred.jpg') + cv2.imwrite(output_path, img[:, :, ::-1]) + cv2.imwrite(pred_path, preds * 255) + save_result( + output_path.replace('_result.jpg', '.txt'), boxes_list, score_list, + args.polygon) diff --git a/benchmark/PaddleOCR_DBNet/tools/train.py b/benchmark/PaddleOCR_DBNet/tools/train.py new file mode 100644 index 0000000000000000000000000000000000000000..403d6185fc28c5037bcb29d5f188ec0692c9499b --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/tools/train.py @@ -0,0 +1,61 @@ +import os +import sys +import pathlib +__dir__ = pathlib.Path(os.path.abspath(__file__)) +sys.path.append(str(__dir__)) +sys.path.append(str(__dir__.parent.parent)) + +import paddle +import paddle.distributed as dist +from utils import Config, ArgsParser + + +def init_args(): + parser = ArgsParser() + args = parser.parse_args() + return args + + +def main(config, profiler_options): + from models import build_model, build_loss + from data_loader import get_dataloader + from trainer import Trainer + from post_processing import get_post_processing + from utils import get_metric + if paddle.device.cuda.device_count() > 1: + dist.init_parallel_env() + config['distributed'] = True + else: + config['distributed'] = False + train_loader = get_dataloader(config['dataset']['train'], + config['distributed']) + assert train_loader is not None + if 'validate' in config['dataset']: + validate_loader = get_dataloader(config['dataset']['validate'], False) + else: + validate_loader = None + criterion = build_loss(config['loss']) + config['arch']['backbone']['in_channels'] = 3 if config['dataset']['train'][ + 'dataset']['args']['img_mode'] != 'GRAY' else 1 + model = build_model(config['arch']) + # set @to_static for benchmark, skip this by default. + post_p = get_post_processing(config['post_processing']) + metric = get_metric(config['metric']) + trainer = Trainer( + config=config, + model=model, + criterion=criterion, + train_loader=train_loader, + post_process=post_p, + metric_cls=metric, + validate_loader=validate_loader, + profiler_options=profiler_options) + trainer.train() + + +if __name__ == '__main__': + args = init_args() + assert os.path.exists(args.config_file) + config = Config(args.config_file) + config.merge_dict(args.opt) + main(config.cfg, args.profiler_options) diff --git a/benchmark/PaddleOCR_DBNet/trainer/__init__.py b/benchmark/PaddleOCR_DBNet/trainer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..76c7392d142c96d5ec715b528ea47e9001cbec4b --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/trainer/__init__.py @@ -0,0 +1,4 @@ +# -*- coding: utf-8 -*- +# @Time : 2019/8/23 21:58 +# @Author : zhoujun +from .trainer import Trainer \ No newline at end of file diff --git a/benchmark/PaddleOCR_DBNet/trainer/trainer.py b/benchmark/PaddleOCR_DBNet/trainer/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..34b259f3d182672d727fb50eebf14c27870cb836 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/trainer/trainer.py @@ -0,0 +1,230 @@ +# -*- coding: utf-8 -*- +# @Time : 2019/8/23 21:58 +# @Author : zhoujun +import time + +import paddle +from tqdm import tqdm + +from base import BaseTrainer +from utils import runningScore, cal_text_score, Polynomial, profiler + + +class Trainer(BaseTrainer): + def __init__(self, + config, + model, + criterion, + train_loader, + validate_loader, + metric_cls, + post_process=None, + profiler_options=None): + super(Trainer, self).__init__(config, model, criterion, train_loader, + validate_loader, metric_cls, post_process) + self.profiler_options = profiler_options + self.enable_eval = config['trainer'].get('enable_eval', True) + + def _train_epoch(self, epoch): + self.model.train() + total_samples = 0 + train_reader_cost = 0.0 + train_batch_cost = 0.0 + reader_start = time.time() + epoch_start = time.time() + train_loss = 0. + running_metric_text = runningScore(2) + + for i, batch in enumerate(self.train_loader): + profiler.add_profiler_step(self.profiler_options) + if i >= self.train_loader_len: + break + self.global_step += 1 + lr = self.optimizer.get_lr() + + cur_batch_size = batch['img'].shape[0] + + train_reader_cost += time.time() - reader_start + if self.amp: + with paddle.amp.auto_cast( + enable='gpu' in paddle.device.get_device(), + custom_white_list=self.amp.get('custom_white_list', []), + custom_black_list=self.amp.get('custom_black_list', []), + level=self.amp.get('level', 'O2')): + preds = self.model(batch['img']) + loss_dict = self.criterion(preds.astype(paddle.float32), batch) + scaled_loss = self.amp['scaler'].scale(loss_dict['loss']) + scaled_loss.backward() + self.amp['scaler'].minimize(self.optimizer, scaled_loss) + else: + preds = self.model(batch['img']) + loss_dict = self.criterion(preds, batch) + # backward + loss_dict['loss'].backward() + self.optimizer.step() + self.lr_scheduler.step() + self.optimizer.clear_grad() + + train_batch_time = time.time() - reader_start + train_batch_cost += train_batch_time + total_samples += cur_batch_size + + # acc iou + score_shrink_map = cal_text_score( + preds[:, 0, :, :], + batch['shrink_map'], + batch['shrink_mask'], + running_metric_text, + thred=self.config['post_processing']['args']['thresh']) + + # loss 和 acc 记录到日志 + loss_str = 'loss: {:.4f}, '.format(loss_dict['loss'].item()) + for idx, (key, value) in enumerate(loss_dict.items()): + loss_dict[key] = value.item() + if key == 'loss': + continue + loss_str += '{}: {:.4f}'.format(key, loss_dict[key]) + if idx < len(loss_dict) - 1: + loss_str += ', ' + + train_loss += loss_dict['loss'] + acc = score_shrink_map['Mean Acc'] + iou_shrink_map = score_shrink_map['Mean IoU'] + + if self.global_step % self.log_iter == 0: + self.logger_info( + '[{}/{}], [{}/{}], global_step: {}, ips: {:.1f} samples/sec, avg_reader_cost: {:.5f} s, avg_batch_cost: {:.5f} s, avg_samples: {}, acc: {:.4f}, iou_shrink_map: {:.4f}, {}lr:{:.6}, time:{:.2f}'. + format(epoch, self.epochs, i + 1, self.train_loader_len, + self.global_step, total_samples / train_batch_cost, + train_reader_cost / self.log_iter, train_batch_cost / + self.log_iter, total_samples / self.log_iter, acc, + iou_shrink_map, loss_str, lr, train_batch_cost)) + total_samples = 0 + train_reader_cost = 0.0 + train_batch_cost = 0.0 + + if self.visualdl_enable and paddle.distributed.get_rank() == 0: + # write tensorboard + for key, value in loss_dict.items(): + self.writer.add_scalar('TRAIN/LOSS/{}'.format(key), value, + self.global_step) + self.writer.add_scalar('TRAIN/ACC_IOU/acc', acc, + self.global_step) + self.writer.add_scalar('TRAIN/ACC_IOU/iou_shrink_map', + iou_shrink_map, self.global_step) + self.writer.add_scalar('TRAIN/lr', lr, self.global_step) + reader_start = time.time() + return { + 'train_loss': train_loss / self.train_loader_len, + 'lr': lr, + 'time': time.time() - epoch_start, + 'epoch': epoch + } + + def _eval(self, epoch): + self.model.eval() + raw_metrics = [] + total_frame = 0.0 + total_time = 0.0 + for i, batch in tqdm( + enumerate(self.validate_loader), + total=len(self.validate_loader), + desc='test model'): + with paddle.no_grad(): + start = time.time() + if self.amp: + with paddle.amp.auto_cast( + enable='gpu' in paddle.device.get_device(), + custom_white_list=self.amp.get('custom_white_list', + []), + custom_black_list=self.amp.get('custom_black_list', + []), + level=self.amp.get('level', 'O2')): + preds = self.model(batch['img']) + preds = preds.astype(paddle.float32) + else: + preds = self.model(batch['img']) + boxes, scores = self.post_process( + batch, + preds, + is_output_polygon=self.metric_cls.is_output_polygon) + total_frame += batch['img'].shape[0] + total_time += time.time() - start + raw_metric = self.metric_cls.validate_measure(batch, + (boxes, scores)) + raw_metrics.append(raw_metric) + metrics = self.metric_cls.gather_measure(raw_metrics) + self.logger_info('FPS:{}'.format(total_frame / total_time)) + return metrics['recall'].avg, metrics['precision'].avg, metrics[ + 'fmeasure'].avg + + def _on_epoch_finish(self): + self.logger_info('[{}/{}], train_loss: {:.4f}, time: {:.4f}, lr: {}'. + format(self.epoch_result['epoch'], self.epochs, self. + epoch_result['train_loss'], self.epoch_result[ + 'time'], self.epoch_result['lr'])) + net_save_path = '{}/model_latest.pth'.format(self.checkpoint_dir) + net_save_path_best = '{}/model_best.pth'.format(self.checkpoint_dir) + + if paddle.distributed.get_rank() == 0: + self._save_checkpoint(self.epoch_result['epoch'], net_save_path) + save_best = False + if self.validate_loader is not None and self.metric_cls is not None and self.enable_eval: # 使用f1作为最优模型指标 + recall, precision, hmean = self._eval(self.epoch_result[ + 'epoch']) + + if self.visualdl_enable: + self.writer.add_scalar('EVAL/recall', recall, + self.global_step) + self.writer.add_scalar('EVAL/precision', precision, + self.global_step) + self.writer.add_scalar('EVAL/hmean', hmean, + self.global_step) + self.logger_info( + 'test: recall: {:.6f}, precision: {:.6f}, hmean: {:.6f}'. + format(recall, precision, hmean)) + + if hmean >= self.metrics['hmean']: + save_best = True + self.metrics['train_loss'] = self.epoch_result['train_loss'] + self.metrics['hmean'] = hmean + self.metrics['precision'] = precision + self.metrics['recall'] = recall + self.metrics['best_model_epoch'] = self.epoch_result[ + 'epoch'] + else: + if self.epoch_result['train_loss'] <= self.metrics[ + 'train_loss']: + save_best = True + self.metrics['train_loss'] = self.epoch_result['train_loss'] + self.metrics['best_model_epoch'] = self.epoch_result[ + 'epoch'] + best_str = 'current best, ' + for k, v in self.metrics.items(): + best_str += '{}: {:.6f}, '.format(k, v) + self.logger_info(best_str) + if save_best: + import shutil + shutil.copy(net_save_path, net_save_path_best) + self.logger_info("Saving current best: {}".format( + net_save_path_best)) + else: + self.logger_info("Saving checkpoint: {}".format(net_save_path)) + + def _on_train_finish(self): + if self.enable_eval: + for k, v in self.metrics.items(): + self.logger_info('{}:{}'.format(k, v)) + self.logger_info('finish train') + + def _initialize_scheduler(self): + if self.config['lr_scheduler']['type'] == 'Polynomial': + self.config['lr_scheduler']['args']['epochs'] = self.config[ + 'trainer']['epochs'] + self.config['lr_scheduler']['args']['step_each_epoch'] = len( + self.train_loader) + self.lr_scheduler = Polynomial( + **self.config['lr_scheduler']['args'])() + else: + self.lr_scheduler = self._initialize('lr_scheduler', + paddle.optimizer.lr) diff --git a/benchmark/PaddleOCR_DBNet/utils/__init__.py b/benchmark/PaddleOCR_DBNet/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..194e0b82ff6576c3880914fad0492fde276cff33 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/utils/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- +# @Time : 2019/8/23 21:58 +# @Author : zhoujun +from .util import * +from .metrics import * +from .schedulers import * +from .cal_recall.script import cal_recall_precison_f1 +from .ocr_metric import get_metric diff --git a/benchmark/PaddleOCR_DBNet/utils/cal_recall/__init__.py b/benchmark/PaddleOCR_DBNet/utils/cal_recall/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0db38a8a37f0a3d8fbd8c12a3e54457e41cf9360 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/utils/cal_recall/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- +# @Time : 1/16/19 6:40 AM +# @Author : zhoujun +from .script import cal_recall_precison_f1 +__all__ = ['cal_recall_precison_f1'] diff --git a/benchmark/PaddleOCR_DBNet/utils/cal_recall/rrc_evaluation_funcs.py b/benchmark/PaddleOCR_DBNet/utils/cal_recall/rrc_evaluation_funcs.py new file mode 100644 index 0000000000000000000000000000000000000000..4e12ee66a07118c07559eebd655f5173e046696e --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/utils/cal_recall/rrc_evaluation_funcs.py @@ -0,0 +1,479 @@ +#!/usr/bin/env python2 +#encoding: UTF-8 +import json +import sys +sys.path.append('./') +import zipfile +import re +import sys +import os +import codecs +import traceback +import numpy as np +from utils import order_points_clockwise + + +def print_help(): + sys.stdout.write( + 'Usage: python %s.py -g= -s= [-o= -p=]' + % sys.argv[0]) + sys.exit(2) + + +def load_zip_file_keys(file, fileNameRegExp=''): + """ + Returns an array with the entries of the ZIP file that match with the regular expression. + The key's are the names or the file or the capturing group definied in the fileNameRegExp + """ + try: + archive = zipfile.ZipFile(file, mode='r', allowZip64=True) + except: + raise Exception('Error loading the ZIP archive.') + + pairs = [] + + for name in archive.namelist(): + addFile = True + keyName = name + if fileNameRegExp != "": + m = re.match(fileNameRegExp, name) + if m == None: + addFile = False + else: + if len(m.groups()) > 0: + keyName = m.group(1) + + if addFile: + pairs.append(keyName) + + return pairs + + +def load_zip_file(file, fileNameRegExp='', allEntries=False): + """ + Returns an array with the contents (filtered by fileNameRegExp) of a ZIP file. + The key's are the names or the file or the capturing group definied in the fileNameRegExp + allEntries validates that all entries in the ZIP file pass the fileNameRegExp + """ + try: + archive = zipfile.ZipFile(file, mode='r', allowZip64=True) + except: + raise Exception('Error loading the ZIP archive') + + pairs = [] + for name in archive.namelist(): + addFile = True + keyName = name + if fileNameRegExp != "": + m = re.match(fileNameRegExp, name) + if m == None: + addFile = False + else: + if len(m.groups()) > 0: + keyName = m.group(1) + + if addFile: + pairs.append([keyName, archive.read(name)]) + else: + if allEntries: + raise Exception('ZIP entry not valid: %s' % name) + + return dict(pairs) + + +def load_folder_file(file, fileNameRegExp='', allEntries=False): + """ + Returns an array with the contents (filtered by fileNameRegExp) of a ZIP file. + The key's are the names or the file or the capturing group definied in the fileNameRegExp + allEntries validates that all entries in the ZIP file pass the fileNameRegExp + """ + pairs = [] + for name in os.listdir(file): + addFile = True + keyName = name + if fileNameRegExp != "": + m = re.match(fileNameRegExp, name) + if m == None: + addFile = False + else: + if len(m.groups()) > 0: + keyName = m.group(1) + + if addFile: + pairs.append([keyName, open(os.path.join(file, name)).read()]) + else: + if allEntries: + raise Exception('ZIP entry not valid: %s' % name) + + return dict(pairs) + + +def decode_utf8(raw): + """ + Returns a Unicode object on success, or None on failure + """ + try: + raw = codecs.decode(raw, 'utf-8', 'replace') + #extracts BOM if exists + raw = raw.encode('utf8') + if raw.startswith(codecs.BOM_UTF8): + raw = raw.replace(codecs.BOM_UTF8, '', 1) + return raw.decode('utf-8') + except: + return None + + +def validate_lines_in_file(fileName, + file_contents, + CRLF=True, + LTRB=True, + withTranscription=False, + withConfidence=False, + imWidth=0, + imHeight=0): + """ + This function validates that all lines of the file calling the Line validation function for each line + """ + utf8File = decode_utf8(file_contents) + if (utf8File is None): + raise Exception("The file %s is not UTF-8" % fileName) + + lines = utf8File.split("\r\n" if CRLF else "\n") + for line in lines: + line = line.replace("\r", "").replace("\n", "") + if (line != ""): + try: + validate_tl_line(line, LTRB, withTranscription, withConfidence, + imWidth, imHeight) + except Exception as e: + raise Exception( + ("Line in sample not valid. Sample: %s Line: %s Error: %s" % + (fileName, line, str(e))).encode('utf-8', 'replace')) + + +def validate_tl_line(line, + LTRB=True, + withTranscription=True, + withConfidence=True, + imWidth=0, + imHeight=0): + """ + Validate the format of the line. If the line is not valid an exception will be raised. + If maxWidth and maxHeight are specified, all points must be inside the imgage bounds. + Posible values are: + LTRB=True: xmin,ymin,xmax,ymax[,confidence][,transcription] + LTRB=False: x1,y1,x2,y2,x3,y3,x4,y4[,confidence][,transcription] + """ + get_tl_line_values(line, LTRB, withTranscription, withConfidence, imWidth, + imHeight) + + +def get_tl_line_values(line, + LTRB=True, + withTranscription=False, + withConfidence=False, + imWidth=0, + imHeight=0): + """ + Validate the format of the line. If the line is not valid an exception will be raised. + If maxWidth and maxHeight are specified, all points must be inside the imgage bounds. + Posible values are: + LTRB=True: xmin,ymin,xmax,ymax[,confidence][,transcription] + LTRB=False: x1,y1,x2,y2,x3,y3,x4,y4[,confidence][,transcription] + Returns values from a textline. Points , [Confidences], [Transcriptions] + """ + confidence = 0.0 + transcription = "" + points = [] + + numPoints = 4 + + if LTRB: + + numPoints = 4 + + if withTranscription and withConfidence: + m = re.match( + r'^\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-1].?[0-9]*)\s*,(.*)$', + line) + if m == None: + m = re.match( + r'^\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-1].?[0-9]*)\s*,(.*)$', + line) + raise Exception( + "Format incorrect. Should be: xmin,ymin,xmax,ymax,confidence,transcription" + ) + elif withConfidence: + m = re.match( + r'^\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-1].?[0-9]*)\s*$', + line) + if m == None: + raise Exception( + "Format incorrect. Should be: xmin,ymin,xmax,ymax,confidence" + ) + elif withTranscription: + m = re.match( + r'^\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*,(.*)$', + line) + if m == None: + raise Exception( + "Format incorrect. Should be: xmin,ymin,xmax,ymax,transcription" + ) + else: + m = re.match( + r'^\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*,?\s*$', + line) + if m == None: + raise Exception( + "Format incorrect. Should be: xmin,ymin,xmax,ymax") + + xmin = int(m.group(1)) + ymin = int(m.group(2)) + xmax = int(m.group(3)) + ymax = int(m.group(4)) + if (xmax < xmin): + raise Exception("Xmax value (%s) not valid (Xmax < Xmin)." % (xmax)) + if (ymax < ymin): + raise Exception("Ymax value (%s) not valid (Ymax < Ymin)." % + (ymax)) + + points = [float(m.group(i)) for i in range(1, (numPoints + 1))] + + if (imWidth > 0 and imHeight > 0): + validate_point_inside_bounds(xmin, ymin, imWidth, imHeight) + validate_point_inside_bounds(xmax, ymax, imWidth, imHeight) + + else: + + numPoints = 8 + + if withTranscription and withConfidence: + m = re.match( + r'^\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*([0-1].?[0-9]*)\s*,(.*)$', + line) + if m == None: + raise Exception( + "Format incorrect. Should be: x1,y1,x2,y2,x3,y3,x4,y4,confidence,transcription" + ) + elif withConfidence: + m = re.match( + r'^\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*([0-1].?[0-9]*)\s*$', + line) + if m == None: + raise Exception( + "Format incorrect. Should be: x1,y1,x2,y2,x3,y3,x4,y4,confidence" + ) + elif withTranscription: + m = re.match( + r'^\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,(.*)$', + line) + if m == None: + raise Exception( + "Format incorrect. Should be: x1,y1,x2,y2,x3,y3,x4,y4,transcription" + ) + else: + m = re.match( + r'^\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*$', + line) + if m == None: + raise Exception( + "Format incorrect. Should be: x1,y1,x2,y2,x3,y3,x4,y4") + + points = [float(m.group(i)) for i in range(1, (numPoints + 1))] + + points = order_points_clockwise(np.array(points).reshape(-1, + 2)).reshape(-1) + validate_clockwise_points(points) + + if (imWidth > 0 and imHeight > 0): + validate_point_inside_bounds(points[0], points[1], imWidth, + imHeight) + validate_point_inside_bounds(points[2], points[3], imWidth, + imHeight) + validate_point_inside_bounds(points[4], points[5], imWidth, + imHeight) + validate_point_inside_bounds(points[6], points[7], imWidth, + imHeight) + + if withConfidence: + try: + confidence = float(m.group(numPoints + 1)) + except ValueError: + raise Exception("Confidence value must be a float") + + if withTranscription: + posTranscription = numPoints + (2 if withConfidence else 1) + transcription = m.group(posTranscription) + m2 = re.match(r'^\s*\"(.*)\"\s*$', transcription) + if m2 != None: #Transcription with double quotes, we extract the value and replace escaped characters + transcription = m2.group(1).replace("\\\\", "\\").replace("\\\"", + "\"") + + return points, confidence, transcription + + +def validate_point_inside_bounds(x, y, imWidth, imHeight): + if (x < 0 or x > imWidth): + raise Exception("X value (%s) not valid. Image dimensions: (%s,%s)" % + (xmin, imWidth, imHeight)) + if (y < 0 or y > imHeight): + raise Exception( + "Y value (%s) not valid. Image dimensions: (%s,%s) Sample: %s Line:%s" + % (ymin, imWidth, imHeight)) + + +def validate_clockwise_points(points): + """ + Validates that the points that the 4 points that dlimite a polygon are in clockwise order. + """ + + if len(points) != 8: + raise Exception("Points list not valid." + str(len(points))) + + point = [[int(points[0]), int(points[1])], + [int(points[2]), int(points[3])], + [int(points[4]), int(points[5])], + [int(points[6]), int(points[7])]] + edge = [(point[1][0] - point[0][0]) * (point[1][1] + point[0][1]), + (point[2][0] - point[1][0]) * (point[2][1] + point[1][1]), + (point[3][0] - point[2][0]) * (point[3][1] + point[2][1]), + (point[0][0] - point[3][0]) * (point[0][1] + point[3][1])] + + summatory = edge[0] + edge[1] + edge[2] + edge[3] + if summatory > 0: + raise Exception( + "Points are not clockwise. The coordinates of bounding quadrilaterals have to be given in clockwise order. Regarding the correct interpretation of 'clockwise' remember that the image coordinate system used is the standard one, with the image origin at the upper left, the X axis extending to the right and Y axis extending downwards." + ) + + +def get_tl_line_values_from_file_contents(content, + CRLF=True, + LTRB=True, + withTranscription=False, + withConfidence=False, + imWidth=0, + imHeight=0, + sort_by_confidences=True): + """ + Returns all points, confindences and transcriptions of a file in lists. Valid line formats: + xmin,ymin,xmax,ymax,[confidence],[transcription] + x1,y1,x2,y2,x3,y3,x4,y4,[confidence],[transcription] + """ + pointsList = [] + transcriptionsList = [] + confidencesList = [] + + lines = content.split("\r\n" if CRLF else "\n") + for line in lines: + line = line.replace("\r", "").replace("\n", "") + if (line != ""): + points, confidence, transcription = get_tl_line_values( + line, LTRB, withTranscription, withConfidence, imWidth, + imHeight) + pointsList.append(points) + transcriptionsList.append(transcription) + confidencesList.append(confidence) + + if withConfidence and len(confidencesList) > 0 and sort_by_confidences: + import numpy as np + sorted_ind = np.argsort(-np.array(confidencesList)) + confidencesList = [confidencesList[i] for i in sorted_ind] + pointsList = [pointsList[i] for i in sorted_ind] + transcriptionsList = [transcriptionsList[i] for i in sorted_ind] + + return pointsList, confidencesList, transcriptionsList + + +def main_evaluation(p, + default_evaluation_params_fn, + validate_data_fn, + evaluate_method_fn, + show_result=True, + per_sample=True): + """ + This process validates a method, evaluates it and if it succed generates a ZIP file with a JSON entry for each sample. + Params: + p: Dictionary of parmeters with the GT/submission locations. If None is passed, the parameters send by the system are used. + default_evaluation_params_fn: points to a function that returns a dictionary with the default parameters used for the evaluation + validate_data_fn: points to a method that validates the corrct format of the submission + evaluate_method_fn: points to a function that evaluated the submission and return a Dictionary with the results + """ + evalParams = default_evaluation_params_fn() + if 'p' in p.keys(): + evalParams.update(p['p'] if isinstance(p['p'], dict) else json.loads(p[ + 'p'][1:-1])) + + resDict = { + 'calculated': True, + 'Message': '', + 'method': '{}', + 'per_sample': '{}' + } + try: + # validate_data_fn(p['g'], p['s'], evalParams) + evalData = evaluate_method_fn(p['g'], p['s'], evalParams) + resDict.update(evalData) + + except Exception as e: + traceback.print_exc() + resDict['Message'] = str(e) + resDict['calculated'] = False + + if 'o' in p: + if not os.path.exists(p['o']): + os.makedirs(p['o']) + + resultsOutputname = p['o'] + '/results.zip' + outZip = zipfile.ZipFile(resultsOutputname, mode='w', allowZip64=True) + + del resDict['per_sample'] + if 'output_items' in resDict.keys(): + del resDict['output_items'] + + outZip.writestr('method.json', json.dumps(resDict)) + + if not resDict['calculated']: + if show_result: + sys.stderr.write('Error!\n' + resDict['Message'] + '\n\n') + if 'o' in p: + outZip.close() + return resDict + + if 'o' in p: + if per_sample == True: + for k, v in evalData['per_sample'].iteritems(): + outZip.writestr(k + '.json', json.dumps(v)) + + if 'output_items' in evalData.keys(): + for k, v in evalData['output_items'].iteritems(): + outZip.writestr(k, v) + + outZip.close() + + if show_result: + sys.stdout.write("Calculated!") + sys.stdout.write(json.dumps(resDict['method'])) + + return resDict + + +def main_validation(default_evaluation_params_fn, validate_data_fn): + """ + This process validates a method + Params: + default_evaluation_params_fn: points to a function that returns a dictionary with the default parameters used for the evaluation + validate_data_fn: points to a method that validates the corrct format of the submission + """ + try: + p = dict([s[1:].split('=') for s in sys.argv[1:]]) + evalParams = default_evaluation_params_fn() + if 'p' in p.keys(): + evalParams.update(p['p'] if isinstance(p['p'], dict) else + json.loads(p['p'][1:-1])) + + validate_data_fn(p['g'], p['s'], evalParams) + print('SUCCESS') + sys.exit(0) + except Exception as e: + print(str(e)) + sys.exit(101) diff --git a/benchmark/PaddleOCR_DBNet/utils/cal_recall/script.py b/benchmark/PaddleOCR_DBNet/utils/cal_recall/script.py new file mode 100644 index 0000000000000000000000000000000000000000..3b2f3916f62f191ae1b6b658edad9963243babf7 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/utils/cal_recall/script.py @@ -0,0 +1,350 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +from collections import namedtuple +from . import rrc_evaluation_funcs +import Polygon as plg +import numpy as np + + +def default_evaluation_params(): + """ + default_evaluation_params: Default parameters to use for the validation and evaluation. + """ + return { + 'IOU_CONSTRAINT': 0.5, + 'AREA_PRECISION_CONSTRAINT': 0.5, + 'GT_SAMPLE_NAME_2_ID': 'gt_img_([0-9]+).txt', + 'DET_SAMPLE_NAME_2_ID': 'res_img_([0-9]+).txt', + 'LTRB': + False, # LTRB:2points(left,top,right,bottom) or 4 points(x1,y1,x2,y2,x3,y3,x4,y4) + 'CRLF': False, # Lines are delimited by Windows CRLF format + 'CONFIDENCES': + False, # Detections must include confidence value. AP will be calculated + 'PER_SAMPLE_RESULTS': + True # Generate per sample results and produce data for visualization + } + + +def validate_data(gtFilePath, submFilePath, evaluationParams): + """ + Method validate_data: validates that all files in the results folder are correct (have the correct name contents). + Validates also that there are no missing files in the folder. + If some error detected, the method raises the error + """ + gt = rrc_evaluation_funcs.load_folder_file( + gtFilePath, evaluationParams['GT_SAMPLE_NAME_2_ID']) + + subm = rrc_evaluation_funcs.load_folder_file( + submFilePath, evaluationParams['DET_SAMPLE_NAME_2_ID'], True) + + # Validate format of GroundTruth + for k in gt: + rrc_evaluation_funcs.validate_lines_in_file( + k, gt[k], evaluationParams['CRLF'], evaluationParams['LTRB'], True) + + # Validate format of results + for k in subm: + if (k in gt) == False: + raise Exception("The sample %s not present in GT" % k) + + rrc_evaluation_funcs.validate_lines_in_file( + k, subm[k], evaluationParams['CRLF'], evaluationParams['LTRB'], + False, evaluationParams['CONFIDENCES']) + + +def evaluate_method(gtFilePath, submFilePath, evaluationParams): + """ + Method evaluate_method: evaluate method and returns the results + Results. Dictionary with the following values: + - method (required) Global method metrics. Ex: { 'Precision':0.8,'Recall':0.9 } + - samples (optional) Per sample metrics. Ex: {'sample1' : { 'Precision':0.8,'Recall':0.9 } , 'sample2' : { 'Precision':0.8,'Recall':0.9 } + """ + + def polygon_from_points(points): + """ + Returns a Polygon object to use with the Polygon2 class from a list of 8 points: x1,y1,x2,y2,x3,y3,x4,y4 + """ + resBoxes = np.empty([1, 8], dtype='int32') + resBoxes[0, 0] = int(points[0]) + resBoxes[0, 4] = int(points[1]) + resBoxes[0, 1] = int(points[2]) + resBoxes[0, 5] = int(points[3]) + resBoxes[0, 2] = int(points[4]) + resBoxes[0, 6] = int(points[5]) + resBoxes[0, 3] = int(points[6]) + resBoxes[0, 7] = int(points[7]) + pointMat = resBoxes[0].reshape([2, 4]).T + return plg.Polygon(pointMat) + + def rectangle_to_polygon(rect): + resBoxes = np.empty([1, 8], dtype='int32') + resBoxes[0, 0] = int(rect.xmin) + resBoxes[0, 4] = int(rect.ymax) + resBoxes[0, 1] = int(rect.xmin) + resBoxes[0, 5] = int(rect.ymin) + resBoxes[0, 2] = int(rect.xmax) + resBoxes[0, 6] = int(rect.ymin) + resBoxes[0, 3] = int(rect.xmax) + resBoxes[0, 7] = int(rect.ymax) + + pointMat = resBoxes[0].reshape([2, 4]).T + + return plg.Polygon(pointMat) + + def rectangle_to_points(rect): + points = [ + int(rect.xmin), int(rect.ymax), int(rect.xmax), int(rect.ymax), + int(rect.xmax), int(rect.ymin), int(rect.xmin), int(rect.ymin) + ] + return points + + def get_union(pD, pG): + areaA = pD.area() + areaB = pG.area() + return areaA + areaB - get_intersection(pD, pG) + + def get_intersection_over_union(pD, pG): + try: + return get_intersection(pD, pG) / get_union(pD, pG) + except: + return 0 + + def get_intersection(pD, pG): + pInt = pD & pG + if len(pInt) == 0: + return 0 + return pInt.area() + + def compute_ap(confList, matchList, numGtCare): + correct = 0 + AP = 0 + if len(confList) > 0: + confList = np.array(confList) + matchList = np.array(matchList) + sorted_ind = np.argsort(-confList) + confList = confList[sorted_ind] + matchList = matchList[sorted_ind] + for n in range(len(confList)): + match = matchList[n] + if match: + correct += 1 + AP += float(correct) / (n + 1) + + if numGtCare > 0: + AP /= numGtCare + + return AP + + perSampleMetrics = {} + + matchedSum = 0 + + Rectangle = namedtuple('Rectangle', 'xmin ymin xmax ymax') + + gt = rrc_evaluation_funcs.load_folder_file( + gtFilePath, evaluationParams['GT_SAMPLE_NAME_2_ID']) + subm = rrc_evaluation_funcs.load_folder_file( + submFilePath, evaluationParams['DET_SAMPLE_NAME_2_ID'], True) + + numGlobalCareGt = 0 + numGlobalCareDet = 0 + + arrGlobalConfidences = [] + arrGlobalMatches = [] + + for resFile in gt: + + gtFile = gt[resFile] # rrc_evaluation_funcs.decode_utf8(gt[resFile]) + recall = 0 + precision = 0 + hmean = 0 + + detMatched = 0 + + iouMat = np.empty([1, 1]) + + gtPols = [] + detPols = [] + + gtPolPoints = [] + detPolPoints = [] + + # Array of Ground Truth Polygons' keys marked as don't Care + gtDontCarePolsNum = [] + # Array of Detected Polygons' matched with a don't Care GT + detDontCarePolsNum = [] + + pairs = [] + detMatchedNums = [] + + arrSampleConfidences = [] + arrSampleMatch = [] + sampleAP = 0 + + evaluationLog = "" + + pointsList, _, transcriptionsList = rrc_evaluation_funcs.get_tl_line_values_from_file_contents( + gtFile, evaluationParams['CRLF'], evaluationParams['LTRB'], True, + False) + for n in range(len(pointsList)): + points = pointsList[n] + transcription = transcriptionsList[n] + dontCare = transcription == "###" + if evaluationParams['LTRB']: + gtRect = Rectangle(*points) + gtPol = rectangle_to_polygon(gtRect) + else: + gtPol = polygon_from_points(points) + gtPols.append(gtPol) + gtPolPoints.append(points) + if dontCare: + gtDontCarePolsNum.append(len(gtPols) - 1) + + evaluationLog += "GT polygons: " + str(len(gtPols)) + ( + " (" + str(len(gtDontCarePolsNum)) + " don't care)\n" + if len(gtDontCarePolsNum) > 0 else "\n") + + if resFile in subm: + + detFile = subm[ + resFile] # rrc_evaluation_funcs.decode_utf8(subm[resFile]) + + pointsList, confidencesList, _ = rrc_evaluation_funcs.get_tl_line_values_from_file_contents( + detFile, evaluationParams['CRLF'], evaluationParams['LTRB'], + False, evaluationParams['CONFIDENCES']) + for n in range(len(pointsList)): + points = pointsList[n] + + if evaluationParams['LTRB']: + detRect = Rectangle(*points) + detPol = rectangle_to_polygon(detRect) + else: + detPol = polygon_from_points(points) + detPols.append(detPol) + detPolPoints.append(points) + if len(gtDontCarePolsNum) > 0: + for dontCarePol in gtDontCarePolsNum: + dontCarePol = gtPols[dontCarePol] + intersected_area = get_intersection(dontCarePol, detPol) + pdDimensions = detPol.area() + precision = 0 if pdDimensions == 0 else intersected_area / pdDimensions + if (precision > + evaluationParams['AREA_PRECISION_CONSTRAINT']): + detDontCarePolsNum.append(len(detPols) - 1) + break + + evaluationLog += "DET polygons: " + str(len(detPols)) + ( + " (" + str(len(detDontCarePolsNum)) + " don't care)\n" + if len(detDontCarePolsNum) > 0 else "\n") + + if len(gtPols) > 0 and len(detPols) > 0: + # Calculate IoU and precision matrixs + outputShape = [len(gtPols), len(detPols)] + iouMat = np.empty(outputShape) + gtRectMat = np.zeros(len(gtPols), np.int8) + detRectMat = np.zeros(len(detPols), np.int8) + for gtNum in range(len(gtPols)): + for detNum in range(len(detPols)): + pG = gtPols[gtNum] + pD = detPols[detNum] + iouMat[gtNum, detNum] = get_intersection_over_union(pD, + pG) + + for gtNum in range(len(gtPols)): + for detNum in range(len(detPols)): + if gtRectMat[gtNum] == 0 and detRectMat[ + detNum] == 0 and gtNum not in gtDontCarePolsNum and detNum not in detDontCarePolsNum: + if iouMat[gtNum, detNum] > evaluationParams[ + 'IOU_CONSTRAINT']: + gtRectMat[gtNum] = 1 + detRectMat[detNum] = 1 + detMatched += 1 + pairs.append({'gt': gtNum, 'det': detNum}) + detMatchedNums.append(detNum) + evaluationLog += "Match GT #" + str( + gtNum) + " with Det #" + str(detNum) + "\n" + + if evaluationParams['CONFIDENCES']: + for detNum in range(len(detPols)): + if detNum not in detDontCarePolsNum: + # we exclude the don't care detections + match = detNum in detMatchedNums + + arrSampleConfidences.append(confidencesList[detNum]) + arrSampleMatch.append(match) + + arrGlobalConfidences.append(confidencesList[detNum]) + arrGlobalMatches.append(match) + + numGtCare = (len(gtPols) - len(gtDontCarePolsNum)) + numDetCare = (len(detPols) - len(detDontCarePolsNum)) + if numGtCare == 0: + recall = float(1) + precision = float(0) if numDetCare > 0 else float(1) + sampleAP = precision + else: + recall = float(detMatched) / numGtCare + precision = 0 if numDetCare == 0 else float(detMatched) / numDetCare + if evaluationParams['CONFIDENCES'] and evaluationParams[ + 'PER_SAMPLE_RESULTS']: + sampleAP = compute_ap(arrSampleConfidences, arrSampleMatch, + numGtCare) + + hmean = 0 if (precision + recall) == 0 else 2.0 * precision * recall / ( + precision + recall) + + matchedSum += detMatched + numGlobalCareGt += numGtCare + numGlobalCareDet += numDetCare + + if evaluationParams['PER_SAMPLE_RESULTS']: + perSampleMetrics[resFile] = { + 'precision': precision, + 'recall': recall, + 'hmean': hmean, + 'pairs': pairs, + 'AP': sampleAP, + 'iouMat': [] if len(detPols) > 100 else iouMat.tolist(), + 'gtPolPoints': gtPolPoints, + 'detPolPoints': detPolPoints, + 'gtDontCare': gtDontCarePolsNum, + 'detDontCare': detDontCarePolsNum, + 'evaluationParams': evaluationParams, + 'evaluationLog': evaluationLog + } + + # Compute MAP and MAR + AP = 0 + if evaluationParams['CONFIDENCES']: + AP = compute_ap(arrGlobalConfidences, arrGlobalMatches, numGlobalCareGt) + + methodRecall = 0 if numGlobalCareGt == 0 else float( + matchedSum) / numGlobalCareGt + methodPrecision = 0 if numGlobalCareDet == 0 else float( + matchedSum) / numGlobalCareDet + methodHmean = 0 if methodRecall + methodPrecision == 0 else 2 * methodRecall * methodPrecision / ( + methodRecall + methodPrecision) + + methodMetrics = { + 'precision': methodPrecision, + 'recall': methodRecall, + 'hmean': methodHmean, + 'AP': AP + } + + resDict = { + 'calculated': True, + 'Message': '', + 'method': methodMetrics, + 'per_sample': perSampleMetrics + } + + return resDict + + +def cal_recall_precison_f1(gt_path, result_path, show_result=False): + p = {'g': gt_path, 's': result_path} + result = rrc_evaluation_funcs.main_evaluation(p, default_evaluation_params, + validate_data, + evaluate_method, show_result) + return result['method'] diff --git a/benchmark/PaddleOCR_DBNet/utils/compute_mean_std.py b/benchmark/PaddleOCR_DBNet/utils/compute_mean_std.py new file mode 100644 index 0000000000000000000000000000000000000000..5d0ab5cd23d66e4070c336a93abebbedad6028b1 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/utils/compute_mean_std.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +# @Time : 2019/12/7 14:46 +# @Author : zhoujun + +import numpy as np +import cv2 +import os +import random +from tqdm import tqdm +# calculate means and std +train_txt_path = './train_val_list.txt' + +CNum = 10000 # 挑选多少图片进行计算 + +img_h, img_w = 640, 640 +imgs = np.zeros([img_w, img_h, 3, 1]) +means, stdevs = [], [] + +with open(train_txt_path, 'r') as f: + lines = f.readlines() + random.shuffle(lines) # shuffle , 随机挑选图片 + + for i in tqdm(range(CNum)): + img_path = lines[i].split('\t')[0] + + img = cv2.imread(img_path) + img = cv2.resize(img, (img_h, img_w)) + img = img[:, :, :, np.newaxis] + + imgs = np.concatenate((imgs, img), axis=3) +# print(i) + +imgs = imgs.astype(np.float32) / 255. + +for i in tqdm(range(3)): + pixels = imgs[:, :, i, :].ravel() # 拉成一行 + means.append(np.mean(pixels)) + stdevs.append(np.std(pixels)) + +# cv2 读取的图像格式为BGR,PIL/Skimage读取到的都是RGB不用转 +means.reverse() # BGR --> RGB +stdevs.reverse() + +print("normMean = {}".format(means)) +print("normStd = {}".format(stdevs)) +print('transforms.Normalize(normMean = {}, normStd = {})'.format(means, stdevs)) \ No newline at end of file diff --git a/benchmark/PaddleOCR_DBNet/utils/make_trainfile.py b/benchmark/PaddleOCR_DBNet/utils/make_trainfile.py new file mode 100644 index 0000000000000000000000000000000000000000..9b7ae70ff7aeea46b7b36ab4a177baa0108d72c3 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/utils/make_trainfile.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +# @Time : 2019/8/24 12:06 +# @Author : zhoujun +import os +import glob +import pathlib + +data_path = r'test' +# data_path/img 存放图片 +# data_path/gt 存放标签文件 + +f_w = open(os.path.join(data_path, 'test.txt'), 'w', encoding='utf8') +for img_path in glob.glob(data_path + '/img/*.jpg', recursive=True): + d = pathlib.Path(img_path) + label_path = os.path.join(data_path, 'gt', ('gt_' + str(d.stem) + '.txt')) + if os.path.exists(img_path) and os.path.exists(label_path): + print(img_path, label_path) + else: + print('不存在', img_path, label_path) + f_w.write('{}\t{}\n'.format(img_path, label_path)) +f_w.close() \ No newline at end of file diff --git a/benchmark/PaddleOCR_DBNet/utils/metrics.py b/benchmark/PaddleOCR_DBNet/utils/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..e9c54b8d2e8d6acc01aef62a43c27f018f333435 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/utils/metrics.py @@ -0,0 +1,58 @@ +# Adapted from score written by wkentaro +# https://github.com/wkentaro/pytorch-fcn/blob/master/torchfcn/utils.py + +import numpy as np + + +class runningScore(object): + def __init__(self, n_classes): + self.n_classes = n_classes + self.confusion_matrix = np.zeros((n_classes, n_classes)) + + def _fast_hist(self, label_true, label_pred, n_class): + mask = (label_true >= 0) & (label_true < n_class) + + if np.sum((label_pred[mask] < 0)) > 0: + print(label_pred[label_pred < 0]) + hist = np.bincount( + n_class * label_true[mask].astype(int) + label_pred[mask], + minlength=n_class**2).reshape(n_class, n_class) + return hist + + def update(self, label_trues, label_preds): + # print label_trues.dtype, label_preds.dtype + for lt, lp in zip(label_trues, label_preds): + try: + self.confusion_matrix += self._fast_hist(lt.flatten(), + lp.flatten(), + self.n_classes) + except: + pass + + def get_scores(self): + """Returns accuracy score evaluation result. + - overall accuracy + - mean accuracy + - mean IU + - fwavacc + """ + hist = self.confusion_matrix + acc = np.diag(hist).sum() / (hist.sum() + 0.0001) + acc_cls = np.diag(hist) / (hist.sum(axis=1) + 0.0001) + acc_cls = np.nanmean(acc_cls) + iu = np.diag(hist) / ( + hist.sum(axis=1) + hist.sum(axis=0) - np.diag(hist) + 0.0001) + mean_iu = np.nanmean(iu) + freq = hist.sum(axis=1) / (hist.sum() + 0.0001) + fwavacc = (freq[freq > 0] * iu[freq > 0]).sum() + cls_iu = dict(zip(range(self.n_classes), iu)) + + return { + 'Overall Acc': acc, + 'Mean Acc': acc_cls, + 'FreqW Acc': fwavacc, + 'Mean IoU': mean_iu, + }, cls_iu + + def reset(self): + self.confusion_matrix = np.zeros((self.n_classes, self.n_classes)) diff --git a/benchmark/PaddleOCR_DBNet/utils/ocr_metric/__init__.py b/benchmark/PaddleOCR_DBNet/utils/ocr_metric/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3e7c51cf0651a25b83e29016f4126461828ff887 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/utils/ocr_metric/__init__.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- +# @Time : 2019/12/5 15:36 +# @Author : zhoujun +from .icdar2015 import QuadMetric + + +def get_metric(config): + try: + if 'args' not in config: + args = {} + else: + args = config['args'] + if isinstance(args, dict): + cls = eval(config['type'])(**args) + else: + cls = eval(config['type'])(args) + return cls + except: + return None \ No newline at end of file diff --git a/benchmark/PaddleOCR_DBNet/utils/ocr_metric/icdar2015/__init__.py b/benchmark/PaddleOCR_DBNet/utils/ocr_metric/icdar2015/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..375ae557e9fdf1120fa79b412de21434c1d71896 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/utils/ocr_metric/icdar2015/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- +# @Time : 2019/12/5 15:36 +# @Author : zhoujun + +from .quad_metric import QuadMetric \ No newline at end of file diff --git a/benchmark/PaddleOCR_DBNet/utils/ocr_metric/icdar2015/detection/__init__.py b/benchmark/PaddleOCR_DBNet/utils/ocr_metric/icdar2015/detection/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/benchmark/PaddleOCR_DBNet/utils/ocr_metric/icdar2015/detection/deteval.py b/benchmark/PaddleOCR_DBNet/utils/ocr_metric/icdar2015/detection/deteval.py new file mode 100644 index 0000000000000000000000000000000000000000..c5dcfc4b96a9802200e08467cbd937483960a0fc --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/utils/ocr_metric/icdar2015/detection/deteval.py @@ -0,0 +1,389 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import math +from collections import namedtuple +import numpy as np +from shapely.geometry import Polygon + + +class DetectionDetEvalEvaluator(object): + def __init__(self, + area_recall_constraint=0.8, + area_precision_constraint=0.4, + ev_param_ind_center_diff_thr=1, + mtype_oo_o=1.0, + mtype_om_o=0.8, + mtype_om_m=1.0): + + self.area_recall_constraint = area_recall_constraint + self.area_precision_constraint = area_precision_constraint + self.ev_param_ind_center_diff_thr = ev_param_ind_center_diff_thr + self.mtype_oo_o = mtype_oo_o + self.mtype_om_o = mtype_om_o + self.mtype_om_m = mtype_om_m + + def evaluate_image(self, gt, pred): + def get_union(pD, pG): + return Polygon(pD).union(Polygon(pG)).area + + def get_intersection_over_union(pD, pG): + return get_intersection(pD, pG) / get_union(pD, pG) + + def get_intersection(pD, pG): + return Polygon(pD).intersection(Polygon(pG)).area + + def one_to_one_match(row, col): + cont = 0 + for j in range(len(recallMat[0])): + if recallMat[row, + j] >= self.area_recall_constraint and precisionMat[ + row, j] >= self.area_precision_constraint: + cont = cont + 1 + if (cont != 1): + return False + cont = 0 + for i in range(len(recallMat)): + if recallMat[ + i, col] >= self.area_recall_constraint and precisionMat[ + i, col] >= self.area_precision_constraint: + cont = cont + 1 + if (cont != 1): + return False + + if recallMat[row, + col] >= self.area_recall_constraint and precisionMat[ + row, col] >= self.area_precision_constraint: + return True + return False + + def num_overlaps_gt(gtNum): + cont = 0 + for detNum in range(len(detRects)): + if detNum not in detDontCareRectsNum: + if recallMat[gtNum, detNum] > 0: + cont = cont + 1 + return cont + + def num_overlaps_det(detNum): + cont = 0 + for gtNum in range(len(recallMat)): + if gtNum not in gtDontCareRectsNum: + if recallMat[gtNum, detNum] > 0: + cont = cont + 1 + return cont + + def is_single_overlap(row, col): + if num_overlaps_gt(row) == 1 and num_overlaps_det(col) == 1: + return True + else: + return False + + def one_to_many_match(gtNum): + many_sum = 0 + detRects = [] + for detNum in range(len(recallMat[0])): + if gtRectMat[gtNum] == 0 and detRectMat[ + detNum] == 0 and detNum not in detDontCareRectsNum: + if precisionMat[gtNum, + detNum] >= self.area_precision_constraint: + many_sum += recallMat[gtNum, detNum] + detRects.append(detNum) + if round(many_sum, 4) >= self.area_recall_constraint: + return True, detRects + else: + return False, [] + + def many_to_one_match(detNum): + many_sum = 0 + gtRects = [] + for gtNum in range(len(recallMat)): + if gtRectMat[gtNum] == 0 and detRectMat[ + detNum] == 0 and gtNum not in gtDontCareRectsNum: + if recallMat[gtNum, detNum] >= self.area_recall_constraint: + many_sum += precisionMat[gtNum, detNum] + gtRects.append(gtNum) + if round(many_sum, 4) >= self.area_precision_constraint: + return True, gtRects + else: + return False, [] + + def center_distance(r1, r2): + return ((np.mean(r1, axis=0) - np.mean(r2, axis=0))**2).sum()**0.5 + + def diag(r): + r = np.array(r) + return ((r[:, 0].max() - r[:, 0].min())**2 + + (r[:, 1].max() - r[:, 1].min())**2)**0.5 + + perSampleMetrics = {} + + recall = 0 + precision = 0 + hmean = 0 + recallAccum = 0. + precisionAccum = 0. + gtRects = [] + detRects = [] + gtPolPoints = [] + detPolPoints = [] + gtDontCareRectsNum = [ + ] #Array of Ground Truth Rectangles' keys marked as don't Care + detDontCareRectsNum = [ + ] #Array of Detected Rectangles' matched with a don't Care GT + pairs = [] + evaluationLog = "" + + recallMat = np.empty([1, 1]) + precisionMat = np.empty([1, 1]) + + for n in range(len(gt)): + points = gt[n]['points'] + # transcription = gt[n]['text'] + dontCare = gt[n]['ignore'] + + if not Polygon(points).is_valid or not Polygon(points).is_simple: + continue + + gtRects.append(points) + gtPolPoints.append(points) + if dontCare: + gtDontCareRectsNum.append(len(gtRects) - 1) + + evaluationLog += "GT rectangles: " + str(len(gtRects)) + ( + " (" + str(len(gtDontCareRectsNum)) + " don't care)\n" + if len(gtDontCareRectsNum) > 0 else "\n") + + for n in range(len(pred)): + points = pred[n]['points'] + + if not Polygon(points).is_valid or not Polygon(points).is_simple: + continue + + detRect = points + detRects.append(detRect) + detPolPoints.append(points) + if len(gtDontCareRectsNum) > 0: + for dontCareRectNum in gtDontCareRectsNum: + dontCareRect = gtRects[dontCareRectNum] + intersected_area = get_intersection(dontCareRect, detRect) + rdDimensions = Polygon(detRect).area + if (rdDimensions == 0): + precision = 0 + else: + precision = intersected_area / rdDimensions + if (precision > self.area_precision_constraint): + detDontCareRectsNum.append(len(detRects) - 1) + break + + evaluationLog += "DET rectangles: " + str(len(detRects)) + ( + " (" + str(len(detDontCareRectsNum)) + " don't care)\n" + if len(detDontCareRectsNum) > 0 else "\n") + + if len(gtRects) == 0: + recall = 1 + precision = 0 if len(detRects) > 0 else 1 + + if len(detRects) > 0: + #Calculate recall and precision matrixs + outputShape = [len(gtRects), len(detRects)] + recallMat = np.empty(outputShape) + precisionMat = np.empty(outputShape) + gtRectMat = np.zeros(len(gtRects), np.int8) + detRectMat = np.zeros(len(detRects), np.int8) + for gtNum in range(len(gtRects)): + for detNum in range(len(detRects)): + rG = gtRects[gtNum] + rD = detRects[detNum] + intersected_area = get_intersection(rG, rD) + rgDimensions = Polygon(rG).area + rdDimensions = Polygon(rD).area + recallMat[ + gtNum, + detNum] = 0 if rgDimensions == 0 else intersected_area / rgDimensions + precisionMat[ + gtNum, + detNum] = 0 if rdDimensions == 0 else intersected_area / rdDimensions + + # Find one-to-one matches + evaluationLog += "Find one-to-one matches\n" + for gtNum in range(len(gtRects)): + for detNum in range(len(detRects)): + if gtRectMat[gtNum] == 0 and detRectMat[ + detNum] == 0 and gtNum not in gtDontCareRectsNum and detNum not in detDontCareRectsNum: + match = one_to_one_match(gtNum, detNum) + if match is True: + #in deteval we have to make other validation before mark as one-to-one + if is_single_overlap(gtNum, detNum) is True: + rG = gtRects[gtNum] + rD = detRects[detNum] + normDist = center_distance(rG, rD) + normDist /= diag(rG) + diag(rD) + normDist *= 2.0 + if normDist < self.ev_param_ind_center_diff_thr: + gtRectMat[gtNum] = 1 + detRectMat[detNum] = 1 + recallAccum += self.mtype_oo_o + precisionAccum += self.mtype_oo_o + pairs.append({ + 'gt': gtNum, + 'det': detNum, + 'type': 'OO' + }) + evaluationLog += "Match GT #" + str( + gtNum) + " with Det #" + str( + detNum) + "\n" + else: + evaluationLog += "Match Discarded GT #" + str( + gtNum) + " with Det #" + str( + detNum) + " normDist: " + str( + normDist) + " \n" + else: + evaluationLog += "Match Discarded GT #" + str( + gtNum) + " with Det #" + str( + detNum) + " not single overlap\n" + # Find one-to-many matches + evaluationLog += "Find one-to-many matches\n" + for gtNum in range(len(gtRects)): + if gtNum not in gtDontCareRectsNum: + match, matchesDet = one_to_many_match(gtNum) + if match is True: + evaluationLog += "num_overlaps_gt=" + str( + num_overlaps_gt(gtNum)) + #in deteval we have to make other validation before mark as one-to-one + if num_overlaps_gt(gtNum) >= 2: + gtRectMat[gtNum] = 1 + recallAccum += (self.mtype_oo_o + if len(matchesDet) == 1 else + self.mtype_om_o) + precisionAccum += (self.mtype_oo_o + if len(matchesDet) == 1 else + self.mtype_om_o * + len(matchesDet)) + pairs.append({ + 'gt': gtNum, + 'det': matchesDet, + 'type': 'OO' if len(matchesDet) == 1 else 'OM' + }) + for detNum in matchesDet: + detRectMat[detNum] = 1 + evaluationLog += "Match GT #" + str( + gtNum) + " with Det #" + str(matchesDet) + "\n" + else: + evaluationLog += "Match Discarded GT #" + str( + gtNum) + " with Det #" + str( + matchesDet) + " not single overlap\n" + + # Find many-to-one matches + evaluationLog += "Find many-to-one matches\n" + for detNum in range(len(detRects)): + if detNum not in detDontCareRectsNum: + match, matchesGt = many_to_one_match(detNum) + if match is True: + #in deteval we have to make other validation before mark as one-to-one + if num_overlaps_det(detNum) >= 2: + detRectMat[detNum] = 1 + recallAccum += (self.mtype_oo_o + if len(matchesGt) == 1 else + self.mtype_om_m * len(matchesGt)) + precisionAccum += (self.mtype_oo_o + if len(matchesGt) == 1 else + self.mtype_om_m) + pairs.append({ + 'gt': matchesGt, + 'det': detNum, + 'type': 'OO' if len(matchesGt) == 1 else 'MO' + }) + for gtNum in matchesGt: + gtRectMat[gtNum] = 1 + evaluationLog += "Match GT #" + str( + matchesGt) + " with Det #" + str(detNum) + "\n" + else: + evaluationLog += "Match Discarded GT #" + str( + matchesGt) + " with Det #" + str( + detNum) + " not single overlap\n" + + numGtCare = (len(gtRects) - len(gtDontCareRectsNum)) + if numGtCare == 0: + recall = float(1) + precision = float(0) if len(detRects) > 0 else float(1) + else: + recall = float(recallAccum) / numGtCare + precision = float(0) if ( + len(detRects) - len(detDontCareRectsNum) + ) == 0 else float(precisionAccum) / ( + len(detRects) - len(detDontCareRectsNum)) + hmean = 0 if (precision + recall + ) == 0 else 2.0 * precision * recall / ( + precision + recall) + + numGtCare = len(gtRects) - len(gtDontCareRectsNum) + numDetCare = len(detRects) - len(detDontCareRectsNum) + + perSampleMetrics = { + 'precision': precision, + 'recall': recall, + 'hmean': hmean, + 'pairs': pairs, + 'recallMat': [] if len(detRects) > 100 else recallMat.tolist(), + 'precisionMat': [] + if len(detRects) > 100 else precisionMat.tolist(), + 'gtPolPoints': gtPolPoints, + 'detPolPoints': detPolPoints, + 'gtCare': numGtCare, + 'detCare': numDetCare, + 'gtDontCare': gtDontCareRectsNum, + 'detDontCare': detDontCareRectsNum, + 'recallAccum': recallAccum, + 'precisionAccum': precisionAccum, + 'evaluationLog': evaluationLog + } + + return perSampleMetrics + + def combine_results(self, results): + numGt = 0 + numDet = 0 + methodRecallSum = 0 + methodPrecisionSum = 0 + + for result in results: + numGt += result['gtCare'] + numDet += result['detCare'] + methodRecallSum += result['recallAccum'] + methodPrecisionSum += result['precisionAccum'] + + methodRecall = 0 if numGt == 0 else methodRecallSum / numGt + methodPrecision = 0 if numDet == 0 else methodPrecisionSum / numDet + methodHmean = 0 if methodRecall + methodPrecision == 0 else 2 * methodRecall * methodPrecision / ( + methodRecall + methodPrecision) + + methodMetrics = { + 'precision': methodPrecision, + 'recall': methodRecall, + 'hmean': methodHmean + } + + return methodMetrics + + +if __name__ == '__main__': + evaluator = DetectionDetEvalEvaluator() + gts = [[{ + 'points': [(0, 0), (1, 0), (1, 1), (0, 1)], + 'text': 1234, + 'ignore': False, + }, { + 'points': [(2, 2), (3, 2), (3, 3), (2, 3)], + 'text': 5678, + 'ignore': True, + }]] + preds = [[{ + 'points': [(0.1, 0.1), (1, 0), (1, 1), (0, 1)], + 'text': 123, + 'ignore': False, + }]] + results = [] + for gt, pred in zip(gts, preds): + results.append(evaluator.evaluate_image(gt, pred)) + metrics = evaluator.combine_results(results) + print(metrics) diff --git a/benchmark/PaddleOCR_DBNet/utils/ocr_metric/icdar2015/detection/icdar2013.py b/benchmark/PaddleOCR_DBNet/utils/ocr_metric/icdar2015/detection/icdar2013.py new file mode 100644 index 0000000000000000000000000000000000000000..7e8c86aae334dfdc1f35db91772e09c164e29d22 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/utils/ocr_metric/icdar2015/detection/icdar2013.py @@ -0,0 +1,346 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import math +from collections import namedtuple +import numpy as np +from shapely.geometry import Polygon + + +class DetectionICDAR2013Evaluator(object): + def __init__(self, + area_recall_constraint=0.8, + area_precision_constraint=0.4, + ev_param_ind_center_diff_thr=1, + mtype_oo_o=1.0, + mtype_om_o=0.8, + mtype_om_m=1.0): + + self.area_recall_constraint = area_recall_constraint + self.area_precision_constraint = area_precision_constraint + self.ev_param_ind_center_diff_thr = ev_param_ind_center_diff_thr + self.mtype_oo_o = mtype_oo_o + self.mtype_om_o = mtype_om_o + self.mtype_om_m = mtype_om_m + + def evaluate_image(self, gt, pred): + def get_union(pD, pG): + return Polygon(pD).union(Polygon(pG)).area + + def get_intersection_over_union(pD, pG): + return get_intersection(pD, pG) / get_union(pD, pG) + + def get_intersection(pD, pG): + return Polygon(pD).intersection(Polygon(pG)).area + + def one_to_one_match(row, col): + cont = 0 + for j in range(len(recallMat[0])): + if recallMat[row, + j] >= self.area_recall_constraint and precisionMat[ + row, j] >= self.area_precision_constraint: + cont = cont + 1 + if (cont != 1): + return False + cont = 0 + for i in range(len(recallMat)): + if recallMat[ + i, col] >= self.area_recall_constraint and precisionMat[ + i, col] >= self.area_precision_constraint: + cont = cont + 1 + if (cont != 1): + return False + + if recallMat[row, + col] >= self.area_recall_constraint and precisionMat[ + row, col] >= self.area_precision_constraint: + return True + return False + + def one_to_many_match(gtNum): + many_sum = 0 + detRects = [] + for detNum in range(len(recallMat[0])): + if gtRectMat[gtNum] == 0 and detRectMat[ + detNum] == 0 and detNum not in detDontCareRectsNum: + if precisionMat[gtNum, + detNum] >= self.area_precision_constraint: + many_sum += recallMat[gtNum, detNum] + detRects.append(detNum) + if round(many_sum, 4) >= self.area_recall_constraint: + return True, detRects + else: + return False, [] + + def many_to_one_match(detNum): + many_sum = 0 + gtRects = [] + for gtNum in range(len(recallMat)): + if gtRectMat[gtNum] == 0 and detRectMat[ + detNum] == 0 and gtNum not in gtDontCareRectsNum: + if recallMat[gtNum, detNum] >= self.area_recall_constraint: + many_sum += precisionMat[gtNum, detNum] + gtRects.append(gtNum) + if round(many_sum, 4) >= self.area_precision_constraint: + return True, gtRects + else: + return False, [] + + def center_distance(r1, r2): + return ((np.mean(r1, axis=0) - np.mean(r2, axis=0))**2).sum()**0.5 + + def diag(r): + r = np.array(r) + return ((r[:, 0].max() - r[:, 0].min())**2 + + (r[:, 1].max() - r[:, 1].min())**2)**0.5 + + perSampleMetrics = {} + + recall = 0 + precision = 0 + hmean = 0 + recallAccum = 0. + precisionAccum = 0. + gtRects = [] + detRects = [] + gtPolPoints = [] + detPolPoints = [] + gtDontCareRectsNum = [ + ] #Array of Ground Truth Rectangles' keys marked as don't Care + detDontCareRectsNum = [ + ] #Array of Detected Rectangles' matched with a don't Care GT + pairs = [] + evaluationLog = "" + + recallMat = np.empty([1, 1]) + precisionMat = np.empty([1, 1]) + + for n in range(len(gt)): + points = gt[n]['points'] + # transcription = gt[n]['text'] + dontCare = gt[n]['ignore'] + + if not Polygon(points).is_valid or not Polygon(points).is_simple: + continue + + gtRects.append(points) + gtPolPoints.append(points) + if dontCare: + gtDontCareRectsNum.append(len(gtRects) - 1) + + evaluationLog += "GT rectangles: " + str(len(gtRects)) + ( + " (" + str(len(gtDontCareRectsNum)) + " don't care)\n" + if len(gtDontCareRectsNum) > 0 else "\n") + + for n in range(len(pred)): + points = pred[n]['points'] + + if not Polygon(points).is_valid or not Polygon(points).is_simple: + continue + + detRect = points + detRects.append(detRect) + detPolPoints.append(points) + if len(gtDontCareRectsNum) > 0: + for dontCareRectNum in gtDontCareRectsNum: + dontCareRect = gtRects[dontCareRectNum] + intersected_area = get_intersection(dontCareRect, detRect) + rdDimensions = Polygon(detRect).area + if (rdDimensions == 0): + precision = 0 + else: + precision = intersected_area / rdDimensions + if (precision > self.area_precision_constraint): + detDontCareRectsNum.append(len(detRects) - 1) + break + + evaluationLog += "DET rectangles: " + str(len(detRects)) + ( + " (" + str(len(detDontCareRectsNum)) + " don't care)\n" + if len(detDontCareRectsNum) > 0 else "\n") + + if len(gtRects) == 0: + recall = 1 + precision = 0 if len(detRects) > 0 else 1 + + if len(detRects) > 0: + #Calculate recall and precision matrixs + outputShape = [len(gtRects), len(detRects)] + recallMat = np.empty(outputShape) + precisionMat = np.empty(outputShape) + gtRectMat = np.zeros(len(gtRects), np.int8) + detRectMat = np.zeros(len(detRects), np.int8) + for gtNum in range(len(gtRects)): + for detNum in range(len(detRects)): + rG = gtRects[gtNum] + rD = detRects[detNum] + intersected_area = get_intersection(rG, rD) + rgDimensions = Polygon(rG).area + rdDimensions = Polygon(rD).area + recallMat[ + gtNum, + detNum] = 0 if rgDimensions == 0 else intersected_area / rgDimensions + precisionMat[ + gtNum, + detNum] = 0 if rdDimensions == 0 else intersected_area / rdDimensions + + # Find one-to-one matches + evaluationLog += "Find one-to-one matches\n" + for gtNum in range(len(gtRects)): + for detNum in range(len(detRects)): + if gtRectMat[gtNum] == 0 and detRectMat[ + detNum] == 0 and gtNum not in gtDontCareRectsNum and detNum not in detDontCareRectsNum: + match = one_to_one_match(gtNum, detNum) + if match is True: + #in deteval we have to make other validation before mark as one-to-one + rG = gtRects[gtNum] + rD = detRects[detNum] + normDist = center_distance(rG, rD) + normDist /= diag(rG) + diag(rD) + normDist *= 2.0 + if normDist < self.ev_param_ind_center_diff_thr: + gtRectMat[gtNum] = 1 + detRectMat[detNum] = 1 + recallAccum += self.mtype_oo_o + precisionAccum += self.mtype_oo_o + pairs.append({ + 'gt': gtNum, + 'det': detNum, + 'type': 'OO' + }) + evaluationLog += "Match GT #" + str( + gtNum) + " with Det #" + str(detNum) + "\n" + else: + evaluationLog += "Match Discarded GT #" + str( + gtNum) + " with Det #" + str( + detNum) + " normDist: " + str( + normDist) + " \n" + # Find one-to-many matches + evaluationLog += "Find one-to-many matches\n" + for gtNum in range(len(gtRects)): + if gtNum not in gtDontCareRectsNum: + match, matchesDet = one_to_many_match(gtNum) + if match is True: + evaluationLog += "num_overlaps_gt=" + str( + num_overlaps_gt(gtNum)) + gtRectMat[gtNum] = 1 + recallAccum += (self.mtype_oo_o if len(matchesDet) == 1 + else self.mtype_om_o) + precisionAccum += (self.mtype_oo_o + if len(matchesDet) == 1 else + self.mtype_om_o * len(matchesDet)) + pairs.append({ + 'gt': gtNum, + 'det': matchesDet, + 'type': 'OO' if len(matchesDet) == 1 else 'OM' + }) + for detNum in matchesDet: + detRectMat[detNum] = 1 + evaluationLog += "Match GT #" + str( + gtNum) + " with Det #" + str(matchesDet) + "\n" + + # Find many-to-one matches + evaluationLog += "Find many-to-one matches\n" + for detNum in range(len(detRects)): + if detNum not in detDontCareRectsNum: + match, matchesGt = many_to_one_match(detNum) + if match is True: + detRectMat[detNum] = 1 + recallAccum += (self.mtype_oo_o if len(matchesGt) == 1 + else self.mtype_om_m * len(matchesGt)) + precisionAccum += (self.mtype_oo_o + if len(matchesGt) == 1 else + self.mtype_om_m) + pairs.append({ + 'gt': matchesGt, + 'det': detNum, + 'type': 'OO' if len(matchesGt) == 1 else 'MO' + }) + for gtNum in matchesGt: + gtRectMat[gtNum] = 1 + evaluationLog += "Match GT #" + str( + matchesGt) + " with Det #" + str(detNum) + "\n" + + numGtCare = (len(gtRects) - len(gtDontCareRectsNum)) + if numGtCare == 0: + recall = float(1) + precision = float(0) if len(detRects) > 0 else float(1) + else: + recall = float(recallAccum) / numGtCare + precision = float(0) if ( + len(detRects) - len(detDontCareRectsNum) + ) == 0 else float(precisionAccum) / ( + len(detRects) - len(detDontCareRectsNum)) + hmean = 0 if (precision + recall + ) == 0 else 2.0 * precision * recall / ( + precision + recall) + + numGtCare = len(gtRects) - len(gtDontCareRectsNum) + numDetCare = len(detRects) - len(detDontCareRectsNum) + + perSampleMetrics = { + 'precision': precision, + 'recall': recall, + 'hmean': hmean, + 'pairs': pairs, + 'recallMat': [] if len(detRects) > 100 else recallMat.tolist(), + 'precisionMat': [] + if len(detRects) > 100 else precisionMat.tolist(), + 'gtPolPoints': gtPolPoints, + 'detPolPoints': detPolPoints, + 'gtCare': numGtCare, + 'detCare': numDetCare, + 'gtDontCare': gtDontCareRectsNum, + 'detDontCare': detDontCareRectsNum, + 'recallAccum': recallAccum, + 'precisionAccum': precisionAccum, + 'evaluationLog': evaluationLog + } + + return perSampleMetrics + + def combine_results(self, results): + numGt = 0 + numDet = 0 + methodRecallSum = 0 + methodPrecisionSum = 0 + + for result in results: + numGt += result['gtCare'] + numDet += result['detCare'] + methodRecallSum += result['recallAccum'] + methodPrecisionSum += result['precisionAccum'] + + methodRecall = 0 if numGt == 0 else methodRecallSum / numGt + methodPrecision = 0 if numDet == 0 else methodPrecisionSum / numDet + methodHmean = 0 if methodRecall + methodPrecision == 0 else 2 * methodRecall * methodPrecision / ( + methodRecall + methodPrecision) + + methodMetrics = { + 'precision': methodPrecision, + 'recall': methodRecall, + 'hmean': methodHmean + } + + return methodMetrics + + +if __name__ == '__main__': + evaluator = DetectionICDAR2013Evaluator() + gts = [[{ + 'points': [(0, 0), (1, 0), (1, 1), (0, 1)], + 'text': 1234, + 'ignore': False, + }, { + 'points': [(2, 2), (3, 2), (3, 3), (2, 3)], + 'text': 5678, + 'ignore': True, + }]] + preds = [[{ + 'points': [(0.1, 0.1), (1, 0), (1, 1), (0, 1)], + 'text': 123, + 'ignore': False, + }]] + results = [] + for gt, pred in zip(gts, preds): + results.append(evaluator.evaluate_image(gt, pred)) + metrics = evaluator.combine_results(results) + print(metrics) diff --git a/benchmark/PaddleOCR_DBNet/utils/ocr_metric/icdar2015/detection/iou.py b/benchmark/PaddleOCR_DBNet/utils/ocr_metric/icdar2015/detection/iou.py new file mode 100644 index 0000000000000000000000000000000000000000..5f9533b3c37e7d11ce2aaa276211baae14d1fb97 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/utils/ocr_metric/icdar2015/detection/iou.py @@ -0,0 +1,263 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +from collections import namedtuple +import numpy as np +from shapely.geometry import Polygon +import cv2 + + +def iou_rotate(box_a, box_b, method='union'): + rect_a = cv2.minAreaRect(box_a) + rect_b = cv2.minAreaRect(box_b) + r1 = cv2.rotatedRectangleIntersection(rect_a, rect_b) + if r1[0] == 0: + return 0 + else: + inter_area = cv2.contourArea(r1[1]) + area_a = cv2.contourArea(box_a) + area_b = cv2.contourArea(box_b) + union_area = area_a + area_b - inter_area + if union_area == 0 or inter_area == 0: + return 0 + if method == 'union': + iou = inter_area / union_area + elif method == 'intersection': + iou = inter_area / min(area_a, area_b) + else: + raise NotImplementedError + return iou + + +class DetectionIoUEvaluator(object): + def __init__(self, + is_output_polygon=False, + iou_constraint=0.5, + area_precision_constraint=0.5): + self.is_output_polygon = is_output_polygon + self.iou_constraint = iou_constraint + self.area_precision_constraint = area_precision_constraint + + def evaluate_image(self, gt, pred): + def get_union(pD, pG): + return Polygon(pD).union(Polygon(pG)).area + + def get_intersection_over_union(pD, pG): + return get_intersection(pD, pG) / get_union(pD, pG) + + def get_intersection(pD, pG): + return Polygon(pD).intersection(Polygon(pG)).area + + def compute_ap(confList, matchList, numGtCare): + correct = 0 + AP = 0 + if len(confList) > 0: + confList = np.array(confList) + matchList = np.array(matchList) + sorted_ind = np.argsort(-confList) + confList = confList[sorted_ind] + matchList = matchList[sorted_ind] + for n in range(len(confList)): + match = matchList[n] + if match: + correct += 1 + AP += float(correct) / (n + 1) + + if numGtCare > 0: + AP /= numGtCare + + return AP + + perSampleMetrics = {} + + matchedSum = 0 + + Rectangle = namedtuple('Rectangle', 'xmin ymin xmax ymax') + + numGlobalCareGt = 0 + numGlobalCareDet = 0 + + arrGlobalConfidences = [] + arrGlobalMatches = [] + + recall = 0 + precision = 0 + hmean = 0 + + detMatched = 0 + + iouMat = np.empty([1, 1]) + + gtPols = [] + detPols = [] + + gtPolPoints = [] + detPolPoints = [] + + # Array of Ground Truth Polygons' keys marked as don't Care + gtDontCarePolsNum = [] + # Array of Detected Polygons' matched with a don't Care GT + detDontCarePolsNum = [] + + pairs = [] + detMatchedNums = [] + + arrSampleConfidences = [] + arrSampleMatch = [] + + evaluationLog = "" + + for n in range(len(gt)): + points = gt[n]['points'] + # transcription = gt[n]['text'] + dontCare = gt[n]['ignore'] + + if not Polygon(points).is_valid or not Polygon(points).is_simple: + continue + + gtPol = points + gtPols.append(gtPol) + gtPolPoints.append(points) + if dontCare: + gtDontCarePolsNum.append(len(gtPols) - 1) + + evaluationLog += "GT polygons: " + str(len(gtPols)) + ( + " (" + str(len(gtDontCarePolsNum)) + " don't care)\n" + if len(gtDontCarePolsNum) > 0 else "\n") + + for n in range(len(pred)): + points = pred[n]['points'] + if not Polygon(points).is_valid or not Polygon(points).is_simple: + continue + + detPol = points + detPols.append(detPol) + detPolPoints.append(points) + if len(gtDontCarePolsNum) > 0: + for dontCarePol in gtDontCarePolsNum: + dontCarePol = gtPols[dontCarePol] + intersected_area = get_intersection(dontCarePol, detPol) + pdDimensions = Polygon(detPol).area + precision = 0 if pdDimensions == 0 else intersected_area / pdDimensions + if (precision > self.area_precision_constraint): + detDontCarePolsNum.append(len(detPols) - 1) + break + + evaluationLog += "DET polygons: " + str(len(detPols)) + ( + " (" + str(len(detDontCarePolsNum)) + " don't care)\n" + if len(detDontCarePolsNum) > 0 else "\n") + + if len(gtPols) > 0 and len(detPols) > 0: + # Calculate IoU and precision matrixs + outputShape = [len(gtPols), len(detPols)] + iouMat = np.empty(outputShape) + gtRectMat = np.zeros(len(gtPols), np.int8) + detRectMat = np.zeros(len(detPols), np.int8) + if self.is_output_polygon: + for gtNum in range(len(gtPols)): + for detNum in range(len(detPols)): + pG = gtPols[gtNum] + pD = detPols[detNum] + iouMat[gtNum, detNum] = get_intersection_over_union(pD, + pG) + else: + # gtPols = np.float32(gtPols) + # detPols = np.float32(detPols) + for gtNum in range(len(gtPols)): + for detNum in range(len(detPols)): + pG = np.float32(gtPols[gtNum]) + pD = np.float32(detPols[detNum]) + iouMat[gtNum, detNum] = iou_rotate(pD, pG) + for gtNum in range(len(gtPols)): + for detNum in range(len(detPols)): + if gtRectMat[gtNum] == 0 and detRectMat[ + detNum] == 0 and gtNum not in gtDontCarePolsNum and detNum not in detDontCarePolsNum: + if iouMat[gtNum, detNum] > self.iou_constraint: + gtRectMat[gtNum] = 1 + detRectMat[detNum] = 1 + detMatched += 1 + pairs.append({'gt': gtNum, 'det': detNum}) + detMatchedNums.append(detNum) + evaluationLog += "Match GT #" + \ + str(gtNum) + " with Det #" + str(detNum) + "\n" + + numGtCare = (len(gtPols) - len(gtDontCarePolsNum)) + numDetCare = (len(detPols) - len(detDontCarePolsNum)) + if numGtCare == 0: + recall = float(1) + precision = float(0) if numDetCare > 0 else float(1) + else: + recall = float(detMatched) / numGtCare + precision = 0 if numDetCare == 0 else float(detMatched) / numDetCare + + hmean = 0 if (precision + recall) == 0 else 2.0 * \ + precision * recall / (precision + recall) + + matchedSum += detMatched + numGlobalCareGt += numGtCare + numGlobalCareDet += numDetCare + + perSampleMetrics = { + 'precision': precision, + 'recall': recall, + 'hmean': hmean, + 'pairs': pairs, + 'iouMat': [] if len(detPols) > 100 else iouMat.tolist(), + 'gtPolPoints': gtPolPoints, + 'detPolPoints': detPolPoints, + 'gtCare': numGtCare, + 'detCare': numDetCare, + 'gtDontCare': gtDontCarePolsNum, + 'detDontCare': detDontCarePolsNum, + 'detMatched': detMatched, + 'evaluationLog': evaluationLog + } + + return perSampleMetrics + + def combine_results(self, results): + numGlobalCareGt = 0 + numGlobalCareDet = 0 + matchedSum = 0 + for result in results: + numGlobalCareGt += result['gtCare'] + numGlobalCareDet += result['detCare'] + matchedSum += result['detMatched'] + + methodRecall = 0 if numGlobalCareGt == 0 else float( + matchedSum) / numGlobalCareGt + methodPrecision = 0 if numGlobalCareDet == 0 else float( + matchedSum) / numGlobalCareDet + methodHmean = 0 if methodRecall + methodPrecision == 0 else 2 * \ + methodRecall * methodPrecision / ( + methodRecall + methodPrecision) + + methodMetrics = { + 'precision': methodPrecision, + 'recall': methodRecall, + 'hmean': methodHmean + } + + return methodMetrics + + +if __name__ == '__main__': + evaluator = DetectionIoUEvaluator() + preds = [[{ + 'points': [(0.1, 0.1), (0.5, 0), (0.5, 1), (0, 1)], + 'text': 1234, + 'ignore': False, + }, { + 'points': [(0.5, 0.1), (1, 0), (1, 1), (0.5, 1)], + 'text': 5678, + 'ignore': False, + }]] + gts = [[{ + 'points': [(0.1, 0.1), (1, 0), (1, 1), (0, 1)], + 'text': 123, + 'ignore': False, + }]] + results = [] + for gt, pred in zip(gts, preds): + results.append(evaluator.evaluate_image(gt, pred)) + metrics = evaluator.combine_results(results) + print(metrics) diff --git a/benchmark/PaddleOCR_DBNet/utils/ocr_metric/icdar2015/detection/mtwi2018.py b/benchmark/PaddleOCR_DBNet/utils/ocr_metric/icdar2015/detection/mtwi2018.py new file mode 100644 index 0000000000000000000000000000000000000000..8e319aacf5a395a121e94bd2e9d123cec9279e7e --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/utils/ocr_metric/icdar2015/detection/mtwi2018.py @@ -0,0 +1,335 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import math +from collections import namedtuple +import numpy as np +from shapely.geometry import Polygon + + +class DetectionMTWI2018Evaluator(object): + def __init__( + self, + area_recall_constraint=0.7, + area_precision_constraint=0.7, + ev_param_ind_center_diff_thr=1, ): + + self.area_recall_constraint = area_recall_constraint + self.area_precision_constraint = area_precision_constraint + self.ev_param_ind_center_diff_thr = ev_param_ind_center_diff_thr + + def evaluate_image(self, gt, pred): + def get_union(pD, pG): + return Polygon(pD).union(Polygon(pG)).area + + def get_intersection_over_union(pD, pG): + return get_intersection(pD, pG) / get_union(pD, pG) + + def get_intersection(pD, pG): + return Polygon(pD).intersection(Polygon(pG)).area + + def one_to_one_match(row, col): + cont = 0 + for j in range(len(recallMat[0])): + if recallMat[row, + j] >= self.area_recall_constraint and precisionMat[ + row, j] >= self.area_precision_constraint: + cont = cont + 1 + if (cont != 1): + return False + cont = 0 + for i in range(len(recallMat)): + if recallMat[ + i, col] >= self.area_recall_constraint and precisionMat[ + i, col] >= self.area_precision_constraint: + cont = cont + 1 + if (cont != 1): + return False + + if recallMat[row, + col] >= self.area_recall_constraint and precisionMat[ + row, col] >= self.area_precision_constraint: + return True + return False + + def one_to_many_match(gtNum): + many_sum = 0 + detRects = [] + for detNum in range(len(recallMat[0])): + if gtRectMat[gtNum] == 0 and detRectMat[ + detNum] == 0 and detNum not in detDontCareRectsNum: + if precisionMat[gtNum, + detNum] >= self.area_precision_constraint: + many_sum += recallMat[gtNum, detNum] + detRects.append(detNum) + if round(many_sum, 4) >= self.area_recall_constraint: + return True, detRects + else: + return False, [] + + def many_to_one_match(detNum): + many_sum = 0 + gtRects = [] + for gtNum in range(len(recallMat)): + if gtRectMat[gtNum] == 0 and detRectMat[ + detNum] == 0 and gtNum not in gtDontCareRectsNum: + if recallMat[gtNum, detNum] >= self.area_recall_constraint: + many_sum += precisionMat[gtNum, detNum] + gtRects.append(gtNum) + if round(many_sum, 4) >= self.area_precision_constraint: + return True, gtRects + else: + return False, [] + + def center_distance(r1, r2): + return ((np.mean(r1, axis=0) - np.mean(r2, axis=0))**2).sum()**0.5 + + def diag(r): + r = np.array(r) + return ((r[:, 0].max() - r[:, 0].min())**2 + + (r[:, 1].max() - r[:, 1].min())**2)**0.5 + + perSampleMetrics = {} + + recall = 0 + precision = 0 + hmean = 0 + recallAccum = 0. + precisionAccum = 0. + gtRects = [] + detRects = [] + gtPolPoints = [] + detPolPoints = [] + gtDontCareRectsNum = [ + ] #Array of Ground Truth Rectangles' keys marked as don't Care + detDontCareRectsNum = [ + ] #Array of Detected Rectangles' matched with a don't Care GT + pairs = [] + evaluationLog = "" + + recallMat = np.empty([1, 1]) + precisionMat = np.empty([1, 1]) + + for n in range(len(gt)): + points = gt[n]['points'] + # transcription = gt[n]['text'] + dontCare = gt[n]['ignore'] + + if not Polygon(points).is_valid or not Polygon(points).is_simple: + continue + + gtRects.append(points) + gtPolPoints.append(points) + if dontCare: + gtDontCareRectsNum.append(len(gtRects) - 1) + + evaluationLog += "GT rectangles: " + str(len(gtRects)) + ( + " (" + str(len(gtDontCareRectsNum)) + " don't care)\n" + if len(gtDontCareRectsNum) > 0 else "\n") + + for n in range(len(pred)): + points = pred[n]['points'] + + if not Polygon(points).is_valid or not Polygon(points).is_simple: + continue + + detRect = points + detRects.append(detRect) + detPolPoints.append(points) + if len(gtDontCareRectsNum) > 0: + for dontCareRectNum in gtDontCareRectsNum: + dontCareRect = gtRects[dontCareRectNum] + intersected_area = get_intersection(dontCareRect, detRect) + rdDimensions = Polygon(detRect).area + if (rdDimensions == 0): + precision = 0 + else: + precision = intersected_area / rdDimensions + if (precision > 0.5): + detDontCareRectsNum.append(len(detRects) - 1) + break + + evaluationLog += "DET rectangles: " + str(len(detRects)) + ( + " (" + str(len(detDontCareRectsNum)) + " don't care)\n" + if len(detDontCareRectsNum) > 0 else "\n") + + if len(gtRects) == 0: + recall = 1 + precision = 0 if len(detRects) > 0 else 1 + + if len(detRects) > 0: + #Calculate recall and precision matrixs + outputShape = [len(gtRects), len(detRects)] + recallMat = np.empty(outputShape) + precisionMat = np.empty(outputShape) + gtRectMat = np.zeros(len(gtRects), np.int8) + detRectMat = np.zeros(len(detRects), np.int8) + for gtNum in range(len(gtRects)): + for detNum in range(len(detRects)): + rG = gtRects[gtNum] + rD = detRects[detNum] + intersected_area = get_intersection(rG, rD) + rgDimensions = Polygon(rG).area + rdDimensions = Polygon(rD).area + recallMat[ + gtNum, + detNum] = 0 if rgDimensions == 0 else intersected_area / rgDimensions + precisionMat[ + gtNum, + detNum] = 0 if rdDimensions == 0 else intersected_area / rdDimensions + + # Find one-to-one matches + evaluationLog += "Find one-to-one matches\n" + for gtNum in range(len(gtRects)): + for detNum in range(len(detRects)): + if gtRectMat[gtNum] == 0 and detRectMat[ + detNum] == 0 and gtNum not in gtDontCareRectsNum and detNum not in detDontCareRectsNum: + match = one_to_one_match(gtNum, detNum) + if match is True: + #in deteval we have to make other validation before mark as one-to-one + rG = gtRects[gtNum] + rD = detRects[detNum] + normDist = center_distance(rG, rD) + normDist /= diag(rG) + diag(rD) + normDist *= 2.0 + if normDist < self.ev_param_ind_center_diff_thr: + gtRectMat[gtNum] = 1 + detRectMat[detNum] = 1 + recallAccum += 1.0 + precisionAccum += 1.0 + pairs.append({ + 'gt': gtNum, + 'det': detNum, + 'type': 'OO' + }) + evaluationLog += "Match GT #" + str( + gtNum) + " with Det #" + str(detNum) + "\n" + else: + evaluationLog += "Match Discarded GT #" + str( + gtNum) + " with Det #" + str( + detNum) + " normDist: " + str( + normDist) + " \n" + # Find one-to-many matches + evaluationLog += "Find one-to-many matches\n" + for gtNum in range(len(gtRects)): + if gtNum not in gtDontCareRectsNum: + match, matchesDet = one_to_many_match(gtNum) + if match is True: + gtRectMat[gtNum] = 1 + recallAccum += 1.0 + precisionAccum += len(matchesDet) / ( + 1 + math.log(len(matchesDet))) + pairs.append({ + 'gt': gtNum, + 'det': matchesDet, + 'type': 'OO' if len(matchesDet) == 1 else 'OM' + }) + for detNum in matchesDet: + detRectMat[detNum] = 1 + evaluationLog += "Match GT #" + str( + gtNum) + " with Det #" + str(matchesDet) + "\n" + + # Find many-to-one matches + evaluationLog += "Find many-to-one matches\n" + for detNum in range(len(detRects)): + if detNum not in detDontCareRectsNum: + match, matchesGt = many_to_one_match(detNum) + if match is True: + detRectMat[detNum] = 1 + recallAccum += len(matchesGt) / ( + 1 + math.log(len(matchesGt))) + precisionAccum += 1.0 + pairs.append({ + 'gt': matchesGt, + 'det': detNum, + 'type': 'OO' if len(matchesGt) == 1 else 'MO' + }) + for gtNum in matchesGt: + gtRectMat[gtNum] = 1 + evaluationLog += "Match GT #" + str( + matchesGt) + " with Det #" + str(detNum) + "\n" + + numGtCare = (len(gtRects) - len(gtDontCareRectsNum)) + if numGtCare == 0: + recall = float(1) + precision = float(0) if len(detRects) > 0 else float(1) + else: + recall = float(recallAccum) / numGtCare + precision = float(0) if ( + len(detRects) - len(detDontCareRectsNum) + ) == 0 else float(precisionAccum) / ( + len(detRects) - len(detDontCareRectsNum)) + hmean = 0 if (precision + recall + ) == 0 else 2.0 * precision * recall / ( + precision + recall) + + numGtCare = len(gtRects) - len(gtDontCareRectsNum) + numDetCare = len(detRects) - len(detDontCareRectsNum) + + perSampleMetrics = { + 'precision': precision, + 'recall': recall, + 'hmean': hmean, + 'pairs': pairs, + 'recallMat': [] if len(detRects) > 100 else recallMat.tolist(), + 'precisionMat': [] + if len(detRects) > 100 else precisionMat.tolist(), + 'gtPolPoints': gtPolPoints, + 'detPolPoints': detPolPoints, + 'gtCare': numGtCare, + 'detCare': numDetCare, + 'gtDontCare': gtDontCareRectsNum, + 'detDontCare': detDontCareRectsNum, + 'recallAccum': recallAccum, + 'precisionAccum': precisionAccum, + 'evaluationLog': evaluationLog + } + + return perSampleMetrics + + def combine_results(self, results): + numGt = 0 + numDet = 0 + methodRecallSum = 0 + methodPrecisionSum = 0 + + for result in results: + numGt += result['gtCare'] + numDet += result['detCare'] + methodRecallSum += result['recallAccum'] + methodPrecisionSum += result['precisionAccum'] + + methodRecall = 0 if numGt == 0 else methodRecallSum / numGt + methodPrecision = 0 if numDet == 0 else methodPrecisionSum / numDet + methodHmean = 0 if methodRecall + methodPrecision == 0 else 2 * methodRecall * methodPrecision / ( + methodRecall + methodPrecision) + + methodMetrics = { + 'precision': methodPrecision, + 'recall': methodRecall, + 'hmean': methodHmean + } + + return methodMetrics + + +if __name__ == '__main__': + evaluator = DetectionICDAR2013Evaluator() + gts = [[{ + 'points': [(0, 0), (1, 0), (1, 1), (0, 1)], + 'text': 1234, + 'ignore': False, + }, { + 'points': [(2, 2), (3, 2), (3, 3), (2, 3)], + 'text': 5678, + 'ignore': True, + }]] + preds = [[{ + 'points': [(0.1, 0.1), (1, 0), (1, 1), (0, 1)], + 'text': 123, + 'ignore': False, + }]] + results = [] + for gt, pred in zip(gts, preds): + results.append(evaluator.evaluate_image(gt, pred)) + metrics = evaluator.combine_results(results) + print(metrics) diff --git a/benchmark/PaddleOCR_DBNet/utils/ocr_metric/icdar2015/quad_metric.py b/benchmark/PaddleOCR_DBNet/utils/ocr_metric/icdar2015/quad_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..e7e403a31c16af6b2e0e533139f2257cf6135c8a --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/utils/ocr_metric/icdar2015/quad_metric.py @@ -0,0 +1,98 @@ +import numpy as np + +from .detection.iou import DetectionIoUEvaluator + + +class AverageMeter(object): + """Computes and stores the average and current value""" + + def __init__(self): + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + return self + + +class QuadMetric(): + def __init__(self, is_output_polygon=False): + self.is_output_polygon = is_output_polygon + self.evaluator = DetectionIoUEvaluator( + is_output_polygon=is_output_polygon) + + def measure(self, batch, output, box_thresh=0.6): + ''' + batch: (image, polygons, ignore_tags + batch: a dict produced by dataloaders. + image: tensor of shape (N, C, H, W). + polygons: tensor of shape (N, K, 4, 2), the polygons of objective regions. + ignore_tags: tensor of shape (N, K), indicates whether a region is ignorable or not. + shape: the original shape of images. + filename: the original filenames of images. + output: (polygons, ...) + ''' + results = [] + gt_polyons_batch = batch['text_polys'] + ignore_tags_batch = batch['ignore_tags'] + pred_polygons_batch = np.array(output[0]) + pred_scores_batch = np.array(output[1]) + for polygons, pred_polygons, pred_scores, ignore_tags in zip( + gt_polyons_batch, pred_polygons_batch, pred_scores_batch, + ignore_tags_batch): + gt = [ + dict( + points=np.int64(polygons[i]), ignore=ignore_tags[i]) + for i in range(len(polygons)) + ] + if self.is_output_polygon: + pred = [ + dict(points=pred_polygons[i]) + for i in range(len(pred_polygons)) + ] + else: + pred = [] + # print(pred_polygons.shape) + for i in range(pred_polygons.shape[0]): + if pred_scores[i] >= box_thresh: + # print(pred_polygons[i,:,:].tolist()) + pred.append( + dict(points=pred_polygons[i, :, :].astype(np.int))) + # pred = [dict(points=pred_polygons[i,:,:].tolist()) if pred_scores[i] >= box_thresh for i in range(pred_polygons.shape[0])] + results.append(self.evaluator.evaluate_image(gt, pred)) + return results + + def validate_measure(self, batch, output, box_thresh=0.6): + return self.measure(batch, output, box_thresh) + + def evaluate_measure(self, batch, output): + return self.measure(batch, output), np.linspace( + 0, batch['image'].shape[0]).tolist() + + def gather_measure(self, raw_metrics): + raw_metrics = [ + image_metrics + for batch_metrics in raw_metrics for image_metrics in batch_metrics + ] + + result = self.evaluator.combine_results(raw_metrics) + + precision = AverageMeter() + recall = AverageMeter() + fmeasure = AverageMeter() + + precision.update(result['precision'], n=len(raw_metrics)) + recall.update(result['recall'], n=len(raw_metrics)) + fmeasure_score = 2 * precision.val * recall.val / ( + precision.val + recall.val + 1e-8) + fmeasure.update(fmeasure_score) + + return {'precision': precision, 'recall': recall, 'fmeasure': fmeasure} diff --git a/benchmark/PaddleOCR_DBNet/utils/profiler.py b/benchmark/PaddleOCR_DBNet/utils/profiler.py new file mode 100644 index 0000000000000000000000000000000000000000..e64afd6a0d8cfd860920916acfbf168d58dfff2d --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/utils/profiler.py @@ -0,0 +1,110 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import paddle + +# A global variable to record the number of calling times for profiler +# functions. It is used to specify the tracing range of training steps. +_profiler_step_id = 0 + +# A global variable to avoid parsing from string every time. +_profiler_options = None + + +class ProfilerOptions(object): + ''' + Use a string to initialize a ProfilerOptions. + The string should be in the format: "key1=value1;key2=value;key3=value3". + For example: + "profile_path=model.profile" + "batch_range=[50, 60]; profile_path=model.profile" + "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile" + ProfilerOptions supports following key-value pair: + batch_range - a integer list, e.g. [100, 110]. + state - a string, the optional values are 'CPU', 'GPU' or 'All'. + sorted_key - a string, the optional values are 'calls', 'total', + 'max', 'min' or 'ave. + tracer_option - a string, the optional values are 'Default', 'OpDetail', + 'AllOpDetail'. + profile_path - a string, the path to save the serialized profile data, + which can be used to generate a timeline. + exit_on_finished - a boolean. + ''' + + def __init__(self, options_str): + assert isinstance(options_str, str) + + self._options = { + 'batch_range': [10, 20], + 'state': 'All', + 'sorted_key': 'total', + 'tracer_option': 'Default', + 'profile_path': '/tmp/profile', + 'exit_on_finished': True + } + self._parse_from_string(options_str) + + def _parse_from_string(self, options_str): + for kv in options_str.replace(' ', '').split(';'): + key, value = kv.split('=') + if key == 'batch_range': + value_list = value.replace('[', '').replace(']', '').split(',') + value_list = list(map(int, value_list)) + if len(value_list) >= 2 and value_list[0] >= 0 and value_list[ + 1] > value_list[0]: + self._options[key] = value_list + elif key == 'exit_on_finished': + self._options[key] = value.lower() in ("yes", "true", "t", "1") + elif key in [ + 'state', 'sorted_key', 'tracer_option', 'profile_path' + ]: + self._options[key] = value + + def __getitem__(self, name): + if self._options.get(name, None) is None: + raise ValueError( + "ProfilerOptions does not have an option named %s." % name) + return self._options[name] + + +def add_profiler_step(options_str=None): + ''' + Enable the operator-level timing using PaddlePaddle's profiler. + The profiler uses a independent variable to count the profiler steps. + One call of this function is treated as a profiler step. + + Args: + profiler_options - a string to initialize the ProfilerOptions. + Default is None, and the profiler is disabled. + ''' + if options_str is None: + return + + global _profiler_step_id + global _profiler_options + + if _profiler_options is None: + _profiler_options = ProfilerOptions(options_str) + + if _profiler_step_id == _profiler_options['batch_range'][0]: + paddle.utils.profiler.start_profiler(_profiler_options['state'], + _profiler_options['tracer_option']) + elif _profiler_step_id == _profiler_options['batch_range'][1]: + paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'], + _profiler_options['profile_path']) + if _profiler_options['exit_on_finished']: + sys.exit(0) + + _profiler_step_id += 1 diff --git a/benchmark/PaddleOCR_DBNet/utils/schedulers.py b/benchmark/PaddleOCR_DBNet/utils/schedulers.py new file mode 100644 index 0000000000000000000000000000000000000000..1b6fb7d285594c06ec146c301bc5deb26d4e9c26 --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/utils/schedulers.py @@ -0,0 +1,64 @@ +from paddle.optimizer import lr +import logging +__all__ = ['Polynomial'] + + +class Polynomial(object): + """ + Polynomial learning rate decay + Args: + learning_rate (float): The initial learning rate. It is a python float number. + epochs(int): The decay epoch size. It determines the decay cycle, when by_epoch is set to true, it will change to epochs=epochs*step_each_epoch. + step_each_epoch: all steps in each epoch. + end_lr(float, optional): The minimum final learning rate. Default: 0.0001. + power(float, optional): Power of polynomial. Default: 1.0. + warmup_epoch(int): The epoch numbers for LinearWarmup. Default: 0, , when by_epoch is set to true, it will change to warmup_epoch=warmup_epoch*step_each_epoch. + warmup_start_lr(float): Initial learning rate of warm up. Default: 0.0. + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + by_epoch: Whether the set parameter is based on epoch or iter, when set to true,, epochs and warmup_epoch will be automatically multiplied by step_each_epoch. Default: True + """ + + def __init__(self, + learning_rate, + epochs, + step_each_epoch, + end_lr=0.0, + power=1.0, + warmup_epoch=0, + warmup_start_lr=0.0, + last_epoch=-1, + by_epoch=True, + **kwargs): + super().__init__() + if warmup_epoch >= epochs: + msg = f"When using warm up, the value of \"epochs\" must be greater than value of \"Optimizer.lr.warmup_epoch\". The value of \"Optimizer.lr.warmup_epoch\" has been set to {epochs}." + logging.warning(msg) + warmup_epoch = epochs + self.learning_rate = learning_rate + self.epochs = epochs + self.end_lr = end_lr + self.power = power + self.last_epoch = last_epoch + self.warmup_epoch = warmup_epoch + self.warmup_start_lr = warmup_start_lr + + if by_epoch: + self.epochs *= step_each_epoch + self.warmup_epoch = int(self.warmup_epoch * step_each_epoch) + + def __call__(self): + learning_rate = lr.PolynomialDecay( + learning_rate=self.learning_rate, + decay_steps=self.epochs, + end_lr=self.end_lr, + power=self.power, + last_epoch=self. + last_epoch) if self.epochs > 0 else self.learning_rate + if self.warmup_epoch > 0: + learning_rate = lr.LinearWarmup( + learning_rate=learning_rate, + warmup_steps=self.warmup_epoch, + start_lr=self.warmup_start_lr, + end_lr=self.learning_rate, + last_epoch=self.last_epoch) + return learning_rate diff --git a/benchmark/PaddleOCR_DBNet/utils/util.py b/benchmark/PaddleOCR_DBNet/utils/util.py new file mode 100644 index 0000000000000000000000000000000000000000..39bae764092fdca068a1792e065350209b45191a --- /dev/null +++ b/benchmark/PaddleOCR_DBNet/utils/util.py @@ -0,0 +1,367 @@ +# -*- coding: utf-8 -*- +# @Time : 2019/8/23 21:59 +# @Author : zhoujun +import json +import pathlib +import time +import os +import glob +import cv2 +import yaml +from typing import Mapping +import matplotlib.pyplot as plt +import numpy as np + +from argparse import ArgumentParser, RawDescriptionHelpFormatter + + +def _check_image_file(path): + img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff', 'gif', 'pdf'} + return any([path.lower().endswith(e) for e in img_end]) + + +def get_image_file_list(img_file): + imgs_lists = [] + if img_file is None or not os.path.exists(img_file): + raise Exception("not found any img file in {}".format(img_file)) + + img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff', 'gif', 'pdf'} + if os.path.isfile(img_file) and _check_image_file(img_file): + imgs_lists.append(img_file) + elif os.path.isdir(img_file): + for single_file in os.listdir(img_file): + file_path = os.path.join(img_file, single_file) + if os.path.isfile(file_path) and _check_image_file(file_path): + imgs_lists.append(file_path) + if len(imgs_lists) == 0: + raise Exception("not found any img file in {}".format(img_file)) + imgs_lists = sorted(imgs_lists) + return imgs_lists + + +def setup_logger(log_file_path: str=None): + import logging + logging._warn_preinit_stderr = 0 + logger = logging.getLogger('DBNet.paddle') + formatter = logging.Formatter( + '%(asctime)s %(name)s %(levelname)s: %(message)s') + ch = logging.StreamHandler() + ch.setFormatter(formatter) + logger.addHandler(ch) + if log_file_path is not None: + file_handle = logging.FileHandler(log_file_path) + file_handle.setFormatter(formatter) + logger.addHandler(file_handle) + logger.setLevel(logging.DEBUG) + return logger + + +# --exeTime +def exe_time(func): + def newFunc(*args, **args2): + t0 = time.time() + back = func(*args, **args2) + print("{} cost {:.3f}s".format(func.__name__, time.time() - t0)) + return back + + return newFunc + + +def load(file_path: str): + file_path = pathlib.Path(file_path) + func_dict = {'.txt': _load_txt, '.json': _load_json, '.list': _load_txt} + assert file_path.suffix in func_dict + return func_dict[file_path.suffix](file_path) + + +def _load_txt(file_path: str): + with open(file_path, 'r', encoding='utf8') as f: + content = [ + x.strip().strip('\ufeff').strip('\xef\xbb\xbf') + for x in f.readlines() + ] + return content + + +def _load_json(file_path: str): + with open(file_path, 'r', encoding='utf8') as f: + content = json.load(f) + return content + + +def save(data, file_path): + file_path = pathlib.Path(file_path) + func_dict = {'.txt': _save_txt, '.json': _save_json} + assert file_path.suffix in func_dict + return func_dict[file_path.suffix](data, file_path) + + +def _save_txt(data, file_path): + """ + 将一个list的数组写入txt文件里 + :param data: + :param file_path: + :return: + """ + if not isinstance(data, list): + data = [data] + with open(file_path, mode='w', encoding='utf8') as f: + f.write('\n'.join(data)) + + +def _save_json(data, file_path): + with open(file_path, 'w', encoding='utf-8') as json_file: + json.dump(data, json_file, ensure_ascii=False, indent=4) + + +def show_img(imgs: np.ndarray, title='img'): + color = (len(imgs.shape) == 3 and imgs.shape[-1] == 3) + imgs = np.expand_dims(imgs, axis=0) + for i, img in enumerate(imgs): + plt.figure() + plt.title('{}_{}'.format(title, i)) + plt.imshow(img, cmap=None if color else 'gray') + plt.show() + + +def draw_bbox(img_path, result, color=(255, 0, 0), thickness=2): + if isinstance(img_path, str): + img_path = cv2.imread(img_path) + # img_path = cv2.cvtColor(img_path, cv2.COLOR_BGR2RGB) + img_path = img_path.copy() + for point in result: + point = point.astype(int) + cv2.polylines(img_path, [point], True, color, thickness) + return img_path + + +def cal_text_score(texts, + gt_texts, + training_masks, + running_metric_text, + thred=0.5): + training_masks = training_masks.numpy() + pred_text = texts.numpy() * training_masks + pred_text[pred_text <= thred] = 0 + pred_text[pred_text > thred] = 1 + pred_text = pred_text.astype(np.int32) + gt_text = gt_texts.numpy() * training_masks + gt_text = gt_text.astype(np.int32) + running_metric_text.update(gt_text, pred_text) + score_text, _ = running_metric_text.get_scores() + return score_text + + +def order_points_clockwise(pts): + rect = np.zeros((4, 2), dtype="float32") + s = pts.sum(axis=1) + rect[0] = pts[np.argmin(s)] + rect[2] = pts[np.argmax(s)] + diff = np.diff(pts, axis=1) + rect[1] = pts[np.argmin(diff)] + rect[3] = pts[np.argmax(diff)] + return rect + + +def order_points_clockwise_list(pts): + pts = pts.tolist() + pts.sort(key=lambda x: (x[1], x[0])) + pts[:2] = sorted(pts[:2], key=lambda x: x[0]) + pts[2:] = sorted(pts[2:], key=lambda x: -x[0]) + pts = np.array(pts) + return pts + + +def get_datalist(train_data_path): + """ + 获取训练和验证的数据list + :param train_data_path: 训练的dataset文件列表,每个文件内以如下格式存储 ‘path/to/img\tlabel’ + :return: + """ + train_data = [] + for p in train_data_path: + with open(p, 'r', encoding='utf-8') as f: + for line in f.readlines(): + line = line.strip('\n').replace('.jpg ', '.jpg\t').split('\t') + if len(line) > 1: + img_path = pathlib.Path(line[0].strip(' ')) + label_path = pathlib.Path(line[1].strip(' ')) + if img_path.exists() and img_path.stat( + ).st_size > 0 and label_path.exists() and label_path.stat( + ).st_size > 0: + train_data.append((str(img_path), str(label_path))) + return train_data + + +def save_result(result_path, box_list, score_list, is_output_polygon): + if is_output_polygon: + with open(result_path, 'wt') as res: + for i, box in enumerate(box_list): + box = box.reshape(-1).tolist() + result = ",".join([str(int(x)) for x in box]) + score = score_list[i] + res.write(result + ',' + str(score) + "\n") + else: + with open(result_path, 'wt') as res: + for i, box in enumerate(box_list): + score = score_list[i] + box = box.reshape(-1).tolist() + result = ",".join([str(int(x)) for x in box]) + res.write(result + ',' + str(score) + "\n") + + +def expand_polygon(polygon): + """ + 对只有一个字符的框进行扩充 + """ + (x, y), (w, h), angle = cv2.minAreaRect(np.float32(polygon)) + if angle < -45: + w, h = h, w + angle += 90 + new_w = w + h + box = ((x, y), (new_w, h), angle) + points = cv2.boxPoints(box) + return order_points_clockwise(points) + + +def _merge_dict(config, merge_dct): + """ Recursive dict merge. Inspired by :meth:``dict.update()``, instead of + updating only top-level keys, dict_merge recurses down into dicts nested + to an arbitrary depth, updating keys. The ``merge_dct`` is merged into + ``dct``. + Args: + config: dict onto which the merge is executed + merge_dct: dct merged into config + Returns: dct + """ + for key, value in merge_dct.items(): + sub_keys = key.split('.') + key = sub_keys[0] + if key in config and len(sub_keys) > 1: + _merge_dict(config[key], {'.'.join(sub_keys[1:]): value}) + elif key in config and isinstance(config[key], dict) and isinstance( + value, Mapping): + _merge_dict(config[key], value) + else: + config[key] = value + return config + + +def print_dict(cfg, print_func=print, delimiter=0): + """ + Recursively visualize a dict and + indenting acrrording by the relationship of keys. + """ + for k, v in sorted(cfg.items()): + if isinstance(v, dict): + print_func("{}{} : ".format(delimiter * " ", str(k))) + print_dict(v, print_func, delimiter + 4) + elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict): + print_func("{}{} : ".format(delimiter * " ", str(k))) + for value in v: + print_dict(value, print_func, delimiter + 4) + else: + print_func("{}{} : {}".format(delimiter * " ", k, v)) + + +class Config(object): + def __init__(self, config_path, BASE_KEY='base'): + self.BASE_KEY = BASE_KEY + self.cfg = self._load_config_with_base(config_path) + + def _load_config_with_base(self, file_path): + """ + Load config from file. + Args: + file_path (str): Path of the config file to be loaded. + Returns: global config + """ + _, ext = os.path.splitext(file_path) + assert ext in ['.yml', '.yaml'], "only support yaml files for now" + + with open(file_path) as f: + file_cfg = yaml.load(f, Loader=yaml.Loader) + + # NOTE: cfgs outside have higher priority than cfgs in _BASE_ + if self.BASE_KEY in file_cfg: + all_base_cfg = dict() + base_ymls = list(file_cfg[self.BASE_KEY]) + for base_yml in base_ymls: + with open(base_yml) as f: + base_cfg = self._load_config_with_base(base_yml) + all_base_cfg = _merge_dict(all_base_cfg, base_cfg) + + del file_cfg[self.BASE_KEY] + file_cfg = _merge_dict(all_base_cfg, file_cfg) + file_cfg['filename'] = os.path.splitext(os.path.split(file_path)[-1])[0] + return file_cfg + + def merge_dict(self, args): + self.cfg = _merge_dict(self.cfg, args) + + def print_cfg(self, print_func=print): + """ + Recursively visualize a dict and + indenting acrrording by the relationship of keys. + """ + print_func('----------- Config -----------') + print_dict(self.cfg, print_func) + print_func('---------------------------------------------') + + def save(self, p): + with open(p, 'w') as f: + yaml.dump( + dict(self.cfg), f, default_flow_style=False, sort_keys=False) + + +class ArgsParser(ArgumentParser): + def __init__(self): + super(ArgsParser, self).__init__( + formatter_class=RawDescriptionHelpFormatter) + self.add_argument( + "-c", "--config_file", help="configuration file to use") + self.add_argument( + "-o", "--opt", nargs='*', help="set configuration options") + self.add_argument( + '-p', + '--profiler_options', + type=str, + default=None, + help='The option of profiler, which should be in format ' \ + '\"key1=value1;key2=value2;key3=value3\".' + ) + + def parse_args(self, argv=None): + args = super(ArgsParser, self).parse_args(argv) + assert args.config_file is not None, \ + "Please specify --config_file=configure_file_path." + args.opt = self._parse_opt(args.opt) + return args + + def _parse_opt(self, opts): + config = {} + if not opts: + return config + for s in opts: + s = s.strip() + k, v = s.split('=', 1) + if '.' not in k: + config[k] = yaml.load(v, Loader=yaml.Loader) + else: + keys = k.split('.') + if keys[0] not in config: + config[keys[0]] = {} + cur = config[keys[0]] + for idx, key in enumerate(keys[1:]): + if idx == len(keys) - 2: + cur[key] = yaml.load(v, Loader=yaml.Loader) + else: + cur[key] = {} + cur = cur[key] + return config + + +if __name__ == '__main__': + img = np.zeros((1, 3, 640, 640)) + show_img(img[0][0]) + plt.show() diff --git a/configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml b/configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml index 0c6ab2a0d1d9733d647dc40a7b182fe201866a78..d35e08ae26eec65be1a8931cf76361b768e80db6 100644 --- a/configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml +++ b/configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml @@ -16,7 +16,7 @@ Global: save_res_path: ./output/det_db/predicts_db.txt use_amp: False amp_level: O2 - amp_custom_black_list: ['exp'] + amp_dtype: bfloat16 Architecture: name: DistillationModel diff --git a/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml b/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml index 000d95e892cb8e6dcceeb7c22264c28934d1000c..252d1599776a893b882723a9f7329c99458f3dc4 100644 --- a/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml +++ b/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml @@ -17,6 +17,8 @@ Global: infer_img: doc/imgs_en/img_10.jpg save_res_path: ./checkpoints/det_db/predicts_db.txt distributed: true + d2s_train_image_shape: [3, -1, -1] + amp_dtype: bfloat16 Architecture: name: DistillationModel @@ -221,4 +223,4 @@ Eval: shuffle: False drop_last: False batch_size_per_card: 1 # must be 1 - num_workers: 2 \ No newline at end of file + num_workers: 2 diff --git a/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml b/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml index 0e8af776479ea26f834ca9ddc169f80b3982e86d..083383a00f5fcdaa43a06a6d13954ac24fdc636c 100644 --- a/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml +++ b/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml @@ -10,7 +10,7 @@ Global: - 0 - 400 cal_metric_during_train: false - pretrained_model: null + pretrained_model: https://paddleocr.bj.bcebos.com/pretrained/MobileNetV3_large_x0_5_pretrained.pdparams checkpoints: null save_inference_dir: null use_visualdl: false diff --git a/configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det_cml.yml b/configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det_cml.yml new file mode 100644 index 0000000000000000000000000000000000000000..fe582ba5b76208459a47563b9d02bf6aa2593b06 --- /dev/null +++ b/configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det_cml.yml @@ -0,0 +1,235 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 20 + save_model_dir: ./output/ch_PP-OCRv4 + save_epoch_step: 50 + eval_batch_step: + - 0 + - 1000 + cal_metric_during_train: true + checkpoints: null + pretrained_model: null + save_inference_dir: null + use_visualdl: false + infer_img: doc/imgs_en/img_10.jpg + save_res_path: ./checkpoints/det_db/predicts_db.txt + distributed: true +Architecture: + name: DistillationModel + algorithm: Distillation + model_type: det + Models: + Student: + model_type: det + algorithm: DB + Transform: null + Backbone: + name: PPLCNetNew + scale: 0.75 + pretrained: false + Neck: + name: RSEFPN + out_channels: 96 + shortcut: true + Head: + name: DBHead + k: 50 + Student2: + pretrained: null + model_type: det + algorithm: DB + Transform: null + Backbone: + name: PPLCNetNew + scale: 0.75 + pretrained: true + Neck: + name: RSEFPN + out_channels: 96 + shortcut: true + Head: + name: DBHead + k: 50 + Teacher: + pretrained: https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_det_cml_teacher_pretrained/teacher.pdparams + freeze_params: true + return_all_feats: false + model_type: det + algorithm: DB + Backbone: + name: ResNet_vd + in_channels: 3 + layers: 50 + Neck: + name: LKPAN + out_channels: 256 + Head: + name: DBHead + kernel_list: + - 7 + - 2 + - 2 + k: 50 +Loss: + name: CombinedLoss + loss_config_list: + - DistillationDilaDBLoss: + weight: 1.0 + model_name_pairs: + - - Student + - Teacher + - - Student2 + - Teacher + key: maps + balance_loss: true + main_loss_type: DiceLoss + alpha: 5 + beta: 10 + ohem_ratio: 3 + - DistillationDMLLoss: + model_name_pairs: + - Student + - Student2 + maps_name: thrink_maps + weight: 1.0 + key: maps + - DistillationDBLoss: + weight: 1.0 + model_name_list: + - Student + - Student2 + balance_loss: true + main_loss_type: DiceLoss + alpha: 5 + beta: 10 + ohem_ratio: 3 +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 2 + regularizer: + name: L2 + factor: 5.0e-05 +PostProcess: + name: DistillationDBPostProcess + model_name: + - Student + key: head_out + thresh: 0.3 + box_thresh: 0.6 + max_candidates: 1000 + unclip_ratio: 1.5 +Metric: + name: DistillationMetric + base_metric_name: DetMetric + main_indicator: hmean + key: Student +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt + ratio_list: [1.0] + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - DetLabelEncode: null + - IaaAugment: + augmenter_args: + - type: Fliplr + args: + p: 0.5 + - type: Affine + args: + rotate: + - -10 + - 10 + - type: Resize + args: + size: + - 0.5 + - 3 + - EastRandomCropData: + size: + - 640 + - 640 + max_tries: 50 + keep_ratio: true + - MakeBorderMap: + shrink_ratio: 0.4 + thresh_min: 0.3 + thresh_max: 0.7 + total_epoch: 500 + - MakeShrinkMap: + shrink_ratio: 0.4 + min_text_size: 8 + total_epoch: 500 + - NormalizeImage: + scale: 1./255. + mean: + - 0.485 + - 0.456 + - 0.406 + std: + - 0.229 + - 0.224 + - 0.225 + order: hwc + - ToCHWImage: null + - KeepKeys: + keep_keys: + - image + - threshold_map + - threshold_mask + - shrink_map + - shrink_mask + loader: + shuffle: true + drop_last: false + batch_size_per_card: 16 + num_workers: 8 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - DetLabelEncode: null + - DetResizeForTest: null + - NormalizeImage: + scale: 1./255. + mean: + - 0.485 + - 0.456 + - 0.406 + std: + - 0.229 + - 0.224 + - 0.225 + order: hwc + - ToCHWImage: null + - KeepKeys: + keep_keys: + - image + - shape + - polys + - ignore_tags + loader: + shuffle: false + drop_last: false + batch_size_per_card: 1 + num_workers: 2 +profiler_options: null diff --git a/configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det_student.yml b/configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det_student.yml new file mode 100644 index 0000000000000000000000000000000000000000..39b260cf3ad0a0bb9664a5165f3262962bba072e --- /dev/null +++ b/configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det_student.yml @@ -0,0 +1,171 @@ +Global: + debug: false + use_gpu: true + epoch_num: &epoch_num 500 + log_smooth_window: 20 + print_batch_step: 100 + save_model_dir: ./output/ch_PP-OCRv4 + save_epoch_step: 10 + eval_batch_step: + - 0 + - 1500 + cal_metric_during_train: false + checkpoints: + pretrained_model: https://paddleocr.bj.bcebos.com/pretrained/PPLCNetV3_x0_75_ocr_det.pdparams + save_inference_dir: null + use_visualdl: false + infer_img: doc/imgs_en/img_10.jpg + save_res_path: ./checkpoints/det_db/predicts_db.txt + distributed: true + +Architecture: + model_type: det + algorithm: DB + Transform: null + Backbone: + name: PPLCNetV3 + scale: 0.75 + det: True + Neck: + name: RSEFPN + out_channels: 96 + shortcut: True + Head: + name: DBHead + k: 50 + +Loss: + name: DBLoss + balance_loss: true + main_loss_type: DiceLoss + alpha: 5 + beta: 10 + ohem_ratio: 3 + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 #(8*8c) + warmup_epoch: 2 + regularizer: + name: L2 + factor: 5.0e-05 + +PostProcess: + name: DBPostProcess + thresh: 0.3 + box_thresh: 0.6 + max_candidates: 1000 + unclip_ratio: 1.5 + +Metric: + name: DetMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt + ratio_list: [1.0] + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - DetLabelEncode: null + - CopyPaste: null + - IaaAugment: + augmenter_args: + - type: Fliplr + args: + p: 0.5 + - type: Affine + args: + rotate: + - -10 + - 10 + - type: Resize + args: + size: + - 0.5 + - 3 + - EastRandomCropData: + size: + - 640 + - 640 + max_tries: 50 + keep_ratio: true + - MakeBorderMap: + shrink_ratio: 0.4 + thresh_min: 0.3 + thresh_max: 0.7 + total_epoch: *epoch_num + - MakeShrinkMap: + shrink_ratio: 0.4 + min_text_size: 8 + total_epoch: *epoch_num + - NormalizeImage: + scale: 1./255. + mean: + - 0.485 + - 0.456 + - 0.406 + std: + - 0.229 + - 0.224 + - 0.225 + order: hwc + - ToCHWImage: null + - KeepKeys: + keep_keys: + - image + - threshold_map + - threshold_mask + - shrink_map + - shrink_mask + loader: + shuffle: true + drop_last: false + batch_size_per_card: 8 + num_workers: 8 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - DetLabelEncode: null + - DetResizeForTest: + - NormalizeImage: + scale: 1./255. + mean: + - 0.485 + - 0.456 + - 0.406 + std: + - 0.229 + - 0.224 + - 0.225 + order: hwc + - ToCHWImage: null + - KeepKeys: + keep_keys: + - image + - shape + - polys + - ignore_tags + loader: + shuffle: false + drop_last: false + batch_size_per_card: 1 + num_workers: 2 +profiler_options: null diff --git a/configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det_teacher.yml b/configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det_teacher.yml new file mode 100644 index 0000000000000000000000000000000000000000..b58af1cc9be9c20372caa1fc94d233c7e14c0773 --- /dev/null +++ b/configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det_teacher.yml @@ -0,0 +1,172 @@ +Global: + debug: false + use_gpu: true + epoch_num: &epoch_num 500 + log_smooth_window: 20 + print_batch_step: 100 + save_model_dir: ./output/ch_PP-OCRv4 + save_epoch_step: 10 + eval_batch_step: + - 0 + - 1500 + cal_metric_during_train: false + checkpoints: + pretrained_model: https://paddleocr.bj.bcebos.com/pretrained/PPHGNet_small_ocr_det.pdparams + save_inference_dir: null + use_visualdl: false + infer_img: doc/imgs_en/img_10.jpg + save_res_path: ./checkpoints/det_db/predicts_db.txt + distributed: true + +Architecture: + model_type: det + algorithm: DB + Transform: null + Backbone: + name: PPHGNet_small + det: True + Neck: + name: LKPAN + out_channels: 256 + intracl: true + Head: + name: PFHeadLocal + k: 50 + mode: "large" + + +Loss: + name: DBLoss + balance_loss: true + main_loss_type: DiceLoss + alpha: 5 + beta: 10 + ohem_ratio: 3 + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 #(8*8c) + warmup_epoch: 2 + regularizer: + name: L2 + factor: 1e-6 + +PostProcess: + name: DBPostProcess + thresh: 0.3 + box_thresh: 0.6 + max_candidates: 1000 + unclip_ratio: 1.5 + +Metric: + name: DetMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt + ratio_list: [1.0] + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - DetLabelEncode: null + - CopyPaste: null + - IaaAugment: + augmenter_args: + - type: Fliplr + args: + p: 0.5 + - type: Affine + args: + rotate: + - -10 + - 10 + - type: Resize + args: + size: + - 0.5 + - 3 + - EastRandomCropData: + size: + - 640 + - 640 + max_tries: 50 + keep_ratio: true + - MakeBorderMap: + shrink_ratio: 0.4 + thresh_min: 0.3 + thresh_max: 0.7 + total_epoch: *epoch_num + - MakeShrinkMap: + shrink_ratio: 0.4 + min_text_size: 8 + total_epoch: *epoch_num + - NormalizeImage: + scale: 1./255. + mean: + - 0.485 + - 0.456 + - 0.406 + std: + - 0.229 + - 0.224 + - 0.225 + order: hwc + - ToCHWImage: null + - KeepKeys: + keep_keys: + - image + - threshold_map + - threshold_mask + - shrink_map + - shrink_mask + loader: + shuffle: true + drop_last: false + batch_size_per_card: 8 + num_workers: 8 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - DetLabelEncode: null + - DetResizeForTest: + - NormalizeImage: + scale: 1./255. + mean: + - 0.485 + - 0.456 + - 0.406 + std: + - 0.229 + - 0.224 + - 0.225 + order: hwc + - ToCHWImage: null + - KeepKeys: + keep_keys: + - image + - shape + - polys + - ignore_tags + loader: + shuffle: false + drop_last: false + batch_size_per_card: 1 + num_workers: 2 +profiler_options: null diff --git a/configs/kie/layoutlm_series/ser_layoutlm_xfund_zh.yml b/configs/kie/layoutlm_series/ser_layoutlm_xfund_zh.yml index 34c7d4114062e9227d48ad5684024e2776e68447..5424d7e1fb5ab472644ec5b00225921670941025 100644 --- a/configs/kie/layoutlm_series/ser_layoutlm_xfund_zh.yml +++ b/configs/kie/layoutlm_series/ser_layoutlm_xfund_zh.yml @@ -83,7 +83,7 @@ Train: shuffle: True drop_last: False batch_size_per_card: 8 - num_workers: 4 + num_workers: 16 Eval: dataset: diff --git a/configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml b/configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml index b8aa44dde8fd3fdc4ff14bbca20513b95178cdb0..50b04ba0dd139060d50aa70421221dfc2c66067f 100644 --- a/configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml +++ b/configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml @@ -12,6 +12,7 @@ Global: use_visualdl: False seed: 2022 infer_img: ppstructure/docs/kie/input/zh_val_42.jpg + d2s_train_image_shape: [3, 224, 224] # if you want to predict using the groundtruth ocr info, # you can use the following config # infer_img: train_data/XFUND/zh_val/val.json @@ -20,6 +21,7 @@ Global: save_res_path: ./output/ser/xfund_zh/res kie_rec_model_dir: kie_det_model_dir: + amp_custom_white_list: ['scale', 'concat', 'elementwise_add'] Architecture: model_type: kie diff --git a/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml index 7e98280b32558b8d3d203084e6e327bc7cd782bf..fd15873fbf87871b0484b50d7b421129a7a560cf 100644 --- a/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml +++ b/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml @@ -36,13 +36,14 @@ Optimizer: Architecture: model_type: rec - algorithm: SVTR + algorithm: SVTR_LCNet Transform: Backbone: name: MobileNetV1Enhance scale: 0.5 last_conv_stride: [1, 2] last_pool_type: avg + last_pool_kernel_size: [2, 2] Head: name: MultiHead head_list: diff --git a/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml b/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml index 427255738696d8e6a073829350c40b00ef30115f..3b82ef857f053133d68f097e625f31f57f154836 100644 --- a/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml +++ b/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml @@ -19,6 +19,7 @@ Global: use_space_char: true distributed: true save_res_path: ./output/rec/predicts_ppocrv3_distillation.txt + d2s_train_image_shape: [3, 48, -1] Optimizer: @@ -27,7 +28,7 @@ Optimizer: beta2: 0.999 lr: name: Piecewise - decay_epochs : [700, 800] + decay_epochs : [700] values : [0.0005, 0.00005] warmup_epoch: 5 regularizer: @@ -45,13 +46,14 @@ Architecture: freeze_params: false return_all_feats: true model_type: *model_type - algorithm: SVTR + algorithm: SVTR_LCNet Transform: Backbone: name: MobileNetV1Enhance scale: 0.5 last_conv_stride: [1, 2] last_pool_type: avg + last_pool_kernel_size: [2, 2] Head: name: MultiHead head_list: @@ -72,13 +74,14 @@ Architecture: freeze_params: false return_all_feats: true model_type: *model_type - algorithm: SVTR + algorithm: SVTR_LCNet Transform: Backbone: name: MobileNetV1Enhance scale: 0.5 last_conv_stride: [1, 2] last_pool_type: avg + last_pool_kernel_size: [2, 2] Head: name: MultiHead head_list: diff --git a/configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml index c728e0ac823b0bf835322dcbd0c385c3ac7b2489..af42001debfebff49ed2c4c4c5b5be347a78ece4 100644 --- a/configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml +++ b/configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml @@ -36,13 +36,14 @@ Optimizer: Architecture: model_type: rec - algorithm: SVTR + algorithm: SVTR_LCNet Transform: Backbone: name: MobileNetV1Enhance scale: 0.5 last_conv_stride: [1, 2] last_pool_type: avg + last_pool_kernel_size: [2, 2] Head: name: MultiHead head_list: diff --git a/configs/rec/PP-OCRv3/multi_language/arabic_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/arabic_PP-OCRv3_rec.yml index 8c650bd826d127f25c907f97d20d1a52f67f9203..f8e9260c36d39ad9939e12a69f4411e5f20748f5 100644 --- a/configs/rec/PP-OCRv3/multi_language/arabic_PP-OCRv3_rec.yml +++ b/configs/rec/PP-OCRv3/multi_language/arabic_PP-OCRv3_rec.yml @@ -36,13 +36,14 @@ Optimizer: Architecture: model_type: rec - algorithm: SVTR + algorithm: SVTR_LCNet Transform: Backbone: name: MobileNetV1Enhance scale: 0.5 last_conv_stride: [1, 2] last_pool_type: avg + last_pool_kernel_size: [2, 2] Head: name: MultiHead head_list: diff --git a/configs/rec/PP-OCRv3/multi_language/chinese_cht_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/chinese_cht_PP-OCRv3_rec.yml index 28e0c10aa0f83fdf8e621aae04bf2b7374255adc..77090846f36f9d511fa5fa39fb0ae5cf8b0c606d 100644 --- a/configs/rec/PP-OCRv3/multi_language/chinese_cht_PP-OCRv3_rec.yml +++ b/configs/rec/PP-OCRv3/multi_language/chinese_cht_PP-OCRv3_rec.yml @@ -36,13 +36,14 @@ Optimizer: Architecture: model_type: rec - algorithm: SVTR + algorithm: SVTR_LCNet Transform: Backbone: name: MobileNetV1Enhance scale: 0.5 last_conv_stride: [1, 2] last_pool_type: avg + last_pool_kernel_size: [2, 2] Head: name: MultiHead head_list: diff --git a/configs/rec/PP-OCRv3/multi_language/cyrillic_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/cyrillic_PP-OCRv3_rec.yml index fbdbe6c44c689ea267c9995f832305d800046edb..0aa1a9340f48ab5f42f27e94aadfa4654d317001 100644 --- a/configs/rec/PP-OCRv3/multi_language/cyrillic_PP-OCRv3_rec.yml +++ b/configs/rec/PP-OCRv3/multi_language/cyrillic_PP-OCRv3_rec.yml @@ -36,13 +36,14 @@ Optimizer: Architecture: model_type: rec - algorithm: SVTR + algorithm: SVTR_LCNet Transform: Backbone: name: MobileNetV1Enhance scale: 0.5 last_conv_stride: [1, 2] last_pool_type: avg + last_pool_kernel_size: [2, 2] Head: name: MultiHead head_list: diff --git a/configs/rec/PP-OCRv3/multi_language/devanagari_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/devanagari_PP-OCRv3_rec.yml index 48eb38df36f931b76b8e9fb8369daf06ad037d25..b05371e6e0b480fc8f926c99a7bb7fd8bf8fc20e 100644 --- a/configs/rec/PP-OCRv3/multi_language/devanagari_PP-OCRv3_rec.yml +++ b/configs/rec/PP-OCRv3/multi_language/devanagari_PP-OCRv3_rec.yml @@ -36,13 +36,14 @@ Optimizer: Architecture: model_type: rec - algorithm: SVTR + algorithm: SVTR_LCNet Transform: Backbone: name: MobileNetV1Enhance scale: 0.5 last_conv_stride: [1, 2] last_pool_type: avg + last_pool_kernel_size: [2, 2] Head: name: MultiHead head_list: diff --git a/configs/rec/PP-OCRv3/multi_language/japan_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/japan_PP-OCRv3_rec.yml index 6cab0d447247e28bb58b30384d4f9d032d6ce9d0..420019e46033676226389082381df17ee691c9c9 100644 --- a/configs/rec/PP-OCRv3/multi_language/japan_PP-OCRv3_rec.yml +++ b/configs/rec/PP-OCRv3/multi_language/japan_PP-OCRv3_rec.yml @@ -36,13 +36,14 @@ Optimizer: Architecture: model_type: rec - algorithm: SVTR + algorithm: SVTR_LCNet Transform: Backbone: name: MobileNetV1Enhance scale: 0.5 last_conv_stride: [1, 2] last_pool_type: avg + last_pool_kernel_size: [2, 2] Head: name: MultiHead head_list: diff --git a/configs/rec/PP-OCRv3/multi_language/ka_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/ka_PP-OCRv3_rec.yml index 7a9c8241d1564e5f1295655ba64694a117064bd8..75e1666f8dc0be4e2eaef61ebeb2b0607d2cf7ce 100644 --- a/configs/rec/PP-OCRv3/multi_language/ka_PP-OCRv3_rec.yml +++ b/configs/rec/PP-OCRv3/multi_language/ka_PP-OCRv3_rec.yml @@ -36,13 +36,14 @@ Optimizer: Architecture: model_type: rec - algorithm: SVTR + algorithm: SVTR_LCNet Transform: Backbone: name: MobileNetV1Enhance scale: 0.5 last_conv_stride: [1, 2] last_pool_type: avg + last_pool_kernel_size: [2, 2] Head: name: MultiHead head_list: diff --git a/configs/rec/PP-OCRv3/multi_language/korean_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/korean_PP-OCRv3_rec.yml index 29ff570772a621ba747e0388bcc0c042db0dba43..f5d4c09e3f20a8799149e141167528567a1f5e66 100644 --- a/configs/rec/PP-OCRv3/multi_language/korean_PP-OCRv3_rec.yml +++ b/configs/rec/PP-OCRv3/multi_language/korean_PP-OCRv3_rec.yml @@ -36,13 +36,14 @@ Optimizer: Architecture: model_type: rec - algorithm: SVTR + algorithm: SVTR_LCNet Transform: Backbone: name: MobileNetV1Enhance scale: 0.5 last_conv_stride: [1, 2] last_pool_type: avg + last_pool_kernel_size: [2, 2] Head: name: MultiHead head_list: diff --git a/configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml index 1784bfe611366c45230fd2abf69ab16e3a1c3ae9..d43444be99fbbd57cb9aa309c34c7fc4e242a04e 100644 --- a/configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml +++ b/configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml @@ -36,13 +36,14 @@ Optimizer: Architecture: model_type: rec - algorithm: SVTR + algorithm: SVTR_LCNet Transform: Backbone: name: MobileNetV1Enhance scale: 0.5 last_conv_stride: [1, 2] last_pool_type: avg + last_pool_kernel_size: [2, 2] Head: name: MultiHead head_list: diff --git a/configs/rec/PP-OCRv3/multi_language/ta_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/ta_PP-OCRv3_rec.yml index 70b26aa84a2178111edab9f094c369c5d22e31a9..9d152e2471b5386fe7c6eff2116a678f38c908de 100644 --- a/configs/rec/PP-OCRv3/multi_language/ta_PP-OCRv3_rec.yml +++ b/configs/rec/PP-OCRv3/multi_language/ta_PP-OCRv3_rec.yml @@ -36,13 +36,14 @@ Optimizer: Architecture: model_type: rec - algorithm: SVTR + algorithm: SVTR_LCNet Transform: Backbone: name: MobileNetV1Enhance scale: 0.5 last_conv_stride: [1, 2] last_pool_type: avg + last_pool_kernel_size: [2, 2] Head: name: MultiHead head_list: diff --git a/configs/rec/PP-OCRv3/multi_language/te_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/te_PP-OCRv3_rec.yml index 3617af79e3b9c5a55ef22d549465ba2109618e32..b1e52e457938ed01bf8b1b52b5e95c2b99bc6313 100644 --- a/configs/rec/PP-OCRv3/multi_language/te_PP-OCRv3_rec.yml +++ b/configs/rec/PP-OCRv3/multi_language/te_PP-OCRv3_rec.yml @@ -36,13 +36,14 @@ Optimizer: Architecture: model_type: rec - algorithm: SVTR + algorithm: SVTR_LCNet Transform: Backbone: name: MobileNetV1Enhance scale: 0.5 last_conv_stride: [1, 2] last_pool_type: avg + last_pool_kernel_size: [2, 2] Head: name: MultiHead head_list: diff --git a/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec.yml b/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec.yml new file mode 100644 index 0000000000000000000000000000000000000000..827f5eef3dff11bd2f5fced387fd5534551075b0 --- /dev/null +++ b/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec.yml @@ -0,0 +1,138 @@ +Global: + debug: false + use_gpu: true + epoch_num: 200 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_ppocr_v4 + save_epoch_step: 10 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR_LCNet + Transform: + Backbone: + name: PPLCNetV3 + scale: 0.95 + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 120 + depth: 2 + hidden_dims: 120 + kernel_size: [1, 3] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - NRTRLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: MultiScaleDataSet + ds_width: false + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + max_text_length: *max_text_length + - RecAug: + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + sampler: + name: MultiScaleSampler + scales: [[320, 32], [320, 48], [320, 64]] + first_bs: &bs 192 + fix_bs: false + divided_factor: [8, 16] # w, h + is_training: True + loader: + shuffle: true + batch_size_per_card: *bs + drop_last: true + num_workers: 8 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_ampO2_ultra.yml b/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_ampO2_ultra.yml new file mode 100644 index 0000000000000000000000000000000000000000..475c551689c3f1e7ba209a68685cb148291ff27a --- /dev/null +++ b/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_ampO2_ultra.yml @@ -0,0 +1,140 @@ +Global: + debug: false + use_gpu: true + epoch_num: 200 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_ppocr_v4 + save_epoch_step: 10 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3.txt + use_amp: True + amp_level: O2 + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR_LCNet + Transform: + Backbone: + name: PPLCNetV3 + scale: 0.95 + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 120 + depth: 2 + hidden_dims: 120 + kernel_size: [1, 3] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - NRTRLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: MultiScaleDataSet + ds_width: false + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + max_text_length: *max_text_length + - RecAug: + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + sampler: + name: MultiScaleSampler + scales: [[320, 32], [320, 48], [320, 64]] + first_bs: &bs 384 + fix_bs: false + divided_factor: [8, 16] # w, h + is_training: True + loader: + shuffle: true + batch_size_per_card: *bs + drop_last: true + num_workers: 16 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 16 diff --git a/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_distill.yml b/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_distill.yml new file mode 100644 index 0000000000000000000000000000000000000000..f613ee52b467f279e4bbbd33dca3c58862f4715a --- /dev/null +++ b/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_distill.yml @@ -0,0 +1,231 @@ +Global: + debug: false + use_gpu: true + epoch_num: 200 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_dkd_400w_svtr_ctc_lcnet_blank_dkd0.1/ + save_epoch_step: 40 + eval_batch_step: + - 0 + - 2000 + cal_metric_during_train: true + pretrained_model: null + checkpoints: ./output/rec_dkd_400w_svtr_ctc_lcnet_blank_dkd0.1/latest + save_inference_dir: null + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + max_text_length: 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3.txt +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 2 + regularizer: + name: L2 + factor: 3.0e-05 +Architecture: + model_type: rec + name: DistillationModel + algorithm: Distillation + Models: + Teacher: + pretrained: + freeze_params: true + return_all_feats: true + model_type: rec + algorithm: SVTR + Transform: null + Backbone: + name: SVTRNet + img_size: + - 48 + - 320 + out_char_num: 40 + out_channels: 192 + patch_merging: Conv + embed_dim: + - 64 + - 128 + - 256 + depth: + - 3 + - 6 + - 3 + num_heads: + - 2 + - 4 + - 8 + mixer: + - Conv + - Conv + - Conv + - Conv + - Conv + - Conv + - Global + - Global + - Global + - Global + - Global + - Global + local_mixer: + - - 5 + - 5 + - - 5 + - 5 + - - 5 + - 5 + last_stage: false + prenorm: true + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 120 + depth: 2 + hidden_dims: 120 + kernel_size: [1, 3] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: *max_text_length + Student: + pretrained: + freeze_params: false + return_all_feats: true + model_type: rec + algorithm: SVTR + Transform: null + Backbone: + name: PPLCNetV3 + scale: 0.95 + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 120 + depth: 2 + hidden_dims: 120 + kernel_size: [1, 3] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: *max_text_length +Loss: + name: CombinedLoss + loss_config_list: + - DistillationDKDLoss: + weight: 0.1 + model_name_pairs: + - - Student + - Teacher + key: head_out + multi_head: true + alpha: 1.0 + beta: 2.0 + dis_head: gtc + name: dkd + - DistillationCTCLoss: + weight: 1.0 + model_name_list: + - Student + key: head_out + multi_head: true + - DistillationNRTRLoss: + weight: 1.0 + smoothing: false + model_name_list: + - Student + key: head_out + multi_head: true + - DistillCTCLogits: + weight: 1.0 + reduction: mean + model_name_pairs: + - - Student + - Teacher + key: head_out +PostProcess: + name: DistillationCTCLabelDecode + model_name: + - Student + key: head_out + multi_head: true +Metric: + name: DistillationMetric + base_metric_name: RecMetric + main_indicator: acc + key: Student + ignore_space: false +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: + - ./train_data/train_list.txt + ratio_list: + - 1.0 + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecAug: + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 8 + use_shared_memory: true +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 +profiler_options: null diff --git a/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_fp32_ultra.yml b/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_fp32_ultra.yml new file mode 100644 index 0000000000000000000000000000000000000000..8c267309daefbe45929551ed186c7c5e7265a066 --- /dev/null +++ b/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_fp32_ultra.yml @@ -0,0 +1,138 @@ +Global: + debug: false + use_gpu: true + epoch_num: 200 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_ppocr_v4 + save_epoch_step: 10 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR_LCNet + Transform: + Backbone: + name: PPLCNetV3 + scale: 0.95 + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 120 + depth: 2 + hidden_dims: 120 + kernel_size: [1, 3] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - NRTRLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: MultiScaleDataSet + ds_width: false + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + max_text_length: *max_text_length + - RecAug: + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + sampler: + name: MultiScaleSampler + scales: [[320, 32], [320, 48], [320, 64]] + first_bs: &bs 192 + fix_bs: false + divided_factor: [8, 16] # w, h + is_training: True + loader: + shuffle: true + batch_size_per_card: *bs + drop_last: true + num_workers: 16 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 16 diff --git a/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_hgnet.yml b/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_hgnet.yml new file mode 100644 index 0000000000000000000000000000000000000000..dba966e7c50c848fcf261bb57fe9ef056f4fec4b --- /dev/null +++ b/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_hgnet.yml @@ -0,0 +1,137 @@ +Global: + debug: false + use_gpu: true + epoch_num: 200 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_ppocr_v4_hgnet + save_epoch_step: 10 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR_HGNet + Transform: + Backbone: + name: PPHGNet_small + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 120 + depth: 2 + hidden_dims: 120 + kernel_size: [1, 3] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - NRTRLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: MultiScaleDataSet + ds_width: false + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + max_text_length: *max_text_length + - RecAug: + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + sampler: + name: MultiScaleSampler + scales: [[320, 32], [320, 48], [320, 64]] + first_bs: &bs 128 + fix_bs: false + divided_factor: [8, 16] # w, h + is_training: True + loader: + shuffle: true + batch_size_per_card: *bs + drop_last: true + num_workers: 8 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_hgnet_ampO2_ultra.yml b/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_hgnet_ampO2_ultra.yml new file mode 100644 index 0000000000000000000000000000000000000000..43035216c3cee092855e5140454c75381f1ee7b5 --- /dev/null +++ b/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_hgnet_ampO2_ultra.yml @@ -0,0 +1,139 @@ +Global: + debug: false + use_gpu: true + epoch_num: 200 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_ppocr_v4_hgnet + save_epoch_step: 10 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3.txt + use_amp: True + amp_level: O2 + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR_HGNet + Transform: + Backbone: + name: PPHGNet_small + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 120 + depth: 2 + hidden_dims: 120 + kernel_size: [1, 3] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - NRTRLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: MultiScaleDataSet + ds_width: false + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + max_text_length: *max_text_length + - RecAug: + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + sampler: + name: MultiScaleSampler + scales: [[320, 32], [320, 48], [320, 64]] + first_bs: &bs 256 + fix_bs: false + divided_factor: [8, 16] # w, h + is_training: True + loader: + shuffle: true + batch_size_per_card: *bs + drop_last: true + num_workers: 16 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 16 diff --git a/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_hgnet_fp32_ultra.yml b/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_hgnet_fp32_ultra.yml new file mode 100644 index 0000000000000000000000000000000000000000..ee9ebcafd39339c7584e5122361804995834ae8f --- /dev/null +++ b/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_hgnet_fp32_ultra.yml @@ -0,0 +1,137 @@ +Global: + debug: false + use_gpu: true + epoch_num: 200 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_ppocr_v4_hgnet + save_epoch_step: 10 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR_HGNet + Transform: + Backbone: + name: PPHGNet_small + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 120 + depth: 2 + hidden_dims: 120 + kernel_size: [1, 3] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - NRTRLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: MultiScaleDataSet + ds_width: false + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + max_text_length: *max_text_length + - RecAug: + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + sampler: + name: MultiScaleSampler + scales: [[320, 32], [320, 48], [320, 64]] + first_bs: &bs 256 + fix_bs: false + divided_factor: [8, 16] # w, h + is_training: True + loader: + shuffle: true + batch_size_per_card: *bs + drop_last: true + num_workers: 16 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 16 diff --git a/configs/rec/PP-OCRv4/en_PP-OCRv4_rec.yml b/configs/rec/PP-OCRv4/en_PP-OCRv4_rec.yml new file mode 100644 index 0000000000000000000000000000000000000000..9537f7a106b8d1f2de1b9f04ee7931a0e3d6105d --- /dev/null +++ b/configs/rec/PP-OCRv4/en_PP-OCRv4_rec.yml @@ -0,0 +1,150 @@ +Global: + debug: false + use_gpu: true + epoch_num: 50 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_ppocr_v4 + save_epoch_step: 10 + eval_batch_step: + - 0 + - 2000 + cal_metric_during_train: true + pretrained_model: refactor + checkpoints: null + save_inference_dir: null + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/en_dict.txt + max_text_length: 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3.txt +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.0005 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 +Architecture: + model_type: rec + algorithm: SVTR_LCNet + Transform: null + Backbone: + name: PPLCNetV3 + scale: 0.95 + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 120 + depth: 2 + hidden_dims: 120 + kernel_size: + - 1 + - 3 + use_guide: true + Head: + fc_decay: 1.0e-05 + - NRTRHead: + nrtr_dim: 384 + max_text_length: 25 +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: null + - NRTRLoss: null +PostProcess: + name: CTCLabelDecode +Metric: + name: RecMetric + main_indicator: acc + ignore_space: false +Train: + dataset: + name: MultiScaleDataSet + ds_width: false + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: + - 48 + - 320 + - 3 + max_text_length: 25 + - RecAug: null + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + sampler: + name: MultiScaleSampler + scales: + - - 320 + - 32 + - - 320 + - 48 + - - 320 + - 64 + first_bs: 96 + fix_bs: false + divided_factor: + - 8 + - 16 + is_training: true + loader: + shuffle: true + batch_size_per_card: 96 + drop_last: true + num_workers: 8 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - RecResizeImg: + image_shape: + - 3 + - 48 + - 320 + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 +profiler_options: null diff --git a/configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml b/configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml index e2aa50106ff60aa61858a22ba6fdd03b8cd04d85..793bb5c91da2e3d53cda76f0c85889d6529e6c59 100644 --- a/configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml +++ b/configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml @@ -27,7 +27,7 @@ Optimizer: beta2: 0.999 lr: name: Piecewise - decay_epochs : [700, 800] + decay_epochs : [700] values : [0.001, 0.0001] warmup_epoch: 5 regularizer: diff --git a/configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml b/configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml index ab48b99791d00785d143cd933ccc31b3f69d0f8f..3855005cc391ef84f8e036fdd36a082e68b810f4 100644 --- a/configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml +++ b/configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml @@ -19,6 +19,7 @@ Global: use_space_char: true distributed: true save_res_path: ./output/rec/predicts_pp-OCRv2_distillation.txt + amp_custom_black_list: ['matmul','matmul_v2','elementwise_add'] Optimizer: @@ -27,7 +28,7 @@ Optimizer: beta2: 0.999 lr: name: Piecewise - decay_epochs : [700, 800] + decay_epochs : [700] values : [0.001, 0.0001] warmup_epoch: 5 regularizer: diff --git a/configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_enhanced_ctc_loss.yml b/configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_enhanced_ctc_loss.yml index 5be96969fd0d4912a4ff2a09c9d181b1b17d633e..ef0e893faee5333beb4838d5dbfde96efbe5e7f8 100644 --- a/configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_enhanced_ctc_loss.yml +++ b/configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_enhanced_ctc_loss.yml @@ -27,7 +27,7 @@ Optimizer: beta2: 0.999 lr: name: Piecewise - decay_epochs : [700, 800] + decay_epochs : [700] values : [0.001, 0.0001] warmup_epoch: 5 regularizer: diff --git a/configs/rec/rec_resnet_stn_bilstm_att.yml b/configs/rec/rec_resnet_stn_bilstm_att.yml index 0bb90b35264b424c58a45685f5a2a066843298a6..20ed9533c2bde761b0d28391b631c9487ca70f91 100644 --- a/configs/rec/rec_resnet_stn_bilstm_att.yml +++ b/configs/rec/rec_resnet_stn_bilstm_att.yml @@ -1,6 +1,6 @@ Global: use_gpu: True - epoch_num: 400 + epoch_num: 6 log_smooth_window: 20 print_batch_step: 10 save_model_dir: ./output/rec/seed @@ -27,7 +27,7 @@ Optimizer: momentum: 0.9 lr: name: Piecewise - decay_epochs: [4,5,8] + decay_epochs: [4, 5] values: [1.0, 0.1, 0.01] regularizer: name: 'L2' diff --git a/configs/rec/rec_satrn.yml b/configs/rec/rec_satrn.yml new file mode 100644 index 0000000000000000000000000000000000000000..8ed688b65b75ab4fad5f3c06b58ec8e78bcf59fd --- /dev/null +++ b/configs/rec/rec_satrn.yml @@ -0,0 +1,117 @@ +Global: + use_gpu: true + epoch_num: 5 + log_smooth_window: 20 + print_batch_step: 50 + save_model_dir: ./output/rec/rec_satrn/ + save_epoch_step: 1 + # evaluation is run every 5000 iterations + eval_batch_step: [0, 5000] + cal_metric_during_train: False + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: + # for data or label process + character_dict_path: ppocr/utils/dict90.txt + max_text_length: 25 + infer_mode: False + use_space_char: False + rm_symbol: True + save_res_path: ./output/rec/predicts_satrn.txt + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Piecewise + decay_epochs: [3, 4] + values: [0.0003, 0.00003, 0.000003] + regularizer: + name: 'L2' + factor: 0 + +Architecture: + model_type: rec + algorithm: SATRN + Backbone: + name: ShallowCNN + in_channels: 3 + hidden_dim: 256 + Head: + name: SATRNHead + enc_cfg: + n_layers: 6 + n_head: 8 + d_k: 32 + d_v: 32 + d_model: 256 + n_position: 100 + d_inner: 1024 + dropout: 0.1 + dec_cfg: + n_layers: 6 + d_embedding: 256 + n_head: 8 + d_model: 256 + d_inner: 1024 + d_k: 32 + d_v: 32 + max_seq_len: 25 + start_idx: 91 + +Loss: + name: SATRNLoss + +PostProcess: + name: SATRNLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/training/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - SATRNLabelEncode: # Class handling label + - SVTRRecResizeImg: + image_shape: [3, 32, 100] + padding: False + - KeepKeys: + keep_keys: ['image', 'label', 'valid_ratio'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 128 + drop_last: True + num_workers: 8 + use_shared_memory: False + +Eval: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/evaluation/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - SATRNLabelEncode: # Class handling label + - SVTRRecResizeImg: + image_shape: [3, 32, 100] + padding: False + - KeepKeys: + keep_keys: ['image', 'label', 'valid_ratio'] # dataloader will return list in this order + + loader: + shuffle: False + drop_last: False + batch_size_per_card: 128 + num_workers: 4 + use_shared_memory: False + diff --git a/configs/rec/rec_svtrnet.yml b/configs/rec/rec_svtrnet.yml index e8ceefead6e42de5167984ffa0c18f7ecb03157b..4657c5c8d607b825eebaa94049cdb0c5237d9475 100644 --- a/configs/rec/rec_svtrnet.yml +++ b/configs/rec/rec_svtrnet.yml @@ -20,16 +20,17 @@ Global: infer_mode: False use_space_char: False save_res_path: ./output/rec/predicts_svtr_tiny.txt + d2s_train_image_shape: [3, 64, 256] Optimizer: name: AdamW beta1: 0.9 beta2: 0.99 - epsilon: 8.e-8 + epsilon: 1.e-8 weight_decay: 0.05 no_weight_decay_name: norm pos_embed - one_dim_param_no_weight_decay: true + one_dim_param_no_weight_decay: True lr: name: Cosine learning_rate: 0.0005 @@ -48,7 +49,7 @@ Architecture: Backbone: name: SVTRNet img_size: [32, 100] - out_char_num: 25 + out_char_num: 25 # W//4 or W//8 or W/12 out_channels: 192 patch_merging: 'Conv' embed_dim: [64, 128, 256] @@ -57,7 +58,7 @@ Architecture: mixer: ['Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global'] local_mixer: [[7, 11], [7, 11], [7, 11]] last_stage: True - prenorm: false + prenorm: False Neck: name: SequenceEncoder encoder_type: reshape @@ -82,6 +83,8 @@ Train: - DecodeImage: # load image img_mode: BGR channel_first: False + - SVTRRecAug: + aug_type: 0 # or 1 - CTCLabelEncode: # Class handling label - SVTRRecResizeImg: image_shape: [3, 64, 256] @@ -92,7 +95,7 @@ Train: shuffle: True batch_size_per_card: 512 drop_last: True - num_workers: 4 + num_workers: 8 Eval: dataset: diff --git a/configs/rec/rec_svtrnet_ch.yml b/configs/rec/rec_svtrnet_ch.yml index 0d3f63d125ea12fa097fe49454b04423710e2f68..bb82cfc5e10785d37cd81dca0b22a070da6adb3b 100644 --- a/configs/rec/rec_svtrnet_ch.yml +++ b/configs/rec/rec_svtrnet_ch.yml @@ -19,11 +19,12 @@ Global: infer_mode: false use_space_char: true save_res_path: ./output/rec/predicts_svtr_tiny_ch_all.txt + d2s_train_image_shape: [3, 32, 320] Optimizer: name: AdamW beta1: 0.9 beta2: 0.99 - epsilon: 8.0e-08 + epsilon: 1.0e-08 weight_decay: 0.05 no_weight_decay_name: norm pos_embed one_dim_param_no_weight_decay: true @@ -40,7 +41,7 @@ Architecture: img_size: - 32 - 320 - out_char_num: 40 + out_char_num: 40 # W//4 or W//8 or W/12 out_channels: 96 patch_merging: Conv embed_dim: diff --git a/configs/table/SLANet.yml b/configs/table/SLANet.yml index a896614556e36f77bd784218b6c2f29914219dbe..3f2d6b28eabf35da227a981e7783ccd59fa62333 100644 --- a/configs/table/SLANet.yml +++ b/configs/table/SLANet.yml @@ -21,6 +21,8 @@ Global: infer_mode: False use_sync_bn: True save_res_path: 'output/infer' + d2s_train_image_shape: [3, -1, -1] + amp_custom_white_list: ['concat', 'elementwise_sub', 'set_value'] Optimizer: name: Adam diff --git a/configs/table/table_master.yml b/configs/table/table_master.yml index df437f7c95523c5fe12f7166d011b4ad8473628b..125162f1889914b7bd27637044497addb580a1aa 100755 --- a/configs/table/table_master.yml +++ b/configs/table/table_master.yml @@ -17,6 +17,7 @@ Global: infer_mode: false max_text_length: &max_text_length 500 box_format: &box_format 'xywh' # 'xywh', 'xyxy', 'xyxyxyxy' + d2s_train_image_shape: [3, 480, 480] Optimizer: diff --git a/configs/table/table_mv3.yml b/configs/table/table_mv3.yml index 9d286f4153eaab44bf0d259bbad4a0b3b8ada568..50c84393af4f50b11c76da5f4c70405083a85daf 100755 --- a/configs/table/table_mv3.yml +++ b/configs/table/table_mv3.yml @@ -20,6 +20,7 @@ Global: max_text_length: &max_text_length 500 box_format: &box_format 'xyxy' # 'xywh', 'xyxy', 'xyxyxyxy' infer_mode: False + amp_custom_black_list: ['matmul_v2','elementwise_add'] Optimizer: name: Adam diff --git a/deploy/avh/convert_image.py b/deploy/avh/convert_image.py index 747ab29e4ad2b577890298626da756b7bac6047c..7c6dbd7fd871e1977ebe17bdfccb10421b4b5e52 100755 --- a/deploy/avh/convert_image.py +++ b/deploy/avh/convert_image.py @@ -24,6 +24,7 @@ import math from PIL import Image import numpy as np + def resize_norm_img(img, image_shape, padding=True): imgC, imgH, imgW = image_shape h = img.shape[0] @@ -61,9 +62,8 @@ def create_header_file(name, tensor_name, tensor_data, output_path): raw_path = file_path.with_suffix(".h").resolve() with open(raw_path, "w") as header_file: header_file.write( - "\n" - + f"const size_t {tensor_name}_len = {tensor_data.size};\n" - + f'__attribute__((section(".data.tvm"), aligned(16))) float {tensor_name}[] = ' + "\n" + f"const size_t {tensor_name}_len = {tensor_data.size};\n" + + f'__attribute__((section(".data.tvm"), aligned(16))) float {tensor_name}[] = ' ) header_file.write("{") @@ -80,22 +80,21 @@ def create_headers(image_name): # Resize image to 32x320 img = cv2.imread(img_path) - img = resize_norm_img(img, [3,32,320]) + img = resize_norm_img(img, [3, 32, 320]) img_data = img.astype("float32") - + # # Add the batch dimension, as we are expecting 4-dimensional input: NCHW. img_data = np.expand_dims(img_data, axis=0) # Create input header file create_header_file("inputs", "input", img_data, "./include") # Create output header file - output_data = np.zeros([7760], np.float) + output_data = np.zeros([7760], np.float32) create_header_file( "outputs", "output", output_data, - "./include", - ) + "./include", ) if __name__ == "__main__": diff --git a/deploy/avh/requirements.txt b/deploy/avh/requirements.txt index 1bf86ed1107cb18bf1ea6ac8eb2cc1214d654b60..a1a8626f19aa1a35b5b1b9ac636d03d15631326e 100644 --- a/deploy/avh/requirements.txt +++ b/deploy/avh/requirements.txt @@ -1,3 +1,4 @@ paddlepaddle numpy -opencv-python \ No newline at end of file +opencv-python +typing-extensions diff --git a/deploy/cpp_infer/docs/windows_vs2019_build.md b/deploy/cpp_infer/docs/windows_vs2019_build.md index bcaefa46f83a30a4c232add78dc2e9f521b9f84f..2f5c5818d7d977c78c15e29d7c1bbebd98ce99bf 100644 --- a/deploy/cpp_infer/docs/windows_vs2019_build.md +++ b/deploy/cpp_infer/docs/windows_vs2019_build.md @@ -121,7 +121,7 @@ CUDA_LIB、CUDNN_LIB、TENSORRT_DIR、WITH_GPU、WITH_TENSORRT ``` cd /d D:\projects\cpp\PaddleOCR\deploy\cpp_infer ``` -可执行文件`ppocr.exe`即为样例的预测程序,其主要使用方法如下,更多使用方法可以参考[说明文档](../readme.md)`运行demo`部分。 +可执行文件`ppocr.exe`即为样例的预测程序,其主要使用方法如下,更多使用方法可以参考[说明文档](../readme_ch.md)`运行demo`部分。 ```shell # 切换终端编码为utf8 diff --git a/deploy/cpp_infer/include/paddleocr.h b/deploy/cpp_infer/include/paddleocr.h index 16750a15f70d374f8aa837042ba6a13bc10a5d35..85b9d15a7edf1e90105a51897083a1dc9a544ecc 100644 --- a/deploy/cpp_infer/include/paddleocr.h +++ b/deploy/cpp_infer/include/paddleocr.h @@ -23,7 +23,7 @@ namespace PaddleOCR { class PPOCR { public: explicit PPOCR(); - ~PPOCR(); + ~PPOCR() = default; std::vector> ocr(std::vector img_list, bool det = true, @@ -47,9 +47,9 @@ protected: std::vector &ocr_results); private: - DBDetector *detector_ = nullptr; - Classifier *classifier_ = nullptr; - CRNNRecognizer *recognizer_ = nullptr; + std::unique_ptr detector_; + std::unique_ptr classifier_; + std::unique_ptr recognizer_; }; } // namespace PaddleOCR diff --git a/deploy/cpp_infer/include/paddlestructure.h b/deploy/cpp_infer/include/paddlestructure.h index 8478a85cdec23984f86a323f55a4591d52bcf08c..9ae54f48f7e246dfba0263c9410bdaeb6b6f6ca6 100644 --- a/deploy/cpp_infer/include/paddlestructure.h +++ b/deploy/cpp_infer/include/paddlestructure.h @@ -23,7 +23,7 @@ namespace PaddleOCR { class PaddleStructure : public PPOCR { public: explicit PaddleStructure(); - ~PaddleStructure(); + ~PaddleStructure() = default; std::vector structure(cv::Mat img, bool layout = false, @@ -37,8 +37,8 @@ private: std::vector time_info_table = {0, 0, 0}; std::vector time_info_layout = {0, 0, 0}; - StructureTableRecognizer *table_model_ = nullptr; - StructureLayoutRecognizer *layout_model_ = nullptr; + std::unique_ptr table_model_; + std::unique_ptr layout_model_; void layout(cv::Mat img, std::vector &structure_result); diff --git a/deploy/cpp_infer/src/main.cpp b/deploy/cpp_infer/src/main.cpp index 0c155dd0eca04874d23c3be7e6eff241b73f5f1b..b522a2d151adbe5d2522e33f5b09f891d18b175b 100644 --- a/deploy/cpp_infer/src/main.cpp +++ b/deploy/cpp_infer/src/main.cpp @@ -82,7 +82,7 @@ void check_params() { } void ocr(std::vector &cv_all_img_names) { - PPOCR ocr = PPOCR(); + PPOCR ocr; if (FLAGS_benchmark) { ocr.reset_timer(); @@ -120,7 +120,7 @@ void ocr(std::vector &cv_all_img_names) { } void structure(std::vector &cv_all_img_names) { - PaddleOCR::PaddleStructure engine = PaddleOCR::PaddleStructure(); + PaddleOCR::PaddleStructure engine; if (FLAGS_benchmark) { engine.reset_timer(); diff --git a/deploy/cpp_infer/src/ocr_cls.cpp b/deploy/cpp_infer/src/ocr_cls.cpp index 13a03d6ad4564e710631ce62f99c622b47d6905f..6f2b5509e63a0e8d3d81d314cb0fe3a2f3763642 100644 --- a/deploy/cpp_infer/src/ocr_cls.cpp +++ b/deploy/cpp_infer/src/ocr_cls.cpp @@ -20,12 +20,9 @@ void Classifier::Run(std::vector img_list, std::vector &cls_labels, std::vector &cls_scores, std::vector ×) { - std::chrono::duration preprocess_diff = - std::chrono::steady_clock::now() - std::chrono::steady_clock::now(); - std::chrono::duration inference_diff = - std::chrono::steady_clock::now() - std::chrono::steady_clock::now(); - std::chrono::duration postprocess_diff = - std::chrono::steady_clock::now() - std::chrono::steady_clock::now(); + std::chrono::duration preprocess_diff = std::chrono::duration::zero(); + std::chrono::duration inference_diff = std::chrono::duration::zero(); + std::chrono::duration postprocess_diff = std::chrono::duration::zero(); int img_num = img_list.size(); std::vector cls_image_shape = {3, 48, 192}; diff --git a/deploy/cpp_infer/src/ocr_rec.cpp b/deploy/cpp_infer/src/ocr_rec.cpp index 96715163681092c0075fdbf456cc38b1679d82b9..cf3e58d42afdc49a203999253929af228cdab5f7 100644 --- a/deploy/cpp_infer/src/ocr_rec.cpp +++ b/deploy/cpp_infer/src/ocr_rec.cpp @@ -20,12 +20,9 @@ void CRNNRecognizer::Run(std::vector img_list, std::vector &rec_texts, std::vector &rec_text_scores, std::vector ×) { - std::chrono::duration preprocess_diff = - std::chrono::steady_clock::now() - std::chrono::steady_clock::now(); - std::chrono::duration inference_diff = - std::chrono::steady_clock::now() - std::chrono::steady_clock::now(); - std::chrono::duration postprocess_diff = - std::chrono::steady_clock::now() - std::chrono::steady_clock::now(); + std::chrono::duration preprocess_diff = std::chrono::duration::zero(); + std::chrono::duration inference_diff = std::chrono::duration::zero(); + std::chrono::duration postprocess_diff = std::chrono::duration::zero(); int img_num = img_list.size(); std::vector width_list; diff --git a/deploy/cpp_infer/src/paddleocr.cpp b/deploy/cpp_infer/src/paddleocr.cpp index 86747c60d682c4f2df66a8bc8f5c9dae68b80170..e0956474d1577741d81e44bcad53f11aaca55778 100644 --- a/deploy/cpp_infer/src/paddleocr.cpp +++ b/deploy/cpp_infer/src/paddleocr.cpp @@ -21,28 +21,28 @@ namespace PaddleOCR { PPOCR::PPOCR() { if (FLAGS_det) { - this->detector_ = new DBDetector( + this->detector_.reset(new DBDetector( FLAGS_det_model_dir, FLAGS_use_gpu, FLAGS_gpu_id, FLAGS_gpu_mem, FLAGS_cpu_threads, FLAGS_enable_mkldnn, FLAGS_limit_type, FLAGS_limit_side_len, FLAGS_det_db_thresh, FLAGS_det_db_box_thresh, FLAGS_det_db_unclip_ratio, FLAGS_det_db_score_mode, FLAGS_use_dilation, - FLAGS_use_tensorrt, FLAGS_precision); + FLAGS_use_tensorrt, FLAGS_precision)); } if (FLAGS_cls && FLAGS_use_angle_cls) { - this->classifier_ = new Classifier( + this->classifier_.reset(new Classifier( FLAGS_cls_model_dir, FLAGS_use_gpu, FLAGS_gpu_id, FLAGS_gpu_mem, FLAGS_cpu_threads, FLAGS_enable_mkldnn, FLAGS_cls_thresh, - FLAGS_use_tensorrt, FLAGS_precision, FLAGS_cls_batch_num); + FLAGS_use_tensorrt, FLAGS_precision, FLAGS_cls_batch_num)); } if (FLAGS_rec) { - this->recognizer_ = new CRNNRecognizer( + this->recognizer_.reset(new CRNNRecognizer( FLAGS_rec_model_dir, FLAGS_use_gpu, FLAGS_gpu_id, FLAGS_gpu_mem, FLAGS_cpu_threads, FLAGS_enable_mkldnn, FLAGS_rec_char_dict_path, FLAGS_use_tensorrt, FLAGS_precision, FLAGS_rec_batch_num, - FLAGS_rec_img_h, FLAGS_rec_img_w); + FLAGS_rec_img_h, FLAGS_rec_img_w)); } -}; +} std::vector> PPOCR::ocr(std::vector img_list, bool det, bool rec, bool cls) { @@ -51,7 +51,7 @@ PPOCR::ocr(std::vector img_list, bool det, bool rec, bool cls) { if (!det) { std::vector ocr_result; ocr_result.resize(img_list.size()); - if (cls && this->classifier_ != nullptr) { + if (cls && this->classifier_) { this->cls(img_list, ocr_result); for (int i = 0; i < img_list.size(); i++) { if (ocr_result[i].cls_label % 2 == 1 && @@ -92,7 +92,7 @@ std::vector PPOCR::ocr(cv::Mat img, bool det, bool rec, img_list.push_back(crop_img); } // cls - if (cls && this->classifier_ != nullptr) { + if (cls && this->classifier_) { this->cls(img_list, ocr_result); for (int i = 0; i < img_list.size(); i++) { if (ocr_result[i].cls_label % 2 == 1 && @@ -190,16 +190,4 @@ void PPOCR::benchmark_log(int img_num) { } } -PPOCR::~PPOCR() { - if (this->detector_ != nullptr) { - delete this->detector_; - } - if (this->classifier_ != nullptr) { - delete this->classifier_; - } - if (this->recognizer_ != nullptr) { - delete this->recognizer_; - } -}; - } // namespace PaddleOCR diff --git a/deploy/cpp_infer/src/paddlestructure.cpp b/deploy/cpp_infer/src/paddlestructure.cpp index b2e35f8c777bde3cea0a3fefd0ce8517d8d75318..bde687e2c2611b23e3eef051cef45a495858cf17 100644 --- a/deploy/cpp_infer/src/paddlestructure.cpp +++ b/deploy/cpp_infer/src/paddlestructure.cpp @@ -21,20 +21,20 @@ namespace PaddleOCR { PaddleStructure::PaddleStructure() { if (FLAGS_layout) { - this->layout_model_ = new StructureLayoutRecognizer( + this->layout_model_.reset(new StructureLayoutRecognizer( FLAGS_layout_model_dir, FLAGS_use_gpu, FLAGS_gpu_id, FLAGS_gpu_mem, FLAGS_cpu_threads, FLAGS_enable_mkldnn, FLAGS_layout_dict_path, FLAGS_use_tensorrt, FLAGS_precision, FLAGS_layout_score_threshold, - FLAGS_layout_nms_threshold); + FLAGS_layout_nms_threshold)); } if (FLAGS_table) { - this->table_model_ = new StructureTableRecognizer( + this->table_model_.reset(new StructureTableRecognizer( FLAGS_table_model_dir, FLAGS_use_gpu, FLAGS_gpu_id, FLAGS_gpu_mem, FLAGS_cpu_threads, FLAGS_enable_mkldnn, FLAGS_table_char_dict_path, FLAGS_use_tensorrt, FLAGS_precision, FLAGS_table_batch_num, - FLAGS_table_max_len, FLAGS_merge_no_span_structure); + FLAGS_table_max_len, FLAGS_merge_no_span_structure)); } -}; +} std::vector PaddleStructure::structure(cv::Mat srcimg, bool layout, bool table, bool ocr) { @@ -65,7 +65,7 @@ PaddleStructure::structure(cv::Mat srcimg, bool layout, bool table, bool ocr) { } return structure_results; -}; +} void PaddleStructure::layout( cv::Mat img, std::vector &structure_result) { @@ -123,7 +123,7 @@ void PaddleStructure::table(cv::Mat img, structure_result.cell_box = structure_boxes[i]; structure_result.html_score = structure_scores[i]; } -}; +} std::string PaddleStructure::rebuild_table(std::vector structure_html_tags, @@ -286,10 +286,4 @@ void PaddleStructure::benchmark_log(int img_num) { } } -PaddleStructure::~PaddleStructure() { - if (this->table_model_ != nullptr) { - delete this->table_model_; - } -}; - -} // namespace PaddleOCR \ No newline at end of file +} // namespace PaddleOCR diff --git a/deploy/cpp_infer/src/preprocess_op.cpp b/deploy/cpp_infer/src/preprocess_op.cpp index b0261a9ed2b177ed58e07c042004e28c70b9762a..4dabb1206800cc19805bbcc330c0628f7679f4d8 100644 --- a/deploy/cpp_infer/src/preprocess_op.cpp +++ b/deploy/cpp_infer/src/preprocess_op.cpp @@ -112,7 +112,7 @@ void CrnnResizeImg::Run(const cv::Mat &img, cv::Mat &resize_img, float wh_ratio, cv::INTER_LINEAR); cv::copyMakeBorder(resize_img, resize_img, 0, 0, 0, int(imgW - resize_img.cols), cv::BORDER_CONSTANT, - {127, 127, 127}); + {0, 0, 0}); } void ClsResizeImg::Run(const cv::Mat &img, cv::Mat &resize_img, diff --git a/deploy/cpp_infer/src/utility.cpp b/deploy/cpp_infer/src/utility.cpp index 4a8b181494fca768b153e0825e8be0853f7f3aef..ea5ef1e4ac2bf622fefa2998776eab2f8d92028b 100644 --- a/deploy/cpp_infer/src/utility.cpp +++ b/deploy/cpp_infer/src/utility.cpp @@ -308,7 +308,7 @@ void Utility::sorted_boxes(std::vector &ocr_result) { std::sort(ocr_result.begin(), ocr_result.end(), Utility::comparison_box); if (ocr_result.size() > 0) { for (int i = 0; i < ocr_result.size() - 1; i++) { - for (int j = i; j > 0; j--) { + for (int j = i; j >= 0; j--) { if (abs(ocr_result[j + 1].box[0][1] - ocr_result[j].box[0][1]) < 10 && (ocr_result[j + 1].box[0][0] < ocr_result[j].box[0][0])) { std::swap(ocr_result[i], ocr_result[i + 1]); diff --git a/deploy/fastdeploy/README.md b/deploy/fastdeploy/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3c157b4cfbd4096cf52afe168202183bf7b9d420 --- /dev/null +++ b/deploy/fastdeploy/README.md @@ -0,0 +1,88 @@ +# PaddleOCR高性能全场景模型部署方案—FastDeploy + +## 目录 +- [FastDeploy介绍](#FastDeploy介绍) +- [PaddleOCR模型部署](#PaddleOCR模型部署) +- [常见问题](#常见问题) + +## 1. FastDeploy介绍 +
+ +**[⚡️FastDeploy](https://github.com/PaddlePaddle/FastDeploy)**是一款**全场景**、**易用灵活**、**极致高效**的AI推理部署工具,支持**云边端**部署.使用FastDeploy可以简单高效的在X86 CPU、NVIDIA GPU、飞腾CPU、ARM CPU、Intel GPU、昆仑、昇腾、算能、瑞芯微等10+款硬件上对PaddleOCR模型进行快速部署,并且支持Paddle Inference、Paddle Lite、TensorRT、OpenVINO、ONNXRuntime、SOPHGO、RKNPU2等多种推理后端. + +
+ + + +
+ +## 2. PaddleOCR模型部署 +
+ +### 2.1 硬件支持列表 + +|硬件类型|该硬件是否支持|使用指南|Python|C++| +|:---:|:---:|:---:|:---:|:---:| +|X86 CPU|✅|[链接](./cpu-gpu)|✅|✅| +|NVIDIA GPU|✅|[链接](./cpu-gpu)|✅|✅| +|飞腾CPU|✅|[链接](./cpu-gpu)|✅|✅| +|ARM CPU|✅|[链接](./cpu-gpu)|✅|✅| +|Intel GPU(集成显卡)|✅|[链接](./cpu-gpu)|✅|✅| +|Intel GPU(独立显卡)|✅|[链接](./cpu-gpu)|✅|✅| +|昆仑|✅|[链接](./kunlunxin)|✅|✅| +|昇腾|✅|[链接](./ascend)|✅|✅| +|算能|✅|[链接](./sophgo)|✅|✅| +|瑞芯微|✅|[链接](./rockchip)|✅|✅| + +### 2.2. 详细使用文档 +- X86 CPU + - [部署模型准备](./cpu-gpu) + - [Python部署示例](./cpu-gpu/python/) + - [C++部署示例](./cpu-gpu/cpp/) +- NVIDIA GPU + - [部署模型准备](./cpu-gpu) + - [Python部署示例](./cpu-gpu/python/) + - [C++部署示例](./cpu-gpu/cpp/) +- 飞腾CPU + - [部署模型准备](./cpu-gpu) + - [Python部署示例](./cpu-gpu/python/) + - [C++部署示例](./cpu-gpu/cpp/) +- ARM CPU + - [部署模型准备](./cpu-gpu) + - [Python部署示例](./cpu-gpu/python/) + - [C++部署示例](./cpu-gpu/cpp/) +- Intel GPU + - [部署模型准备](./cpu-gpu) + - [Python部署示例](./cpu-gpu/python/) + - [C++部署示例](./cpu-gpu/cpp/) +- 昆仑 XPU + - [部署模型准备](./kunlunxin) + - [Python部署示例](./kunlunxin/python/) + - [C++部署示例](./kunlunxin/cpp/) +- 昇腾 Ascend + - [部署模型准备](./ascend) + - [Python部署示例](./ascend/python/) + - [C++部署示例](./ascend/cpp/) +- 算能 Sophgo + - [部署模型准备](./sophgo/) + - [Python部署示例](./sophgo/python/) + - [C++部署示例](./sophgo/cpp/) +- 瑞芯微 Rockchip + - [部署模型准备](./rockchip/) + - [Python部署示例](./rockchip/rknpu2/) + - [C++部署示例](./rockchip/rknpu2/) + +### 2.3 更多部署方式 + +- [Android ARM CPU部署](./android) +- [服务化Serving部署](./serving) +- [web部署](./web) + + +## 3. 常见问题 +
+ +遇到问题可查看常见问题集合,搜索FastDeploy issue,*或给FastDeploy提交[issue](https://github.com/PaddlePaddle/FastDeploy/issues)*: + +[常见问题集合](https://github.com/PaddlePaddle/FastDeploy/tree/develop/docs/cn/faq) +[FastDeploy issues](https://github.com/PaddlePaddle/FastDeploy/issues) diff --git a/deploy/fastdeploy/android/.gitignore b/deploy/fastdeploy/android/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..f6eba672f08bc874e4b639e2a6323b2e1d6697a3 --- /dev/null +++ b/deploy/fastdeploy/android/.gitignore @@ -0,0 +1,20 @@ +.DS_Store +.idea +.gradle +.cxx +cache +build +app/cache +app/libs/fastdeploy* +app/.cxx +app/build +app/src/main/assets/models/* +app/.gradle +app/.idea +fastdeploy/cache +fastdeploy/libs/fastdeploy* +fastdeploy/.cxx +fastdeploy/build +fastdeploy/src/main/assets/models/* +fastdeploy/.gradle +fastdeploy/.idea diff --git a/deploy/fastdeploy/android/README.md b/deploy/fastdeploy/android/README.md new file mode 100644 index 0000000000000000000000000000000000000000..19ff8a01955801f2afa77aa6ebe6f5c3a6fd98fa --- /dev/null +++ b/deploy/fastdeploy/android/README.md @@ -0,0 +1,223 @@ +[English](README.md) | 简体中文 +# PaddleOCR Android Demo 使用文档 + +在 Android 上实现实时的PaddleOCR文字识别功能,此 Demo 有很好的的易用性和开放性,如在 Demo 中跑自己训练好的模型等。 + +## 环境准备 + +1. 在本地环境安装好 Android Studio 工具,详细安装方法请见[Android Stuido 官网](https://developer.android.com/studio)。 +2. 准备一部 Android 手机,并开启 USB 调试模式。开启方法: `手机设置 -> 查找开发者选项 -> 打开开发者选项和 USB 调试模式` + +## 部署步骤 + +1. 用 Android Studio 打开 PP-OCRv3/android 工程 +2. 手机连接电脑,打开 USB 调试和文件传输模式,并在 Android Studio 上连接自己的手机设备(手机需要开启允许从 USB 安装软件权限) + +

+image +

+ +> **注意:** +>> 如果您在导入项目、编译或者运行过程中遇到 NDK 配置错误的提示,请打开 ` File > Project Structure > SDK Location`,修改 `Andriod SDK location` 为您本机配置的 SDK 所在路径。 + +4. 点击 Run 按钮,自动编译 APP 并安装到手机。(该过程会自动下载预编译的 FastDeploy Android 库 以及 模型文件,需要联网) + 成功后效果如下,图一:APP 安装到手机;图二: APP 打开后的效果,会自动识别图片中的物体并标记;图三:APP设置选项,点击右上角的设置图片,可以设置不同选项进行体验。 + +| APP 图标 | APP 效果 | APP设置项 + | --- | --- | --- | +| ![app_pic](https://user-images.githubusercontent.com/14995488/203484427-83de2316-fd60-4baf-93b6-3755f9b5559d.jpg) | ![app_res](https://user-images.githubusercontent.com/14995488/203495616-af42a5b7-d3bc-4fce-8d5e-2ed88454f618.jpg) | ![app_setup](https://user-images.githubusercontent.com/14995488/203484436-57fdd041-7dcc-4e0e-b6cb-43e5ac1e729b.jpg) | + +### PP-OCRv3 Java API 说明 + +- 模型初始化 API: 模型初始化API包含两种方式,方式一是通过构造函数直接初始化;方式二是,通过调用init函数,在合适的程序节点进行初始化。 PP-OCR初始化参数说明如下: + - modelFile: String, paddle格式的模型文件路径,如 model.pdmodel + - paramFile: String, paddle格式的参数文件路径,如 model.pdiparams + - labelFile: String, 可选参数,表示label标签文件所在路径,用于可视化,如 ppocr_keys_v1.txt,每一行包含一个label + - option: RuntimeOption,可选参数,模型初始化option。如果不传入该参数则会使用默认的运行时选项。 + 与其他模型不同的是,PP-OCRv3 包含 DBDetector、Classifier和Recognizer等基础模型,以及pipeline类型。 +```java +// 构造函数: constructor w/o label file +public DBDetector(String modelFile, String paramsFile); +public DBDetector(String modelFile, String paramsFile, RuntimeOption option); +public Classifier(String modelFile, String paramsFile); +public Classifier(String modelFile, String paramsFile, RuntimeOption option); +public Recognizer(String modelFile, String paramsFile, String labelPath); +public Recognizer(String modelFile, String paramsFile, String labelPath, RuntimeOption option); +public PPOCRv3(); // 空构造函数,之后可以调用init初始化 +// Constructor w/o classifier +public PPOCRv3(DBDetector detModel, Recognizer recModel); +public PPOCRv3(DBDetector detModel, Classifier clsModel, Recognizer recModel); +``` +- 模型预测 API:模型预测API包含直接预测的API以及带可视化功能的API。直接预测是指,不保存图片以及不渲染结果到Bitmap上,仅预测推理结果。预测并且可视化是指,预测结果以及可视化,并将可视化后的图片保存到指定的途径,以及将可视化结果渲染在Bitmap(目前支持ARGB8888格式的Bitmap), 后续可将该Bitmap在camera中进行显示。 +```java +// 直接预测:不保存图片以及不渲染结果到Bitmap上 +public OCRResult predict(Bitmap ARGB8888Bitmap); +// 预测并且可视化:预测结果以及可视化,并将可视化后的图片保存到指定的途径,以及将可视化结果渲染在Bitmap上 +public OCRResult predict(Bitmap ARGB8888Bitmap, String savedImagePath); +public OCRResult predict(Bitmap ARGB8888Bitmap, boolean rendering); // 只渲染 不保存图片 +``` +- 模型资源释放 API:调用 release() API 可以释放模型资源,返回true表示释放成功,false表示失败;调用 initialized() 可以判断模型是否初始化成功,true表示初始化成功,false表示失败。 +```java +public boolean release(); // 释放native资源 +public boolean initialized(); // 检查是否初始化成功 +``` + +- RuntimeOption设置说明 + +```java +public void enableLiteFp16(); // 开启fp16精度推理 +public void disableLiteFP16(); // 关闭fp16精度推理 +public void enableLiteInt8(); // 开启int8精度推理,针对量化模型 +public void disableLiteInt8(); // 关闭int8精度推理 +public void setCpuThreadNum(int threadNum); // 设置线程数 +public void setLitePowerMode(LitePowerMode mode); // 设置能耗模式 +public void setLitePowerMode(String modeStr); // 通过字符串形式设置能耗模式 +``` + +- 模型结果OCRResult说明 +```java +public class OCRResult { + public int[][] mBoxes; // 表示单张图片检测出来的所有目标框坐标,每个框以8个int数值依次表示框的4个坐标点,顺序为左下,右下,右上,左上 + public String[] mText; // 表示多个文本框内被识别出来的文本内容 + public float[] mRecScores; // 表示文本框内识别出来的文本的置信度 + public float[] mClsScores; // 表示文本框的分类结果的置信度 + public int[] mClsLabels; // 表示文本框的方向分类类别 + public boolean mInitialized = false; // 检测结果是否有效 +} +``` +其他参考:C++/Python对应的OCRResult说明: [api/vision_results/ocr_result.md](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/api/vision_results/ocr_result.md) + + +- 模型调用示例1:使用构造函数 +```java +import java.nio.ByteBuffer; +import android.graphics.Bitmap; +import android.opengl.GLES20; + +import com.baidu.paddle.fastdeploy.RuntimeOption; +import com.baidu.paddle.fastdeploy.LitePowerMode; +import com.baidu.paddle.fastdeploy.vision.OCRResult; +import com.baidu.paddle.fastdeploy.vision.ocr.Classifier; +import com.baidu.paddle.fastdeploy.vision.ocr.DBDetector; +import com.baidu.paddle.fastdeploy.vision.ocr.Recognizer; + +// 模型路径 +String detModelFile = "ch_PP-OCRv3_det_infer/inference.pdmodel"; +String detParamsFile = "ch_PP-OCRv3_det_infer/inference.pdiparams"; +String clsModelFile = "ch_ppocr_mobile_v2.0_cls_infer/inference.pdmodel"; +String clsParamsFile = "ch_ppocr_mobile_v2.0_cls_infer/inference.pdiparams"; +String recModelFile = "ch_PP-OCRv3_rec_infer/inference.pdmodel"; +String recParamsFile = "ch_PP-OCRv3_rec_infer/inference.pdiparams"; +String recLabelFilePath = "labels/ppocr_keys_v1.txt"; +// 设置RuntimeOption +RuntimeOption detOption = new RuntimeOption(); +RuntimeOption clsOption = new RuntimeOption(); +RuntimeOption recOption = new RuntimeOption(); +detOption.setCpuThreadNum(2); +clsOption.setCpuThreadNum(2); +recOption.setCpuThreadNum(2); +detOption.setLitePowerMode(LitePowerMode.LITE_POWER_HIGH); +clsOption.setLitePowerMode(LitePowerMode.LITE_POWER_HIGH); +recOption.setLitePowerMode(LitePowerMode.LITE_POWER_HIGH); +detOption.enableLiteFp16(); +clsOption.enableLiteFp16(); +recOption.enableLiteFp16(); +// 初始化模型 +DBDetector detModel = new DBDetector(detModelFile, detParamsFile, detOption); +Classifier clsModel = new Classifier(clsModelFile, clsParamsFile, clsOption); +Recognizer recModel = new Recognizer(recModelFile, recParamsFile, recLabelFilePath, recOption); +PPOCRv3 model = new PPOCRv3(detModel,clsModel,recModel); + +// 读取图片: 以下仅为读取Bitmap的伪代码 +ByteBuffer pixelBuffer = ByteBuffer.allocate(width * height * 4); +GLES20.glReadPixels(0, 0, width, height, GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, pixelBuffer); +Bitmap ARGB8888ImageBitmap = Bitmap.createBitmap(width, height, Bitmap.Config.ARGB_8888); +ARGB8888ImageBitmap.copyPixelsFromBuffer(pixelBuffer); + +// 模型推理 +OCRResult result = model.predict(ARGB8888ImageBitmap); + +// 释放模型资源 +model.release(); +``` + +- 模型调用示例2: 在合适的程序节点,手动调用init +```java +// import 同上 ... +import com.baidu.paddle.fastdeploy.RuntimeOption; +import com.baidu.paddle.fastdeploy.LitePowerMode; +import com.baidu.paddle.fastdeploy.vision.OCRResult; +import com.baidu.paddle.fastdeploy.vision.ocr.Classifier; +import com.baidu.paddle.fastdeploy.vision.ocr.DBDetector; +import com.baidu.paddle.fastdeploy.vision.ocr.Recognizer; +// 新建空模型 +PPOCRv3 model = new PPOCRv3(); +// 模型路径 +String detModelFile = "ch_PP-OCRv3_det_infer/inference.pdmodel"; +String detParamsFile = "ch_PP-OCRv3_det_infer/inference.pdiparams"; +String clsModelFile = "ch_ppocr_mobile_v2.0_cls_infer/inference.pdmodel"; +String clsParamsFile = "ch_ppocr_mobile_v2.0_cls_infer/inference.pdiparams"; +String recModelFile = "ch_PP-OCRv3_rec_infer/inference.pdmodel"; +String recParamsFile = "ch_PP-OCRv3_rec_infer/inference.pdiparams"; +String recLabelFilePath = "labels/ppocr_keys_v1.txt"; +// 设置RuntimeOption +RuntimeOption detOption = new RuntimeOption(); +RuntimeOption clsOption = new RuntimeOption(); +RuntimeOption recOption = new RuntimeOption(); +detOption.setCpuThreadNum(2); +clsOption.setCpuThreadNum(2); +recOption.setCpuThreadNum(2); +detOption.setLitePowerMode(LitePowerMode.LITE_POWER_HIGH); +clsOption.setLitePowerMode(LitePowerMode.LITE_POWER_HIGH); +recOption.setLitePowerMode(LitePowerMode.LITE_POWER_HIGH); +detOption.enableLiteFp16(); +clsOption.enableLiteFp16(); +recOption.enableLiteFp16(); +// 使用init函数初始化 +DBDetector detModel = new DBDetector(detModelFile, detParamsFile, detOption); +Classifier clsModel = new Classifier(clsModelFile, clsParamsFile, clsOption); +Recognizer recModel = new Recognizer(recModelFile, recParamsFile, recLabelFilePath, recOption); +model.init(detModel, clsModel, recModel); +// Bitmap读取、模型预测、资源释放 同上 ... +``` +更详细的用法请参考 [OcrMainActivity](./app/src/main/java/com/baidu/paddle/fastdeploy/app/examples/ocr/OcrMainActivity.java)中的用法 + +## 替换 FastDeploy SDK和模型 +替换FastDeploy预测库和模型的步骤非常简单。预测库所在的位置为 `app/libs/fastdeploy-android-sdk-xxx.aar`,其中 `xxx` 表示当前您使用的预测库版本号。模型所在的位置为,`app/src/main/assets/models`。 +- 替换FastDeploy Android SDK: 下载或编译最新的FastDeploy Android SDK,解压缩后放在 `app/libs` 目录下;详细配置文档可参考: + - [在 Android 中使用 FastDeploy Java SDK](https://github.com/PaddlePaddle/FastDeploy/tree/develop/java/android) + +- 替换OCR模型的步骤: + - 将您的OCR模型放在 `app/src/main/assets/models` 目录下; + - 修改 `app/src/main/res/values/strings.xml` 中模型路径的默认值,如: +```xml + +models +labels/ppocr_keys_v1.txt +``` +## 使用量化模型 +如果您使用的是量化格式的模型,只需要使用RuntimeOption的enableLiteInt8()接口设置Int8精度推理即可。 +```java +String detModelFile = "ch_ppocrv3_plate_det_quant/inference.pdmodel"; +String detParamsFile = "ch_ppocrv3_plate_det_quant/inference.pdiparams"; +String recModelFile = "ch_ppocrv3_plate_rec_distillation_quant/inference.pdmodel"; +String recParamsFile = "ch_ppocrv3_plate_rec_distillation_quant/inference.pdiparams"; +String recLabelFilePath = "ppocr_keys_v1.txt"; // ppocr_keys_v1.txt +RuntimeOption detOption = new RuntimeOption(); +RuntimeOption recOption = new RuntimeOption(); +// 使用Int8精度进行推理 +detOption.enableLiteInt8(); +recOption.enableLiteInt8(); +// 初始化PP-OCRv3 Pipeline +PPOCRv3 predictor = new PPOCRv3(); +DBDetector detModel = new DBDetector(detModelFile, detParamsFile, detOption); +Recognizer recModel = new Recognizer(recModelFile, recParamsFile, recLabelFilePath, recOption); +predictor.init(detModel, recModel); +``` +在App中使用,可以参考 [OcrMainActivity.java](./app/src/main/java/com/baidu/paddle/fastdeploy/app/examples/ocr/OcrMainActivity.java) 中的用法。 + +## 更多参考文档 +如果您想知道更多的FastDeploy Java API文档以及如何通过JNI来接入FastDeploy C++ API感兴趣,可以参考以下内容: +- [在 Android 中使用 FastDeploy Java SDK](https://github.com/PaddlePaddle/FastDeploy/tree/develop/java/android) +- [在 Android 中使用 FastDeploy C++ SDK](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/faq/use_cpp_sdk_on_android.md) +- 如果用户想要调整前后处理超参数、单独使用文字检测识别模型、使用其他模型等,更多详细文档与说明请参考[PP-OCR系列在CPU/GPU上的部署](../../cpu-gpu/python/README.md) diff --git a/deploy/fastdeploy/android/app/build.gradle b/deploy/fastdeploy/android/app/build.gradle new file mode 100644 index 0000000000000000000000000000000000000000..de19b87c0d56dbdaed591049313989b109c13123 --- /dev/null +++ b/deploy/fastdeploy/android/app/build.gradle @@ -0,0 +1,125 @@ +import java.security.MessageDigest + +apply plugin: 'com.android.application' + +android { + compileSdk 28 + + defaultConfig { + applicationId 'com.baidu.paddle.fastdeploy.app.examples' + minSdkVersion 15 + //noinspection ExpiredTargetSdkVersion + targetSdkVersion 28 + versionCode 1 + versionName "1.0" + testInstrumentationRunner "android.support.test.runner.AndroidJUnitRunner" + } + + buildTypes { + release { + minifyEnabled false + proguardFiles getDefaultProguardFile('proguard-android-optimize.txt'), 'proguard-rules.pro' + } + } + +} + +dependencies { + implementation fileTree(include: ['*.aar'], dir: 'libs') + implementation 'com.android.support:appcompat-v7:28.0.0' + //noinspection GradleDependency + implementation 'com.android.support.constraint:constraint-layout:1.1.3' + implementation 'com.android.support:design:28.0.0' + implementation 'org.jetbrains:annotations:15.0' + //noinspection GradleDependency + testImplementation 'junit:junit:4.12' + androidTestImplementation 'com.android.support.test:runner:1.0.2' + androidTestImplementation 'com.android.support.test.espresso:espresso-core:3.0.2' +} + +def FD_MODEL = [ + [ + 'src' : 'https://bj.bcebos.com/paddlehub/fastdeploy/ch_PP-OCRv3_det_infer.tgz', + 'dest': 'src/main/assets/models' + ], + [ + 'src' : 'https://bj.bcebos.com/paddlehub/fastdeploy/ch_ppocr_mobile_v2.0_cls_infer.tgz', + 'dest': 'src/main/assets/models' + ], + [ + 'src' : 'https://bj.bcebos.com/paddlehub/fastdeploy/ch_PP-OCRv3_rec_infer.tgz', + 'dest': 'src/main/assets/models' + ] +] + +def FD_JAVA_SDK = [ + [ + 'src' : 'https://bj.bcebos.com/fastdeploy/test/fastdeploy-android-sdk-latest-dev.aar', + 'dest': 'libs' + ] +] + +task downloadAndExtractModels(type: DefaultTask) { + doFirst { + println "Downloading and extracting fastdeploy models ..." + } + doLast { + String cachePath = "cache" + if (!file("${cachePath}").exists()) { + mkdir "${cachePath}" + } + FD_MODEL.eachWithIndex { model, index -> + MessageDigest messageDigest = MessageDigest.getInstance('MD5') + messageDigest.update(model.src.bytes) + String[] modelPaths = model.src.split("/") + String modelName = modelPaths[modelPaths.length - 1] + // Download the target model if not exists + boolean copyFiles = !file("${model.dest}").exists() + if (!file("${cachePath}/${modelName}").exists()) { + println "Downloading ${model.src} -> ${cachePath}/${modelName}" + ant.get(src: model.src, dest: file("${cachePath}/${modelName}")) + copyFiles = true + } + if (copyFiles) { + println "Coping ${cachePath}/${modelName} -> ${model.dest}" + copy { + from tarTree("${cachePath}/${modelName}") + into "${model.dest}" + } + } + } + } +} + +task downloadAndExtractSDKs(type: DefaultTask) { + doFirst { + println "Downloading and extracting fastdeploy android java sdk ..." + } + doLast { + String cachePath = "cache" + if (!file("${cachePath}").exists()) { + mkdir "${cachePath}" + } + FD_JAVA_SDK.eachWithIndex { sdk, index -> + String[] sdkPaths = sdk.src.split("/") + String sdkName = sdkPaths[sdkPaths.length - 1] + // Download the target SDK if not exists + boolean copyFiles = !file("${sdk.dest}/${sdkName}").exists() + if (!file("${cachePath}/${sdkName}").exists()) { + println "Downloading ${sdk.src} -> ${cachePath}/${sdkName}" + ant.get(src: sdk.src, dest: file("${cachePath}/${sdkName}")) + copyFiles = true + } + if (copyFiles) { + println "Coping ${cachePath}/${sdkName} -> ${sdk.dest}/${sdkName}" + copy { + from "${cachePath}/${sdkName}" + into "${sdk.dest}" + } + } + } + } +} + +preBuild.dependsOn downloadAndExtractSDKs +preBuild.dependsOn downloadAndExtractModels \ No newline at end of file diff --git a/deploy/fastdeploy/android/app/proguard-rules.pro b/deploy/fastdeploy/android/app/proguard-rules.pro new file mode 100644 index 0000000000000000000000000000000000000000..481bb434814107eb79d7a30b676d344b0df2f8ce --- /dev/null +++ b/deploy/fastdeploy/android/app/proguard-rules.pro @@ -0,0 +1,21 @@ +# Add project specific ProGuard rules here. +# You can control the set of applied configuration files using the +# proguardFiles setting in build.gradle. +# +# For more details, see +# http://developer.android.com/guide/developing/tools/proguard.html + +# If your project uses WebView with JS, uncomment the following +# and specify the fully qualified class name to the JavaScript interface +# class: +#-keepclassmembers class fqcn.of.javascript.interface.for.webview { +# public *; +#} + +# Uncomment this to preserve the line number information for +# debugging stack traces. +#-keepattributes SourceFile,LineNumberTable + +# If you keep the line number information, uncomment this to +# hide the original source file name. +#-renamesourcefileattribute SourceFile \ No newline at end of file diff --git a/deploy/fastdeploy/android/app/src/androidTest/java/com/baidu/paddle/fastdeploy/ExampleInstrumentedTest.java b/deploy/fastdeploy/android/app/src/androidTest/java/com/baidu/paddle/fastdeploy/ExampleInstrumentedTest.java new file mode 100644 index 0000000000000000000000000000000000000000..0efacb79092bdca698c25c7369883ee7e77aa8cb --- /dev/null +++ b/deploy/fastdeploy/android/app/src/androidTest/java/com/baidu/paddle/fastdeploy/ExampleInstrumentedTest.java @@ -0,0 +1,26 @@ +package com.baidu.paddle.fastdeploy; + +import android.content.Context; + +import androidx.test.platform.app.InstrumentationRegistry; +import androidx.test.ext.junit.runners.AndroidJUnit4; + +import org.junit.Test; +import org.junit.runner.RunWith; + +import static org.junit.Assert.*; + +/** + * Instrumented test, which will execute on an Android device. + * + * @see Testing documentation + */ +@RunWith(AndroidJUnit4.class) +public class ExampleInstrumentedTest { + @Test + public void useAppContext() { + // Context of the app under test. + Context appContext = InstrumentationRegistry.getInstrumentation().getTargetContext(); + assertEquals("com.baidu.paddle.fastdeploy", appContext.getPackageName()); + } +} \ No newline at end of file diff --git a/deploy/fastdeploy/android/app/src/main/AndroidManifest.xml b/deploy/fastdeploy/android/app/src/main/AndroidManifest.xml new file mode 100644 index 0000000000000000000000000000000000000000..8493c0379fc167dd4a21a6fe32375e17bdaffb10 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/AndroidManifest.xml @@ -0,0 +1,30 @@ + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/deploy/fastdeploy/android/app/src/main/assets/labels/ppocr_keys_v1.txt b/deploy/fastdeploy/android/app/src/main/assets/labels/ppocr_keys_v1.txt new file mode 100644 index 0000000000000000000000000000000000000000..b75af2130342e619dbb9f3f87dc8b74aa27b4a76 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/assets/labels/ppocr_keys_v1.txt @@ -0,0 +1,6623 @@ +' +疗 +绚 +诚 +娇 +溜 +题 +贿 +者 +廖 +更 +纳 +加 +奉 +公 +一 +就 +汴 +计 +与 +路 +房 +原 +妇 +2 +0 +8 +- +7 +其 +> +: +] +, +, +骑 +刈 +全 +消 +昏 +傈 +安 +久 +钟 +嗅 +不 +影 +处 +驽 +蜿 +资 +关 +椤 +地 +瘸 +专 +问 +忖 +票 +嫉 +炎 +韵 +要 +月 +田 +节 +陂 +鄙 +捌 +备 +拳 +伺 +眼 +网 +盎 +大 +傍 +心 +东 +愉 +汇 +蹿 +科 +每 +业 +里 +航 +晏 +字 +平 +录 +先 +1 +3 +彤 +鲶 +产 +稍 +督 +腴 +有 +象 +岳 +注 +绍 +在 +泺 +文 +定 +核 +名 +水 +过 +理 +让 +偷 +率 +等 +这 +发 +” +为 +含 +肥 +酉 +相 +鄱 +七 +编 +猥 +锛 +日 +镀 +蒂 +掰 +倒 +辆 +栾 +栗 +综 +涩 +州 +雌 +滑 +馀 +了 +机 +块 +司 +宰 +甙 +兴 +矽 +抚 +保 +用 +沧 +秩 +如 +收 +息 +滥 +页 +疑 +埠 +! +! +姥 +异 +橹 +钇 +向 +下 +跄 +的 +椴 +沫 +国 +绥 +獠 +报 +开 +民 +蜇 +何 +分 +凇 +长 +讥 +藏 +掏 +施 +羽 +中 +讲 +派 +嘟 +人 +提 +浼 +间 +世 +而 +古 +多 +倪 +唇 +饯 +控 +庚 +首 +赛 +蜓 +味 +断 +制 +觉 +技 +替 +艰 +溢 +潮 +夕 +钺 +外 +摘 +枋 +动 +双 +单 +啮 +户 +枇 +确 +锦 +曜 +杜 +或 +能 +效 +霜 +盒 +然 +侗 +电 +晁 +放 +步 +鹃 +新 +杖 +蜂 +吒 +濂 +瞬 +评 +总 +隍 +对 +独 +合 +也 +是 +府 +青 +天 +诲 +墙 +组 +滴 +级 +邀 +帘 +示 +已 +时 +骸 +仄 +泅 +和 +遨 +店 +雇 +疫 +持 +巍 +踮 +境 +只 +亨 +目 +鉴 +崤 +闲 +体 +泄 +杂 +作 +般 +轰 +化 +解 +迂 +诿 +蛭 +璀 +腾 +告 +版 +服 +省 +师 +小 +规 +程 +线 +海 +办 +引 +二 +桧 +牌 +砺 +洄 +裴 +修 +图 +痫 +胡 +许 +犊 +事 +郛 +基 +柴 +呼 +食 +研 +奶 +律 +蛋 +因 +葆 +察 +戏 +褒 +戒 +再 +李 +骁 +工 +貂 +油 +鹅 +章 +啄 +休 +场 +给 +睡 +纷 +豆 +器 +捎 +说 +敏 +学 +会 +浒 +设 +诊 +格 +廓 +查 +来 +霓 +室 +溆 +¢ +诡 +寥 +焕 +舜 +柒 +狐 +回 +戟 +砾 +厄 +实 +翩 +尿 +五 +入 +径 +惭 +喹 +股 +宇 +篝 +| +; +美 +期 +云 +九 +祺 +扮 +靠 +锝 +槌 +系 +企 +酰 +阊 +暂 +蚕 +忻 +豁 +本 +羹 +执 +条 +钦 +H +獒 +限 +进 +季 +楦 +于 +芘 +玖 +铋 +茯 +未 +答 +粘 +括 +样 +精 +欠 +矢 +甥 +帷 +嵩 +扣 +令 +仔 +风 +皈 +行 +支 +部 +蓉 +刮 +站 +蜡 +救 +钊 +汗 +松 +嫌 +成 +可 +. +鹤 +院 +从 +交 +政 +怕 +活 +调 +球 +局 +验 +髌 +第 +韫 +谗 +串 +到 +圆 +年 +米 +/ +* +友 +忿 +检 +区 +看 +自 +敢 +刃 +个 +兹 +弄 +流 +留 +同 +没 +齿 +星 +聆 +轼 +湖 +什 +三 +建 +蛔 +儿 +椋 +汕 +震 +颧 +鲤 +跟 +力 +情 +璺 +铨 +陪 +务 +指 +族 +训 +滦 +鄣 +濮 +扒 +商 +箱 +十 +召 +慷 +辗 +所 +莞 +管 +护 +臭 +横 +硒 +嗓 +接 +侦 +六 +露 +党 +馋 +驾 +剖 +高 +侬 +妪 +幂 +猗 +绺 +骐 +央 +酐 +孝 +筝 +课 +徇 +缰 +门 +男 +西 +项 +句 +谙 +瞒 +秃 +篇 +教 +碲 +罚 +声 +呐 +景 +前 +富 +嘴 +鳌 +稀 +免 +朋 +啬 +睐 +去 +赈 +鱼 +住 +肩 +愕 +速 +旁 +波 +厅 +健 +茼 +厥 +鲟 +谅 +投 +攸 +炔 +数 +方 +击 +呋 +谈 +绩 +别 +愫 +僚 +躬 +鹧 +胪 +炳 +招 +喇 +膨 +泵 +蹦 +毛 +结 +5 +4 +谱 +识 +陕 +粽 +婚 +拟 +构 +且 +搜 +任 +潘 +比 +郢 +妨 +醪 +陀 +桔 +碘 +扎 +选 +哈 +骷 +楷 +亿 +明 +缆 +脯 +监 +睫 +逻 +婵 +共 +赴 +淝 +凡 +惦 +及 +达 +揖 +谩 +澹 +减 +焰 +蛹 +番 +祁 +柏 +员 +禄 +怡 +峤 +龙 +白 +叽 +生 +闯 +起 +细 +装 +谕 +竟 +聚 +钙 +上 +导 +渊 +按 +艾 +辘 +挡 +耒 +盹 +饪 +臀 +记 +邮 +蕙 +受 +各 +医 +搂 +普 +滇 +朗 +茸 +带 +翻 +酚 +( +光 +堤 +墟 +蔷 +万 +幻 +〓 +瑙 +辈 +昧 +盏 +亘 +蛀 +吉 +铰 +请 +子 +假 +闻 +税 +井 +诩 +哨 +嫂 +好 +面 +琐 +校 +馊 +鬣 +缂 +营 +访 +炖 +占 +农 +缀 +否 +经 +钚 +棵 +趟 +张 +亟 +吏 +茶 +谨 +捻 +论 +迸 +堂 +玉 +信 +吧 +瞠 +乡 +姬 +寺 +咬 +溏 +苄 +皿 +意 +赉 +宝 +尔 +钰 +艺 +特 +唳 +踉 +都 +荣 +倚 +登 +荐 +丧 +奇 +涵 +批 +炭 +近 +符 +傩 +感 +道 +着 +菊 +虹 +仲 +众 +懈 +濯 +颞 +眺 +南 +释 +北 +缝 +标 +既 +茗 +整 +撼 +迤 +贲 +挎 +耱 +拒 +某 +妍 +卫 +哇 +英 +矶 +藩 +治 +他 +元 +领 +膜 +遮 +穗 +蛾 +飞 +荒 +棺 +劫 +么 +市 +火 +温 +拈 +棚 +洼 +转 +果 +奕 +卸 +迪 +伸 +泳 +斗 +邡 +侄 +涨 +屯 +萋 +胭 +氡 +崮 +枞 +惧 +冒 +彩 +斜 +手 +豚 +随 +旭 +淑 +妞 +形 +菌 +吲 +沱 +争 +驯 +歹 +挟 +兆 +柱 +传 +至 +包 +内 +响 +临 +红 +功 +弩 +衡 +寂 +禁 +老 +棍 +耆 +渍 +织 +害 +氵 +渑 +布 +载 +靥 +嗬 +虽 +苹 +咨 +娄 +库 +雉 +榜 +帜 +嘲 +套 +瑚 +亲 +簸 +欧 +边 +6 +腿 +旮 +抛 +吹 +瞳 +得 +镓 +梗 +厨 +继 +漾 +愣 +憨 +士 +策 +窑 +抑 +躯 +襟 +脏 +参 +贸 +言 +干 +绸 +鳄 +穷 +藜 +音 +折 +详 +) +举 +悍 +甸 +癌 +黎 +谴 +死 +罩 +迁 +寒 +驷 +袖 +媒 +蒋 +掘 +模 +纠 +恣 +观 +祖 +蛆 +碍 +位 +稿 +主 +澧 +跌 +筏 +京 +锏 +帝 +贴 +证 +糠 +才 +黄 +鲸 +略 +炯 +饱 +四 +出 +园 +犀 +牧 +容 +汉 +杆 +浈 +汰 +瑷 +造 +虫 +瘩 +怪 +驴 +济 +应 +花 +沣 +谔 +夙 +旅 +价 +矿 +以 +考 +s +u +呦 +晒 +巡 +茅 +准 +肟 +瓴 +詹 +仟 +褂 +译 +桌 +混 +宁 +怦 +郑 +抿 +些 +余 +鄂 +饴 +攒 +珑 +群 +阖 +岔 +琨 +藓 +预 +环 +洮 +岌 +宀 +杲 +瀵 +最 +常 +囡 +周 +踊 +女 +鼓 +袭 +喉 +简 +范 +薯 +遐 +疏 +粱 +黜 +禧 +法 +箔 +斤 +遥 +汝 +奥 +直 +贞 +撑 +置 +绱 +集 +她 +馅 +逗 +钧 +橱 +魉 +[ +恙 +躁 +唤 +9 +旺 +膘 +待 +脾 +惫 +购 +吗 +依 +盲 +度 +瘿 +蠖 +俾 +之 +镗 +拇 +鲵 +厝 +簧 +续 +款 +展 +啃 +表 +剔 +品 +钻 +腭 +损 +清 +锶 +统 +涌 +寸 +滨 +贪 +链 +吠 +冈 +伎 +迥 +咏 +吁 +览 +防 +迅 +失 +汾 +阔 +逵 +绀 +蔑 +列 +川 +凭 +努 +熨 +揪 +利 +俱 +绉 +抢 +鸨 +我 +即 +责 +膦 +易 +毓 +鹊 +刹 +玷 +岿 +空 +嘞 +绊 +排 +术 +估 +锷 +违 +们 +苟 +铜 +播 +肘 +件 +烫 +审 +鲂 +广 +像 +铌 +惰 +铟 +巳 +胍 +鲍 +康 +憧 +色 +恢 +想 +拷 +尤 +疳 +知 +S +Y +F +D +A +峄 +裕 +帮 +握 +搔 +氐 +氘 +难 +墒 +沮 +雨 +叁 +缥 +悴 +藐 +湫 +娟 +苑 +稠 +颛 +簇 +后 +阕 +闭 +蕤 +缚 +怎 +佞 +码 +嘤 +蔡 +痊 +舱 +螯 +帕 +赫 +昵 +升 +烬 +岫 +、 +疵 +蜻 +髁 +蕨 +隶 +烛 +械 +丑 +盂 +梁 +强 +鲛 +由 +拘 +揉 +劭 +龟 +撤 +钩 +呕 +孛 +费 +妻 +漂 +求 +阑 +崖 +秤 +甘 +通 +深 +补 +赃 +坎 +床 +啪 +承 +吼 +量 +暇 +钼 +烨 +阂 +擎 +脱 +逮 +称 +P +神 +属 +矗 +华 +届 +狍 +葑 +汹 +育 +患 +窒 +蛰 +佼 +静 +槎 +运 +鳗 +庆 +逝 +曼 +疱 +克 +代 +官 +此 +麸 +耧 +蚌 +晟 +例 +础 +榛 +副 +测 +唰 +缢 +迹 +灬 +霁 +身 +岁 +赭 +扛 +又 +菡 +乜 +雾 +板 +读 +陷 +徉 +贯 +郁 +虑 +变 +钓 +菜 +圾 +现 +琢 +式 +乐 +维 +渔 +浜 +左 +吾 +脑 +钡 +警 +T +啵 +拴 +偌 +漱 +湿 +硕 +止 +骼 +魄 +积 +燥 +联 +踢 +玛 +则 +窿 +见 +振 +畿 +送 +班 +钽 +您 +赵 +刨 +印 +讨 +踝 +籍 +谡 +舌 +崧 +汽 +蔽 +沪 +酥 +绒 +怖 +财 +帖 +肱 +私 +莎 +勋 +羔 +霸 +励 +哼 +帐 +将 +帅 +渠 +纪 +婴 +娩 +岭 +厘 +滕 +吻 +伤 +坝 +冠 +戊 +隆 +瘁 +介 +涧 +物 +黍 +并 +姗 +奢 +蹑 +掣 +垸 +锴 +命 +箍 +捉 +病 +辖 +琰 +眭 +迩 +艘 +绌 +繁 +寅 +若 +毋 +思 +诉 +类 +诈 +燮 +轲 +酮 +狂 +重 +反 +职 +筱 +县 +委 +磕 +绣 +奖 +晋 +濉 +志 +徽 +肠 +呈 +獐 +坻 +口 +片 +碰 +几 +村 +柿 +劳 +料 +获 +亩 +惕 +晕 +厌 +号 +罢 +池 +正 +鏖 +煨 +家 +棕 +复 +尝 +懋 +蜥 +锅 +岛 +扰 +队 +坠 +瘾 +钬 +@ +卧 +疣 +镇 +譬 +冰 +彷 +频 +黯 +据 +垄 +采 +八 +缪 +瘫 +型 +熹 +砰 +楠 +襁 +箐 +但 +嘶 +绳 +啤 +拍 +盥 +穆 +傲 +洗 +盯 +塘 +怔 +筛 +丿 +台 +恒 +喂 +葛 +永 +¥ +烟 +酒 +桦 +书 +砂 +蚝 +缉 +态 +瀚 +袄 +圳 +轻 +蛛 +超 +榧 +遛 +姒 +奘 +铮 +右 +荽 +望 +偻 +卡 +丶 +氰 +附 +做 +革 +索 +戚 +坨 +桷 +唁 +垅 +榻 +岐 +偎 +坛 +莨 +山 +殊 +微 +骇 +陈 +爨 +推 +嗝 +驹 +澡 +藁 +呤 +卤 +嘻 +糅 +逛 +侵 +郓 +酌 +德 +摇 +※ +鬃 +被 +慨 +殡 +羸 +昌 +泡 +戛 +鞋 +河 +宪 +沿 +玲 +鲨 +翅 +哽 +源 +铅 +语 +照 +邯 +址 +荃 +佬 +顺 +鸳 +町 +霭 +睾 +瓢 +夸 +椁 +晓 +酿 +痈 +咔 +侏 +券 +噎 +湍 +签 +嚷 +离 +午 +尚 +社 +锤 +背 +孟 +使 +浪 +缦 +潍 +鞅 +军 +姹 +驶 +笑 +鳟 +鲁 +》 +孽 +钜 +绿 +洱 +礴 +焯 +椰 +颖 +囔 +乌 +孔 +巴 +互 +性 +椽 +哞 +聘 +昨 +早 +暮 +胶 +炀 +隧 +低 +彗 +昝 +铁 +呓 +氽 +藉 +喔 +癖 +瑗 +姨 +权 +胱 +韦 +堑 +蜜 +酋 +楝 +砝 +毁 +靓 +歙 +锲 +究 +屋 +喳 +骨 +辨 +碑 +武 +鸠 +宫 +辜 +烊 +适 +坡 +殃 +培 +佩 +供 +走 +蜈 +迟 +翼 +况 +姣 +凛 +浔 +吃 +飘 +债 +犟 +金 +促 +苛 +崇 +坂 +莳 +畔 +绂 +兵 +蠕 +斋 +根 +砍 +亢 +欢 +恬 +崔 +剁 +餐 +榫 +快 +扶 +‖ +濒 +缠 +鳜 +当 +彭 +驭 +浦 +篮 +昀 +锆 +秸 +钳 +弋 +娣 +瞑 +夷 +龛 +苫 +拱 +致 +% +嵊 +障 +隐 +弑 +初 +娓 +抉 +汩 +累 +蓖 +" +唬 +助 +苓 +昙 +押 +毙 +破 +城 +郧 +逢 +嚏 +獭 +瞻 +溱 +婿 +赊 +跨 +恼 +璧 +萃 +姻 +貉 +灵 +炉 +密 +氛 +陶 +砸 +谬 +衔 +点 +琛 +沛 +枳 +层 +岱 +诺 +脍 +榈 +埂 +征 +冷 +裁 +打 +蹴 +素 +瘘 +逞 +蛐 +聊 +激 +腱 +萘 +踵 +飒 +蓟 +吆 +取 +咙 +簋 +涓 +矩 +曝 +挺 +揣 +座 +你 +史 +舵 +焱 +尘 +苏 +笈 +脚 +溉 +榨 +诵 +樊 +邓 +焊 +义 +庶 +儋 +蟋 +蒲 +赦 +呷 +杞 +诠 +豪 +还 +试 +颓 +茉 +太 +除 +紫 +逃 +痴 +草 +充 +鳕 +珉 +祗 +墨 +渭 +烩 +蘸 +慕 +璇 +镶 +穴 +嵘 +恶 +骂 +险 +绋 +幕 +碉 +肺 +戳 +刘 +潞 +秣 +纾 +潜 +銮 +洛 +须 +罘 +销 +瘪 +汞 +兮 +屉 +r +林 +厕 +质 +探 +划 +狸 +殚 +善 +煊 +烹 +〒 +锈 +逯 +宸 +辍 +泱 +柚 +袍 +远 +蹋 +嶙 +绝 +峥 +娥 +缍 +雀 +徵 +认 +镱 +谷 += +贩 +勉 +撩 +鄯 +斐 +洋 +非 +祚 +泾 +诒 +饿 +撬 +威 +晷 +搭 +芍 +锥 +笺 +蓦 +候 +琊 +档 +礁 +沼 +卵 +荠 +忑 +朝 +凹 +瑞 +头 +仪 +弧 +孵 +畏 +铆 +突 +衲 +车 +浩 +气 +茂 +悖 +厢 +枕 +酝 +戴 +湾 +邹 +飚 +攘 +锂 +写 +宵 +翁 +岷 +无 +喜 +丈 +挑 +嗟 +绛 +殉 +议 +槽 +具 +醇 +淞 +笃 +郴 +阅 +饼 +底 +壕 +砚 +弈 +询 +缕 +庹 +翟 +零 +筷 +暨 +舟 +闺 +甯 +撞 +麂 +茌 +蔼 +很 +珲 +捕 +棠 +角 +阉 +媛 +娲 +诽 +剿 +尉 +爵 +睬 +韩 +诰 +匣 +危 +糍 +镯 +立 +浏 +阳 +少 +盆 +舔 +擘 +匪 +申 +尬 +铣 +旯 +抖 +赘 +瓯 +居 +ˇ +哮 +游 +锭 +茏 +歌 +坏 +甚 +秒 +舞 +沙 +仗 +劲 +潺 +阿 +燧 +郭 +嗖 +霏 +忠 +材 +奂 +耐 +跺 +砀 +输 +岖 +媳 +氟 +极 +摆 +灿 +今 +扔 +腻 +枝 +奎 +药 +熄 +吨 +话 +q +额 +慑 +嘌 +协 +喀 +壳 +埭 +视 +著 +於 +愧 +陲 +翌 +峁 +颅 +佛 +腹 +聋 +侯 +咎 +叟 +秀 +颇 +存 +较 +罪 +哄 +岗 +扫 +栏 +钾 +羌 +己 +璨 +枭 +霉 +煌 +涸 +衿 +键 +镝 +益 +岢 +奏 +连 +夯 +睿 +冥 +均 +糖 +狞 +蹊 +稻 +爸 +刿 +胥 +煜 +丽 +肿 +璃 +掸 +跚 +灾 +垂 +樾 +濑 +乎 +莲 +窄 +犹 +撮 +战 +馄 +软 +络 +显 +鸢 +胸 +宾 +妲 +恕 +埔 +蝌 +份 +遇 +巧 +瞟 +粒 +恰 +剥 +桡 +博 +讯 +凯 +堇 +阶 +滤 +卖 +斌 +骚 +彬 +兑 +磺 +樱 +舷 +两 +娱 +福 +仃 +差 +找 +桁 +÷ +净 +把 +阴 +污 +戬 +雷 +碓 +蕲 +楚 +罡 +焖 +抽 +妫 +咒 +仑 +闱 +尽 +邑 +菁 +爱 +贷 +沥 +鞑 +牡 +嗉 +崴 +骤 +塌 +嗦 +订 +拮 +滓 +捡 +锻 +次 +坪 +杩 +臃 +箬 +融 +珂 +鹗 +宗 +枚 +降 +鸬 +妯 +阄 +堰 +盐 +毅 +必 +杨 +崃 +俺 +甬 +状 +莘 +货 +耸 +菱 +腼 +铸 +唏 +痤 +孚 +澳 +懒 +溅 +翘 +疙 +杷 +淼 +缙 +骰 +喊 +悉 +砻 +坷 +艇 +赁 +界 +谤 +纣 +宴 +晃 +茹 +归 +饭 +梢 +铡 +街 +抄 +肼 +鬟 +苯 +颂 +撷 +戈 +炒 +咆 +茭 +瘙 +负 +仰 +客 +琉 +铢 +封 +卑 +珥 +椿 +镧 +窨 +鬲 +寿 +御 +袤 +铃 +萎 +砖 +餮 +脒 +裳 +肪 +孕 +嫣 +馗 +嵇 +恳 +氯 +江 +石 +褶 +冢 +祸 +阻 +狈 +羞 +银 +靳 +透 +咳 +叼 +敷 +芷 +啥 +它 +瓤 +兰 +痘 +懊 +逑 +肌 +往 +捺 +坊 +甩 +呻 +〃 +沦 +忘 +膻 +祟 +菅 +剧 +崆 +智 +坯 +臧 +霍 +墅 +攻 +眯 +倘 +拢 +骠 +铐 +庭 +岙 +瓠 +′ +缺 +泥 +迢 +捶 +? +? +郏 +喙 +掷 +沌 +纯 +秘 +种 +听 +绘 +固 +螨 +团 +香 +盗 +妒 +埚 +蓝 +拖 +旱 +荞 +铀 +血 +遏 +汲 +辰 +叩 +拽 +幅 +硬 +惶 +桀 +漠 +措 +泼 +唑 +齐 +肾 +念 +酱 +虚 +屁 +耶 +旗 +砦 +闵 +婉 +馆 +拭 +绅 +韧 +忏 +窝 +醋 +葺 +顾 +辞 +倜 +堆 +辋 +逆 +玟 +贱 +疾 +董 +惘 +倌 +锕 +淘 +嘀 +莽 +俭 +笏 +绑 +鲷 +杈 +择 +蟀 +粥 +嗯 +驰 +逾 +案 +谪 +褓 +胫 +哩 +昕 +颚 +鲢 +绠 +躺 +鹄 +崂 +儒 +俨 +丝 +尕 +泌 +啊 +萸 +彰 +幺 +吟 +骄 +苣 +弦 +脊 +瑰 +〈 +诛 +镁 +析 +闪 +剪 +侧 +哟 +框 +螃 +守 +嬗 +燕 +狭 +铈 +缮 +概 +迳 +痧 +鲲 +俯 +售 +笼 +痣 +扉 +挖 +满 +咋 +援 +邱 +扇 +歪 +便 +玑 +绦 +峡 +蛇 +叨 +〖 +泽 +胃 +斓 +喋 +怂 +坟 +猪 +该 +蚬 +炕 +弥 +赞 +棣 +晔 +娠 +挲 +狡 +创 +疖 +铕 +镭 +稷 +挫 +弭 +啾 +翔 +粉 +履 +苘 +哦 +楼 +秕 +铂 +土 +锣 +瘟 +挣 +栉 +习 +享 +桢 +袅 +磨 +桂 +谦 +延 +坚 +蔚 +噗 +署 +谟 +猬 +钎 +恐 +嬉 +雒 +倦 +衅 +亏 +璩 +睹 +刻 +殿 +王 +算 +雕 +麻 +丘 +柯 +骆 +丸 +塍 +谚 +添 +鲈 +垓 +桎 +蚯 +芥 +予 +飕 +镦 +谌 +窗 +醚 +菀 +亮 +搪 +莺 +蒿 +羁 +足 +J +真 +轶 +悬 +衷 +靛 +翊 +掩 +哒 +炅 +掐 +冼 +妮 +l +谐 +稚 +荆 +擒 +犯 +陵 +虏 +浓 +崽 +刍 +陌 +傻 +孜 +千 +靖 +演 +矜 +钕 +煽 +杰 +酗 +渗 +伞 +栋 +俗 +泫 +戍 +罕 +沾 +疽 +灏 +煦 +芬 +磴 +叱 +阱 +榉 +湃 +蜀 +叉 +醒 +彪 +租 +郡 +篷 +屎 +良 +垢 +隗 +弱 +陨 +峪 +砷 +掴 +颁 +胎 +雯 +绵 +贬 +沐 +撵 +隘 +篙 +暖 +曹 +陡 +栓 +填 +臼 +彦 +瓶 +琪 +潼 +哪 +鸡 +摩 +啦 +俟 +锋 +域 +耻 +蔫 +疯 +纹 +撇 +毒 +绶 +痛 +酯 +忍 +爪 +赳 +歆 +嘹 +辕 +烈 +册 +朴 +钱 +吮 +毯 +癜 +娃 +谀 +邵 +厮 +炽 +璞 +邃 +丐 +追 +词 +瓒 +忆 +轧 +芫 +谯 +喷 +弟 +半 +冕 +裙 +掖 +墉 +绮 +寝 +苔 +势 +顷 +褥 +切 +衮 +君 +佳 +嫒 +蚩 +霞 +佚 +洙 +逊 +镖 +暹 +唛 +& +殒 +顶 +碗 +獗 +轭 +铺 +蛊 +废 +恹 +汨 +崩 +珍 +那 +杵 +曲 +纺 +夏 +薰 +傀 +闳 +淬 +姘 +舀 +拧 +卷 +楂 +恍 +讪 +厩 +寮 +篪 +赓 +乘 +灭 +盅 +鞣 +沟 +慎 +挂 +饺 +鼾 +杳 +树 +缨 +丛 +絮 +娌 +臻 +嗳 +篡 +侩 +述 +衰 +矛 +圈 +蚜 +匕 +筹 +匿 +濞 +晨 +叶 +骋 +郝 +挚 +蚴 +滞 +增 +侍 +描 +瓣 +吖 +嫦 +蟒 +匾 +圣 +赌 +毡 +癞 +恺 +百 +曳 +需 +篓 +肮 +庖 +帏 +卿 +驿 +遗 +蹬 +鬓 +骡 +歉 +芎 +胳 +屐 +禽 +烦 +晌 +寄 +媾 +狄 +翡 +苒 +船 +廉 +终 +痞 +殇 +々 +畦 +饶 +改 +拆 +悻 +萄 +£ +瓿 +乃 +訾 +桅 +匮 +溧 +拥 +纱 +铍 +骗 +蕃 +龋 +缬 +父 +佐 +疚 +栎 +醍 +掳 +蓄 +x +惆 +颜 +鲆 +榆 +〔 +猎 +敌 +暴 +谥 +鲫 +贾 +罗 +玻 +缄 +扦 +芪 +癣 +落 +徒 +臾 +恿 +猩 +托 +邴 +肄 +牵 +春 +陛 +耀 +刊 +拓 +蓓 +邳 +堕 +寇 +枉 +淌 +啡 +湄 +兽 +酷 +萼 +碚 +濠 +萤 +夹 +旬 +戮 +梭 +琥 +椭 +昔 +勺 +蜊 +绐 +晚 +孺 +僵 +宣 +摄 +冽 +旨 +萌 +忙 +蚤 +眉 +噼 +蟑 +付 +契 +瓜 +悼 +颡 +壁 +曾 +窕 +颢 +澎 +仿 +俑 +浑 +嵌 +浣 +乍 +碌 +褪 +乱 +蔟 +隙 +玩 +剐 +葫 +箫 +纲 +围 +伐 +决 +伙 +漩 +瑟 +刑 +肓 +镳 +缓 +蹭 +氨 +皓 +典 +畲 +坍 +铑 +檐 +塑 +洞 +倬 +储 +胴 +淳 +戾 +吐 +灼 +惺 +妙 +毕 +珐 +缈 +虱 +盖 +羰 +鸿 +磅 +谓 +髅 +娴 +苴 +唷 +蚣 +霹 +抨 +贤 +唠 +犬 +誓 +逍 +庠 +逼 +麓 +籼 +釉 +呜 +碧 +秧 +氩 +摔 +霄 +穸 +纨 +辟 +妈 +映 +完 +牛 +缴 +嗷 +炊 +恩 +荔 +茆 +掉 +紊 +慌 +莓 +羟 +阙 +萁 +磐 +另 +蕹 +辱 +鳐 +湮 +吡 +吩 +唐 +睦 +垠 +舒 +圜 +冗 +瞿 +溺 +芾 +囱 +匠 +僳 +汐 +菩 +饬 +漓 +黑 +霰 +浸 +濡 +窥 +毂 +蒡 +兢 +驻 +鹉 +芮 +诙 +迫 +雳 +厂 +忐 +臆 +猴 +鸣 +蚪 +栈 +箕 +羡 +渐 +莆 +捍 +眈 +哓 +趴 +蹼 +埕 +嚣 +骛 +宏 +淄 +斑 +噜 +严 +瑛 +垃 +椎 +诱 +压 +庾 +绞 +焘 +廿 +抡 +迄 +棘 +夫 +纬 +锹 +眨 +瞌 +侠 +脐 +竞 +瀑 +孳 +骧 +遁 +姜 +颦 +荪 +滚 +萦 +伪 +逸 +粳 +爬 +锁 +矣 +役 +趣 +洒 +颔 +诏 +逐 +奸 +甭 +惠 +攀 +蹄 +泛 +尼 +拼 +阮 +鹰 +亚 +颈 +惑 +勒 +〉 +际 +肛 +爷 +刚 +钨 +丰 +养 +冶 +鲽 +辉 +蔻 +画 +覆 +皴 +妊 +麦 +返 +醉 +皂 +擀 +〗 +酶 +凑 +粹 +悟 +诀 +硖 +港 +卜 +z +杀 +涕 +± +舍 +铠 +抵 +弛 +段 +敝 +镐 +奠 +拂 +轴 +跛 +袱 +e +t +沉 +菇 +俎 +薪 +峦 +秭 +蟹 +历 +盟 +菠 +寡 +液 +肢 +喻 +染 +裱 +悱 +抱 +氙 +赤 +捅 +猛 +跑 +氮 +谣 +仁 +尺 +辊 +窍 +烙 +衍 +架 +擦 +倏 +璐 +瑁 +币 +楞 +胖 +夔 +趸 +邛 +惴 +饕 +虔 +蝎 +§ +哉 +贝 +宽 +辫 +炮 +扩 +饲 +籽 +魏 +菟 +锰 +伍 +猝 +末 +琳 +哚 +蛎 +邂 +呀 +姿 +鄞 +却 +歧 +仙 +恸 +椐 +森 +牒 +寤 +袒 +婆 +虢 +雅 +钉 +朵 +贼 +欲 +苞 +寰 +故 +龚 +坭 +嘘 +咫 +礼 +硷 +兀 +睢 +汶 +’ +铲 +烧 +绕 +诃 +浃 +钿 +哺 +柜 +讼 +颊 +璁 +腔 +洽 +咐 +脲 +簌 +筠 +镣 +玮 +鞠 +谁 +兼 +姆 +挥 +梯 +蝴 +谘 +漕 +刷 +躏 +宦 +弼 +b +垌 +劈 +麟 +莉 +揭 +笙 +渎 +仕 +嗤 +仓 +配 +怏 +抬 +错 +泯 +镊 +孰 +猿 +邪 +仍 +秋 +鼬 +壹 +歇 +吵 +炼 +< +尧 +射 +柬 +廷 +胧 +霾 +凳 +隋 +肚 +浮 +梦 +祥 +株 +堵 +退 +L +鹫 +跎 +凶 +毽 +荟 +炫 +栩 +玳 +甜 +沂 +鹿 +顽 +伯 +爹 +赔 +蛴 +徐 +匡 +欣 +狰 +缸 +雹 +蟆 +疤 +默 +沤 +啜 +痂 +衣 +禅 +w +i +h +辽 +葳 +黝 +钗 +停 +沽 +棒 +馨 +颌 +肉 +吴 +硫 +悯 +劾 +娈 +马 +啧 +吊 +悌 +镑 +峭 +帆 +瀣 +涉 +咸 +疸 +滋 +泣 +翦 +拙 +癸 +钥 +蜒 ++ +尾 +庄 +凝 +泉 +婢 +渴 +谊 +乞 +陆 +锉 +糊 +鸦 +淮 +I +B +N +晦 +弗 +乔 +庥 +葡 +尻 +席 +橡 +傣 +渣 +拿 +惩 +麋 +斛 +缃 +矮 +蛏 +岘 +鸽 +姐 +膏 +催 +奔 +镒 +喱 +蠡 +摧 +钯 +胤 +柠 +拐 +璋 +鸥 +卢 +荡 +倾 +^ +_ +珀 +逄 +萧 +塾 +掇 +贮 +笆 +聂 +圃 +冲 +嵬 +M +滔 +笕 +值 +炙 +偶 +蜱 +搐 +梆 +汪 +蔬 +腑 +鸯 +蹇 +敞 +绯 +仨 +祯 +谆 +梧 +糗 +鑫 +啸 +豺 +囹 +猾 +巢 +柄 +瀛 +筑 +踌 +沭 +暗 +苁 +鱿 +蹉 +脂 +蘖 +牢 +热 +木 +吸 +溃 +宠 +序 +泞 +偿 +拜 +檩 +厚 +朐 +毗 +螳 +吞 +媚 +朽 +担 +蝗 +橘 +畴 +祈 +糟 +盱 +隼 +郜 +惜 +珠 +裨 +铵 +焙 +琚 +唯 +咚 +噪 +骊 +丫 +滢 +勤 +棉 +呸 +咣 +淀 +隔 +蕾 +窈 +饨 +挨 +煅 +短 +匙 +粕 +镜 +赣 +撕 +墩 +酬 +馁 +豌 +颐 +抗 +酣 +氓 +佑 +搁 +哭 +递 +耷 +涡 +桃 +贻 +碣 +截 +瘦 +昭 +镌 +蔓 +氚 +甲 +猕 +蕴 +蓬 +散 +拾 +纛 +狼 +猷 +铎 +埋 +旖 +矾 +讳 +囊 +糜 +迈 +粟 +蚂 +紧 +鲳 +瘢 +栽 +稼 +羊 +锄 +斟 +睁 +桥 +瓮 +蹙 +祉 +醺 +鼻 +昱 +剃 +跳 +篱 +跷 +蒜 +翎 +宅 +晖 +嗑 +壑 +峻 +癫 +屏 +狠 +陋 +袜 +途 +憎 +祀 +莹 +滟 +佶 +溥 +臣 +约 +盛 +峰 +磁 +慵 +婪 +拦 +莅 +朕 +鹦 +粲 +裤 +哎 +疡 +嫖 +琵 +窟 +堪 +谛 +嘉 +儡 +鳝 +斩 +郾 +驸 +酊 +妄 +胜 +贺 +徙 +傅 +噌 +钢 +栅 +庇 +恋 +匝 +巯 +邈 +尸 +锚 +粗 +佟 +蛟 +薹 +纵 +蚊 +郅 +绢 +锐 +苗 +俞 +篆 +淆 +膀 +鲜 +煎 +诶 +秽 +寻 +涮 +刺 +怀 +噶 +巨 +褰 +魅 +灶 +灌 +桉 +藕 +谜 +舸 +薄 +搀 +恽 +借 +牯 +痉 +渥 +愿 +亓 +耘 +杠 +柩 +锔 +蚶 +钣 +珈 +喘 +蹒 +幽 +赐 +稗 +晤 +莱 +泔 +扯 +肯 +菪 +裆 +腩 +豉 +疆 +骜 +腐 +倭 +珏 +唔 +粮 +亡 +润 +慰 +伽 +橄 +玄 +誉 +醐 +胆 +龊 +粼 +塬 +陇 +彼 +削 +嗣 +绾 +芽 +妗 +垭 +瘴 +爽 +薏 +寨 +龈 +泠 +弹 +赢 +漪 +猫 +嘧 +涂 +恤 +圭 +茧 +烽 +屑 +痕 +巾 +赖 +荸 +凰 +腮 +畈 +亵 +蹲 +偃 +苇 +澜 +艮 +换 +骺 +烘 +苕 +梓 +颉 +肇 +哗 +悄 +氤 +涠 +葬 +屠 +鹭 +植 +竺 +佯 +诣 +鲇 +瘀 +鲅 +邦 +移 +滁 +冯 +耕 +癔 +戌 +茬 +沁 +巩 +悠 +湘 +洪 +痹 +锟 +循 +谋 +腕 +鳃 +钠 +捞 +焉 +迎 +碱 +伫 +急 +榷 +奈 +邝 +卯 +辄 +皲 +卟 +醛 +畹 +忧 +稳 +雄 +昼 +缩 +阈 +睑 +扌 +耗 +曦 +涅 +捏 +瞧 +邕 +淖 +漉 +铝 +耦 +禹 +湛 +喽 +莼 +琅 +诸 +苎 +纂 +硅 +始 +嗨 +傥 +燃 +臂 +赅 +嘈 +呆 +贵 +屹 +壮 +肋 +亍 +蚀 +卅 +豹 +腆 +邬 +迭 +浊 +} +童 +螂 +捐 +圩 +勐 +触 +寞 +汊 +壤 +荫 +膺 +渌 +芳 +懿 +遴 +螈 +泰 +蓼 +蛤 +茜 +舅 +枫 +朔 +膝 +眙 +避 +梅 +判 +鹜 +璜 +牍 +缅 +垫 +藻 +黔 +侥 +惚 +懂 +踩 +腰 +腈 +札 +丞 +唾 +慈 +顿 +摹 +荻 +琬 +~ +斧 +沈 +滂 +胁 +胀 +幄 +莜 +Z +匀 +鄄 +掌 +绰 +茎 +焚 +赋 +萱 +谑 +汁 +铒 +瞎 +夺 +蜗 +野 +娆 +冀 +弯 +篁 +懵 +灞 +隽 +芡 +脘 +俐 +辩 +芯 +掺 +喏 +膈 +蝈 +觐 +悚 +踹 +蔗 +熠 +鼠 +呵 +抓 +橼 +峨 +畜 +缔 +禾 +崭 +弃 +熊 +摒 +凸 +拗 +穹 +蒙 +抒 +祛 +劝 +闫 +扳 +阵 +醌 +踪 +喵 +侣 +搬 +仅 +荧 +赎 +蝾 +琦 +买 +婧 +瞄 +寓 +皎 +冻 +赝 +箩 +莫 +瞰 +郊 +笫 +姝 +筒 +枪 +遣 +煸 +袋 +舆 +痱 +涛 +母 +〇 +启 +践 +耙 +绲 +盘 +遂 +昊 +搞 +槿 +诬 +纰 +泓 +惨 +檬 +亻 +越 +C +o +憩 +熵 +祷 +钒 +暧 +塔 +阗 +胰 +咄 +娶 +魔 +琶 +钞 +邻 +扬 +杉 +殴 +咽 +弓 +〆 +髻 +】 +吭 +揽 +霆 +拄 +殖 +脆 +彻 +岩 +芝 +勃 +辣 +剌 +钝 +嘎 +甄 +佘 +皖 +伦 +授 +徕 +憔 +挪 +皇 +庞 +稔 +芜 +踏 +溴 +兖 +卒 +擢 +饥 +鳞 +煲 +‰ +账 +颗 +叻 +斯 +捧 +鳍 +琮 +讹 +蛙 +纽 +谭 +酸 +兔 +莒 +睇 +伟 +觑 +羲 +嗜 +宜 +褐 +旎 +辛 +卦 +诘 +筋 +鎏 +溪 +挛 +熔 +阜 +晰 +鳅 +丢 +奚 +灸 +呱 +献 +陉 +黛 +鸪 +甾 +萨 +疮 +拯 +洲 +疹 +辑 +叙 +恻 +谒 +允 +柔 +烂 +氏 +逅 +漆 +拎 +惋 +扈 +湟 +纭 +啕 +掬 +擞 +哥 +忽 +涤 +鸵 +靡 +郗 +瓷 +扁 +廊 +怨 +雏 +钮 +敦 +E +懦 +憋 +汀 +拚 +啉 +腌 +岸 +f +痼 +瞅 +尊 +咀 +眩 +飙 +忌 +仝 +迦 +熬 +毫 +胯 +篑 +茄 +腺 +凄 +舛 +碴 +锵 +诧 +羯 +後 +漏 +汤 +宓 +仞 +蚁 +壶 +谰 +皑 +铄 +棰 +罔 +辅 +晶 +苦 +牟 +闽 +\ +烃 +饮 +聿 +丙 +蛳 +朱 +煤 +涔 +鳖 +犁 +罐 +荼 +砒 +淦 +妤 +黏 +戎 +孑 +婕 +瑾 +戢 +钵 +枣 +捋 +砥 +衩 +狙 +桠 +稣 +阎 +肃 +梏 +诫 +孪 +昶 +婊 +衫 +嗔 +侃 +塞 +蜃 +樵 +峒 +貌 +屿 +欺 +缫 +阐 +栖 +诟 +珞 +荭 +吝 +萍 +嗽 +恂 +啻 +蜴 +磬 +峋 +俸 +豫 +谎 +徊 +镍 +韬 +魇 +晴 +U +囟 +猜 +蛮 +坐 +囿 +伴 +亭 +肝 +佗 +蝠 +妃 +胞 +滩 +榴 +氖 +垩 +苋 +砣 +扪 +馏 +姓 +轩 +厉 +夥 +侈 +禀 +垒 +岑 +赏 +钛 +辐 +痔 +披 +纸 +碳 +“ +坞 +蠓 +挤 +荥 +沅 +悔 +铧 +帼 +蒌 +蝇 +a +p +y +n +g +哀 +浆 +瑶 +凿 +桶 +馈 +皮 +奴 +苜 +佤 +伶 +晗 +铱 +炬 +优 +弊 +氢 +恃 +甫 +攥 +端 +锌 +灰 +稹 +炝 +曙 +邋 +亥 +眶 +碾 +拉 +萝 +绔 +捷 +浍 +腋 +姑 +菖 +凌 +涞 +麽 +锢 +桨 +潢 +绎 +镰 +殆 +锑 +渝 +铬 +困 +绽 +觎 +匈 +糙 +暑 +裹 +鸟 +盔 +肽 +迷 +綦 +『 +亳 +佝 +俘 +钴 +觇 +骥 +仆 +疝 +跪 +婶 +郯 +瀹 +唉 +脖 +踞 +针 +晾 +忒 +扼 +瞩 +叛 +椒 +疟 +嗡 +邗 +肆 +跆 +玫 +忡 +捣 +咧 +唆 +艄 +蘑 +潦 +笛 +阚 +沸 +泻 +掊 +菽 +贫 +斥 +髂 +孢 +镂 +赂 +麝 +鸾 +屡 +衬 +苷 +恪 +叠 +希 +粤 +爻 +喝 +茫 +惬 +郸 +绻 +庸 +撅 +碟 +宄 +妹 +膛 +叮 +饵 +崛 +嗲 +椅 +冤 +搅 +咕 +敛 +尹 +垦 +闷 +蝉 +霎 +勰 +败 +蓑 +泸 +肤 +鹌 +幌 +焦 +浠 +鞍 +刁 +舰 +乙 +竿 +裔 +。 +茵 +函 +伊 +兄 +丨 +娜 +匍 +謇 +莪 +宥 +似 +蝽 +翳 +酪 +翠 +粑 +薇 +祢 +骏 +赠 +叫 +Q +噤 +噻 +竖 +芗 +莠 +潭 +俊 +羿 +耜 +O +郫 +趁 +嗪 +囚 +蹶 +芒 +洁 +笋 +鹑 +敲 +硝 +啶 +堡 +渲 +揩 +』 +携 +宿 +遒 +颍 +扭 +棱 +割 +萜 +蔸 +葵 +琴 +捂 +饰 +衙 +耿 +掠 +募 +岂 +窖 +涟 +蔺 +瘤 +柞 +瞪 +怜 +匹 +距 +楔 +炜 +哆 +秦 +缎 +幼 +茁 +绪 +痨 +恨 +楸 +娅 +瓦 +桩 +雪 +嬴 +伏 +榔 +妥 +铿 +拌 +眠 +雍 +缇 +‘ +卓 +搓 +哌 +觞 +噩 +屈 +哧 +髓 +咦 +巅 +娑 +侑 +淫 +膳 +祝 +勾 +姊 +莴 +胄 +疃 +薛 +蜷 +胛 +巷 +芙 +芋 +熙 +闰 +勿 +窃 +狱 +剩 +钏 +幢 +陟 +铛 +慧 +靴 +耍 +k +浙 +浇 +飨 +惟 +绗 +祜 +澈 +啼 +咪 +磷 +摞 +诅 +郦 +抹 +跃 +壬 +吕 +肖 +琏 +颤 +尴 +剡 +抠 +凋 +赚 +泊 +津 +宕 +殷 +倔 +氲 +漫 +邺 +涎 +怠 +$ +垮 +荬 +遵 +俏 +叹 +噢 +饽 +蜘 +孙 +筵 +疼 +鞭 +羧 +牦 +箭 +潴 +c +眸 +祭 +髯 +啖 +坳 +愁 +芩 +驮 +倡 +巽 +穰 +沃 +胚 +怒 +凤 +槛 +剂 +趵 +嫁 +v +邢 +灯 +鄢 +桐 +睽 +檗 +锯 +槟 +婷 +嵋 +圻 +诗 +蕈 +颠 +遭 +痢 +芸 +怯 +馥 +竭 +锗 +徜 +恭 +遍 +籁 +剑 +嘱 +苡 +龄 +僧 +桑 +潸 +弘 +澶 +楹 +悲 +讫 +愤 +腥 +悸 +谍 +椹 +呢 +桓 +葭 +攫 +阀 +翰 +躲 +敖 +柑 +郎 +笨 +橇 +呃 +魁 +燎 +脓 +葩 +磋 +垛 +玺 +狮 +沓 +砜 +蕊 +锺 +罹 +蕉 +翱 +虐 +闾 +巫 +旦 +茱 +嬷 +枯 +鹏 +贡 +芹 +汛 +矫 +绁 +拣 +禺 +佃 +讣 +舫 +惯 +乳 +趋 +疲 +挽 +岚 +虾 +衾 +蠹 +蹂 +飓 +氦 +铖 +孩 +稞 +瑜 +壅 +掀 +勘 +妓 +畅 +髋 +W +庐 +牲 +蓿 +榕 +练 +垣 +唱 +邸 +菲 +昆 +婺 +穿 +绡 +麒 +蚱 +掂 +愚 +泷 +涪 +漳 +妩 +娉 +榄 +讷 +觅 +旧 +藤 +煮 +呛 +柳 +腓 +叭 +庵 +烷 +阡 +罂 +蜕 +擂 +猖 +咿 +媲 +脉 +【 +沏 +貅 +黠 +熏 +哲 +烁 +坦 +酵 +兜 +× +潇 +撒 +剽 +珩 +圹 +乾 +摸 +樟 +帽 +嗒 +襄 +魂 +轿 +憬 +锡 +〕 +喃 +皆 +咖 +隅 +脸 +残 +泮 +袂 +鹂 +珊 +囤 +捆 +咤 +误 +徨 +闹 +淙 +芊 +淋 +怆 +囗 +拨 +梳 +渤 +R +G +绨 +蚓 +婀 +幡 +狩 +麾 +谢 +唢 +裸 +旌 +伉 +纶 +裂 +驳 +砼 +咛 +澄 +樨 +蹈 +宙 +澍 +倍 +貔 +操 +勇 +蟠 +摈 +砧 +虬 +够 +缁 +悦 +藿 +撸 +艹 +摁 +淹 +豇 +虎 +榭 +ˉ +吱 +d +° +喧 +荀 +踱 +侮 +奋 +偕 +饷 +犍 +惮 +坑 +璎 +徘 +宛 +妆 +袈 +倩 +窦 +昂 +荏 +乖 +K +怅 +撰 +鳙 +牙 +袁 +酞 +X +痿 +琼 +闸 +雁 +趾 +荚 +虻 +涝 +《 +杏 +韭 +偈 +烤 +绫 +鞘 +卉 +症 +遢 +蓥 +诋 +杭 +荨 +匆 +竣 +簪 +辙 +敕 +虞 +丹 +缭 +咩 +黟 +m +淤 +瑕 +咂 +铉 +硼 +茨 +嶂 +痒 +畸 +敬 +涿 +粪 +窘 +熟 +叔 +嫔 +盾 +忱 +裘 +憾 +梵 +赡 +珙 +咯 +娘 +庙 +溯 +胺 +葱 +痪 +摊 +荷 +卞 +乒 +髦 +寐 +铭 +坩 +胗 +枷 +爆 +溟 +嚼 +羚 +砬 +轨 +惊 +挠 +罄 +竽 +菏 +氧 +浅 +楣 +盼 +枢 +炸 +阆 +杯 +谏 +噬 +淇 +渺 +俪 +秆 +墓 +泪 +跻 +砌 +痰 +垡 +渡 +耽 +釜 +讶 +鳎 +煞 +呗 +韶 +舶 +绷 +鹳 +缜 +旷 +铊 +皱 +龌 +檀 +霖 +奄 +槐 +艳 +蝶 +旋 +哝 +赶 +骞 +蚧 +腊 +盈 +丁 +` +蜚 +矸 +蝙 +睨 +嚓 +僻 +鬼 +醴 +夜 +彝 +磊 +笔 +拔 +栀 +糕 +厦 +邰 +纫 +逭 +纤 +眦 +膊 +馍 +躇 +烯 +蘼 +冬 +诤 +暄 +骶 +哑 +瘠 +」 +臊 +丕 +愈 +咱 +螺 +擅 +跋 +搏 +硪 +谄 +笠 +淡 +嘿 +骅 +谧 +鼎 +皋 +姚 +歼 +蠢 +驼 +耳 +胬 +挝 +涯 +狗 +蒽 +孓 +犷 +凉 +芦 +箴 +铤 +孤 +嘛 +坤 +V +茴 +朦 +挞 +尖 +橙 +诞 +搴 +碇 +洵 +浚 +帚 +蜍 +漯 +柘 +嚎 +讽 +芭 +荤 +咻 +祠 +秉 +跖 +埃 +吓 +糯 +眷 +馒 +惹 +娼 +鲑 +嫩 +讴 +轮 +瞥 +靶 +褚 +乏 +缤 +宋 +帧 +删 +驱 +碎 +扑 +俩 +俄 +偏 +涣 +竹 +噱 +皙 +佰 +渚 +唧 +斡 +# +镉 +刀 +崎 +筐 +佣 +夭 +贰 +肴 +峙 +哔 +艿 +匐 +牺 +镛 +缘 +仡 +嫡 +劣 +枸 +堀 +梨 +簿 +鸭 +蒸 +亦 +稽 +浴 +{ +衢 +束 +槲 +j +阁 +揍 +疥 +棋 +潋 +聪 +窜 +乓 +睛 +插 +冉 +阪 +苍 +搽 +「 +蟾 +螟 +幸 +仇 +樽 +撂 +慢 +跤 +幔 +俚 +淅 +覃 +觊 +溶 +妖 +帛 +侨 +曰 +妾 +泗 +· +: +瀘 +風 +Ë +( +) +∶ +紅 +紗 +瑭 +雲 +頭 +鶏 +財 +許 +• +¥ +樂 +焗 +麗 +— +; +滙 +東 +榮 +繪 +興 +… +門 +業 +π +楊 +國 +顧 +é +盤 +寳 +Λ +龍 +鳳 +島 +誌 +緣 +結 +銭 +萬 +勝 +祎 +璟 +優 +歡 +臨 +時 +購 += +★ +藍 +昇 +鐵 +觀 +勅 +農 +聲 +畫 +兿 +術 +發 +劉 +記 +專 +耑 +園 +書 +壴 +種 +Ο +● +褀 +號 +銀 +匯 +敟 +锘 +葉 +橪 +廣 +進 +蒄 +鑽 +阝 +祙 +貢 +鍋 +豊 +夬 +喆 +團 +閣 +開 +燁 +賓 +館 +酡 +沔 +順 ++ +硚 +劵 +饸 +陽 +車 +湓 +復 +萊 +氣 +軒 +華 +堃 +迮 +纟 +戶 +馬 +學 +裡 +電 +嶽 +獨 +マ +シ +サ +ジ +燘 +袪 +環 +❤ +臺 +灣 +専 +賣 +孖 +聖 +攝 +線 +▪ +α +傢 +俬 +夢 +達 +莊 +喬 +貝 +薩 +劍 +羅 +壓 +棛 +饦 +尃 +璈 +囍 +醫 +G +I +A +# +N +鷄 +髙 +嬰 +啓 +約 +隹 +潔 +賴 +藝 +~ +寶 +籣 +麺 +  +嶺 +√ +義 +網 +峩 +長 +∧ +魚 +機 +構 +② +鳯 +偉 +L +B +㙟 +畵 +鴿 +' +詩 +溝 +嚞 +屌 +藔 +佧 +玥 +蘭 +織 +1 +3 +9 +0 +7 +點 +砭 +鴨 +鋪 +銘 +廳 +弍 +‧ +創 +湯 +坶 +℃ +卩 +骝 +& +烜 +荘 +當 +潤 +扞 +係 +懷 +碶 +钅 +蚨 +讠 +☆ +叢 +爲 +埗 +涫 +塗 +→ +楽 +現 +鯨 +愛 +瑪 +鈺 +忄 +悶 +藥 +飾 +樓 +視 +孬 +ㆍ +燚 +苪 +師 +① +丼 +锽 +│ +韓 +標 +è +兒 +閏 +匋 +張 +漢 +Ü +髪 +會 +閑 +檔 +習 +裝 +の +峯 +菘 +輝 +И +雞 +釣 +億 +浐 +K +O +R +8 +H +E +P +T +W +D +S +C +M +F +姌 +饹 +» +晞 +廰 +ä +嵯 +鷹 +負 +飲 +絲 +冚 +楗 +澤 +綫 +區 +❋ +← +質 +靑 +揚 +③ +滬 +統 +産 +協 +﹑ +乸 +畐 +經 +運 +際 +洺 +岽 +為 +粵 +諾 +崋 +豐 +碁 +ɔ +V +2 +6 +齋 +誠 +訂 +´ +勑 +雙 +陳 +無 +í +泩 +媄 +夌 +刂 +i +c +t +o +r +a +嘢 +耄 +燴 +暃 +壽 +媽 +靈 +抻 +體 +唻 +É +冮 +甹 +鎮 +錦 +ʌ +蜛 +蠄 +尓 +駕 +戀 +飬 +逹 +倫 +貴 +極 +Я +Й +寬 +磚 +嶪 +郎 +職 +| +間 +n +d +剎 +伈 +課 +飛 +橋 +瘊 +№ +譜 +骓 +圗 +滘 +縣 +粿 +咅 +養 +濤 +彳 +® +% +Ⅱ +啰 +㴪 +見 +矞 +薬 +糁 +邨 +鲮 +顔 +罱 +З +選 +話 +贏 +氪 +俵 +競 +瑩 +繡 +枱 +β +綉 +á +獅 +爾 +™ +麵 +戋 +淩 +徳 +個 +劇 +場 +務 +簡 +寵 +h +實 +膠 +轱 +圖 +築 +嘣 +樹 +㸃 +營 +耵 +孫 +饃 +鄺 +飯 +麯 +遠 +輸 +坫 +孃 +乚 +閃 +鏢 +㎡ +題 +廠 +關 +↑ +爺 +將 +軍 +連 +篦 +覌 +參 +箸 +- +窠 +棽 +寕 +夀 +爰 +歐 +呙 +閥 +頡 +熱 +雎 +垟 +裟 +凬 +勁 +帑 +馕 +夆 +疌 +枼 +馮 +貨 +蒤 +樸 +彧 +旸 +靜 +龢 +暢 +㐱 +鳥 +珺 +鏡 +灡 +爭 +堷 +廚 +Ó +騰 +診 +┅ +蘇 +褔 +凱 +頂 +豕 +亞 +帥 +嘬 +⊥ +仺 +桖 +複 +饣 +絡 +穂 +顏 +棟 +納 +▏ +濟 +親 +設 +計 +攵 +埌 +烺 +ò +頤 +燦 +蓮 +撻 +節 +講 +濱 +濃 +娽 +洳 +朿 +燈 +鈴 +護 +膚 +铔 +過 +補 +Z +U +5 +4 +坋 +闿 +䖝 +餘 +缐 +铞 +貿 +铪 +桼 +趙 +鍊 +[ +㐂 +垚 +菓 +揸 +捲 +鐘 +滏 +𣇉 +爍 +輪 +燜 +鴻 +鮮 +動 +鹞 +鷗 +丄 +慶 +鉌 +翥 +飮 +腸 +⇋ +漁 +覺 +來 +熘 +昴 +翏 +鲱 +圧 +鄉 +萭 +頔 +爐 +嫚 +г +貭 +類 +聯 +幛 +輕 +訓 +鑒 +夋 +锨 +芃 +珣 +䝉 +扙 +嵐 +銷 +處 +ㄱ +語 +誘 +苝 +歸 +儀 +燒 +楿 +內 +粢 +葒 +奧 +麥 +礻 +滿 +蠔 +穵 +瞭 +態 +鱬 +榞 +硂 +鄭 +黃 +煙 +祐 +奓 +逺 +* +瑄 +獲 +聞 +薦 +讀 +這 +樣 +決 +問 +啟 +們 +執 +説 +轉 +單 +隨 +唘 +帶 +倉 +庫 +還 +贈 +尙 +皺 +■ +餅 +產 +○ +∈ +報 +狀 +楓 +賠 +琯 +嗮 +禮 +` +傳 +> +≤ +嗞 +Φ +≥ +換 +咭 +∣ +↓ +曬 +ε +応 +寫 +″ +終 +様 +純 +費 +療 +聨 +凍 +壐 +郵 +ü +黒 +∫ +製 +塊 +調 +軽 +確 +撃 +級 +馴 +Ⅲ +涇 +繹 +數 +碼 +證 +狒 +処 +劑 +< +晧 +賀 +衆 +] +櫥 +兩 +陰 +絶 +對 +鯉 +憶 +◎ +p +e +Y +蕒 +煖 +頓 +測 +試 +鼽 +僑 +碩 +妝 +帯 +≈ +鐡 +舖 +權 +喫 +倆 +ˋ +該 +悅 +ā +俫 +. +f +s +b +m +k +g +u +j +貼 +淨 +濕 +針 +適 +備 +l +/ +給 +謢 +強 +觸 +衛 +與 +⊙ +$ +緯 +變 +⑴ +⑵ +⑶ +㎏ +殺 +∩ +幚 +─ +價 +▲ +離 +ú +ó +飄 +烏 +関 +閟 +﹝ +﹞ +邏 +輯 +鍵 +驗 +訣 +導 +歷 +屆 +層 +▼ +儱 +錄 +熳 +ē +艦 +吋 +錶 +辧 +飼 +顯 +④ +禦 +販 +気 +対 +枰 +閩 +紀 +幹 +瞓 +貊 +淚 +△ +眞 +墊 +Ω +獻 +褲 +縫 +緑 +亜 +鉅 +餠 +{ +} +◆ +蘆 +薈 +█ +◇ +溫 +彈 +晳 +粧 +犸 +穩 +訊 +崬 +凖 +熥 +П +舊 +條 +紋 +圍 +Ⅳ +筆 +尷 +難 +雜 +錯 +綁 +識 +頰 +鎖 +艶 +□ +殁 +殼 +⑧ +├ +▕ +鵬 +ǐ +ō +ǒ +糝 +綱 +▎ +μ +盜 +饅 +醬 +籤 +蓋 +釀 +鹽 +據 +à +ɡ +辦 +◥ +彐 +┌ +婦 +獸 +鲩 +伱 +ī +蒟 +蒻 +齊 +袆 +腦 +寧 +凈 +妳 +煥 +詢 +偽 +謹 +啫 +鯽 +騷 +鱸 +損 +傷 +鎻 +髮 +買 +冏 +儥 +両 +﹢ +∞ +載 +喰 +z +羙 +悵 +燙 +曉 +員 +組 +徹 +艷 +痠 +鋼 +鼙 +縮 +細 +嚒 +爯 +≠ +維 +" +鱻 +壇 +厍 +帰 +浥 +犇 +薡 +軎 +² +應 +醜 +刪 +緻 +鶴 +賜 +噁 +軌 +尨 +镔 +鷺 +槗 +彌 +葚 +濛 +請 +溇 +緹 +賢 +訪 +獴 +瑅 +資 +縤 +陣 +蕟 +栢 +韻 +祼 +恁 +伢 +謝 +劃 +涑 +總 +衖 +踺 +砋 +凉 +籃 +駿 +苼 +瘋 +昽 +紡 +驊 +腎 +﹗ +響 +杋 +剛 +嚴 +禪 +歓 +槍 +傘 +檸 +檫 +炣 +勢 +鏜 +鎢 +銑 +尐 +減 +奪 +惡 +θ +僮 +婭 +臘 +ū +ì +殻 +鉄 +∑ +蛲 +焼 +緖 +續 +紹 +懮 diff --git a/deploy/fastdeploy/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/examples/ocr/OcrMainActivity.java b/deploy/fastdeploy/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/examples/ocr/OcrMainActivity.java new file mode 100644 index 0000000000000000000000000000000000000000..9ce3b38765ca4b44c375723b45b8f995838820d5 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/examples/ocr/OcrMainActivity.java @@ -0,0 +1,500 @@ +package com.baidu.paddle.fastdeploy.app.examples.ocr; + +import static com.baidu.paddle.fastdeploy.app.ui.Utils.decodeBitmap; +import static com.baidu.paddle.fastdeploy.app.ui.Utils.getRealPathFromURI; + +import android.Manifest; +import android.annotation.SuppressLint; +import android.app.Activity; +import android.app.AlertDialog; +import android.content.DialogInterface; +import android.content.Intent; +import android.content.SharedPreferences; +import android.content.pm.PackageManager; +import android.graphics.Bitmap; +import android.net.Uri; +import android.os.Bundle; +import android.os.SystemClock; +import android.preference.PreferenceManager; +import android.support.annotation.NonNull; +import android.support.v4.app.ActivityCompat; +import android.support.v4.content.ContextCompat; +import android.view.View; +import android.view.ViewGroup; +import android.view.Window; +import android.view.WindowManager; +import android.widget.ImageButton; +import android.widget.ImageView; +import android.widget.SeekBar; +import android.widget.TextView; + +import com.baidu.paddle.fastdeploy.RuntimeOption; +import com.baidu.paddle.fastdeploy.app.examples.R; +import com.baidu.paddle.fastdeploy.app.ui.view.CameraSurfaceView; +import com.baidu.paddle.fastdeploy.app.ui.view.ResultListView; +import com.baidu.paddle.fastdeploy.app.ui.Utils; +import com.baidu.paddle.fastdeploy.app.ui.view.adapter.BaseResultAdapter; +import com.baidu.paddle.fastdeploy.app.ui.view.model.BaseResultModel; +import com.baidu.paddle.fastdeploy.pipeline.PPOCRv3; +import com.baidu.paddle.fastdeploy.vision.OCRResult; +import com.baidu.paddle.fastdeploy.vision.Visualize; +import com.baidu.paddle.fastdeploy.vision.ocr.Classifier; +import com.baidu.paddle.fastdeploy.vision.ocr.DBDetector; +import com.baidu.paddle.fastdeploy.vision.ocr.Recognizer; + +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.List; + +public class OcrMainActivity extends Activity implements View.OnClickListener, CameraSurfaceView.OnTextureChangedListener { + private static final String TAG = OcrMainActivity.class.getSimpleName(); + + CameraSurfaceView svPreview; + TextView tvStatus; + ImageButton btnSwitch; + ImageButton btnShutter; + ImageButton btnSettings; + ImageView realtimeToggleButton; + boolean isRealtimeStatusRunning = false; + ImageView backInPreview; + private ImageView albumSelectButton; + private View cameraPageView; + private ViewGroup resultPageView; + private ImageView resultImage; + private ImageView backInResult; + private SeekBar confidenceSeekbar; + private TextView seekbarText; + private float resultNum = 1.0f; + private ResultListView resultView; + private Bitmap picBitmap; + private Bitmap shutterBitmap; + private Bitmap originPicBitmap; + private Bitmap originShutterBitmap; + private boolean isShutterBitmapCopied = false; + + public static final int TYPE_UNKNOWN = -1; + public static final int BTN_SHUTTER = 0; + public static final int ALBUM_SELECT = 1; + public static final int REALTIME_DETECT = 2; + private static int TYPE = REALTIME_DETECT; + + private static final int REQUEST_PERMISSION_CODE_STORAGE = 101; + private static final int INTENT_CODE_PICK_IMAGE = 100; + private static final int TIME_SLEEP_INTERVAL = 50; // ms + + long timeElapsed = 0; + long frameCounter = 0; + + // Call 'init' and 'release' manually later + PPOCRv3 predictor = new PPOCRv3(); + + private String[] texts; + private float[] recScores; + private boolean initialized; + private List results = new ArrayList<>(); + + @Override + protected void onCreate(Bundle savedInstanceState) { + super.onCreate(savedInstanceState); + + // Fullscreen + requestWindowFeature(Window.FEATURE_NO_TITLE); + getWindow().setFlags(WindowManager.LayoutParams.FLAG_FULLSCREEN, WindowManager.LayoutParams.FLAG_FULLSCREEN); + + setContentView(R.layout.ocr_activity_main); + + // Clear all setting items to avoid app crashing due to the incorrect settings + initSettings(); + + // Check and request CAMERA and WRITE_EXTERNAL_STORAGE permissions + if (!checkAllPermissions()) { + requestAllPermissions(); + } + + // Init the camera preview and UI components + initView(); + } + + @SuppressLint("NonConstantResourceId") + @Override + public void onClick(View v) { + switch (v.getId()) { + case R.id.btn_switch: + svPreview.switchCamera(); + break; + case R.id.btn_shutter: + TYPE = BTN_SHUTTER; + shutterAndPauseCamera(); + resultView.setAdapter(null); + break; + case R.id.btn_settings: + startActivity(new Intent(OcrMainActivity.this, OcrSettingsActivity.class)); + break; + case R.id.realtime_toggle_btn: + toggleRealtimeStyle(); + break; + case R.id.back_in_preview: + finish(); + break; + case R.id.iv_select: + TYPE = ALBUM_SELECT; + // Judge whether authority has been granted. + if (ContextCompat.checkSelfPermission(this, Manifest.permission.WRITE_EXTERNAL_STORAGE) != PackageManager.PERMISSION_GRANTED) { + // If this permission was requested before the application but the user refused the request, this method will return true. + ActivityCompat.requestPermissions(this, new String[]{Manifest.permission.WRITE_EXTERNAL_STORAGE}, REQUEST_PERMISSION_CODE_STORAGE); + } else { + Intent intent = new Intent(Intent.ACTION_PICK); + intent.setType("image/*"); + startActivityForResult(intent, INTENT_CODE_PICK_IMAGE); + } + resultView.setAdapter(null); + break; + case R.id.back_in_result: + back(); + break; + } + } + + @Override + public void onBackPressed() { + super.onBackPressed(); + back(); + } + + private void back() { + resultPageView.setVisibility(View.GONE); + cameraPageView.setVisibility(View.VISIBLE); + TYPE = REALTIME_DETECT; + isShutterBitmapCopied = false; + svPreview.onResume(); + results.clear(); + if (texts != null) { + texts = null; + } + if (recScores != null) { + recScores = null; + } + } + + private void shutterAndPauseCamera() { + new Thread(new Runnable() { + @Override + public void run() { + try { + // Sleep some times to ensure picture has been correctly shut. + Thread.sleep(TIME_SLEEP_INTERVAL * 10); // 500ms + } catch (InterruptedException e) { + e.printStackTrace(); + } + runOnUiThread(new Runnable() { + @SuppressLint("SetTextI18n") + public void run() { + // These code will run in main thread. + svPreview.onPause(); + cameraPageView.setVisibility(View.GONE); + resultPageView.setVisibility(View.VISIBLE); + seekbarText.setText(resultNum + ""); + confidenceSeekbar.setProgress((int) (resultNum * 100)); + if (shutterBitmap != null && !shutterBitmap.isRecycled()) { + resultImage.setImageBitmap(shutterBitmap); + } else { + new AlertDialog.Builder(OcrMainActivity.this) + .setTitle("Empty Result!") + .setMessage("Current picture is empty, please shutting it again!") + .setCancelable(true) + .show(); + } + } + }); + + } + }).start(); + } + + private void copyBitmapFromCamera(Bitmap ARGB8888ImageBitmap) { + if (isShutterBitmapCopied || ARGB8888ImageBitmap == null) { + return; + } + if (!ARGB8888ImageBitmap.isRecycled()) { + synchronized (this) { + shutterBitmap = ARGB8888ImageBitmap.copy(Bitmap.Config.ARGB_8888, true); + originShutterBitmap = ARGB8888ImageBitmap.copy(Bitmap.Config.ARGB_8888, true); + } + SystemClock.sleep(TIME_SLEEP_INTERVAL); + isShutterBitmapCopied = true; + } + } + + + @Override + protected void onActivityResult(int requestCode, int resultCode, Intent data) { + super.onActivityResult(requestCode, resultCode, data); + if (requestCode == INTENT_CODE_PICK_IMAGE) { + if (resultCode == Activity.RESULT_OK) { + cameraPageView.setVisibility(View.GONE); + resultPageView.setVisibility(View.VISIBLE); + seekbarText.setText(resultNum + ""); + confidenceSeekbar.setProgress((int) (resultNum * 100)); + Uri uri = data.getData(); + String path = getRealPathFromURI(this, uri); + picBitmap = decodeBitmap(path, 720, 1280); + originPicBitmap = picBitmap.copy(Bitmap.Config.ARGB_8888, true); + resultImage.setImageBitmap(picBitmap); + } + } + } + + private void toggleRealtimeStyle() { + if (isRealtimeStatusRunning) { + isRealtimeStatusRunning = false; + realtimeToggleButton.setImageResource(R.drawable.realtime_stop_btn); + svPreview.setOnTextureChangedListener(this); + tvStatus.setVisibility(View.VISIBLE); + } else { + isRealtimeStatusRunning = true; + realtimeToggleButton.setImageResource(R.drawable.realtime_start_btn); + tvStatus.setVisibility(View.GONE); + isShutterBitmapCopied = false; + svPreview.setOnTextureChangedListener(new CameraSurfaceView.OnTextureChangedListener() { + @Override + public boolean onTextureChanged(Bitmap ARGB8888ImageBitmap) { + if (TYPE == BTN_SHUTTER) { + copyBitmapFromCamera(ARGB8888ImageBitmap); + } + return false; + } + }); + } + } + + @Override + public boolean onTextureChanged(Bitmap ARGB8888ImageBitmap) { + if (TYPE == BTN_SHUTTER) { + copyBitmapFromCamera(ARGB8888ImageBitmap); + return false; + } + + boolean modified = false; + + long tc = System.currentTimeMillis(); + OCRResult result = predictor.predict(ARGB8888ImageBitmap); + timeElapsed += (System.currentTimeMillis() - tc); + + Visualize.visOcr(ARGB8888ImageBitmap, result); + modified = result.initialized(); + + frameCounter++; + if (frameCounter >= 30) { + final int fps = (int) (1000 / (timeElapsed / 30)); + runOnUiThread(new Runnable() { + @SuppressLint("SetTextI18n") + public void run() { + tvStatus.setText(Integer.toString(fps) + "fps"); + } + }); + frameCounter = 0; + timeElapsed = 0; + } + return modified; + } + + @Override + protected void onResume() { + super.onResume(); + // Reload settings and re-initialize the predictor + checkAndUpdateSettings(); + // Open camera until the permissions have been granted + if (!checkAllPermissions()) { + svPreview.disableCamera(); + } else { + svPreview.enableCamera(); + } + svPreview.onResume(); + } + + @Override + protected void onPause() { + super.onPause(); + svPreview.onPause(); + } + + @Override + protected void onDestroy() { + if (predictor != null) { + predictor.release(); + } + super.onDestroy(); + } + + public void initView() { + TYPE = REALTIME_DETECT; + svPreview = (CameraSurfaceView) findViewById(R.id.sv_preview); + svPreview.setOnTextureChangedListener(this); + tvStatus = (TextView) findViewById(R.id.tv_status); + btnSwitch = (ImageButton) findViewById(R.id.btn_switch); + btnSwitch.setOnClickListener(this); + btnShutter = (ImageButton) findViewById(R.id.btn_shutter); + btnShutter.setOnClickListener(this); + btnSettings = (ImageButton) findViewById(R.id.btn_settings); + btnSettings.setOnClickListener(this); + realtimeToggleButton = findViewById(R.id.realtime_toggle_btn); + realtimeToggleButton.setOnClickListener(this); + backInPreview = findViewById(R.id.back_in_preview); + backInPreview.setOnClickListener(this); + albumSelectButton = findViewById(R.id.iv_select); + albumSelectButton.setOnClickListener(this); + cameraPageView = findViewById(R.id.camera_page); + resultPageView = findViewById(R.id.result_page); + resultImage = findViewById(R.id.result_image); + backInResult = findViewById(R.id.back_in_result); + backInResult.setOnClickListener(this); + confidenceSeekbar = findViewById(R.id.confidence_seekbar); + seekbarText = findViewById(R.id.seekbar_text); + resultView = findViewById(R.id.result_list_view); + + confidenceSeekbar.setMax(100); + confidenceSeekbar.setOnSeekBarChangeListener(new SeekBar.OnSeekBarChangeListener() { + @Override + public void onProgressChanged(SeekBar seekBar, int progress, boolean fromUser) { + float resultConfidence = seekBar.getProgress() / 100f; + BigDecimal bd = new BigDecimal(resultConfidence); + resultNum = bd.setScale(1, BigDecimal.ROUND_HALF_UP).floatValue(); + seekbarText.setText(resultNum + ""); + confidenceSeekbar.setProgress((int) (resultNum * 100)); + results.clear(); + } + + @Override + public void onStartTrackingTouch(SeekBar seekBar) { + + } + + @Override + public void onStopTrackingTouch(SeekBar seekBar) { + runOnUiThread(new Runnable() { + @Override + public void run() { + if (TYPE == ALBUM_SELECT) { + SystemClock.sleep(TIME_SLEEP_INTERVAL * 10); + detail(picBitmap); + picBitmap = originPicBitmap.copy(Bitmap.Config.ARGB_8888, true); + } else { + SystemClock.sleep(TIME_SLEEP_INTERVAL * 10); + detail(shutterBitmap); + shutterBitmap = originShutterBitmap.copy(Bitmap.Config.ARGB_8888, true); + } + } + }); + } + }); + } + + private void detail(Bitmap bitmap) { + OCRResult result = predictor.predict(bitmap, true); + + texts = result.mText; + recScores = result.mRecScores; + + initialized = result.initialized(); + if (initialized) { + for (int i = 0; i < texts.length; i++) { + if (recScores[i] > resultNum) { + results.add(new BaseResultModel(i + 1, texts[i], recScores[i])); + } + } + } + BaseResultAdapter adapter = new BaseResultAdapter(getBaseContext(), R.layout.ocr_result_page_item, results); + resultView.setAdapter(adapter); + resultView.invalidate(); + + resultImage.setImageBitmap(bitmap); + resultNum = 1.0f; + } + + @SuppressLint("ApplySharedPref") + public void initSettings() { + SharedPreferences sharedPreferences = PreferenceManager.getDefaultSharedPreferences(this); + SharedPreferences.Editor editor = sharedPreferences.edit(); + editor.clear(); + editor.commit(); + OcrSettingsActivity.resetSettings(); + } + + public void checkAndUpdateSettings() { + if (OcrSettingsActivity.checkAndUpdateSettings(this)) { + String realModelDir = getCacheDir() + "/" + OcrSettingsActivity.modelDir; + String detModelName = "ch_PP-OCRv3_det_infer"; + // String detModelName = "ch_ppocr_mobile_v2.0_det_infer"; + String clsModelName = "ch_ppocr_mobile_v2.0_cls_infer"; + // String recModelName = "ch_ppocr_mobile_v2.0_rec_infer"; + String recModelName = "ch_PP-OCRv3_rec_infer"; + String realDetModelDir = realModelDir + "/" + detModelName; + String realClsModelDir = realModelDir + "/" + clsModelName; + String realRecModelDir = realModelDir + "/" + recModelName; + String srcDetModelDir = OcrSettingsActivity.modelDir + "/" + detModelName; + String srcClsModelDir = OcrSettingsActivity.modelDir + "/" + clsModelName; + String srcRecModelDir = OcrSettingsActivity.modelDir + "/" + recModelName; + Utils.copyDirectoryFromAssets(this, srcDetModelDir, realDetModelDir); + Utils.copyDirectoryFromAssets(this, srcClsModelDir, realClsModelDir); + Utils.copyDirectoryFromAssets(this, srcRecModelDir, realRecModelDir); + String realLabelPath = getCacheDir() + "/" + OcrSettingsActivity.labelPath; + Utils.copyFileFromAssets(this, OcrSettingsActivity.labelPath, realLabelPath); + + String detModelFile = realDetModelDir + "/" + "inference.pdmodel"; + String detParamsFile = realDetModelDir + "/" + "inference.pdiparams"; + String clsModelFile = realClsModelDir + "/" + "inference.pdmodel"; + String clsParamsFile = realClsModelDir + "/" + "inference.pdiparams"; + String recModelFile = realRecModelDir + "/" + "inference.pdmodel"; + String recParamsFile = realRecModelDir + "/" + "inference.pdiparams"; + String recLabelFilePath = realLabelPath; // ppocr_keys_v1.txt + RuntimeOption detOption = new RuntimeOption(); + RuntimeOption clsOption = new RuntimeOption(); + RuntimeOption recOption = new RuntimeOption(); + detOption.setCpuThreadNum(OcrSettingsActivity.cpuThreadNum); + clsOption.setCpuThreadNum(OcrSettingsActivity.cpuThreadNum); + recOption.setCpuThreadNum(OcrSettingsActivity.cpuThreadNum); + detOption.setLitePowerMode(OcrSettingsActivity.cpuPowerMode); + clsOption.setLitePowerMode(OcrSettingsActivity.cpuPowerMode); + recOption.setLitePowerMode(OcrSettingsActivity.cpuPowerMode); + if (Boolean.parseBoolean(OcrSettingsActivity.enableLiteFp16)) { + detOption.enableLiteFp16(); + clsOption.enableLiteFp16(); + recOption.enableLiteFp16(); + } + DBDetector detModel = new DBDetector(detModelFile, detParamsFile, detOption); + Classifier clsModel = new Classifier(clsModelFile, clsParamsFile, clsOption); + Recognizer recModel = new Recognizer(recModelFile, recParamsFile, recLabelFilePath, recOption); + predictor.init(detModel, clsModel, recModel); + + } + } + + @Override + public void onRequestPermissionsResult(int requestCode, @NonNull String[] permissions, + @NonNull int[] grantResults) { + super.onRequestPermissionsResult(requestCode, permissions, grantResults); + if (grantResults[0] != PackageManager.PERMISSION_GRANTED || grantResults[1] != PackageManager.PERMISSION_GRANTED) { + new AlertDialog.Builder(OcrMainActivity.this) + .setTitle("Permission denied") + .setMessage("Click to force quit the app, then open Settings->Apps & notifications->Target " + + "App->Permissions to grant all of the permissions.") + .setCancelable(false) + .setPositiveButton("Exit", new DialogInterface.OnClickListener() { + @Override + public void onClick(DialogInterface dialog, int which) { + OcrMainActivity.this.finish(); + } + }).show(); + } + } + + private void requestAllPermissions() { + ActivityCompat.requestPermissions(this, new String[]{Manifest.permission.WRITE_EXTERNAL_STORAGE, + Manifest.permission.CAMERA}, 0); + } + + private boolean checkAllPermissions() { + return ContextCompat.checkSelfPermission(this, Manifest.permission.WRITE_EXTERNAL_STORAGE) == PackageManager.PERMISSION_GRANTED + && ContextCompat.checkSelfPermission(this, Manifest.permission.CAMERA) == PackageManager.PERMISSION_GRANTED; + } +} diff --git a/deploy/fastdeploy/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/examples/ocr/OcrSettingsActivity.java b/deploy/fastdeploy/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/examples/ocr/OcrSettingsActivity.java new file mode 100644 index 0000000000000000000000000000000000000000..6f8c45ff4f4cd7e6f1407f6a78ab89d262bb43bf --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/examples/ocr/OcrSettingsActivity.java @@ -0,0 +1,198 @@ +package com.baidu.paddle.fastdeploy.app.examples.ocr; + +import android.annotation.SuppressLint; +import android.content.Context; +import android.content.SharedPreferences; +import android.os.Bundle; +import android.preference.EditTextPreference; +import android.preference.ListPreference; +import android.preference.PreferenceManager; +import android.support.v7.app.ActionBar; + +import com.baidu.paddle.fastdeploy.app.examples.R; +import com.baidu.paddle.fastdeploy.app.ui.Utils; +import com.baidu.paddle.fastdeploy.app.ui.view.AppCompatPreferenceActivity; + +import java.util.ArrayList; +import java.util.List; + +public class OcrSettingsActivity extends AppCompatPreferenceActivity implements + SharedPreferences.OnSharedPreferenceChangeListener { + private static final String TAG = OcrSettingsActivity.class.getSimpleName(); + + static public int selectedModelIdx = -1; + static public String modelDir = ""; + static public String labelPath = ""; + static public int cpuThreadNum = 2; + static public String cpuPowerMode = ""; + static public float scoreThreshold = 0.4f; + static public String enableLiteFp16 = "true"; + + ListPreference lpChoosePreInstalledModel = null; + EditTextPreference etModelDir = null; + EditTextPreference etLabelPath = null; + ListPreference lpCPUThreadNum = null; + ListPreference lpCPUPowerMode = null; + EditTextPreference etScoreThreshold = null; + ListPreference lpEnableLiteFp16 = null; + + List preInstalledModelDirs = null; + List preInstalledLabelPaths = null; + List preInstalledCPUThreadNums = null; + List preInstalledCPUPowerModes = null; + List preInstalledScoreThresholds = null; + List preInstalledEnableLiteFp16s = null; + + @Override + public void onCreate(Bundle savedInstanceState) { + super.onCreate(savedInstanceState); + addPreferencesFromResource(R.xml.ocr_settings); + ActionBar supportActionBar = getSupportActionBar(); + if (supportActionBar != null) { + supportActionBar.setDisplayHomeAsUpEnabled(true); + } + + // Initialize pre-installed models + preInstalledModelDirs = new ArrayList(); + preInstalledLabelPaths = new ArrayList(); + preInstalledCPUThreadNums = new ArrayList(); + preInstalledCPUPowerModes = new ArrayList(); + preInstalledScoreThresholds = new ArrayList(); + preInstalledEnableLiteFp16s = new ArrayList(); + preInstalledModelDirs.add(getString(R.string.OCR_MODEL_DIR_DEFAULT)); + preInstalledLabelPaths.add(getString(R.string.OCR_REC_LABEL_DEFAULT)); + preInstalledCPUThreadNums.add(getString(R.string.CPU_THREAD_NUM_DEFAULT)); + preInstalledCPUPowerModes.add(getString(R.string.CPU_POWER_MODE_DEFAULT)); + preInstalledScoreThresholds.add(getString(R.string.SCORE_THRESHOLD_DEFAULT)); + preInstalledEnableLiteFp16s.add(getString(R.string.ENABLE_LITE_FP16_MODE_DEFAULT)); + + // Setup UI components + lpChoosePreInstalledModel = + (ListPreference) findPreference(getString(R.string.CHOOSE_PRE_INSTALLED_MODEL_KEY)); + String[] preInstalledModelNames = new String[preInstalledModelDirs.size()]; + for (int i = 0; i < preInstalledModelDirs.size(); i++) { + preInstalledModelNames[i] = preInstalledModelDirs.get(i).substring(preInstalledModelDirs.get(i).lastIndexOf("/") + 1); + } + lpChoosePreInstalledModel.setEntries(preInstalledModelNames); + lpChoosePreInstalledModel.setEntryValues(preInstalledModelDirs.toArray(new String[preInstalledModelDirs.size()])); + lpCPUThreadNum = (ListPreference) findPreference(getString(R.string.CPU_THREAD_NUM_KEY)); + lpCPUPowerMode = (ListPreference) findPreference(getString(R.string.CPU_POWER_MODE_KEY)); + etModelDir = (EditTextPreference) findPreference(getString(R.string.MODEL_DIR_KEY)); + etModelDir.setTitle("Model dir (SDCard: " + Utils.getSDCardDirectory() + ")"); + etLabelPath = (EditTextPreference) findPreference(getString(R.string.LABEL_PATH_KEY)); + etLabelPath.setTitle("Label path (SDCard: " + Utils.getSDCardDirectory() + ")"); + etScoreThreshold = (EditTextPreference) findPreference(getString(R.string.SCORE_THRESHOLD_KEY)); + lpEnableLiteFp16 = (ListPreference) findPreference(getString(R.string.ENABLE_LITE_FP16_MODE_KEY)); + } + + @SuppressLint("ApplySharedPref") + private void reloadSettingsAndUpdateUI() { + SharedPreferences sharedPreferences = getPreferenceScreen().getSharedPreferences(); + + String selected_model_dir = sharedPreferences.getString(getString(R.string.CHOOSE_PRE_INSTALLED_MODEL_KEY), + getString(R.string.OCR_MODEL_DIR_DEFAULT)); + int selected_model_idx = lpChoosePreInstalledModel.findIndexOfValue(selected_model_dir); + if (selected_model_idx >= 0 && selected_model_idx < preInstalledModelDirs.size() && selected_model_idx != selectedModelIdx) { + SharedPreferences.Editor editor = sharedPreferences.edit(); + editor.putString(getString(R.string.MODEL_DIR_KEY), preInstalledModelDirs.get(selected_model_idx)); + editor.putString(getString(R.string.LABEL_PATH_KEY), preInstalledLabelPaths.get(selected_model_idx)); + editor.putString(getString(R.string.CPU_THREAD_NUM_KEY), preInstalledCPUThreadNums.get(selected_model_idx)); + editor.putString(getString(R.string.CPU_POWER_MODE_KEY), preInstalledCPUPowerModes.get(selected_model_idx)); + editor.putString(getString(R.string.SCORE_THRESHOLD_KEY), preInstalledScoreThresholds.get(selected_model_idx)); + editor.putString(getString(R.string.ENABLE_LITE_FP16_MODE_DEFAULT), preInstalledEnableLiteFp16s.get(selected_model_idx)); + editor.commit(); + lpChoosePreInstalledModel.setSummary(selected_model_dir); + selectedModelIdx = selected_model_idx; + } + + String model_dir = sharedPreferences.getString(getString(R.string.MODEL_DIR_KEY), + getString(R.string.OCR_MODEL_DIR_DEFAULT)); + String label_path = sharedPreferences.getString(getString(R.string.LABEL_PATH_KEY), + getString(R.string.OCR_REC_LABEL_DEFAULT)); + String cpu_thread_num = sharedPreferences.getString(getString(R.string.CPU_THREAD_NUM_KEY), + getString(R.string.CPU_THREAD_NUM_DEFAULT)); + String cpu_power_mode = sharedPreferences.getString(getString(R.string.CPU_POWER_MODE_KEY), + getString(R.string.CPU_POWER_MODE_DEFAULT)); + String score_threshold = sharedPreferences.getString(getString(R.string.SCORE_THRESHOLD_KEY), + getString(R.string.SCORE_THRESHOLD_DEFAULT)); + String enable_lite_fp16 = sharedPreferences.getString(getString(R.string.ENABLE_LITE_FP16_MODE_KEY), + getString(R.string.ENABLE_LITE_FP16_MODE_DEFAULT)); + + etModelDir.setSummary(model_dir); + etLabelPath.setSummary(label_path); + lpCPUThreadNum.setValue(cpu_thread_num); + lpCPUThreadNum.setSummary(cpu_thread_num); + lpCPUPowerMode.setValue(cpu_power_mode); + lpCPUPowerMode.setSummary(cpu_power_mode); + etScoreThreshold.setSummary(score_threshold); + etScoreThreshold.setText(score_threshold); + lpEnableLiteFp16.setValue(enable_lite_fp16); + lpEnableLiteFp16.setSummary(enable_lite_fp16); + + } + + static boolean checkAndUpdateSettings(Context ctx) { + boolean settingsChanged = false; + SharedPreferences sharedPreferences = PreferenceManager.getDefaultSharedPreferences(ctx); + + String model_dir = sharedPreferences.getString(ctx.getString(R.string.MODEL_DIR_KEY), + ctx.getString(R.string.OCR_MODEL_DIR_DEFAULT)); + settingsChanged |= !modelDir.equalsIgnoreCase(model_dir); + modelDir = model_dir; + + String label_path = sharedPreferences.getString(ctx.getString(R.string.LABEL_PATH_KEY), + ctx.getString(R.string.OCR_REC_LABEL_DEFAULT)); + settingsChanged |= !labelPath.equalsIgnoreCase(label_path); + labelPath = label_path; + + String cpu_thread_num = sharedPreferences.getString(ctx.getString(R.string.CPU_THREAD_NUM_KEY), + ctx.getString(R.string.CPU_THREAD_NUM_DEFAULT)); + settingsChanged |= cpuThreadNum != Integer.parseInt(cpu_thread_num); + cpuThreadNum = Integer.parseInt(cpu_thread_num); + + String cpu_power_mode = sharedPreferences.getString(ctx.getString(R.string.CPU_POWER_MODE_KEY), + ctx.getString(R.string.CPU_POWER_MODE_DEFAULT)); + settingsChanged |= !cpuPowerMode.equalsIgnoreCase(cpu_power_mode); + cpuPowerMode = cpu_power_mode; + + String score_threshold = sharedPreferences.getString(ctx.getString(R.string.SCORE_THRESHOLD_KEY), + ctx.getString(R.string.SCORE_THRESHOLD_DEFAULT)); + settingsChanged |= scoreThreshold != Float.parseFloat(score_threshold); + scoreThreshold = Float.parseFloat(score_threshold); + + String enable_lite_fp16 = sharedPreferences.getString(ctx.getString(R.string.ENABLE_LITE_FP16_MODE_KEY), + ctx.getString(R.string.ENABLE_LITE_FP16_MODE_DEFAULT)); + settingsChanged |= !enableLiteFp16.equalsIgnoreCase(enable_lite_fp16); + enableLiteFp16 = enable_lite_fp16; + + return settingsChanged; + } + + static void resetSettings() { + selectedModelIdx = -1; + modelDir = ""; + labelPath = ""; + cpuThreadNum = 2; + cpuPowerMode = ""; + scoreThreshold = 0.4f; + enableLiteFp16 = "true"; + } + + @Override + protected void onResume() { + super.onResume(); + getPreferenceScreen().getSharedPreferences().registerOnSharedPreferenceChangeListener(this); + reloadSettingsAndUpdateUI(); + } + + @Override + protected void onPause() { + super.onPause(); + getPreferenceScreen().getSharedPreferences().unregisterOnSharedPreferenceChangeListener(this); + } + + @Override + public void onSharedPreferenceChanged(SharedPreferences sharedPreferences, String key) { + reloadSettingsAndUpdateUI(); + } +} diff --git a/deploy/fastdeploy/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/ui/Utils.java b/deploy/fastdeploy/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/ui/Utils.java new file mode 100644 index 0000000000000000000000000000000000000000..eabeb74f463ac37e4cee7c81035b5e4f1da50ebf --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/ui/Utils.java @@ -0,0 +1,313 @@ +package com.baidu.paddle.fastdeploy.app.ui; + +import android.content.Context; +import android.content.res.Resources; +import android.database.Cursor; +import android.graphics.Bitmap; +import android.graphics.BitmapFactory; +import android.hardware.Camera; +import android.net.Uri; +import android.opengl.GLES20; +import android.os.Environment; +import android.provider.MediaStore; +import android.util.Log; +import android.view.Surface; +import android.view.WindowManager; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.List; + +public class Utils { + private static final String TAG = Utils.class.getSimpleName(); + + public static void RecursiveCreateDirectories(String fileDir) { + String[] fileDirs = fileDir.split("\\/"); + String topPath = ""; + for (int i = 0; i < fileDirs.length; i++) { + topPath += "/" + fileDirs[i]; + File file = new File(topPath); + if (file.exists()) { + continue; + } else { + file.mkdir(); + } + } + } + + public static void copyFileFromAssets(Context appCtx, String srcPath, String dstPath) { + if (srcPath.isEmpty() || dstPath.isEmpty()) { + return; + } + String dstDir = dstPath.substring(0, dstPath.lastIndexOf('/')); + if (dstDir.length() > 0) { + RecursiveCreateDirectories(dstDir); + } + InputStream is = null; + OutputStream os = null; + try { + is = new BufferedInputStream(appCtx.getAssets().open(srcPath)); + os = new BufferedOutputStream(new FileOutputStream(new File(dstPath))); + byte[] buffer = new byte[1024]; + int length = 0; + while ((length = is.read(buffer)) != -1) { + os.write(buffer, 0, length); + } + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } finally { + try { + os.close(); + is.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + + public static void copyDirectoryFromAssets(Context appCtx, String srcDir, String dstDir) { + if (srcDir.isEmpty() || dstDir.isEmpty()) { + return; + } + try { + if (!new File(dstDir).exists()) { + new File(dstDir).mkdirs(); + } + for (String fileName : appCtx.getAssets().list(srcDir)) { + String srcSubPath = srcDir + File.separator + fileName; + String dstSubPath = dstDir + File.separator + fileName; + if (new File(srcSubPath).isDirectory()) { + copyDirectoryFromAssets(appCtx, srcSubPath, dstSubPath); + } else { + copyFileFromAssets(appCtx, srcSubPath, dstSubPath); + } + } + } catch (Exception e) { + e.printStackTrace(); + } + } + + public static float[] parseFloatsFromString(String string, String delimiter) { + String[] pieces = string.trim().toLowerCase().split(delimiter); + float[] floats = new float[pieces.length]; + for (int i = 0; i < pieces.length; i++) { + floats[i] = Float.parseFloat(pieces[i].trim()); + } + return floats; + } + + public static long[] parseLongsFromString(String string, String delimiter) { + String[] pieces = string.trim().toLowerCase().split(delimiter); + long[] longs = new long[pieces.length]; + for (int i = 0; i < pieces.length; i++) { + longs[i] = Long.parseLong(pieces[i].trim()); + } + return longs; + } + + public static String getSDCardDirectory() { + return Environment.getExternalStorageDirectory().getAbsolutePath(); + } + + public static String getDCIMDirectory() { + return Environment.getExternalStoragePublicDirectory(Environment.DIRECTORY_DCIM).getAbsolutePath(); + } + + public static Camera.Size getOptimalPreviewSize(List sizes, int w, int h) { + final double ASPECT_TOLERANCE = 0.3; + double targetRatio = (double) w / h; + if (sizes == null) return null; + + Camera.Size optimalSize = null; + double minDiff = Double.MAX_VALUE; + + int targetHeight = h; + + // Try to find an size match aspect ratio and size + for (Camera.Size size : sizes) { + double ratio = (double) size.width / size.height; + if (Math.abs(ratio - targetRatio) > ASPECT_TOLERANCE) continue; + if (Math.abs(size.height - targetHeight) < minDiff) { + optimalSize = size; + minDiff = Math.abs(size.height - targetHeight); + } + } + + // Cannot find the one match the aspect ratio, ignore the requirement + if (optimalSize == null) { + minDiff = Double.MAX_VALUE; + for (Camera.Size size : sizes) { + if (Math.abs(size.height - targetHeight) < minDiff) { + optimalSize = size; + minDiff = Math.abs(size.height - targetHeight); + } + } + } + return optimalSize; + } + + public static int getScreenWidth() { + return Resources.getSystem().getDisplayMetrics().widthPixels; + } + + public static int getScreenHeight() { + return Resources.getSystem().getDisplayMetrics().heightPixels; + } + + public static int getCameraDisplayOrientation(Context context, int cameraId) { + Camera.CameraInfo info = new Camera.CameraInfo(); + Camera.getCameraInfo(cameraId, info); + WindowManager wm = (WindowManager) context.getSystemService(Context.WINDOW_SERVICE); + int rotation = wm.getDefaultDisplay().getRotation(); + int degrees = 0; + switch (rotation) { + case Surface.ROTATION_0: + degrees = 0; + break; + case Surface.ROTATION_90: + degrees = 90; + break; + case Surface.ROTATION_180: + degrees = 180; + break; + case Surface.ROTATION_270: + degrees = 270; + break; + } + int result; + if (info.facing == Camera.CameraInfo.CAMERA_FACING_FRONT) { + result = (info.orientation + degrees) % 360; + result = (360 - result) % 360; // compensate the mirror + } else { + // back-facing + result = (info.orientation - degrees + 360) % 360; + } + return result; + } + + public static int createShaderProgram(String vss, String fss) { + int vshader = GLES20.glCreateShader(GLES20.GL_VERTEX_SHADER); + GLES20.glShaderSource(vshader, vss); + GLES20.glCompileShader(vshader); + int[] status = new int[1]; + GLES20.glGetShaderiv(vshader, GLES20.GL_COMPILE_STATUS, status, 0); + if (status[0] == 0) { + Log.e(TAG, GLES20.glGetShaderInfoLog(vshader)); + GLES20.glDeleteShader(vshader); + vshader = 0; + return 0; + } + + int fshader = GLES20.glCreateShader(GLES20.GL_FRAGMENT_SHADER); + GLES20.glShaderSource(fshader, fss); + GLES20.glCompileShader(fshader); + GLES20.glGetShaderiv(fshader, GLES20.GL_COMPILE_STATUS, status, 0); + if (status[0] == 0) { + Log.e(TAG, GLES20.glGetShaderInfoLog(fshader)); + GLES20.glDeleteShader(vshader); + GLES20.glDeleteShader(fshader); + fshader = 0; + return 0; + } + + int program = GLES20.glCreateProgram(); + GLES20.glAttachShader(program, vshader); + GLES20.glAttachShader(program, fshader); + GLES20.glLinkProgram(program); + GLES20.glDeleteShader(vshader); + GLES20.glDeleteShader(fshader); + GLES20.glGetProgramiv(program, GLES20.GL_LINK_STATUS, status, 0); + if (status[0] == 0) { + Log.e(TAG, GLES20.glGetProgramInfoLog(program)); + program = 0; + return 0; + } + GLES20.glValidateProgram(program); + GLES20.glGetProgramiv(program, GLES20.GL_VALIDATE_STATUS, status, 0); + if (status[0] == 0) { + Log.e(TAG, GLES20.glGetProgramInfoLog(program)); + GLES20.glDeleteProgram(program); + program = 0; + return 0; + } + + return program; + } + + public static boolean isSupportedNPU() { + String hardware = android.os.Build.HARDWARE; + return hardware.equalsIgnoreCase("kirin810") || hardware.equalsIgnoreCase("kirin990"); + } + + public static Bitmap decodeBitmap(String path, int displayWidth, int displayHeight) { + BitmapFactory.Options op = new BitmapFactory.Options(); + op.inJustDecodeBounds = true;// Only the width and height information of Bitmap is read, not the pixels. + Bitmap bmp = BitmapFactory.decodeFile(path, op); // Get size information. + int wRatio = (int) Math.ceil(op.outWidth / (float) displayWidth);// Get Scale Size. + int hRatio = (int) Math.ceil(op.outHeight / (float) displayHeight); + // If the specified size is exceeded, reduce the corresponding scale. + if (wRatio > 1 && hRatio > 1) { + if (wRatio > hRatio) { + // If it is too wide, we will reduce the width to the required size. Note that the height will become smaller. + op.inSampleSize = wRatio; + } else { + op.inSampleSize = hRatio; + } + } + op.inJustDecodeBounds = false; + bmp = BitmapFactory.decodeFile(path, op); + // Create a Bitmap with a given width and height from the original Bitmap. + return Bitmap.createScaledBitmap(bmp, displayWidth, displayHeight, true); + } + + public static String getRealPathFromURI(Context context, Uri contentURI) { + String result; + Cursor cursor = null; + try { + cursor = context.getContentResolver().query(contentURI, null, null, null, null); + } catch (Throwable e) { + e.printStackTrace(); + } + if (cursor == null) { + result = contentURI.getPath(); + } else { + cursor.moveToFirst(); + int idx = cursor.getColumnIndex(MediaStore.Images.ImageColumns.DATA); + result = cursor.getString(idx); + cursor.close(); + } + return result; + } + + public static List readTxt(String txtPath) { + File file = new File(txtPath); + if (file.isFile() && file.exists()) { + try { + FileInputStream fileInputStream = new FileInputStream(file); + InputStreamReader inputStreamReader = new InputStreamReader(fileInputStream); + BufferedReader bufferedReader = new BufferedReader(inputStreamReader); + String text; + List labels = new ArrayList<>(); + while ((text = bufferedReader.readLine()) != null) { + labels.add(text); + } + return labels; + } catch (Exception e) { + e.printStackTrace(); + } + } + return null; + } +} diff --git a/deploy/fastdeploy/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/ui/layout/ActionBarLayout.java b/deploy/fastdeploy/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/ui/layout/ActionBarLayout.java new file mode 100644 index 0000000000000000000000000000000000000000..099219fa9f677134ae58d3e695d9389b54ce9597 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/ui/layout/ActionBarLayout.java @@ -0,0 +1,33 @@ +package com.baidu.paddle.fastdeploy.app.ui.layout; + +import android.content.Context; +import android.graphics.Color; +import android.support.annotation.Nullable; +import android.util.AttributeSet; +import android.widget.RelativeLayout; + + +public class ActionBarLayout extends RelativeLayout { + private int layoutHeight = 150; + + public ActionBarLayout(Context context) { + super(context); + } + + public ActionBarLayout(Context context, @Nullable AttributeSet attrs) { + super(context, attrs); + } + + public ActionBarLayout(Context context, @Nullable AttributeSet attrs, int defStyleAttr) { + super(context, attrs, defStyleAttr); + } + + @Override + protected void onMeasure(int widthMeasureSpec, int heightMeasureSpec) { + super.onMeasure(widthMeasureSpec, heightMeasureSpec); + int width = MeasureSpec.getSize(widthMeasureSpec); + setMeasuredDimension(width, layoutHeight); + setBackgroundColor(Color.BLACK); + setAlpha(0.9f); + } +} \ No newline at end of file diff --git a/deploy/fastdeploy/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/ui/view/AppCompatPreferenceActivity.java b/deploy/fastdeploy/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/ui/view/AppCompatPreferenceActivity.java new file mode 100644 index 0000000000000000000000000000000000000000..c1a952dcff6873593c0d5e75dc909d9b3177b3d0 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/ui/view/AppCompatPreferenceActivity.java @@ -0,0 +1,111 @@ +package com.baidu.paddle.fastdeploy.app.ui.view; + +import android.content.res.Configuration; +import android.os.Bundle; +import android.preference.PreferenceActivity; +import android.support.annotation.LayoutRes; +import android.support.annotation.Nullable; +import android.support.v7.app.ActionBar; +import android.support.v7.app.AppCompatDelegate; +import android.support.v7.widget.Toolbar; +import android.view.MenuInflater; +import android.view.View; +import android.view.ViewGroup; + +/** + * A {@link PreferenceActivity} which implements and proxies the necessary calls + * to be used with AppCompat. + *

+ * This technique can be used with an {@link android.app.Activity} class, not just + * {@link PreferenceActivity}. + */ +public abstract class AppCompatPreferenceActivity extends PreferenceActivity { + private AppCompatDelegate mDelegate; + + @Override + protected void onCreate(Bundle savedInstanceState) { + getDelegate().installViewFactory(); + getDelegate().onCreate(savedInstanceState); + super.onCreate(savedInstanceState); + } + + @Override + protected void onPostCreate(Bundle savedInstanceState) { + super.onPostCreate(savedInstanceState); + getDelegate().onPostCreate(savedInstanceState); + } + + public ActionBar getSupportActionBar() { + return getDelegate().getSupportActionBar(); + } + + public void setSupportActionBar(@Nullable Toolbar toolbar) { + getDelegate().setSupportActionBar(toolbar); + } + + @Override + public MenuInflater getMenuInflater() { + return getDelegate().getMenuInflater(); + } + + @Override + public void setContentView(@LayoutRes int layoutResID) { + getDelegate().setContentView(layoutResID); + } + + @Override + public void setContentView(View view) { + getDelegate().setContentView(view); + } + + @Override + public void setContentView(View view, ViewGroup.LayoutParams params) { + getDelegate().setContentView(view, params); + } + + @Override + public void addContentView(View view, ViewGroup.LayoutParams params) { + getDelegate().addContentView(view, params); + } + + @Override + protected void onPostResume() { + super.onPostResume(); + getDelegate().onPostResume(); + } + + @Override + protected void onTitleChanged(CharSequence title, int color) { + super.onTitleChanged(title, color); + getDelegate().setTitle(title); + } + + @Override + public void onConfigurationChanged(Configuration newConfig) { + super.onConfigurationChanged(newConfig); + getDelegate().onConfigurationChanged(newConfig); + } + + @Override + protected void onStop() { + super.onStop(); + getDelegate().onStop(); + } + + @Override + protected void onDestroy() { + super.onDestroy(); + getDelegate().onDestroy(); + } + + public void invalidateOptionsMenu() { + getDelegate().invalidateOptionsMenu(); + } + + private AppCompatDelegate getDelegate() { + if (mDelegate == null) { + mDelegate = AppCompatDelegate.create(this, null); + } + return mDelegate; + } +} diff --git a/deploy/fastdeploy/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/ui/view/CameraSurfaceView.java b/deploy/fastdeploy/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/ui/view/CameraSurfaceView.java new file mode 100644 index 0000000000000000000000000000000000000000..e90874c627f671de2b7341334b92d872c7078bb6 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/ui/view/CameraSurfaceView.java @@ -0,0 +1,353 @@ +package com.baidu.paddle.fastdeploy.app.ui.view; + +import android.content.Context; +import android.graphics.Bitmap; +import android.graphics.SurfaceTexture; +import android.hardware.Camera; +import android.hardware.Camera.CameraInfo; +import android.hardware.Camera.Size; +import android.opengl.GLES11Ext; +import android.opengl.GLES20; +import android.opengl.GLSurfaceView; +import android.opengl.GLSurfaceView.Renderer; +import android.opengl.GLUtils; +import android.opengl.Matrix; +import android.os.SystemClock; +import android.util.AttributeSet; +import android.util.Log; + +import com.baidu.paddle.fastdeploy.app.ui.Utils; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.FloatBuffer; +import java.util.List; + +import javax.microedition.khronos.egl.EGLConfig; +import javax.microedition.khronos.opengles.GL10; + +public class CameraSurfaceView extends GLSurfaceView implements Renderer, + SurfaceTexture.OnFrameAvailableListener { + private static final String TAG = CameraSurfaceView.class.getSimpleName(); + + public static int EXPECTED_PREVIEW_WIDTH = 1280; // 1920 + public static int EXPECTED_PREVIEW_HEIGHT = 720; // 960 + + protected int numberOfCameras; + protected int selectedCameraId; + protected boolean disableCamera = false; + protected Camera camera; + + protected Context context; + protected SurfaceTexture surfaceTexture; + protected int surfaceWidth = 0; + protected int surfaceHeight = 0; + protected int textureWidth = 0; + protected int textureHeight = 0; + + protected Bitmap ARGB8888ImageBitmap; + protected boolean bitmapReleaseMode = true; + + // In order to manipulate the camera preview data and render the modified one + // to the screen, three textures are created and the data flow is shown as following: + // previewdata->camTextureId->fboTexureId->drawTexureId->framebuffer + protected int[] fbo = {0}; + protected int[] camTextureId = {0}; + protected int[] fboTexureId = {0}; + protected int[] drawTexureId = {0}; + + private final String vss = "" + + "attribute vec2 vPosition;\n" + + "attribute vec2 vTexCoord;\n" + "varying vec2 texCoord;\n" + + "void main() {\n" + " texCoord = vTexCoord;\n" + + " gl_Position = vec4 (vPosition.x, vPosition.y, 0.0, 1.0);\n" + + "}"; + + private final String fssCam2FBO = "" + + "#extension GL_OES_EGL_image_external : require\n" + + "precision mediump float;\n" + + "uniform samplerExternalOES sTexture;\n" + + "varying vec2 texCoord;\n" + + "void main() {\n" + + " gl_FragColor = texture2D(sTexture,texCoord);\n" + "}"; + + private final String fssTex2Screen = "" + + "precision mediump float;\n" + + "uniform sampler2D sTexture;\n" + + "varying vec2 texCoord;\n" + + "void main() {\n" + + " gl_FragColor = texture2D(sTexture,texCoord);\n" + "}"; + + private final float[] vertexCoords = { + -1, -1, + -1, 1, + 1, -1, + 1, 1}; + private float[] textureCoords = { + 0, 1, + 0, 0, + 1, 1, + 1, 0}; + + private FloatBuffer vertexCoordsBuffer; + private FloatBuffer textureCoordsBuffer; + + private int progCam2FBO = -1; + private int progTex2Screen = -1; + private int vcCam2FBO; + private int tcCam2FBO; + private int vcTex2Screen; + private int tcTex2Screen; + + public void setBitmapReleaseMode(boolean mode) { + synchronized (this) { + bitmapReleaseMode = mode; + } + } + + public Bitmap getBitmap() { + return ARGB8888ImageBitmap; // may null or recycled. + } + + public interface OnTextureChangedListener { + boolean onTextureChanged(Bitmap ARGB8888ImageBitmap); + } + + private OnTextureChangedListener onTextureChangedListener = null; + + public void setOnTextureChangedListener(OnTextureChangedListener listener) { + onTextureChangedListener = listener; + } + + public CameraSurfaceView(Context ctx, AttributeSet attrs) { + super(ctx, attrs); + context = ctx; + setEGLContextClientVersion(2); + setRenderer(this); + setRenderMode(RENDERMODE_WHEN_DIRTY); + + // Find the total number of available cameras and the ID of the default camera + numberOfCameras = Camera.getNumberOfCameras(); + CameraInfo cameraInfo = new CameraInfo(); + for (int i = 0; i < numberOfCameras; i++) { + Camera.getCameraInfo(i, cameraInfo); + if (cameraInfo.facing == CameraInfo.CAMERA_FACING_BACK) { + selectedCameraId = i; + } + } + } + + @Override + public void onSurfaceCreated(GL10 gl, EGLConfig config) { + // Create OES texture for storing camera preview data(YUV format) + GLES20.glGenTextures(1, camTextureId, 0); + GLES20.glBindTexture(GLES11Ext.GL_TEXTURE_EXTERNAL_OES, camTextureId[0]); + GLES20.glTexParameteri(GLES11Ext.GL_TEXTURE_EXTERNAL_OES, GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE); + GLES20.glTexParameteri(GLES11Ext.GL_TEXTURE_EXTERNAL_OES, GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE); + GLES20.glTexParameteri(GLES11Ext.GL_TEXTURE_EXTERNAL_OES, GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_NEAREST); + GLES20.glTexParameteri(GLES11Ext.GL_TEXTURE_EXTERNAL_OES, GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_NEAREST); + surfaceTexture = new SurfaceTexture(camTextureId[0]); + surfaceTexture.setOnFrameAvailableListener(this); + + // Prepare vertex and texture coordinates + int bytes = vertexCoords.length * Float.SIZE / Byte.SIZE; + vertexCoordsBuffer = ByteBuffer.allocateDirect(bytes).order(ByteOrder.nativeOrder()).asFloatBuffer(); + textureCoordsBuffer = ByteBuffer.allocateDirect(bytes).order(ByteOrder.nativeOrder()).asFloatBuffer(); + vertexCoordsBuffer.put(vertexCoords).position(0); + textureCoordsBuffer.put(textureCoords).position(0); + + // Create vertex and fragment shaders + // camTextureId->fboTexureId + progCam2FBO = Utils.createShaderProgram(vss, fssCam2FBO); + vcCam2FBO = GLES20.glGetAttribLocation(progCam2FBO, "vPosition"); + tcCam2FBO = GLES20.glGetAttribLocation(progCam2FBO, "vTexCoord"); + GLES20.glEnableVertexAttribArray(vcCam2FBO); + GLES20.glEnableVertexAttribArray(tcCam2FBO); + // fboTexureId/drawTexureId -> screen + progTex2Screen = Utils.createShaderProgram(vss, fssTex2Screen); + vcTex2Screen = GLES20.glGetAttribLocation(progTex2Screen, "vPosition"); + tcTex2Screen = GLES20.glGetAttribLocation(progTex2Screen, "vTexCoord"); + GLES20.glEnableVertexAttribArray(vcTex2Screen); + GLES20.glEnableVertexAttribArray(tcTex2Screen); + } + + @Override + public void onSurfaceChanged(GL10 gl, int width, int height) { + surfaceWidth = width; + surfaceHeight = height; + openCamera(); + } + + @Override + public void onDrawFrame(GL10 gl) { + if (surfaceTexture == null) return; + + GLES20.glClearColor(0.0f, 0.0f, 0.0f, 1.0f); + GLES20.glClear(GLES20.GL_COLOR_BUFFER_BIT | GLES20.GL_DEPTH_BUFFER_BIT); + surfaceTexture.updateTexImage(); + float[] matrix = new float[16]; + surfaceTexture.getTransformMatrix(matrix); + + // camTextureId->fboTexureId + GLES20.glBindFramebuffer(GLES20.GL_FRAMEBUFFER, fbo[0]); + GLES20.glViewport(0, 0, textureWidth, textureHeight); + GLES20.glClear(GLES20.GL_COLOR_BUFFER_BIT); + GLES20.glUseProgram(progCam2FBO); + GLES20.glVertexAttribPointer(vcCam2FBO, 2, GLES20.GL_FLOAT, false, 4 * 2, vertexCoordsBuffer); + textureCoordsBuffer.clear(); + textureCoordsBuffer.put(transformTextureCoordinates(textureCoords, matrix)); + textureCoordsBuffer.position(0); + GLES20.glVertexAttribPointer(tcCam2FBO, 2, GLES20.GL_FLOAT, false, 4 * 2, textureCoordsBuffer); + GLES20.glActiveTexture(GLES20.GL_TEXTURE0); + GLES20.glBindTexture(GLES11Ext.GL_TEXTURE_EXTERNAL_OES, camTextureId[0]); + GLES20.glUniform1i(GLES20.glGetUniformLocation(progCam2FBO, "sTexture"), 0); + GLES20.glDrawArrays(GLES20.GL_TRIANGLE_STRIP, 0, 4); + GLES20.glFlush(); + + // Check if the draw texture is set + int targetTexureId = fboTexureId[0]; + if (onTextureChangedListener != null) { + // Read pixels of FBO to a bitmap + ByteBuffer pixelBuffer = ByteBuffer.allocate(textureWidth * textureHeight * 4); + GLES20.glReadPixels(0, 0, textureWidth, textureHeight, GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, pixelBuffer); + + ARGB8888ImageBitmap = Bitmap.createBitmap(textureWidth, textureHeight, Bitmap.Config.ARGB_8888); + ARGB8888ImageBitmap.copyPixelsFromBuffer(pixelBuffer); + + boolean modified = onTextureChangedListener.onTextureChanged(ARGB8888ImageBitmap); + + if (modified) { + targetTexureId = drawTexureId[0]; + // Update a bitmap to the GL texture if modified + GLES20.glActiveTexture(targetTexureId); + // GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, targetTexureId); + GLES20.glBindTexture(GLES11Ext.GL_TEXTURE_EXTERNAL_OES, targetTexureId); + GLUtils.texImage2D(GL10.GL_TEXTURE_2D, 0, ARGB8888ImageBitmap, 0); + } + if (bitmapReleaseMode) { + ARGB8888ImageBitmap.recycle(); + } + } + + // fboTexureId/drawTexureId->Screen + GLES20.glBindFramebuffer(GLES20.GL_FRAMEBUFFER, 0); + GLES20.glViewport(0, 0, surfaceWidth, surfaceHeight); + GLES20.glClear(GLES20.GL_COLOR_BUFFER_BIT); + GLES20.glUseProgram(progTex2Screen); + GLES20.glVertexAttribPointer(vcTex2Screen, 2, GLES20.GL_FLOAT, false, 4 * 2, vertexCoordsBuffer); + textureCoordsBuffer.clear(); + textureCoordsBuffer.put(textureCoords); + textureCoordsBuffer.position(0); + GLES20.glVertexAttribPointer(tcTex2Screen, 2, GLES20.GL_FLOAT, false, 4 * 2, textureCoordsBuffer); + GLES20.glActiveTexture(GLES20.GL_TEXTURE0); + GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, targetTexureId); + GLES20.glUniform1i(GLES20.glGetUniformLocation(progTex2Screen, "sTexture"), 0); + GLES20.glDrawArrays(GLES20.GL_TRIANGLE_STRIP, 0, 4); + GLES20.glFlush(); + } + + private float[] transformTextureCoordinates(float[] coords, float[] matrix) { + float[] result = new float[coords.length]; + float[] vt = new float[4]; + for (int i = 0; i < coords.length; i += 2) { + float[] v = {coords[i], coords[i + 1], 0, 1}; + Matrix.multiplyMV(vt, 0, matrix, 0, v, 0); + result[i] = vt[0]; + result[i + 1] = vt[1]; + } + return result; + } + + @Override + public void onResume() { + super.onResume(); + } + + @Override + public void onPause() { + super.onPause(); + releaseCamera(); + } + + @Override + public void onFrameAvailable(SurfaceTexture surfaceTexture) { + requestRender(); + } + + public void disableCamera() { + disableCamera = true; + } + + public void enableCamera() { + disableCamera = false; + } + + public void switchCamera() { + releaseCamera(); + selectedCameraId = (selectedCameraId + 1) % numberOfCameras; + openCamera(); + } + + public void openCamera() { + if (disableCamera) return; + camera = Camera.open(selectedCameraId); + List supportedPreviewSizes = camera.getParameters().getSupportedPreviewSizes(); + Size previewSize = Utils.getOptimalPreviewSize(supportedPreviewSizes, EXPECTED_PREVIEW_WIDTH, + EXPECTED_PREVIEW_HEIGHT); + Camera.Parameters parameters = camera.getParameters(); + parameters.setPreviewSize(previewSize.width, previewSize.height); + if (parameters.getSupportedFocusModes().contains(Camera.Parameters.FOCUS_MODE_CONTINUOUS_VIDEO)) { + parameters.setFocusMode(Camera.Parameters.FOCUS_MODE_CONTINUOUS_VIDEO); + } + camera.setParameters(parameters); + int degree = Utils.getCameraDisplayOrientation(context, selectedCameraId); + camera.setDisplayOrientation(degree); + boolean rotate = degree == 90 || degree == 270; + textureWidth = rotate ? previewSize.height : previewSize.width; + textureHeight = rotate ? previewSize.width : previewSize.height; + // Destroy FBO and draw textures + GLES20.glBindFramebuffer(GLES20.GL_FRAMEBUFFER, 0); + GLES20.glDeleteFramebuffers(1, fbo, 0); + GLES20.glDeleteTextures(1, drawTexureId, 0); + GLES20.glDeleteTextures(1, fboTexureId, 0); + // Normal texture for storing modified camera preview data(RGBA format) + GLES20.glGenTextures(1, drawTexureId, 0); + GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, drawTexureId[0]); + GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, 0, GLES20.GL_RGBA, textureWidth, textureHeight, 0, + GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, null); + GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE); + GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE); + GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_NEAREST); + GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_NEAREST); + // FBO texture for storing camera preview data(RGBA format) + GLES20.glGenTextures(1, fboTexureId, 0); + GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, fboTexureId[0]); + GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, 0, GLES20.GL_RGBA, textureWidth, textureHeight, 0, + GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, null); + GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE); + GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE); + GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_NEAREST); + GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_NEAREST); + // Generate FBO and bind to FBO texture + GLES20.glGenFramebuffers(1, fbo, 0); + GLES20.glBindFramebuffer(GLES20.GL_FRAMEBUFFER, fbo[0]); + GLES20.glFramebufferTexture2D(GLES20.GL_FRAMEBUFFER, GLES20.GL_COLOR_ATTACHMENT0, GLES20.GL_TEXTURE_2D, + fboTexureId[0], 0); + try { + camera.setPreviewTexture(surfaceTexture); + } catch (IOException exception) { + Log.e(TAG, "IOException caused by setPreviewDisplay()", exception); + } + camera.startPreview(); + } + + public void releaseCamera() { + if (camera != null) { + camera.setPreviewCallback(null); + camera.stopPreview(); + camera.release(); + camera = null; + } + } +} diff --git a/deploy/fastdeploy/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/ui/view/ResultListView.java b/deploy/fastdeploy/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/ui/view/ResultListView.java new file mode 100644 index 0000000000000000000000000000000000000000..62b48a0547dca5c1dd80440918bb813811f35844 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/ui/view/ResultListView.java @@ -0,0 +1,43 @@ +package com.baidu.paddle.fastdeploy.app.ui.view; + +import android.content.Context; +import android.os.Handler; +import android.util.AttributeSet; +import android.widget.ListView; + +public class ResultListView extends ListView { + public ResultListView(Context context) { + super(context); + } + + public ResultListView(Context context, AttributeSet attrs) { + super(context, attrs); + } + + public ResultListView(Context context, AttributeSet attrs, int defStyleAttr) { + super(context, attrs, defStyleAttr); + } + + private Handler handler; + + public void setHandler(Handler mHandler) { + handler = mHandler; + } + + public void clear() { + handler.post(new Runnable() { + @Override + public void run() { + removeAllViewsInLayout(); + invalidate(); + } + }); + } + + @Override + protected void onMeasure(int widthMeasureSpec, int heightMeasureSpec) { + int expandSpec = MeasureSpec.makeMeasureSpec(Integer.MAX_VALUE >> 2, + MeasureSpec.AT_MOST); + super.onMeasure(widthMeasureSpec, expandSpec); + } +} diff --git a/deploy/fastdeploy/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/ui/view/adapter/BaseResultAdapter.java b/deploy/fastdeploy/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/ui/view/adapter/BaseResultAdapter.java new file mode 100644 index 0000000000000000000000000000000000000000..62747965adc25714bd35fa254c6fce1e6009fa0e --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/ui/view/adapter/BaseResultAdapter.java @@ -0,0 +1,48 @@ +package com.baidu.paddle.fastdeploy.app.ui.view.adapter; + +import android.content.Context; +import android.support.annotation.NonNull; +import android.support.annotation.Nullable; +import android.view.LayoutInflater; +import android.view.View; +import android.view.ViewGroup; +import android.widget.ArrayAdapter; +import android.widget.TextView; + +import com.baidu.paddle.fastdeploy.app.examples.R; +import com.baidu.paddle.fastdeploy.app.ui.view.model.BaseResultModel; + +import java.text.DecimalFormat; +import java.util.List; + +public class BaseResultAdapter extends ArrayAdapter { + private int resourceId; + + public BaseResultAdapter(@NonNull Context context, int resource) { + super(context, resource); + } + + public BaseResultAdapter(@NonNull Context context, int resource, @NonNull List objects) { + super(context, resource, objects); + resourceId = resource; + } + + @NonNull + @Override + public View getView(int position, @Nullable View convertView, @NonNull ViewGroup parent) { + BaseResultModel model = getItem(position); + View view = LayoutInflater.from(getContext()).inflate(resourceId, null); + TextView indexText = (TextView) view.findViewById(R.id.index); + TextView nameText = (TextView) view.findViewById(R.id.name); + TextView confidenceText = (TextView) view.findViewById(R.id.confidence); + indexText.setText(String.valueOf(model.getIndex())); + nameText.setText(String.valueOf(model.getName())); + confidenceText.setText(formatFloatString(model.getConfidence())); + return view; + } + + public static String formatFloatString(float number) { + DecimalFormat df = new DecimalFormat("0.00"); + return df.format(number); + } +} diff --git a/deploy/fastdeploy/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/ui/view/model/BaseResultModel.java b/deploy/fastdeploy/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/ui/view/model/BaseResultModel.java new file mode 100644 index 0000000000000000000000000000000000000000..cae71b6909db125894a2ce0da8ac3485dd48619f --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/java/com/baidu/paddle/fastdeploy/app/ui/view/model/BaseResultModel.java @@ -0,0 +1,41 @@ +package com.baidu.paddle.fastdeploy.app.ui.view.model; + +public class BaseResultModel { + private int index; + private String name; + private float confidence; + + public BaseResultModel() { + + } + + public BaseResultModel(int index, String name, float confidence) { + this.index = index; + this.name = name; + this.confidence = confidence; + } + + public float getConfidence() { + return confidence; + } + + public void setConfidence(float confidence) { + this.confidence = confidence; + } + + public int getIndex() { + return index; + } + + public void setIndex(int index) { + this.index = index; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } +} diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-v24/action_button_layer.xml b/deploy/fastdeploy/android/app/src/main/res/drawable-v24/action_button_layer.xml new file mode 100644 index 0000000000000000000000000000000000000000..a0d2e76bfa39dc7faa6cca58132ea6c0691c3f15 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/drawable-v24/action_button_layer.xml @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-v24/album_btn.xml b/deploy/fastdeploy/android/app/src/main/res/drawable-v24/album_btn.xml new file mode 100644 index 0000000000000000000000000000000000000000..26d01c584185231af27b424b26de8b957a8f5c28 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/drawable-v24/album_btn.xml @@ -0,0 +1,7 @@ + + + + + + + \ No newline at end of file diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-v24/ic_launcher_foreground.xml b/deploy/fastdeploy/android/app/src/main/res/drawable-v24/ic_launcher_foreground.xml new file mode 100644 index 0000000000000000000000000000000000000000..1f6bb290603d7caa16c5fb6f61bbfdc750622f5c --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/drawable-v24/ic_launcher_foreground.xml @@ -0,0 +1,34 @@ + + + + + + + + + + + diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-v24/realtime_start_btn.xml b/deploy/fastdeploy/android/app/src/main/res/drawable-v24/realtime_start_btn.xml new file mode 100644 index 0000000000000000000000000000000000000000..664134453069f0353eb0e34893bb7d9b6efa8a78 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/drawable-v24/realtime_start_btn.xml @@ -0,0 +1,7 @@ + + + + + + + \ No newline at end of file diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-v24/realtime_stop_btn.xml b/deploy/fastdeploy/android/app/src/main/res/drawable-v24/realtime_stop_btn.xml new file mode 100644 index 0000000000000000000000000000000000000000..8869a1b2bf0a73abee8438ee12ddda8ec1e8524f --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/drawable-v24/realtime_stop_btn.xml @@ -0,0 +1,7 @@ + + + + + + + \ No newline at end of file diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-v24/result_page_border_section_bk.xml b/deploy/fastdeploy/android/app/src/main/res/drawable-v24/result_page_border_section_bk.xml new file mode 100644 index 0000000000000000000000000000000000000000..bd068f169f551e5f88942ed65c5dca83fc8a6033 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/drawable-v24/result_page_border_section_bk.xml @@ -0,0 +1,12 @@ + + + + + + + + + + \ No newline at end of file diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-v24/round_corner_btn.xml b/deploy/fastdeploy/android/app/src/main/res/drawable-v24/round_corner_btn.xml new file mode 100644 index 0000000000000000000000000000000000000000..c5dcc45d56375ae8bfad057aea837a1d34c6aac2 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/drawable-v24/round_corner_btn.xml @@ -0,0 +1,10 @@ + + + + + \ No newline at end of file diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-v24/seekbar_progress_realtime.xml b/deploy/fastdeploy/android/app/src/main/res/drawable-v24/seekbar_progress_realtime.xml new file mode 100644 index 0000000000000000000000000000000000000000..b349d15a6aa37105a7ce2a1d09db4490ff715341 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/drawable-v24/seekbar_progress_realtime.xml @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-v24/seekbar_progress_result.xml b/deploy/fastdeploy/android/app/src/main/res/drawable-v24/seekbar_progress_result.xml new file mode 100644 index 0000000000000000000000000000000000000000..17cb68ed80ccb203d76c20bf6be25cf3408f7a22 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/drawable-v24/seekbar_progress_result.xml @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + + + diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-v24/seekbar_thumb.xml b/deploy/fastdeploy/android/app/src/main/res/drawable-v24/seekbar_thumb.xml new file mode 100644 index 0000000000000000000000000000000000000000..96bd95e0a1736f5eb1bf574c041fd631a888f2b4 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/drawable-v24/seekbar_thumb.xml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-v24/seekbar_thumb_shape.xml b/deploy/fastdeploy/android/app/src/main/res/drawable-v24/seekbar_thumb_shape.xml new file mode 100644 index 0000000000000000000000000000000000000000..26d033b6df27d3bdec275cb938914d5087d753ce --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/drawable-v24/seekbar_thumb_shape.xml @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-v24/switch_side_btn.xml b/deploy/fastdeploy/android/app/src/main/res/drawable-v24/switch_side_btn.xml new file mode 100644 index 0000000000000000000000000000000000000000..b9b2edfb6a55a246302cbf7b67e6a8110ceebe54 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/drawable-v24/switch_side_btn.xml @@ -0,0 +1,7 @@ + + + + + + + \ No newline at end of file diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-v24/take_picture_btn.xml b/deploy/fastdeploy/android/app/src/main/res/drawable-v24/take_picture_btn.xml new file mode 100644 index 0000000000000000000000000000000000000000..4966675c35cfae5b1514b6600ada79f855550a92 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/drawable-v24/take_picture_btn.xml @@ -0,0 +1,7 @@ + + + + + + + \ No newline at end of file diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/album.png b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/album.png new file mode 100644 index 0000000000000000000000000000000000000000..3a6fdedaee3cce52cf376ecb9977ea750a6014df Binary files /dev/null and b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/album.png differ diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/album_pressed.png b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/album_pressed.png new file mode 100644 index 0000000000000000000000000000000000000000..aa873424ebb9921081bbb9618875fc410bf9c84d Binary files /dev/null and b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/album_pressed.png differ diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/back_btn.png b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/back_btn.png new file mode 100644 index 0000000000000000000000000000000000000000..ff121e85f5614dfd022f39627028af825a46d683 Binary files /dev/null and b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/back_btn.png differ diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/more_menu.png b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/more_menu.png new file mode 100644 index 0000000000000000000000000000000000000000..edf9f3ccced5afeb71d9516d93ea19f26c7d9984 Binary files /dev/null and b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/more_menu.png differ diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/realtime_start.png b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/realtime_start.png new file mode 100644 index 0000000000000000000000000000000000000000..94ab0817247bfa462d539237441cdc5795f1fdb0 Binary files /dev/null and b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/realtime_start.png differ diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/realtime_start_pressed.png b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/realtime_start_pressed.png new file mode 100644 index 0000000000000000000000000000000000000000..feef0fea62a15ab72af6556cae2811f9e5f1e3c5 Binary files /dev/null and b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/realtime_start_pressed.png differ diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/realtime_stop.png b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/realtime_stop.png new file mode 100644 index 0000000000000000000000000000000000000000..8c926367db6d1b66e1a2ef0cfe79c2eee2dbc789 Binary files /dev/null and b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/realtime_stop.png differ diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/realtime_stop_pressed.png b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/realtime_stop_pressed.png new file mode 100644 index 0000000000000000000000000000000000000000..309082788b0ca3b7686ded57f123e9e501110182 Binary files /dev/null and b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/realtime_stop_pressed.png differ diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/scan_icon.png b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/scan_icon.png new file mode 100644 index 0000000000000000000000000000000000000000..7517d99d09403cad513c22da492c43c8cde6c9e3 Binary files /dev/null and b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/scan_icon.png differ diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/seekbar_handle.png b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/seekbar_handle.png new file mode 100644 index 0000000000000000000000000000000000000000..55f5f73991da608090a5586e95158dfd31760609 Binary files /dev/null and b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/seekbar_handle.png differ diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/seekbar_progress_dotted.png b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/seekbar_progress_dotted.png new file mode 100644 index 0000000000000000000000000000000000000000..e6241d12e6e67c53f45d8955bdae0707e8c68683 Binary files /dev/null and b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/seekbar_progress_dotted.png differ diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/seekbar_thumb_invisible.png b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/seekbar_thumb_invisible.png new file mode 100644 index 0000000000000000000000000000000000000000..acfe8d374a41fdd2db428f9e5242c790fd0b3926 Binary files /dev/null and b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/seekbar_thumb_invisible.png differ diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/switch_side.png b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/switch_side.png new file mode 100644 index 0000000000000000000000000000000000000000..3e6ae9a9472b10d72aac63c4755d67ff33704f31 Binary files /dev/null and b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/switch_side.png differ diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/switch_side_pressed.png b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/switch_side_pressed.png new file mode 100644 index 0000000000000000000000000000000000000000..25e1522768f55c7ff7f8f4f6b12073b084dcb2ae Binary files /dev/null and b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/switch_side_pressed.png differ diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/take_picture.png b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/take_picture.png new file mode 100644 index 0000000000000000000000000000000000000000..d6ced986e82ce3eefe6e1f81fb662dc3797cb764 Binary files /dev/null and b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/take_picture.png differ diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/take_picture_pressed.png b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/take_picture_pressed.png new file mode 100644 index 0000000000000000000000000000000000000000..5f9c8ee3b51b5849d375136ee6fef178103d9738 Binary files /dev/null and b/deploy/fastdeploy/android/app/src/main/res/drawable-xhdpi/take_picture_pressed.png differ diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-xxhdpi-v4/btn_switch_default.png b/deploy/fastdeploy/android/app/src/main/res/drawable-xxhdpi-v4/btn_switch_default.png new file mode 100644 index 0000000000000000000000000000000000000000..b9e66c7f605dd5a02d13f04284a046810b292add Binary files /dev/null and b/deploy/fastdeploy/android/app/src/main/res/drawable-xxhdpi-v4/btn_switch_default.png differ diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable-xxhdpi-v4/btn_switch_pressed.png b/deploy/fastdeploy/android/app/src/main/res/drawable-xxhdpi-v4/btn_switch_pressed.png new file mode 100644 index 0000000000000000000000000000000000000000..9544133bdade8f57552f9ab22976be3172c95b86 Binary files /dev/null and b/deploy/fastdeploy/android/app/src/main/res/drawable-xxhdpi-v4/btn_switch_pressed.png differ diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable/btn_settings.xml b/deploy/fastdeploy/android/app/src/main/res/drawable/btn_settings.xml new file mode 100644 index 0000000000000000000000000000000000000000..917897b99981d18082d18a87a4ad5176ad8e8f8d --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/drawable/btn_settings.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable/btn_settings_default.xml b/deploy/fastdeploy/android/app/src/main/res/drawable/btn_settings_default.xml new file mode 100644 index 0000000000000000000000000000000000000000..e19589a97e419249eaacd05f3d75deeeada3e128 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/drawable/btn_settings_default.xml @@ -0,0 +1,13 @@ + + + + diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable/btn_settings_pressed.xml b/deploy/fastdeploy/android/app/src/main/res/drawable/btn_settings_pressed.xml new file mode 100644 index 0000000000000000000000000000000000000000..c4af2a042de3a8ae00ab253f889a20dedffa4874 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/drawable/btn_settings_pressed.xml @@ -0,0 +1,13 @@ + + + + diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable/btn_shutter.xml b/deploy/fastdeploy/android/app/src/main/res/drawable/btn_shutter.xml new file mode 100644 index 0000000000000000000000000000000000000000..4f9826d3ae340b54046a48e4250a9d7e0b9d9139 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/drawable/btn_shutter.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable/btn_shutter_default.xml b/deploy/fastdeploy/android/app/src/main/res/drawable/btn_shutter_default.xml new file mode 100644 index 0000000000000000000000000000000000000000..234ca014a76b9647959814fa28e0c02324a8d814 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/drawable/btn_shutter_default.xml @@ -0,0 +1,17 @@ + + + + + diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable/btn_shutter_pressed.xml b/deploy/fastdeploy/android/app/src/main/res/drawable/btn_shutter_pressed.xml new file mode 100644 index 0000000000000000000000000000000000000000..accc7acedb91cc4fb8171d78eeba24eaa6b0c2db --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/drawable/btn_shutter_pressed.xml @@ -0,0 +1,17 @@ + + + + + diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable/btn_switch.xml b/deploy/fastdeploy/android/app/src/main/res/drawable/btn_switch.xml new file mode 100644 index 0000000000000000000000000000000000000000..691e8c2e97d7a65d580e4d12d6b77608083b5617 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/drawable/btn_switch.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/deploy/fastdeploy/android/app/src/main/res/drawable/ic_launcher_background.xml b/deploy/fastdeploy/android/app/src/main/res/drawable/ic_launcher_background.xml new file mode 100644 index 0000000000000000000000000000000000000000..0d025f9bf6b67c63044a36a9ff44fbc69e5c5822 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/drawable/ic_launcher_background.xml @@ -0,0 +1,170 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/fastdeploy/android/app/src/main/res/layout-land/ocr_activity_main.xml b/deploy/fastdeploy/android/app/src/main/res/layout-land/ocr_activity_main.xml new file mode 100644 index 0000000000000000000000000000000000000000..b30f35edf73786cd8d8b97db03f90567922647d9 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/layout-land/ocr_activity_main.xml @@ -0,0 +1,14 @@ + + + + + + + diff --git a/deploy/fastdeploy/android/app/src/main/res/layout/ocr_activity_main.xml b/deploy/fastdeploy/android/app/src/main/res/layout/ocr_activity_main.xml new file mode 100644 index 0000000000000000000000000000000000000000..b30f35edf73786cd8d8b97db03f90567922647d9 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/layout/ocr_activity_main.xml @@ -0,0 +1,14 @@ + + + + + + + diff --git a/deploy/fastdeploy/android/app/src/main/res/layout/ocr_camera_page.xml b/deploy/fastdeploy/android/app/src/main/res/layout/ocr_camera_page.xml new file mode 100644 index 0000000000000000000000000000000000000000..6f31c2c7e4423867f4f96ede92ca1594f432ac58 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/layout/ocr_camera_page.xml @@ -0,0 +1,160 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/deploy/fastdeploy/android/app/src/main/res/layout/ocr_result_page.xml b/deploy/fastdeploy/android/app/src/main/res/layout/ocr_result_page.xml new file mode 100644 index 0000000000000000000000000000000000000000..958a85940147f5726208f6504bc3c94212939b95 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/layout/ocr_result_page.xml @@ -0,0 +1,160 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/deploy/fastdeploy/android/app/src/main/res/layout/ocr_result_page_item.xml b/deploy/fastdeploy/android/app/src/main/res/layout/ocr_result_page_item.xml new file mode 100644 index 0000000000000000000000000000000000000000..6a2b09ebff16c3398c0fe64dff2772c00ba6be53 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/layout/ocr_result_page_item.xml @@ -0,0 +1,26 @@ + + + + + + + + + \ No newline at end of file diff --git a/deploy/fastdeploy/android/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml b/deploy/fastdeploy/android/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml new file mode 100644 index 0000000000000000000000000000000000000000..eca70cfe52eac1ba66ba280a68ca7be8fcf88a16 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml @@ -0,0 +1,5 @@ + + + + + \ No newline at end of file diff --git a/deploy/fastdeploy/android/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml b/deploy/fastdeploy/android/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml new file mode 100644 index 0000000000000000000000000000000000000000..eca70cfe52eac1ba66ba280a68ca7be8fcf88a16 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml @@ -0,0 +1,5 @@ + + + + + \ No newline at end of file diff --git a/deploy/fastdeploy/android/app/src/main/res/mipmap-hdpi/ic_launcher.png b/deploy/fastdeploy/android/app/src/main/res/mipmap-hdpi/ic_launcher.png new file mode 100644 index 0000000000000000000000000000000000000000..898f3ed59ac9f3248734a00e5902736c9367d455 Binary files /dev/null and b/deploy/fastdeploy/android/app/src/main/res/mipmap-hdpi/ic_launcher.png differ diff --git a/deploy/fastdeploy/android/app/src/main/res/mipmap-hdpi/ic_launcher_round.png b/deploy/fastdeploy/android/app/src/main/res/mipmap-hdpi/ic_launcher_round.png new file mode 100644 index 0000000000000000000000000000000000000000..dffca3601eba7bf5f409bdd520820e2eb5122c75 Binary files /dev/null and b/deploy/fastdeploy/android/app/src/main/res/mipmap-hdpi/ic_launcher_round.png differ diff --git a/deploy/fastdeploy/android/app/src/main/res/mipmap-mdpi/ic_launcher.png b/deploy/fastdeploy/android/app/src/main/res/mipmap-mdpi/ic_launcher.png new file mode 100644 index 0000000000000000000000000000000000000000..64ba76f75e9ce021aa3d95c213491f73bcacb597 Binary files /dev/null and b/deploy/fastdeploy/android/app/src/main/res/mipmap-mdpi/ic_launcher.png differ diff --git a/deploy/fastdeploy/android/app/src/main/res/mipmap-mdpi/ic_launcher_round.png b/deploy/fastdeploy/android/app/src/main/res/mipmap-mdpi/ic_launcher_round.png new file mode 100644 index 0000000000000000000000000000000000000000..dae5e082342fcdeee5db8a6e0b27028e2d2808f5 Binary files /dev/null and b/deploy/fastdeploy/android/app/src/main/res/mipmap-mdpi/ic_launcher_round.png differ diff --git a/deploy/fastdeploy/android/app/src/main/res/mipmap-xhdpi/ic_launcher.png b/deploy/fastdeploy/android/app/src/main/res/mipmap-xhdpi/ic_launcher.png new file mode 100644 index 0000000000000000000000000000000000000000..e5ed46597ea8447d91ab1786a34e30f1c26b18bd Binary files /dev/null and b/deploy/fastdeploy/android/app/src/main/res/mipmap-xhdpi/ic_launcher.png differ diff --git a/deploy/fastdeploy/android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.png b/deploy/fastdeploy/android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.png new file mode 100644 index 0000000000000000000000000000000000000000..14ed0af35023e4f1901cf03487b6c524257b8483 Binary files /dev/null and b/deploy/fastdeploy/android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.png differ diff --git a/deploy/fastdeploy/android/app/src/main/res/mipmap-xxhdpi/ic_launcher.png b/deploy/fastdeploy/android/app/src/main/res/mipmap-xxhdpi/ic_launcher.png new file mode 100644 index 0000000000000000000000000000000000000000..b0907cac3bfd8fbfdc46e1108247f0a1055387ec Binary files /dev/null and b/deploy/fastdeploy/android/app/src/main/res/mipmap-xxhdpi/ic_launcher.png differ diff --git a/deploy/fastdeploy/android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.png b/deploy/fastdeploy/android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.png new file mode 100644 index 0000000000000000000000000000000000000000..d8ae03154975f397f8ed1b84f2d4bf9783ecfa26 Binary files /dev/null and b/deploy/fastdeploy/android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.png differ diff --git a/deploy/fastdeploy/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png b/deploy/fastdeploy/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png new file mode 100644 index 0000000000000000000000000000000000000000..2c18de9e66108411737e910f5c1972476f03ddbf Binary files /dev/null and b/deploy/fastdeploy/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png differ diff --git a/deploy/fastdeploy/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png b/deploy/fastdeploy/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png new file mode 100644 index 0000000000000000000000000000000000000000..beed3cdd2c32af5114a7dc70b9ef5b698eb8797e Binary files /dev/null and b/deploy/fastdeploy/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png differ diff --git a/deploy/fastdeploy/android/app/src/main/res/values/arrays.xml b/deploy/fastdeploy/android/app/src/main/res/values/arrays.xml new file mode 100644 index 0000000000000000000000000000000000000000..c7cf123788b49665435742d26fdb4dcc576c8a9a --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/values/arrays.xml @@ -0,0 +1,39 @@ + + + + 1 threads + 2 threads + 4 threads + 8 threads + + + 1 + 2 + 4 + 8 + + + HIGH(only big cores) + LOW(only LITTLE cores) + FULL(all cores) + NO_BIND(depends on system) + RAND_HIGH + RAND_LOW + + + LITE_POWER_HIGH + LITE_POWER_LOW + LITE_POWER_FULL + LITE_POWER_NO_BIND + LITE_POWER_RAND_HIGH + LITE_POWER_RAND_LOW + + + true + false + + + true + false + + \ No newline at end of file diff --git a/deploy/fastdeploy/android/app/src/main/res/values/colors.xml b/deploy/fastdeploy/android/app/src/main/res/values/colors.xml new file mode 100644 index 0000000000000000000000000000000000000000..f8ec1f0c3bca8b1b8cf4a82334fdd6ab18f35862 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/values/colors.xml @@ -0,0 +1,22 @@ + + + #008577 + #00574B + #D81B60 + #FF000000 + #00000000 + #00000000 + #FFFFFFFF + + #000000 + #3B85F5 + #F5A623 + #FFFFFF + + #EEEEEE + + #3B85F5 + #333333 + #E5E5E5 + #3b85f5 + diff --git a/deploy/fastdeploy/android/app/src/main/res/values/dimens.xml b/deploy/fastdeploy/android/app/src/main/res/values/dimens.xml new file mode 100644 index 0000000000000000000000000000000000000000..2df89499da7090787effe0b811af18a2612b0f4c --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/values/dimens.xml @@ -0,0 +1,17 @@ + + + 26dp + 36dp + 34dp + 60dp + 16dp + 67dp + 67dp + 56dp + 56dp + 46dp + 46dp + 32dp + 24dp + 16dp + diff --git a/deploy/fastdeploy/android/app/src/main/res/values/strings.xml b/deploy/fastdeploy/android/app/src/main/res/values/strings.xml new file mode 100644 index 0000000000000000000000000000000000000000..b5c396f5f781f3eee74272953c95bf7fd78ae369 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/values/strings.xml @@ -0,0 +1,51 @@ + + + EasyEdge + + EasyEdge + EasyEdge + EasyEdge + EasyEdge + EasyEdge + + CHOOSE_INSTALLED_MODEL_KEY + MODEL_DIR_KEY + LABEL_PATH_KEY + CPU_THREAD_NUM_KEY + CPU_POWER_MODE_KEY + SCORE_THRESHOLD_KEY + ENABLE_LITE_FP16_MODE_KEY + + 2 + LITE_POWER_HIGH + 0.4 + 0.1 + 0.25 + true + + + models/picodet_s_320_coco_lcnet + labels/coco_label_list.txt + + models + labels/ppocr_keys_v1.txt + + models/MobileNetV1_x0_25_infer + labels/imagenet1k_label_list.txt + + models/scrfd_500m_bnkps_shape320x320_pd + + models/human_pp_humansegv1_lite_192x192_inference_model + + 拍照识别 + 实时识别 + < + 模型名称 + 识别结果 + 序号 + 名称 + 置信度 + 阈值控制 + 重新识别 + 保存结果 + diff --git a/deploy/fastdeploy/android/app/src/main/res/values/styles.xml b/deploy/fastdeploy/android/app/src/main/res/values/styles.xml new file mode 100644 index 0000000000000000000000000000000000000000..67c147594487ee33165cb1c13d0cc8bc332671a9 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/values/styles.xml @@ -0,0 +1,70 @@ + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/fastdeploy/android/app/src/main/res/values/values.xml b/deploy/fastdeploy/android/app/src/main/res/values/values.xml new file mode 100644 index 0000000000000000000000000000000000000000..156146d9ad86481e7aaa245be39936fbaa1f765f --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/values/values.xml @@ -0,0 +1,17 @@ + + + 120dp + 46px + + 126px + 136px + + 46px + + 36px + + 15dp + + 15dp + + \ No newline at end of file diff --git a/deploy/fastdeploy/android/app/src/main/res/xml/ocr_settings.xml b/deploy/fastdeploy/android/app/src/main/res/xml/ocr_settings.xml new file mode 100644 index 0000000000000000000000000000000000000000..692b74b4cd21fe040ca6dd825040c07e5ecb2f67 --- /dev/null +++ b/deploy/fastdeploy/android/app/src/main/res/xml/ocr_settings.xml @@ -0,0 +1,45 @@ + + + + + + + + + + \ No newline at end of file diff --git a/deploy/fastdeploy/android/build.gradle b/deploy/fastdeploy/android/build.gradle new file mode 100644 index 0000000000000000000000000000000000000000..d8d678b3ffd56e367294f6c5fb7c4be25df22a7c --- /dev/null +++ b/deploy/fastdeploy/android/build.gradle @@ -0,0 +1,37 @@ +// Top-level build file where you can add configuration options common to all sub-projects/modules. +//plugins { +// id 'com.android.application' version '7.2.2' apply false +// id 'com.android.library' version '7.2.2' apply false +//} +// +//task clean(type: Delete) { +// delete rootProject.buildDir +//} + +buildscript { + repositories { + google() + jcenter() + // mavenCentral() + + } + dependencies { + classpath 'com.android.tools.build:gradle:7.2.2' + + // NOTE: Do not place your application dependencies here; they belong + // in the individual module build.gradle files + } +} + +allprojects { + repositories { + google() + jcenter() + // mavenCentral() + + } +} + +task clean(type: Delete) { + delete rootProject.buildDir +} diff --git a/deploy/fastdeploy/android/gradle.properties b/deploy/fastdeploy/android/gradle.properties new file mode 100644 index 0000000000000000000000000000000000000000..ae995d47ccd9199fa367c2566d87f18caf10b8e5 --- /dev/null +++ b/deploy/fastdeploy/android/gradle.properties @@ -0,0 +1,13 @@ +# Project-wide Gradle settings. +# IDE (e.g. Android Studio) users: +# Gradle settings configured through the IDE *will override* +# any settings specified in this file. +# For more details on how to configure your build environment visit +# http://www.gradle.org/docs/current/userguide/build_environment.html +# Specifies the JVM arguments used for the daemon process. +# The setting is particularly useful for tweaking memory settings. +org.gradle.jvmargs=-Xmx3096m +# When configured, Gradle will run in incubating parallel mode. +# This option should only be used with decoupled projects. More details, visit +# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects +# org.gradle.parallel=true diff --git a/deploy/fastdeploy/android/gradle/wrapper/gradle-wrapper.jar b/deploy/fastdeploy/android/gradle/wrapper/gradle-wrapper.jar new file mode 100644 index 0000000000000000000000000000000000000000..e708b1c023ec8b20f512888fe07c5bd3ff77bb8f Binary files /dev/null and b/deploy/fastdeploy/android/gradle/wrapper/gradle-wrapper.jar differ diff --git a/deploy/fastdeploy/android/gradle/wrapper/gradle-wrapper.properties b/deploy/fastdeploy/android/gradle/wrapper/gradle-wrapper.properties new file mode 100644 index 0000000000000000000000000000000000000000..7855fafe4997690cd9fdc4db93d3b7491f7fb747 --- /dev/null +++ b/deploy/fastdeploy/android/gradle/wrapper/gradle-wrapper.properties @@ -0,0 +1,6 @@ +#Sat Oct 08 17:24:34 CST 2022 +distributionBase=GRADLE_USER_HOME +distributionUrl=https\://services.gradle.org/distributions/gradle-7.3.3-bin.zip +distributionPath=wrapper/dists +zipStorePath=wrapper/dists +zipStoreBase=GRADLE_USER_HOME diff --git a/deploy/fastdeploy/android/gradlew b/deploy/fastdeploy/android/gradlew new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/deploy/fastdeploy/android/gradlew.bat b/deploy/fastdeploy/android/gradlew.bat new file mode 100644 index 0000000000000000000000000000000000000000..107acd32c4e687021ef32db511e8a206129b88ec --- /dev/null +++ b/deploy/fastdeploy/android/gradlew.bat @@ -0,0 +1,89 @@ +@rem +@rem Copyright 2015 the original author or authors. +@rem +@rem Licensed under the Apache License, Version 2.0 (the "License"); +@rem you may not use this file except in compliance with the License. +@rem You may obtain a copy of the License at +@rem +@rem https://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, software +@rem distributed under the License is distributed on an "AS IS" BASIS, +@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@rem See the License for the specific language governing permissions and +@rem limitations under the License. +@rem + +@if "%DEBUG%" == "" @echo off +@rem ########################################################################## +@rem +@rem Gradle startup script for Windows +@rem +@rem ########################################################################## + +@rem Set local scope for the variables with windows NT shell +if "%OS%"=="Windows_NT" setlocal + +set DIRNAME=%~dp0 +if "%DIRNAME%" == "" set DIRNAME=. +set APP_BASE_NAME=%~n0 +set APP_HOME=%DIRNAME% + +@rem Resolve any "." and ".." in APP_HOME to make it shorter. +for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi + +@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" + +@rem Find java.exe +if defined JAVA_HOME goto findJavaFromJavaHome + +set JAVA_EXE=java.exe +%JAVA_EXE% -version >NUL 2>&1 +if "%ERRORLEVEL%" == "0" goto execute + +echo. +echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:findJavaFromJavaHome +set JAVA_HOME=%JAVA_HOME:"=% +set JAVA_EXE=%JAVA_HOME%/bin/java.exe + +if exist "%JAVA_EXE%" goto execute + +echo. +echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:execute +@rem Setup the command line + +set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar + + +@rem Execute Gradle +"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* + +:end +@rem End local scope for the variables with windows NT shell +if "%ERRORLEVEL%"=="0" goto mainEnd + +:fail +rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of +rem the _cmd.exe /c_ return code! +if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 +exit /b 1 + +:mainEnd +if "%OS%"=="Windows_NT" endlocal + +:omega diff --git a/deploy/fastdeploy/android/local.properties b/deploy/fastdeploy/android/local.properties new file mode 100644 index 0000000000000000000000000000000000000000..aaa0de9aa3c1c41e9997edd9bc95a5aeba2fed62 --- /dev/null +++ b/deploy/fastdeploy/android/local.properties @@ -0,0 +1,8 @@ +## This file must *NOT* be checked into Version Control Systems, +# as it contains information specific to your local configuration. +# +# Location of the SDK. This is only used by Gradle. +# For customization when using a Version Control System, please read the +# header note. +#Tue Nov 29 18:47:20 CST 2022 +sdk.dir=D\:\\androidsdk diff --git a/deploy/fastdeploy/android/settings.gradle b/deploy/fastdeploy/android/settings.gradle new file mode 100644 index 0000000000000000000000000000000000000000..e7b4def49cb53d9aa04228dd3edb14c9e635e003 --- /dev/null +++ b/deploy/fastdeploy/android/settings.gradle @@ -0,0 +1 @@ +include ':app' diff --git a/deploy/fastdeploy/ascend/README.md b/deploy/fastdeploy/ascend/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3e13de3ef8bdbc98c530f6acce2d882a56210b6d --- /dev/null +++ b/deploy/fastdeploy/ascend/README.md @@ -0,0 +1,23 @@ +[English](README.md) | 简体中文 + +# PaddleOCR 模型在华为昇腾上部署方案-FastDeploy + +## 1. 说明 +PaddleOCR支持通过FastDeploy在华为昇腾上部署相关模型 + +## 2. 支持模型列表 + +下表中的模型下载链接由PaddleOCR模型库提供, 详见[PP-OCR系列模型列表](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6/doc/doc_ch/models_list.md) + +| PaddleOCR版本 | 文本框检测 | 方向分类模型 | 文字识别 |字典文件| 说明 | +|:----|:----|:----|:----|:----|:--------| +| ch_PP-OCRv3[推荐] |[ch_PP-OCRv3_det](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) | [ch_ppocr_mobile_v2.0_cls](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) | [ch_PP-OCRv3_rec](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) | [ppocr_keys_v1.txt](https://bj.bcebos.com/paddlehub/fastdeploy/ppocr_keys_v1.txt) | OCRv3系列原始超轻量模型,支持中英文、多语种文本检测 | +| en_PP-OCRv3[推荐] |[en_PP-OCRv3_det](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) | [ch_ppocr_mobile_v2.0_cls](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) | [en_PP-OCRv3_rec](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) | [en_dict.txt](https://bj.bcebos.com/paddlehub/fastdeploy/en_dict.txt) | OCRv3系列原始超轻量模型,支持英文与数字识别,除检测模型和识别模型的训练数据与中文模型不同以外,无其他区别 | +| ch_PP-OCRv2 |[ch_PP-OCRv2_det](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) | [ch_ppocr_mobile_v2.0_cls](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) | [ch_PP-OCRv2_rec](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) | [ppocr_keys_v1.txt](https://bj.bcebos.com/paddlehub/fastdeploy/ppocr_keys_v1.txt) | OCRv2系列原始超轻量模型,支持中英文、多语种文本检测 | +| ch_PP-OCRv2_mobile |[ch_ppocr_mobile_v2.0_det](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) | [ch_ppocr_mobile_v2.0_cls](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) | [ch_ppocr_mobile_v2.0_rec](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) | [ppocr_keys_v1.txt](https://bj.bcebos.com/paddlehub/fastdeploy/ppocr_keys_v1.txt) | OCRv2系列原始超轻量模型,支持中英文、多语种文本检测,比PPOCRv2更加轻量 | +| ch_PP-OCRv2_server |[ch_ppocr_server_v2.0_det](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) | [ch_ppocr_mobile_v2.0_cls](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) | [ch_ppocr_server_v2.0_rec](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) |[ppocr_keys_v1.txt](https://bj.bcebos.com/paddlehub/fastdeploy/ppocr_keys_v1.txt) | OCRv2服务器系列模型, 支持中英文、多语种文本检测,比超轻量模型更大,但效果更好| + + +## 3. 详细部署的部署示例 +- [Python部署](python) +- [C++部署](cpp) diff --git a/deploy/fastdeploy/ascend/cpp/CMakeLists.txt b/deploy/fastdeploy/ascend/cpp/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..93540a7e83e05228bcb38042a91166c858c95137 --- /dev/null +++ b/deploy/fastdeploy/ascend/cpp/CMakeLists.txt @@ -0,0 +1,14 @@ +PROJECT(infer_demo C CXX) +CMAKE_MINIMUM_REQUIRED (VERSION 3.10) + +# 指定下载解压后的fastdeploy库路径 +option(FASTDEPLOY_INSTALL_DIR "Path of downloaded fastdeploy sdk.") + +include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake) + +# 添加FastDeploy依赖头文件 +include_directories(${FASTDEPLOY_INCS}) + +add_executable(infer_demo ${PROJECT_SOURCE_DIR}/infer.cc) +# 添加FastDeploy库依赖 +target_link_libraries(infer_demo ${FASTDEPLOY_LIBS}) diff --git a/deploy/fastdeploy/ascend/cpp/README.md b/deploy/fastdeploy/ascend/cpp/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ed8d63a309074a4d1b6626eb8b0d7c7e5d443fbd --- /dev/null +++ b/deploy/fastdeploy/ascend/cpp/README.md @@ -0,0 +1,63 @@ +[English](README.md) | 简体中文 +# PP-OCRv3 Ascend C++部署示例 + +本目录下提供`infer.cc`, 供用户完成PP-OCRv3在华为昇腾AI处理器上的部署. + +## 1. 部署环境准备 +在部署前,需确认以下两个步骤 +- 1. 在部署前,需自行编译基于华为昇腾AI处理器的预测库,参考文档[华为昇腾AI处理器部署环境编译](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install#自行编译安装) +- 2. 部署时需要环境初始化, 请参考[如何使用C++在华为昇腾AI处理器部署](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/faq/use_sdk_on_ascend.md) + + +## 2.部署模型准备 +在部署前, 请准备好您所需要运行的推理模型, 您可以在[FastDeploy支持的PaddleOCR模型列表](../README.md)中下载所需模型. + +## 3.运行部署示例 +``` +# 下载部署示例代码 +git clone https://github.com/PaddlePaddle/FastDeploy.git +cd FastDeploy/examples/vision/ocr/PP-OCR/ascend/cpp + +# 如果您希望从PaddleOCR下载示例代码,请运行 +git clone https://github.com/PaddlePaddle/PaddleOCR.git +# 注意:如果当前分支找不到下面的fastdeploy测试代码,请切换到dygraph分支 +git checkout dygraph +cd PaddleOCR/deploy/fastdeploy/ascend/cpp + +mkdir build +cd build +# 使用编译完成的FastDeploy库编译infer_demo +cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-ascend +make -j + +# 下载PP-OCRv3文字检测模型 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar +tar -xvf ch_PP-OCRv3_det_infer.tar +# 下载文字方向分类器模型 +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar +tar -xvf ch_ppocr_mobile_v2.0_cls_infer.tar +# 下载PP-OCRv3文字识别模型 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar +tar -xvf ch_PP-OCRv3_rec_infer.tar + +# 下载预测图片与字典文件 +wget https://gitee.com/paddlepaddle/PaddleOCR/raw/release/2.6/doc/imgs/12.jpg +wget https://gitee.com/paddlepaddle/PaddleOCR/raw/release/2.6/ppocr/utils/ppocr_keys_v1.txt + +# 按照上文提供的文档完成环境初始化, 并执行以下命令 +./infer_demo ./ch_PP-OCRv3_det_infer ./ch_ppocr_mobile_v2.0_cls_infer ./ch_PP-OCRv3_rec_infer ./ppocr_keys_v1.txt ./12.jpg + +# NOTE:若用户需要连续地预测图片, 输入图片尺寸需要准备为统一尺寸, 例如 N 张, 尺寸为 A * B 的图片. +``` + +运行完成可视化结果如下图所示 + +

+ +
+ +## 4. 更多指南 +- [PP-OCR系列 C++ API查阅](https://www.paddlepaddle.org.cn/fastdeploy-api-doc/cpp/html/namespacefastdeploy_1_1vision_1_1ocr.html) +- [FastDeploy部署PaddleOCR模型概览](../../) +- [PP-OCRv3 Python部署](../python) +- 如果用户想要调整前后处理超参数、单独使用文字检测识别模型、使用其他模型等,更多详细文档与说明请参考[PP-OCR系列在CPU/GPU上的部署](../../cpu-gpu/python/README.md) diff --git a/deploy/fastdeploy/ascend/cpp/infer.cc b/deploy/fastdeploy/ascend/cpp/infer.cc new file mode 100644 index 0000000000000000000000000000000000000000..dc0a986707e7be784c075a35886fc6b2f4bab340 --- /dev/null +++ b/deploy/fastdeploy/ascend/cpp/infer.cc @@ -0,0 +1,108 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision.h" +#ifdef WIN32 +const char sep = '\\'; +#else +const char sep = '/'; +#endif + +void AscendInfer(const std::string &det_model_dir, + const std::string &cls_model_dir, + const std::string &rec_model_dir, + const std::string &rec_label_file, + const std::string &image_file, + const fastdeploy::RuntimeOption &option) { + auto det_model_file = det_model_dir + sep + "inference.pdmodel"; + auto det_params_file = det_model_dir + sep + "inference.pdiparams"; + + auto cls_model_file = cls_model_dir + sep + "inference.pdmodel"; + auto cls_params_file = cls_model_dir + sep + "inference.pdiparams"; + + auto rec_model_file = rec_model_dir + sep + "inference.pdmodel"; + auto rec_params_file = rec_model_dir + sep + "inference.pdiparams"; + + fastdeploy::RuntimeOption option; + option.UseAscend(); + + auto det_option = option; + auto cls_option = option; + auto rec_option = option; + + auto det_model = fastdeploy::vision::ocr::DBDetector( + det_model_file, det_params_file, det_option); + auto cls_model = fastdeploy::vision::ocr::Classifier( + cls_model_file, cls_params_file, cls_option); + auto rec_model = fastdeploy::vision::ocr::Recognizer( + rec_model_file, rec_params_file, rec_label_file, rec_option); + + // When deploy on Ascend, rec model must enable static shape infer as below. + rec_model.GetPreprocessor().SetStaticShapeInfer(true); + + assert(det_model.Initialized()); + assert(cls_model.Initialized()); + assert(rec_model.Initialized()); + + // The classification model is optional, so the PP-OCR can also be connected + // in series as follows + // auto ppocr_v3 = fastdeploy::pipeline::PPOCRv3(&det_model, &rec_model); + auto ppocr_v3 = + fastdeploy::pipeline::PPOCRv3(&det_model, &cls_model, &rec_model); + + // When users enable static shape infer for rec model, the batch size of cls + // and rec model must to be set to 1. + ppocr_v3.SetClsBatchSize(1); + ppocr_v3.SetRecBatchSize(1); + + if (!ppocr_v3.Initialized()) { + std::cerr << "Failed to initialize PP-OCR." << std::endl; + return; + } + + auto im = cv::imread(image_file); + + fastdeploy::vision::OCRResult result; + if (!ppocr_v3.Predict(im, &result)) { + std::cerr << "Failed to predict." << std::endl; + return; + } + + std::cout << result.Str() << std::endl; + + auto vis_im = fastdeploy::vision::VisOcr(im, result); + cv::imwrite("vis_result.jpg", vis_im); + std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl; +} + +int main(int argc, char *argv[]) { + if (argc < 6) { + std::cout << "Usage: infer_demo path/to/det_model path/to/cls_model " + "path/to/rec_model path/to/rec_label_file path/to/image " + "e.g ./infer_demo ./ch_PP-OCRv3_det_infer " + "./ch_ppocr_mobile_v2.0_cls_infer ./ch_PP-OCRv3_rec_infer " + "./ppocr_keys_v1.txt ./12.jpg" + << std::endl; + return -1; + } + + std::string det_model_dir = argv[1]; + std::string cls_model_dir = argv[2]; + std::string rec_model_dir = argv[3]; + std::string rec_label_file = argv[4]; + std::string test_image = argv[5]; + AscendInfer(det_model_dir, cls_model_dir, rec_model_dir, rec_label_file, + test_image); + return 0; +} diff --git a/deploy/fastdeploy/ascend/python/README.md b/deploy/fastdeploy/ascend/python/README.md new file mode 100644 index 0000000000000000000000000000000000000000..13a0fb64459c4d1bef77acad52049ff0a8d8665f --- /dev/null +++ b/deploy/fastdeploy/ascend/python/README.md @@ -0,0 +1,55 @@ +[English](README.md) | 简体中文 +# PP-OCRv3 Ascend Python部署示例 + +本目录下提供`infer.py`, 供用户完成PP-OCRv3在华为昇腾AI处理器上的部署. + +## 1. 部署环境准备 +在部署前,需自行编译基于华为昇腾AI处理器的FastDeploy python wheel包并安装,参考文档,参考文档[华为昇腾AI处理器部署环境编译](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install#自行编译安装) + +## 2.部署模型准备 +在部署前, 请准备好您所需要运行的推理模型, 您可以在[FastDeploy支持的PaddleOCR模型列表](../README.md)中下载所需模型. + +## 3.运行部署示例 +``` +# 下载部署示例代码 +git clone https://github.com/PaddlePaddle/FastDeploy.git +cd FastDeploy/examples/vision/ocr/PP-OCR/ascend/python + +# 如果您希望从PaddleOCR下载示例代码,请运行 +git clone https://github.com/PaddlePaddle/PaddleOCR.git +# 注意:如果当前分支找不到下面的fastdeploy测试代码,请切换到dygraph分支 +git checkout dygraph +cd PaddleOCR/deploy/fastdeploy/ascend/python + +# 下载PP-OCRv3文字检测模型 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar +tar -xvf ch_PP-OCRv3_det_infer.tar +# 下载文字方向分类器模型 +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar +tar -xvf ch_ppocr_mobile_v2.0_cls_infer.tar +# 下载PP-OCRv3文字识别模型 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar +tar -xvf ch_PP-OCRv3_rec_infer.tar + +# 下载预测图片与字典文件 +wget https://gitee.com/paddlepaddle/PaddleOCR/raw/release/2.6/doc/imgs/12.jpg +wget https://gitee.com/paddlepaddle/PaddleOCR/raw/release/2.6/ppocr/utils/ppocr_keys_v1.txt + +python infer.py --det_model ch_PP-OCRv3_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv3_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg +# NOTE:若用户需要连续地预测图片, 输入图片尺寸需要准备为统一尺寸, 例如 N 张, 尺寸为 A * B 的图片. +``` + +运行完成可视化结果如下图所示 + +
+ +
+ +## 4. 更多指南 +- [PP-OCR系列 Python API查阅](https://www.paddlepaddle.org.cn/fastdeploy-api-doc/python/html/ocr.html) +- [FastDeploy部署PaddleOCR模型概览](../../) +- [PP-OCRv3 C++部署](../cpp) +- 如果用户想要调整前后处理超参数、单独使用文字检测识别模型、使用其他模型等,更多详细文档与说明请参考[PP-OCR系列在CPU/GPU上的部署](../../cpu-gpu/python/README.md) + +## 5. 常见问题 +- [如何将视觉模型预测结果转为numpy格式](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/faq/vision_result_related_problems.md) diff --git a/deploy/fastdeploy/ascend/python/infer.py b/deploy/fastdeploy/ascend/python/infer.py new file mode 100755 index 0000000000000000000000000000000000000000..ceb28e0f7f5855b871a8619d0d920f8adb77b8bb --- /dev/null +++ b/deploy/fastdeploy/ascend/python/infer.py @@ -0,0 +1,103 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import fastdeploy as fd +import cv2 +import os + + +def parse_arguments(): + import argparse + import ast + parser = argparse.ArgumentParser() + parser.add_argument( + "--det_model", required=True, help="Path of Detection model of PPOCR.") + parser.add_argument( + "--cls_model", + required=True, + help="Path of Classification model of PPOCR.") + parser.add_argument( + "--rec_model", + required=True, + help="Path of Recognization model of PPOCR.") + parser.add_argument( + "--rec_label_file", + required=True, + help="Path of Recognization model of PPOCR.") + parser.add_argument( + "--image", type=str, required=True, help="Path of test image file.") + return parser.parse_args() + + +def build_option(args): + + det_option = fd.RuntimeOption() + cls_option = fd.RuntimeOption() + rec_option = fd.RuntimeOption() + + det_option.use_ascend() + cls_option.use_ascend() + rec_option.use_ascend() + + return det_option, cls_option, rec_option + + +args = parse_arguments() + +det_model_file = os.path.join(args.det_model, "inference.pdmodel") +det_params_file = os.path.join(args.det_model, "inference.pdiparams") + +cls_model_file = os.path.join(args.cls_model, "inference.pdmodel") +cls_params_file = os.path.join(args.cls_model, "inference.pdiparams") + +rec_model_file = os.path.join(args.rec_model, "inference.pdmodel") +rec_params_file = os.path.join(args.rec_model, "inference.pdiparams") +rec_label_file = args.rec_label_file + +det_option, cls_option, rec_option = build_option(args) + +det_model = fd.vision.ocr.DBDetector( + det_model_file, det_params_file, runtime_option=det_option) + +cls_model = fd.vision.ocr.Classifier( + cls_model_file, cls_params_file, runtime_option=cls_option) + +rec_model = fd.vision.ocr.Recognizer( + rec_model_file, rec_params_file, rec_label_file, runtime_option=rec_option) + +# Rec model enable static shape infer. +# When deploy on Ascend, it must be true. +rec_model.preprocessor.static_shape_infer = True + +# Create PP-OCRv3, if cls_model is not needed, +# just set cls_model=None . +ppocr_v3 = fd.vision.ocr.PPOCRv3( + det_model=det_model, cls_model=cls_model, rec_model=rec_model) + +# The batch size must be set to 1, when enable static shape infer. +ppocr_v3.cls_batch_size = 1 +ppocr_v3.rec_batch_size = 1 + +# Prepare image. +im = cv2.imread(args.image) + +# Print the results. +result = ppocr_v3.predict(im) + +print(result) + +# Visuliaze the output. +vis_im = fd.vision.vis_ppocr(im, result) +cv2.imwrite("visualized_result.jpg", vis_im) +print("Visualized result save in ./visualized_result.jpg") diff --git a/deploy/fastdeploy/cpu-gpu/README.md b/deploy/fastdeploy/cpu-gpu/README.md new file mode 100644 index 0000000000000000000000000000000000000000..69a8e3e7e9f0aef831e4cfa803b33ddafb0d0ed2 --- /dev/null +++ b/deploy/fastdeploy/cpu-gpu/README.md @@ -0,0 +1,26 @@ +[English](README.md) | 简体中文 + +# PaddleOCR 模型在CPU与GPU上的部署方案-FastDeploy + +## 1. 说明 +PaddleOCR支持通过FastDeploy在NVIDIA GPU、X86 CPU、飞腾CPU、ARM CPU、Intel GPU(独立显卡/集成显卡)硬件上快速部署PaddleOCR系列模型 + +## 2. 支持的PaddleOCR推理模型 + +下表中的推理模型为FastDeploy测试过的模型, 下载链接由PaddleOCR模型库提供, +更多的模型, 详见[PP-OCR系列模型列表](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6/doc/doc_ch/models_list.md), 欢迎用户尝试. + +| PaddleOCR版本 | 文本框检测 | 方向分类模型 | 文字识别 |字典文件| 说明 | +|:----|:----|:----|:----|:----|:--------| +| ch_PP-OCRv3[推荐] |[ch_PP-OCRv3_det](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) | [ch_ppocr_mobile_v2.0_cls](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) | [ch_PP-OCRv3_rec](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) | [ppocr_keys_v1.txt](https://bj.bcebos.com/paddlehub/fastdeploy/ppocr_keys_v1.txt) | OCRv3系列原始超轻量模型,支持中英文、多语种文本检测 | +| en_PP-OCRv3[推荐] |[en_PP-OCRv3_det](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) | [ch_ppocr_mobile_v2.0_cls](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) | [en_PP-OCRv3_rec](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) | [en_dict.txt](https://bj.bcebos.com/paddlehub/fastdeploy/en_dict.txt) | OCRv3系列原始超轻量模型,支持英文与数字识别,除检测模型和识别模型的训练数据与中文模型不同以外,无其他区别 | +| ch_PP-OCRv2 |[ch_PP-OCRv2_det](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) | [ch_ppocr_mobile_v2.0_cls](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) | [ch_PP-OCRv2_rec](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) | [ppocr_keys_v1.txt](https://bj.bcebos.com/paddlehub/fastdeploy/ppocr_keys_v1.txt) | OCRv2系列原始超轻量模型,支持中英文、多语种文本检测 | +| ch_PP-OCRv2_mobile |[ch_ppocr_mobile_v2.0_det](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) | [ch_ppocr_mobile_v2.0_cls](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) | [ch_ppocr_mobile_v2.0_rec](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) | [ppocr_keys_v1.txt](https://bj.bcebos.com/paddlehub/fastdeploy/ppocr_keys_v1.txt) | OCRv2系列原始超轻量模型,支持中英文、多语种文本检测,比PPOCRv2更加轻量 | +| ch_PP-OCRv2_server |[ch_ppocr_server_v2.0_det](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) | [ch_ppocr_mobile_v2.0_cls](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) | [ch_ppocr_server_v2.0_rec](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) |[ppocr_keys_v1.txt](https://bj.bcebos.com/paddlehub/fastdeploy/ppocr_keys_v1.txt) | OCRv2服务器系列模型, 支持中英文、多语种文本检测,比超轻量模型更大,但效果更好| + + +## 3. 详细部署的部署示例 +- [Python部署](python) +- [C++部署](cpp) +- [C部署](c) +- [C#部署](csharp) diff --git a/deploy/fastdeploy/cpu-gpu/c/CMakeLists.txt b/deploy/fastdeploy/cpu-gpu/c/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..b228346da862604d54a0e11ac98512395ffde2da --- /dev/null +++ b/deploy/fastdeploy/cpu-gpu/c/CMakeLists.txt @@ -0,0 +1,13 @@ +PROJECT(infer_demo C) +CMAKE_MINIMUM_REQUIRED (VERSION 3.10) + +# 指定下载解压后的fastdeploy库路径 +option(FASTDEPLOY_INSTALL_DIR "Path of downloaded fastdeploy sdk.") + +include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake) + +# 添加FastDeploy依赖头文件 +include_directories(${FASTDEPLOY_INCS}) + +add_executable(infer_demo ${PROJECT_SOURCE_DIR}/infer.c) +target_link_libraries(infer_demo ${FASTDEPLOY_LIBS}) diff --git a/deploy/fastdeploy/cpu-gpu/c/README.md b/deploy/fastdeploy/cpu-gpu/c/README.md new file mode 100755 index 0000000000000000000000000000000000000000..7c5863773005a9119fda427c1edc2b798265b0ca --- /dev/null +++ b/deploy/fastdeploy/cpu-gpu/c/README.md @@ -0,0 +1,263 @@ +[English](README.md) | 简体中文 +# PaddleOCR CPU-GPU C部署示例 + +本目录下提供`infer.c`来调用C API快速完成PP-OCRv3模型在CPU/GPU上部署的示例。 + +## 1. 说明 +PaddleOCR支持利用FastDeploy在NVIDIA GPU、X86 CPU、飞腾CPU、ARM CPU、Intel GPU(独立显卡/集成显卡)硬件上快速部署OCR模型. + +## 2. 部署环境准备 +在部署前,需确认软硬件环境,同时下载预编译部署库,参考[FastDeploy安装文档](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install#FastDeploy预编译库安装)安装FastDeploy预编译库. +以Linux上推理为例,在本目录执行如下命令即可完成编译测试,支持此模型需保证FastDeploy版本1.0.4以上(x.x.x>=1.0.4) + +## 3. 部署模型准备 +在部署前, 请准备好您所需要运行的推理模型, 您可以在[FastDeploy支持的PaddleOCR模型列表](../README.md)中下载所需模型. + +## 4.运行部署示例 +```bash +# 下载部署示例代码 +git clone https://github.com/PaddlePaddle/FastDeploy.git +cd FastDeploy/examples/vision/ocr/PP-OCR/cpu-gpu/c + +# 如果您希望从PaddleOCR下载示例代码,请运行 +git clone https://github.com/PaddlePaddle/PaddleOCR.git +# 注意:如果当前分支找不到下面的fastdeploy测试代码,请切换到dygraph分支 +git checkout dygraph +cd PaddleOCR/deploy/fastdeploy/cpu-gpu/c + +mkdir build +cd build + +# 下载FastDeploy预编译库,用户可在上文提到的`FastDeploy预编译库`中自行选择合适的版本使用 +wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz + +# 编译Demo +tar xvf fastdeploy-linux-x64-x.x.x.tgz +cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x +make -j + +# 下载PP-OCRv3文字检测模型 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar +tar -xvf ch_PP-OCRv3_det_infer.tar +# 下载文字方向分类器模型 +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar +tar -xvf ch_ppocr_mobile_v2.0_cls_infer.tar +# 下载PP-OCRv3文字识别模型 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar +tar -xvf ch_PP-OCRv3_rec_infer.tar + +# 下载预测图片与字典文件 +wget https://gitee.com/paddlepaddle/PaddleOCR/raw/release/2.6/doc/imgs/12.jpg +wget https://gitee.com/paddlepaddle/PaddleOCR/raw/release/2.6/ppocr/utils/ppocr_keys_v1.txt + +# 在CPU上使用Paddle Inference推理 +./infer_demo ./ch_PP-OCRv3_det_infer ./ch_ppocr_mobile_v2.0_cls_infer ./ch_PP-OCRv3_rec_infer ./ppocr_keys_v1.txt ./12.jpg 0 +# 在GPU上使用Paddle Inference推理 +./infer_demo ./ch_PP-OCRv3_det_infer ./ch_ppocr_mobile_v2.0_cls_infer ./ch_PP-OCRv3_rec_infer ./ppocr_keys_v1.txt ./12.jpg 1 +``` +以上命令只适用于Linux或MacOS, Windows下SDK的使用方式请参考: +- [如何在Windows中使用FastDeploy C++ SDK](../../../../../docs/cn/faq/use_sdk_on_windows.md) + + +运行完成可视化结果如下图所示 + + + +## 5. PP-OCRv3 C API接口简介 +下面提供了PP-OCRv3的C API简介 + +- 如果用户想要更换部署后端或进行其他定制化操作, 请查看[C Runtime API](https://baidu-paddle.github.io/fastdeploy-api/c/html/runtime__option_8h.html). +- 更多 PP-OCR C API 请查看 [C PP-OCR API](https://github.com/PaddlePaddle/FastDeploy/blob/develop/c_api/fastdeploy_capi/vision/ocr/ppocr/model.h) + +### 配置 + +```c +FD_C_RuntimeOptionWrapper* FD_C_CreateRuntimeOptionWrapper() +``` + +> 创建一个RuntimeOption的配置对象,并且返回操作它的指针。 +> +> **返回** +> +> * **fd_c_runtime_option_wrapper**(FD_C_RuntimeOptionWrapper*): 指向RuntimeOption对象的指针 + + +```c +void FD_C_RuntimeOptionWrapperUseCpu( + FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper) +``` + +> 开启CPU推理 +> +> **参数** +> +> * **fd_c_runtime_option_wrapper**(FD_C_RuntimeOptionWrapper*): 指向RuntimeOption对象的指针 + +```c +void FD_C_RuntimeOptionWrapperUseGpu( + FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper, + int gpu_id) +``` +> 开启GPU推理 +> +> **参数** +> +> * **fd_c_runtime_option_wrapper**(FD_C_RuntimeOptionWrapper*): 指向RuntimeOption对象的指针 +> * **gpu_id**(int): 显卡号 + + +### 模型 + +```c +FD_C_DBDetectorWrapper* FD_C_CreateDBDetectorWrapper( + const char* model_file, const char* params_file, + FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper, + const FD_C_ModelFormat model_format +) +``` + +> 创建一个DBDetector的模型,并且返回操作它的指针。 +> +> **参数** +> +> * **model_file**(const char*): 模型文件路径 +> * **params_file**(const char*): 参数文件路径 +> * **fd_c_runtime_option_wrapper**(FD_C_RuntimeOptionWrapper*): 指向RuntimeOption的指针,表示后端推理配置 +> * **model_format**(FD_C_ModelFormat): 模型格式 +> +> **返回** +> * **fd_c_dbdetector_wrapper**(FD_C_DBDetectorWrapper*): 指向DBDetector模型对象的指针 + +```c +FD_C_ClassifierWrapper* FD_C_CreateClassifierWrapper( + const char* model_file, const char* params_file, + FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper, + const FD_C_ModelFormat model_format +) +``` +> 创建一个Classifier的模型,并且返回操作它的指针。 +> +> **参数** +> +> * **model_file**(const char*): 模型文件路径 +> * **params_file**(const char*): 参数文件路径 +> * **fd_c_runtime_option_wrapper**(FD_C_RuntimeOptionWrapper*): 指向RuntimeOption的指针,表示后端推理配置 +> * **model_format**(FD_C_ModelFormat): 模型格式 +> +> **返回** +> +> * **fd_c_classifier_wrapper**(FD_C_ClassifierWrapper*): 指向Classifier模型对象的指针 + +```c +FD_C_RecognizerWrapper* FD_C_CreateRecognizerWrapper( + const char* model_file, const char* params_file, const char* label_path, + FD_C_RuntimeOptionWrapper* fd_c_runtime_option_wrapper, + const FD_C_ModelFormat model_format +) +``` +> 创建一个Recognizer的模型,并且返回操作它的指针。 +> +> **参数** +> +> * **model_file**(const char*): 模型文件路径 +> * **params_file**(const char*): 参数文件路径 +> * **label_path**(const char*): 标签文件路径 +> * **fd_c_runtime_option_wrapper**(FD_C_RuntimeOptionWrapper*): 指向RuntimeOption的指针,表示后端推理配置 +> * **model_format**(FD_C_ModelFormat): 模型格式 +> +> **返回** +> * **fd_c_recognizer_wrapper**(FD_C_RecognizerWrapper*): 指向Recognizer模型对象的指针 + +```c +FD_C_PPOCRv3Wrapper* FD_C_CreatePPOCRv3Wrapper( + FD_C_DBDetectorWrapper* det_model, + FD_C_ClassifierWrapper* cls_model, + FD_C_RecognizerWrapper* rec_model +) +``` +> 创建一个PP-OCRv3的模型,并且返回操作它的指针。 +> +> **参数** +> +> * **det_model**(FD_C_DBDetectorWrapper*): DBDetector模型 +> * **cls_model**(FD_C_ClassifierWrapper*): Classifier模型 +> * **rec_model**(FD_C_RecognizerWrapper*): Recognizer模型 +> +> **返回** +> +> * **fd_c_ppocrv3_wrapper**(FD_C_PPOCRv3Wrapper*): 指向PP-OCRv3模型对象的指针 + + + +### 读写图像 + +```c +FD_C_Mat FD_C_Imread(const char* imgpath) +``` + +> 读取一个图像,并且返回cv::Mat的指针。 +> +> **参数** +> +> * **imgpath**(const char*): 图像文件路径 +> +> **返回** +> +> * **imgmat**(FD_C_Mat): 指向图像数据cv::Mat的指针。 + + +```c +FD_C_Bool FD_C_Imwrite(const char* savepath, FD_C_Mat img); +``` + +> 将图像写入文件中。 +> +> **参数** +> +> * **savepath**(const char*): 保存图像的路径 +> * **img**(FD_C_Mat): 指向图像数据的指针 +> +> **返回** +> +> * **result**(FD_C_Bool): 表示操作是否成功 + + +### Predict函数 + +```c +FD_C_Bool FD_C_PPOCRv3WrapperPredict( + FD_C_PPOCRv3Wrapper* fd_c_ppocrv3_wrapper, + FD_C_Mat img, + FD_C_OCRResult* result) +``` +> +> 模型预测接口,输入图像直接并生成结果。 +> +> **参数** +> * **fd_c_ppocrv3_wrapper**(FD_C_PPOCRv3Wrapper*): 指向PP-OCRv3模型的指针 +> * **img**(FD_C_Mat): 输入图像的指针,指向cv::Mat对象,可以调用FD_C_Imread读取图像获取 +> * **result**(FD_C_OCRResult*): OCR预测结果,包括由检测模型输出的检测框位置,分类模型输出的方向分类,以及识别模型输出的识别结果, OCRResult说明参考[视觉模型预测结果](../../../../../docs/api/vision_results/) + + +### Predict结果 + +```c +FD_C_Mat FD_C_VisOcr(FD_C_Mat im, FD_C_OCRResult* ocr_result) +``` +> +> 对结果进行可视化,返回可视化的图像。 +> +> **参数** +> * **im**(FD_C_Mat): 指向输入图像的指针 +> * **ocr_result**(FD_C_OCRResult*): 指向 FD_C_OCRResult结构的指针 +> +> **返回** +> * **vis_im**(FD_C_Mat): 指向可视化图像的指针 + + +## 6. 其它文档 + +- [FastDeploy部署PaddleOCR模型概览](../../) +- [PP-OCRv3 Python部署](../python) +- [PP-OCRv3 C++ 部署](../cpp) +- [PP-OCRv3 C# 部署](../csharp) diff --git a/deploy/fastdeploy/cpu-gpu/c/infer.c b/deploy/fastdeploy/cpu-gpu/c/infer.c new file mode 100644 index 0000000000000000000000000000000000000000..62bbc2d00246d0ed8ab5acbae15ebc7917c0270a --- /dev/null +++ b/deploy/fastdeploy/cpu-gpu/c/infer.c @@ -0,0 +1,249 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "fastdeploy_capi/vision.h" + +#ifdef WIN32 +const char sep = '\\'; +#else +const char sep = '/'; +#endif + +void CpuInfer(const char *det_model_dir, const char *cls_model_dir, + const char *rec_model_dir, const char *rec_label_file, + const char *image_file) { + char det_model_file[100]; + char det_params_file[100]; + + char cls_model_file[100]; + char cls_params_file[100]; + + char rec_model_file[100]; + char rec_params_file[100]; + + int max_size = 99; + snprintf(det_model_file, max_size, "%s%c%s", det_model_dir, sep, + "inference.pdmodel"); + snprintf(det_params_file, max_size, "%s%c%s", det_model_dir, sep, + "inference.pdiparams"); + + snprintf(cls_model_file, max_size, "%s%c%s", cls_model_dir, sep, + "inference.pdmodel"); + snprintf(cls_params_file, max_size, "%s%c%s", cls_model_dir, sep, + "inference.pdiparams"); + + snprintf(rec_model_file, max_size, "%s%c%s", rec_model_dir, sep, + "inference.pdmodel"); + snprintf(rec_params_file, max_size, "%s%c%s", rec_model_dir, sep, + "inference.pdiparams"); + + FD_C_RuntimeOptionWrapper *det_option = FD_C_CreateRuntimeOptionWrapper(); + FD_C_RuntimeOptionWrapper *cls_option = FD_C_CreateRuntimeOptionWrapper(); + FD_C_RuntimeOptionWrapper *rec_option = FD_C_CreateRuntimeOptionWrapper(); + FD_C_RuntimeOptionWrapperUseCpu(det_option); + FD_C_RuntimeOptionWrapperUseCpu(cls_option); + FD_C_RuntimeOptionWrapperUseCpu(rec_option); + + FD_C_DBDetectorWrapper *det_model = FD_C_CreateDBDetectorWrapper( + det_model_file, det_params_file, det_option, FD_C_ModelFormat_PADDLE); + FD_C_ClassifierWrapper *cls_model = FD_C_CreateClassifierWrapper( + cls_model_file, cls_params_file, cls_option, FD_C_ModelFormat_PADDLE); + FD_C_RecognizerWrapper *rec_model = FD_C_CreateRecognizerWrapper( + rec_model_file, rec_params_file, rec_label_file, rec_option, + FD_C_ModelFormat_PADDLE); + + FD_C_PPOCRv3Wrapper *ppocr_v3 = + FD_C_CreatePPOCRv3Wrapper(det_model, cls_model, rec_model); + if (!FD_C_PPOCRv3WrapperInitialized(ppocr_v3)) { + printf("Failed to initialize.\n"); + FD_C_DestroyRuntimeOptionWrapper(det_option); + FD_C_DestroyRuntimeOptionWrapper(cls_option); + FD_C_DestroyRuntimeOptionWrapper(rec_option); + FD_C_DestroyClassifierWrapper(cls_model); + FD_C_DestroyDBDetectorWrapper(det_model); + FD_C_DestroyRecognizerWrapper(rec_model); + FD_C_DestroyPPOCRv3Wrapper(ppocr_v3); + return; + } + + FD_C_Mat im = FD_C_Imread(image_file); + + FD_C_OCRResult *result = (FD_C_OCRResult *)malloc(sizeof(FD_C_OCRResult)); + + if (!FD_C_PPOCRv3WrapperPredict(ppocr_v3, im, result)) { + printf("Failed to predict.\n"); + FD_C_DestroyRuntimeOptionWrapper(det_option); + FD_C_DestroyRuntimeOptionWrapper(cls_option); + FD_C_DestroyRuntimeOptionWrapper(rec_option); + FD_C_DestroyClassifierWrapper(cls_model); + FD_C_DestroyDBDetectorWrapper(det_model); + FD_C_DestroyRecognizerWrapper(rec_model); + FD_C_DestroyPPOCRv3Wrapper(ppocr_v3); + FD_C_DestroyMat(im); + free(result); + return; + } + + // print res + char res[2000]; + FD_C_OCRResultStr(result, res); + printf("%s", res); + FD_C_Mat vis_im = FD_C_VisOcr(im, result); + FD_C_Imwrite("vis_result.jpg", vis_im); + printf("Visualized result saved in ./vis_result.jpg\n"); + + FD_C_DestroyRuntimeOptionWrapper(det_option); + FD_C_DestroyRuntimeOptionWrapper(cls_option); + FD_C_DestroyRuntimeOptionWrapper(rec_option); + FD_C_DestroyClassifierWrapper(cls_model); + FD_C_DestroyDBDetectorWrapper(det_model); + FD_C_DestroyRecognizerWrapper(rec_model); + FD_C_DestroyPPOCRv3Wrapper(ppocr_v3); + FD_C_DestroyOCRResult(result); + FD_C_DestroyMat(im); + FD_C_DestroyMat(vis_im); +} + +void GpuInfer(const char *det_model_dir, const char *cls_model_dir, + const char *rec_model_dir, const char *rec_label_file, + const char *image_file) { + char det_model_file[100]; + char det_params_file[100]; + + char cls_model_file[100]; + char cls_params_file[100]; + + char rec_model_file[100]; + char rec_params_file[100]; + + int max_size = 99; + snprintf(det_model_file, max_size, "%s%c%s", det_model_dir, sep, + "inference.pdmodel"); + snprintf(det_params_file, max_size, "%s%c%s", det_model_dir, sep, + "inference.pdiparams"); + + snprintf(cls_model_file, max_size, "%s%c%s", cls_model_dir, sep, + "inference.pdmodel"); + snprintf(cls_params_file, max_size, "%s%c%s", cls_model_dir, sep, + "inference.pdiparams"); + + snprintf(rec_model_file, max_size, "%s%c%s", rec_model_dir, sep, + "inference.pdmodel"); + snprintf(rec_params_file, max_size, "%s%c%s", rec_model_dir, sep, + "inference.pdiparams"); + + FD_C_RuntimeOptionWrapper *det_option = FD_C_CreateRuntimeOptionWrapper(); + FD_C_RuntimeOptionWrapper *cls_option = FD_C_CreateRuntimeOptionWrapper(); + FD_C_RuntimeOptionWrapper *rec_option = FD_C_CreateRuntimeOptionWrapper(); + FD_C_RuntimeOptionWrapperUseGpu(det_option, 0); + FD_C_RuntimeOptionWrapperUseGpu(cls_option, 0); + FD_C_RuntimeOptionWrapperUseGpu(rec_option, 0); + + FD_C_DBDetectorWrapper *det_model = FD_C_CreateDBDetectorWrapper( + det_model_file, det_params_file, det_option, FD_C_ModelFormat_PADDLE); + FD_C_ClassifierWrapper *cls_model = FD_C_CreateClassifierWrapper( + cls_model_file, cls_params_file, cls_option, FD_C_ModelFormat_PADDLE); + FD_C_RecognizerWrapper *rec_model = FD_C_CreateRecognizerWrapper( + rec_model_file, rec_params_file, rec_label_file, rec_option, + FD_C_ModelFormat_PADDLE); + + FD_C_PPOCRv3Wrapper *ppocr_v3 = + FD_C_CreatePPOCRv3Wrapper(det_model, cls_model, rec_model); + if (!FD_C_PPOCRv3WrapperInitialized(ppocr_v3)) { + printf("Failed to initialize.\n"); + FD_C_DestroyRuntimeOptionWrapper(det_option); + FD_C_DestroyRuntimeOptionWrapper(cls_option); + FD_C_DestroyRuntimeOptionWrapper(rec_option); + FD_C_DestroyClassifierWrapper(cls_model); + FD_C_DestroyDBDetectorWrapper(det_model); + FD_C_DestroyRecognizerWrapper(rec_model); + FD_C_DestroyPPOCRv3Wrapper(ppocr_v3); + return; + } + + FD_C_Mat im = FD_C_Imread(image_file); + + FD_C_OCRResult *result = (FD_C_OCRResult *)malloc(sizeof(FD_C_OCRResult)); + + if (!FD_C_PPOCRv3WrapperPredict(ppocr_v3, im, result)) { + printf("Failed to predict.\n"); + FD_C_DestroyRuntimeOptionWrapper(det_option); + FD_C_DestroyRuntimeOptionWrapper(cls_option); + FD_C_DestroyRuntimeOptionWrapper(rec_option); + FD_C_DestroyClassifierWrapper(cls_model); + FD_C_DestroyDBDetectorWrapper(det_model); + FD_C_DestroyRecognizerWrapper(rec_model); + FD_C_DestroyPPOCRv3Wrapper(ppocr_v3); + FD_C_DestroyMat(im); + free(result); + return; + } + + // print res + char res[2000]; + FD_C_OCRResultStr(result, res); + printf("%s", res); + FD_C_Mat vis_im = FD_C_VisOcr(im, result); + FD_C_Imwrite("vis_result.jpg", vis_im); + printf("Visualized result saved in ./vis_result.jpg\n"); + + FD_C_DestroyRuntimeOptionWrapper(det_option); + FD_C_DestroyRuntimeOptionWrapper(cls_option); + FD_C_DestroyRuntimeOptionWrapper(rec_option); + FD_C_DestroyClassifierWrapper(cls_model); + FD_C_DestroyDBDetectorWrapper(det_model); + FD_C_DestroyRecognizerWrapper(rec_model); + FD_C_DestroyPPOCRv3Wrapper(ppocr_v3); + FD_C_DestroyOCRResult(result); + FD_C_DestroyMat(im); + FD_C_DestroyMat(vis_im); +} +int main(int argc, char *argv[]) { + if (argc < 7) { + printf("Usage: infer_demo path/to/det_model path/to/cls_model " + "path/to/rec_model path/to/rec_label_file path/to/image " + "run_option, " + "e.g ./infer_demo ./ch_PP-OCRv3_det_infer " + "./ch_ppocr_mobile_v3.0_cls_infer ./ch_PP-OCRv3_rec_infer " + "./ppocr_keys_v1.txt ./12.jpg 0\n"); + printf( + "The data type of run_option is int, 0: run with cpu; 1: run with gpu" + "\n"); + return -1; + } + + if (atoi(argv[6]) == 0) { + CpuInfer(argv[1], argv[2], argv[3], argv[4], argv[5]); + } else if (atoi(argv[6]) == 1) { + GpuInfer(argv[1], argv[2], argv[3], argv[4], argv[5]); + } + return 0; +} diff --git a/deploy/fastdeploy/cpu-gpu/cpp/CMakeLists.txt b/deploy/fastdeploy/cpu-gpu/cpp/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..fe4e03f266318e26c29731ea819a674dc6492959 --- /dev/null +++ b/deploy/fastdeploy/cpu-gpu/cpp/CMakeLists.txt @@ -0,0 +1,30 @@ +PROJECT(infer_demo C CXX) +CMAKE_MINIMUM_REQUIRED (VERSION 3.10) + +# 指定下载解压后的fastdeploy库路径 +option(FASTDEPLOY_INSTALL_DIR "Path of downloaded fastdeploy sdk.") + +include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake) + +# 添加FastDeploy依赖头文件 +include_directories(${FASTDEPLOY_INCS}) + +# PP-OCR +add_executable(infer_demo ${PROJECT_SOURCE_DIR}/infer.cc) +# 添加FastDeploy库依赖 +target_link_libraries(infer_demo ${FASTDEPLOY_LIBS}) + +# Only Det +add_executable(infer_det ${PROJECT_SOURCE_DIR}/infer_det.cc) +# 添加FastDeploy库依赖 +target_link_libraries(infer_det ${FASTDEPLOY_LIBS}) + +# Only Cls +add_executable(infer_cls ${PROJECT_SOURCE_DIR}/infer_cls.cc) +# 添加FastDeploy库依赖 +target_link_libraries(infer_cls ${FASTDEPLOY_LIBS}) + +# Only Rec +add_executable(infer_rec ${PROJECT_SOURCE_DIR}/infer_rec.cc) +# 添加FastDeploy库依赖 +target_link_libraries(infer_rec ${FASTDEPLOY_LIBS}) diff --git a/deploy/fastdeploy/cpu-gpu/cpp/README.md b/deploy/fastdeploy/cpu-gpu/cpp/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4481f49be547a4c93c6fac672ed00f3ff6c1d4b3 --- /dev/null +++ b/deploy/fastdeploy/cpu-gpu/cpp/README.md @@ -0,0 +1,163 @@ +[English](README.md) | 简体中文 +# PaddleOCR CPU-GPU C++部署示例 + +本目录下提供`infer.cc`快速完成PP-OCRv3在CPU/GPU,以及GPU上通过Paddle-TensorRT加速部署的示例. +## 1. 说明 +PaddleOCR支持利用FastDeploy在NVIDIA GPU、X86 CPU、飞腾CPU、ARM CPU、Intel GPU(独立显卡/集成显卡)硬件上快速部署OCR模型. + +## 2. 部署环境准备 +在部署前,需确认软硬件环境,同时下载预编译部署库,参考[FastDeploy安装文档](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install#FastDeploy预编译库安装)安装FastDeploy预编译库. + +## 3. 部署模型准备 +在部署前, 请准备好您所需要运行的推理模型, 您可以在[FastDeploy支持的PaddleOCR模型列表](../README.md)中下载所需模型. + +## 4. 运行部署示例 +以Linux上推理为例,在本目录执行如下命令即可完成编译测试,支持此模型需保证FastDeploy版本1.0.0以上(x.x.x>=1.0.0) + +```bash +# 下载部署示例代码 +git clone https://github.com/PaddlePaddle/FastDeploy.git +cd FastDeploy/examples/vision/ocr/PP-OCR/cpu-gpu/cpp + +# 如果您希望从PaddleOCR下载示例代码,请运行 +git clone https://github.com/PaddlePaddle/PaddleOCR.git +# 注意:如果当前分支找不到下面的fastdeploy测试代码,请切换到dygraph分支 +git checkout dygraph +cd PaddleOCR/deploy/fastdeploy/cpu-gpu/cpp + +# 下载FastDeploy预编译库,用户可在上文提到的`FastDeploy预编译库`中自行选择合适的版本使用 +wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz +tar xvf fastdeploy-linux-x64-x.x.x.tgz + +# 编译部署示例 +mkdir build && cd build +cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x +make -j + +# 下载PP-OCRv3文字检测模型 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar +tar -xvf ch_PP-OCRv3_det_infer.tar +# 下载文字方向分类器模型 +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar +tar -xvf ch_ppocr_mobile_v2.0_cls_infer.tar +# 下载PP-OCRv3文字识别模型 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar +tar -xvf ch_PP-OCRv3_rec_infer.tar + +# 下载预测图片与字典文件 +wget https://gitee.com/paddlepaddle/PaddleOCR/raw/release/2.6/doc/imgs/12.jpg +wget https://gitee.com/paddlepaddle/PaddleOCR/raw/release/2.6/ppocr/utils/ppocr_keys_v1.txt + +# 运行部署示例 +# 在CPU上使用Paddle Inference推理 +./infer_demo ./ch_PP-OCRv3_det_infer ./ch_ppocr_mobile_v2.0_cls_infer ./ch_PP-OCRv3_rec_infer ./ppocr_keys_v1.txt ./12.jpg 0 +# 在CPU上使用OenVINO推理 +./infer_demo ./ch_PP-OCRv3_det_infer ./ch_ppocr_mobile_v2.0_cls_infer ./ch_PP-OCRv3_rec_infer ./ppocr_keys_v1.txt ./12.jpg 1 +# 在CPU上使用ONNX Runtime推理 +./infer_demo ./ch_PP-OCRv3_det_infer ./ch_ppocr_mobile_v2.0_cls_infer ./ch_PP-OCRv3_rec_infer ./ppocr_keys_v1.txt ./12.jpg 2 +# 在CPU上使用Paddle Lite推理 +./infer_demo ./ch_PP-OCRv3_det_infer ./ch_ppocr_mobile_v2.0_cls_infer ./ch_PP-OCRv3_rec_infer ./ppocr_keys_v1.txt ./12.jpg 3 +# 在GPU上使用Paddle Inference推理 +./infer_demo ./ch_PP-OCRv3_det_infer ./ch_ppocr_mobile_v2.0_cls_infer ./ch_PP-OCRv3_rec_infer ./ppocr_keys_v1.txt ./12.jpg 4 +# 在GPU上使用Paddle TensorRT推理 +./infer_demo ./ch_PP-OCRv3_det_infer ./ch_ppocr_mobile_v2.0_cls_infer ./ch_PP-OCRv3_rec_infer ./ppocr_keys_v1.txt ./12.jpg 5 +# 在GPU上使用ONNX Runtime推理 +./infer_demo ./ch_PP-OCRv3_det_infer ./ch_ppocr_mobile_v2.0_cls_infer ./ch_PP-OCRv3_rec_infer ./ppocr_keys_v1.txt ./12.jpg 6 +# 在GPU上使用Nvidia TensorRT推理 +./infer_demo ./ch_PP-OCRv3_det_infer ./ch_ppocr_mobile_v2.0_cls_infer ./ch_PP-OCRv3_rec_infer ./ppocr_keys_v1.txt ./12.jpg 7 + +# 同时, FastDeploy提供文字检测,文字分类,文字识别三个模型的单独推理, +# 有需要的用户, 请准备合适的图片, 同时根据自己的需求, 参考infer.cc来配置自定义硬件与推理后端. + +# 在CPU上,单独使用文字检测模型部署 +./infer_det ./ch_PP-OCRv3_det_infer ./12.jpg 0 + +# 在CPU上,单独使用文字方向分类模型部署 +./infer_cls ./ch_ppocr_mobile_v2.0_cls_infer ./12.jpg 0 + +# 在CPU上,单独使用文字识别模型部署 +./infer_rec ./ch_PP-OCRv3_rec_infer ./ppocr_keys_v1.txt ./12.jpg 0 +``` + +运行完成可视化结果如下图所示 +
+ +
+ +- 注意,以上命令只适用于Linux或MacOS, Windows下SDK的使用方式请参考文档: [如何在Windows中使用FastDeploy C++ SDK](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/faq/use_sdk_on_windows.md) +- 关于如何通过FastDeploy使用更多不同的推理后端,以及如何使用不同的硬件,请参考文档:[如何切换模型推理后端引擎](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/faq/how_to_change_backend.md) + +## 5. 部署示例选项说明 +在我们使用`infer_demo`时, 输入了6个参数, 分别为文字检测模型, 文字分类模型, 文字识别模型, 预测图片, 字典文件与最后一位的数字选项. +现在下表将解释最后一位数字选项的含义. +|数字选项|含义| +|:---:|:---:| +|0| 在CPU上使用Paddle Inference推理 | +|1| 在CPU上使用OenVINO推理 | +|2| 在CPU上使用ONNX Runtime推理 | +|3| 在CPU上使用Paddle Lite推理 | +|4| 在GPU上使用Paddle Inference推理 | +|5| 在GPU上使用Paddle TensorRT推理 | +|6| 在GPU上使用ONNX Runtime推理 | +|7| 在GPU上使用Nvidia TensorRT推理 | + +关于如何通过FastDeploy使用更多不同的推理后端,以及如何使用不同的硬件,请参考文档:[如何切换模型推理后端引擎](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/faq/how_to_change_backend.md) + +## 6. 更多指南 + +### 6.1 如何使用C++部署PP-OCRv2系列模型. +本目录下的`infer.cc`代码是以PP-OCRv3模型为例, 如果用户有使用PP-OCRv2的需求, 只需要按照下面所示的方式, 来创建PP-OCRv2并使用. + +```cpp +// 此行为创建PP-OCRv3模型的代码 +auto ppocr_v3 = fastdeploy::pipeline::PPOCRv3(&det_model, &cls_model, &rec_model); +// 只需要将PPOCRv3改为PPOCRv2,即可创造PPOCRv2模型, 同时, 后续的接口均使用ppocr_v2来调用 +auto ppocr_v2 = fastdeploy::pipeline::PPOCRv2(&det_model, &cls_model, &rec_model); + +// 如果用户在部署PP-OCRv2时, 需要使用TensorRT推理, 还需要改动Rec模型的TensorRT的输入shape. +// 建议如下修改, 需要把 H 维度改为32, W 纬度按需修改. +rec_option.SetTrtInputShape("x", {1, 3, 32, 10}, {rec_batch_size, 3, 32, 320}, + {rec_batch_size, 3, 32, 2304}); +``` +### 6.2 如何在PP-OCRv2/v3系列模型中, 关闭文字方向分类器的使用. + +在PP-OCRv3/v2中, 文字方向分类器是可选的, 用户可以按照以下方式, 来决定自己是否使用方向分类器. +```cpp +// 使用 Cls 模型 +auto ppocr_v3 = fastdeploy::pipeline::PPOCRv3(&det_model, &cls_model, &rec_model); + +// 不使用 Cls 模型 +auto ppocr_v3 = fastdeploy::pipeline::PPOCRv3(&det_model, &rec_model); + +// 当不使用Cls模型时, 请删掉或者注释掉相关代码 +``` + +### 6.3 如何修改前后处理超参数. +在示例代码中, 我们展示出了修改前后处理超参数的接口,并设置为默认值,其中, FastDeploy提供的超参数的含义与文档[PaddleOCR推理模型参数解释](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_ch/inference_args.md)是相同的. 如果用户想要进行更多定制化的开发, 请阅读[PP-OCR系列 C++ API查阅](https://www.paddlepaddle.org.cn/fastdeploy-api-doc/cpp/html/namespacefastdeploy_1_1vision_1_1ocr.html) + +```cpp +// 设置检测模型的max_side_len +det_model.GetPreprocessor().SetMaxSideLen(960); +// 其他... +``` + +### 6.4 其他指南 +- [FastDeploy部署PaddleOCR模型概览](../../) +- [PP-OCRv3 Python部署](../python) +- [PP-OCRv3 C 部署](../c) +- [PP-OCRv3 C# 部署](../csharp) + +## 7. 常见问题 +- PaddleOCR能在FastDeploy支持的多种后端上推理,支持情况如下表所示, 如何切换后端, 详见文档[如何切换模型推理后端引擎](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/faq/how_to_change_backend.md) + +|硬件类型|支持的后端| +|:---:|:---:| +|X86 CPU| Paddle Inference, ONNX Runtime, OpenVINO | +|ARM CPU| Paddle Lite | +|飞腾 CPU| ONNX Runtime | +|NVIDIA GPU| Paddle Inference, ONNX Runtime, TensorRT | + +- [Intel GPU(独立显卡/集成显卡)的使用](https://github.com/PaddlePaddle/FastDeploy/blob/develop/tutorials/intel_gpu/README.md) +- [编译CPU部署库](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install/cpu.md) +- [编译GPU部署库](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install/gpu.md) +- [编译Jetson部署库](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install/jetson.md) diff --git a/deploy/fastdeploy/cpu-gpu/cpp/infer.cc b/deploy/fastdeploy/cpu-gpu/cpp/infer.cc new file mode 100644 index 0000000000000000000000000000000000000000..1464edacdf6a74c21b9025a73ece8470595f9538 --- /dev/null +++ b/deploy/fastdeploy/cpu-gpu/cpp/infer.cc @@ -0,0 +1,174 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision.h" +#ifdef WIN32 +const char sep = '\\'; +#else +const char sep = '/'; +#endif + +void InitAndInfer(const std::string &det_model_dir, + const std::string &cls_model_dir, + const std::string &rec_model_dir, + const std::string &rec_label_file, + const std::string &image_file, + const fastdeploy::RuntimeOption &option) { + auto det_model_file = det_model_dir + sep + "inference.pdmodel"; + auto det_params_file = det_model_dir + sep + "inference.pdiparams"; + + auto cls_model_file = cls_model_dir + sep + "inference.pdmodel"; + auto cls_params_file = cls_model_dir + sep + "inference.pdiparams"; + + auto rec_model_file = rec_model_dir + sep + "inference.pdmodel"; + auto rec_params_file = rec_model_dir + sep + "inference.pdiparams"; + + auto det_option = option; + auto cls_option = option; + auto rec_option = option; + + // The cls and rec model can inference a batch of images now. + // User could initialize the inference batch size and set them after create + // PP-OCR model. + int cls_batch_size = 1; + int rec_batch_size = 6; + + // If use TRT backend, the dynamic shape will be set as follow. + // We recommend that users set the length and height of the detection model to + // a multiple of 32. + // We also recommend that users set the Trt input shape as follow. + det_option.SetTrtInputShape("x", {1, 3, 64, 64}, {1, 3, 640, 640}, + {1, 3, 960, 960}); + cls_option.SetTrtInputShape("x", {1, 3, 48, 10}, {cls_batch_size, 3, 48, 320}, + {cls_batch_size, 3, 48, 1024}); + rec_option.SetTrtInputShape("x", {1, 3, 48, 10}, {rec_batch_size, 3, 48, 320}, + {rec_batch_size, 3, 48, 2304}); + + // Users could save TRT cache file to disk as follow. + // det_option.SetTrtCacheFile(det_model_dir + sep + "det_trt_cache.trt"); + // cls_option.SetTrtCacheFile(cls_model_dir + sep + "cls_trt_cache.trt"); + // rec_option.SetTrtCacheFile(rec_model_dir + sep + "rec_trt_cache.trt"); + + auto det_model = fastdeploy::vision::ocr::DBDetector( + det_model_file, det_params_file, det_option); + auto cls_model = fastdeploy::vision::ocr::Classifier( + cls_model_file, cls_params_file, cls_option); + auto rec_model = fastdeploy::vision::ocr::Recognizer( + rec_model_file, rec_params_file, rec_label_file, rec_option); + + assert(det_model.Initialized()); + assert(cls_model.Initialized()); + assert(rec_model.Initialized()); + + // Parameters settings for pre and post processing of Det/Cls/Rec Models. + // All parameters are set to default values. + det_model.GetPreprocessor().SetMaxSideLen(960); + det_model.GetPostprocessor().SetDetDBThresh(0.3); + det_model.GetPostprocessor().SetDetDBBoxThresh(0.6); + det_model.GetPostprocessor().SetDetDBUnclipRatio(1.5); + det_model.GetPostprocessor().SetDetDBScoreMode("slow"); + det_model.GetPostprocessor().SetUseDilation(0); + cls_model.GetPostprocessor().SetClsThresh(0.9); + + // The classification model is optional, so the PP-OCR can also be connected + // in series as follows + // auto ppocr_v3 = fastdeploy::pipeline::PPOCRv3(&det_model, &rec_model); + auto ppocr_v3 = + fastdeploy::pipeline::PPOCRv3(&det_model, &cls_model, &rec_model); + + // Set inference batch size for cls model and rec model, the value could be -1 + // and 1 to positive infinity. + // When inference batch size is set to -1, it means that the inference batch + // size + // of the cls and rec models will be the same as the number of boxes detected + // by the det model. + ppocr_v3.SetClsBatchSize(cls_batch_size); + ppocr_v3.SetRecBatchSize(rec_batch_size); + + if (!ppocr_v3.Initialized()) { + std::cerr << "Failed to initialize PP-OCR." << std::endl; + return; + } + + auto im = cv::imread(image_file); + auto im_bak = im.clone(); + + fastdeploy::vision::OCRResult result; + if (!ppocr_v3.Predict(&im, &result)) { + std::cerr << "Failed to predict." << std::endl; + return; + } + + std::cout << result.Str() << std::endl; + + auto vis_im = fastdeploy::vision::VisOcr(im_bak, result); + cv::imwrite("vis_result.jpg", vis_im); + std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl; +} + +int main(int argc, char *argv[]) { + if (argc < 7) { + std::cout << "Usage: infer_demo path/to/det_model path/to/cls_model " + "path/to/rec_model path/to/rec_label_file path/to/image " + "run_option, " + "e.g ./infer_demo ./ch_PP-OCRv3_det_infer " + "./ch_ppocr_mobile_v2.0_cls_infer ./ch_PP-OCRv3_rec_infer " + "./ppocr_keys_v1.txt ./12.jpg 0" + << std::endl; + std::cout << "The data type of run_option is int, e.g. 0: run with paddle " + "inference on cpu;" + << std::endl; + return -1; + } + + fastdeploy::RuntimeOption option; + int flag = std::atoi(argv[6]); + + if (flag == 0) { + option.UseCpu(); + option.UsePaddleBackend(); // Paddle Inference + } else if (flag == 1) { + option.UseCpu(); + option.UseOpenVINOBackend(); // OpenVINO + } else if (flag == 2) { + option.UseCpu(); + option.UseOrtBackend(); // ONNX Runtime + } else if (flag == 3) { + option.UseCpu(); + option.UseLiteBackend(); // Paddle Lite + } else if (flag == 4) { + option.UseGpu(); + option.UsePaddleBackend(); // Paddle Inference + } else if (flag == 5) { + option.UseGpu(); + option.UsePaddleInferBackend(); + option.paddle_infer_option.collect_trt_shape = true; + option.paddle_infer_option.enable_trt = true; // Paddle-TensorRT + } else if (flag == 6) { + option.UseGpu(); + option.UseOrtBackend(); // ONNX Runtime + } else if (flag == 7) { + option.UseGpu(); + option.UseTrtBackend(); // TensorRT + } + + std::string det_model_dir = argv[1]; + std::string cls_model_dir = argv[2]; + std::string rec_model_dir = argv[3]; + std::string rec_label_file = argv[4]; + std::string test_image = argv[5]; + InitAndInfer(det_model_dir, cls_model_dir, rec_model_dir, rec_label_file, + test_image, option); + return 0; +} diff --git a/deploy/fastdeploy/cpu-gpu/cpp/infer_cls.cc b/deploy/fastdeploy/cpu-gpu/cpp/infer_cls.cc new file mode 100644 index 0000000000000000000000000000000000000000..789c2a9f365ae37f270cd27d3d83f7e66d4241c0 --- /dev/null +++ b/deploy/fastdeploy/cpu-gpu/cpp/infer_cls.cc @@ -0,0 +1,79 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision.h" +#ifdef WIN32 +const char sep = '\\'; +#else +const char sep = '/'; +#endif + +void InitAndInfer(const std::string &cls_model_dir, + const std::string &image_file, + const fastdeploy::RuntimeOption &option) { + auto cls_model_file = cls_model_dir + sep + "inference.pdmodel"; + auto cls_params_file = cls_model_dir + sep + "inference.pdiparams"; + auto cls_option = option; + + auto cls_model = fastdeploy::vision::ocr::Classifier( + cls_model_file, cls_params_file, cls_option); + assert(cls_model.Initialized()); + + // Parameters settings for pre and post processing of Cls Model. + cls_model.GetPostprocessor().SetClsThresh(0.9); + + auto im = cv::imread(image_file); + auto im_bak = im.clone(); + + fastdeploy::vision::OCRResult result; + if (!cls_model.Predict(im, &result)) { + std::cerr << "Failed to predict." << std::endl; + return; + } + + // User can infer a batch of images by following code. + // if (!cls_model.BatchPredict({im}, &result)) { + // std::cerr << "Failed to predict." << std::endl; + // return; + // } + + std::cout << result.Str() << std::endl; +} + +int main(int argc, char *argv[]) { + if (argc < 4) { + std::cout << "Usage: infer_demo path/to/cls_model path/to/image " + "run_option, " + "e.g ./infer_demo ./ch_ppocr_mobile_v2.0_cls_infer ./12.jpg 0" + << std::endl; + std::cout << "The data type of run_option is int, 0: run with cpu; 1: run " + "with gpu;." + << std::endl; + return -1; + } + + fastdeploy::RuntimeOption option; + int flag = std::atoi(argv[3]); + + if (flag == 0) { + option.UseCpu(); + } else if (flag == 1) { + option.UseGpu(); + } + + std::string cls_model_dir = argv[1]; + std::string test_image = argv[2]; + InitAndInfer(cls_model_dir, test_image, option); + return 0; +} \ No newline at end of file diff --git a/deploy/fastdeploy/cpu-gpu/cpp/infer_det.cc b/deploy/fastdeploy/cpu-gpu/cpp/infer_det.cc new file mode 100644 index 0000000000000000000000000000000000000000..8b1cea4b9ef9e5788624aac0e7edcb024dbf2605 --- /dev/null +++ b/deploy/fastdeploy/cpu-gpu/cpp/infer_det.cc @@ -0,0 +1,82 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision.h" +#ifdef WIN32 +const char sep = '\\'; +#else +const char sep = '/'; +#endif + +void InitAndInfer(const std::string &det_model_dir, + const std::string &image_file, + const fastdeploy::RuntimeOption &option) { + auto det_model_file = det_model_dir + sep + "inference.pdmodel"; + auto det_params_file = det_model_dir + sep + "inference.pdiparams"; + auto det_option = option; + + auto det_model = fastdeploy::vision::ocr::DBDetector( + det_model_file, det_params_file, det_option); + assert(det_model.Initialized()); + + // Parameters settings for pre and post processing of Det Model. + det_model.GetPreprocessor().SetMaxSideLen(960); + det_model.GetPostprocessor().SetDetDBThresh(0.3); + det_model.GetPostprocessor().SetDetDBBoxThresh(0.6); + det_model.GetPostprocessor().SetDetDBUnclipRatio(1.5); + det_model.GetPostprocessor().SetDetDBScoreMode("slow"); + det_model.GetPostprocessor().SetUseDilation(0); + + auto im = cv::imread(image_file); + auto im_bak = im.clone(); + + fastdeploy::vision::OCRResult result; + if (!det_model.Predict(im, &result)) { + std::cerr << "Failed to predict." << std::endl; + return; + } + + std::cout << result.Str() << std::endl; + + auto vis_im = fastdeploy::vision::VisOcr(im_bak, result); + cv::imwrite("vis_result.jpg", vis_im); + std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl; +} + +int main(int argc, char *argv[]) { + if (argc < 4) { + std::cout << "Usage: infer_demo path/to/det_model path/to/image " + "run_option, " + "e.g ./infer_demo ./ch_PP-OCRv3_det_infer ./12.jpg 0" + << std::endl; + std::cout << "The data type of run_option is int, 0: run with cpu; 1: run " + "with gpu;." + << std::endl; + return -1; + } + + fastdeploy::RuntimeOption option; + int flag = std::atoi(argv[3]); + + if (flag == 0) { + option.UseCpu(); + } else if (flag == 1) { + option.UseGpu(); + } + + std::string det_model_dir = argv[1]; + std::string test_image = argv[2]; + InitAndInfer(det_model_dir, test_image, option); + return 0; +} diff --git a/deploy/fastdeploy/cpu-gpu/cpp/infer_rec.cc b/deploy/fastdeploy/cpu-gpu/cpp/infer_rec.cc new file mode 100644 index 0000000000000000000000000000000000000000..e07e2a0cd8152fece1687e7e8c4302f6ee079bc8 --- /dev/null +++ b/deploy/fastdeploy/cpu-gpu/cpp/infer_rec.cc @@ -0,0 +1,83 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision.h" +#ifdef WIN32 +const char sep = '\\'; +#else +const char sep = '/'; +#endif + +void InitAndInfer(const std::string &rec_model_dir, + const std::string &rec_label_file, + const std::string &image_file, + const fastdeploy::RuntimeOption &option) { + auto rec_model_file = rec_model_dir + sep + "inference.pdmodel"; + auto rec_params_file = rec_model_dir + sep + "inference.pdiparams"; + auto rec_option = option; + + auto rec_model = fastdeploy::vision::ocr::Recognizer( + rec_model_file, rec_params_file, rec_label_file, rec_option); + + assert(rec_model.Initialized()); + + auto im = cv::imread(image_file); + auto im_bak = im.clone(); + + fastdeploy::vision::OCRResult result; + + if (!rec_model.Predict(im, &result)) { + std::cerr << "Failed to predict." << std::endl; + return; + } + + // User can infer a batch of images by following code. + // if (!rec_model.BatchPredict({im}, &result)) { + // std::cerr << "Failed to predict." << std::endl; + // return; + // } + + std::cout << result.Str() << std::endl; +} + +int main(int argc, char *argv[]) { + if (argc < 5) { + std::cout << "Usage: infer_demo" + "path/to/rec_model path/to/rec_label_file path/to/image " + "run_option, " + "e.g ./infer_demo " + "./ch_PP-OCRv3_rec_infer " + "./ppocr_keys_v1.txt ./12.jpg 0" + << std::endl; + std::cout << "The data type of run_option is int, 0: run with cpu; 1: run " + "with gpu;" + << std::endl; + return -1; + } + + fastdeploy::RuntimeOption option; + int flag = std::atoi(argv[4]); + + if (flag == 0) { + option.UseCpu(); + } else if (flag == 1) { + option.UseGpu(); + } + + std::string rec_model_dir = argv[1]; + std::string rec_label_file = argv[2]; + std::string test_image = argv[3]; + InitAndInfer(rec_model_dir, rec_label_file, test_image, option); + return 0; +} diff --git a/deploy/fastdeploy/cpu-gpu/csharp/CMakeLists.txt b/deploy/fastdeploy/cpu-gpu/csharp/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..7ae8e2aba35b71c4f92cc908f1baa983bce0757b --- /dev/null +++ b/deploy/fastdeploy/cpu-gpu/csharp/CMakeLists.txt @@ -0,0 +1,22 @@ +PROJECT(infer_demo CSharp) +CMAKE_MINIMUM_REQUIRED (VERSION 3.10) + +# Set the C# language version (defaults to 3.0 if not set). +set(CMAKE_CSharp_FLAGS "/langversion:10") +set(CMAKE_DOTNET_TARGET_FRAMEWORK "net6.0") +set(CMAKE_DOTNET_SDK "Microsoft.NET.Sdk") + +# 指定下载解压后的fastdeploy库路径 +option(FASTDEPLOY_INSTALL_DIR "Path of downloaded fastdeploy sdk.") + +include(${FASTDEPLOY_INSTALL_DIR}/FastDeployCSharp.cmake) + + +add_executable(infer_demo ${PROJECT_SOURCE_DIR}/infer.cs) + +set_property(TARGET infer_demo PROPERTY VS_DOTNET_REFERENCES + ${FASTDEPLOY_DOTNET_REFERENCES} +) + +set_property(TARGET infer_demo + PROPERTY VS_PACKAGE_REFERENCES ${FASTDEPLOY_PACKAGE_REFERENCES}) diff --git a/deploy/fastdeploy/cpu-gpu/csharp/README.md b/deploy/fastdeploy/cpu-gpu/csharp/README.md new file mode 100755 index 0000000000000000000000000000000000000000..3a87730e19ac9d5f13b7c8219686118f4cc669e2 --- /dev/null +++ b/deploy/fastdeploy/cpu-gpu/csharp/README.md @@ -0,0 +1,173 @@ +[English](README.md) | 简体中文 +# PaddleOCR CPU-GPU C#部署示例 + +本目录下提供`infer.cs`来调用C# API快速完成PPOCRv3模型在CPU/GPU上部署的示例。 + +## 1. 说明 +PaddleOCR支持利用FastDeploy在NVIDIA GPU、X86 CPU、飞腾CPU、ARM CPU、Intel GPU(独立显卡/集成显卡)硬件上快速部署OCR模型. + +## 2. 部署环境准备 +在部署前,需确认软硬件环境,同时下载预编译部署库,参考[FastDeploy安装文档](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install#FastDeploy预编译库安装)安装FastDeploy预编译库. 在本目录执行如下命令即可在Windows完成编译测试,支持此模型需保证FastDeploy版本1.0.4以上(x.x.x>=1.0.4) + +## 3. 部署模型准备 +在部署前, 请准备好您所需要运行的推理模型, 您可以在[FastDeploy支持的PaddleOCR模型列表](../README.md)中下载所需模型. + +## 4. 部署示例 + +### 4.1 下载C#包管理程序nuget客户端 +> https://dist.nuget.org/win-x86-commandline/v6.4.0/nuget.exe +下载完成后将该程序添加到环境变量**PATH**中 + +### 4.2. 下载模型文件和测试图片 +> https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar # (下载后解压缩) +> https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar +> https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar +> https://gitee.com/paddlepaddle/PaddleOCR/raw/release/2.6/doc/imgs/12.jpg +> https://gitee.com/paddlepaddle/PaddleOCR/raw/release/2.6/ppocr/utils/ppocr_keys_v1.txt + +### 4.3 编译示例代码 + +本文档编译的示例代码的编译工具依赖VS 2019,**Windows打开x64 Native Tools Command Prompt for VS 2019命令工具**,通过如下命令开始编译 + + +```shell +## 下载FastDeploy预编译库,用户可在上文提到的`FastDeploy预编译库`中自行选择合适的版本使用 +https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz + +# 下载部署示例代码 +git clone https://github.com/PaddlePaddle/FastDeploy.git +cd D:\FastDeploy\examples\vision\ocr\PP-OCR\cpu-gpu\csharp + +# 如果您希望从PaddleOCR下载示例代码,请运行 +git clone https://github.com/PaddlePaddle/PaddleOCR.git +# 注意:如果当前分支找不到下面的fastdeploy测试代码,请切换到dygraph分支 +git checkout dygraph +cd D:\PaddleOCR\deploy\fastdeploy\cpu-gpu\csharp + +mkdir build && cd build +cmake .. -G "Visual Studio 16 2019" -A x64 -DFASTDEPLOY_INSTALL_DIR=D:\fastdeploy-win-x64-gpu-x.x.x -DCUDA_DIRECTORY="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.2" + +nuget restore +msbuild infer_demo.sln /m:4 /p:Configuration=Release /p:Platform=x64 +``` + +关于使用Visual Studio 2019创建sln工程,或者CMake工程等方式编译的更详细信息,可参考如下文档 +- [在 Windows 使用 FastDeploy C++ SDK](https://github.com/PaddlePaddle/FastDeploy/tree/develop/docs/cn/faq/use_sdk_on_windows.md) +- [FastDeploy C++库在Windows上的多种使用方式](https://github.com/PaddlePaddle/FastDeploy/tree/develop/docs/cn/faq/use_sdk_on_windows_build.md) + +### 4.4 运行可执行程序 + +注意Windows上运行时,需要将FastDeploy依赖的库拷贝至可执行程序所在目录, 或者配置环境变量。FastDeploy提供了工具帮助我们快速将所有依赖库拷贝至可执行程序所在目录,通过如下命令将所有依赖的dll文件拷贝至可执行程序所在的目录(可能生成的可执行文件在Release下还有一层目录,这里假设生成的可执行文件在Release处) +```shell +cd D:\fastdeploy-win-x64-gpu-x.x.x + +fastdeploy_init.bat install %cd% D:\PaddleOCR\deploy\fastdeploy\cpu-gpu\csharp\build\Release +``` + +将dll拷贝到当前路径后,准备好模型和图片,使用如下命令运行可执行程序即可 +```shell +cd Release +# CPU推理 +infer_demo ./ch_PP-OCRv3_det_infer ./ch_ppocr_mobile_v3.0_cls_infer ./ch_PP-OCRv3_rec_infer ./ppocr_keys_v1.txt ./12.jpg 0 +# GPU推理 +infer_demo ./ch_PP-OCRv3_det_infer ./ch_ppocr_mobile_v3.0_cls_infer ./ch_PP-OCRv3_rec_infer ./ppocr_keys_v1.txt ./12.jpg 1 +``` + +## 5. PP-OCRv3 C# API接口简介 +下面提供了PP-OCRv3的C# API简介 + +- 如果用户想要更换部署后端或进行其他定制化操作, 请查看[C# Runtime API](https://baidu-paddle.github.io/fastdeploy-api/csharp/html/classfastdeploy_1_1RuntimeOption.html). +- 更多 PP-OCR C# API 请查看 [C# PP-OCR API](https://github.com/PaddlePaddle/FastDeploy/blob/develop/csharp/fastdeploy/vision/ocr/model.cs) + +### 模型 + +```c# +fastdeploy.vision.ocr.DBDetector( + string model_file, + string params_file, + fastdeploy.RuntimeOption runtime_option = null, + fastdeploy.ModelFormat model_format = ModelFormat.PADDLE) +``` + +> DBDetector模型加载和初始化。 + +> **参数** + +>> * **model_file**(str): 模型文件路径 +>> * **params_file**(str): 参数文件路径 +>> * **runtime_option**(RuntimeOption): 后端推理配置,默认为null,即采用默认配置 +>> * **model_format**(ModelFormat): 模型格式,默认为PADDLE格式 + +```c# +fastdeploy.vision.ocr.Classifier( + string model_file, + string params_file, + fastdeploy.RuntimeOption runtime_option = null, + fastdeploy.ModelFormat model_format = ModelFormat.PADDLE) +``` + +> Classifier模型加载和初始化。 + +> **参数** + +>> * **model_file**(str): 模型文件路径 +>> * **params_file**(str): 参数文件路径 +>> * **runtime_option**(RuntimeOption): 后端推理配置,默认为null,即采用默认配置 +>> * **model_format**(ModelFormat): 模型格式,默认为PADDLE格式 + +```c# +fastdeploy.vision.ocr.Recognizer( + string model_file, + string params_file, + string label_path, + fastdeploy.RuntimeOption runtime_option = null, + fastdeploy.ModelFormat model_format = ModelFormat.PADDLE) +``` + +> Recognizer模型加载和初始化。 + +> **参数** + +>> * **model_file**(str): 模型文件路径 +>> * **params_file**(str): 参数文件路径 +>> * **label_path**(str): 标签文件路径 +>> * **runtime_option**(RuntimeOption): 后端推理配置,默认为null,即采用默认配置 +>> * **model_format**(ModelFormat): 模型格式,默认为PADDLE格式 + +```c# +fastdeploy.pipeline.PPOCRv3Model( + DBDetector dbdetector, + Classifier classifier, + Recognizer recognizer) +``` + +> PP-OCRv3Model模型加载和初始化。 + +> **参数** + +>> * **det_model**(FD_C_DBDetectorWrapper*): DBDetector模型 +>> * **cls_model**(FD_C_ClassifierWrapper*): Classifier模型 +>> * **rec_model**(FD_C_RecognizerWrapper*): Recognizer模型文件 + +#### Predict函数 + +```c# +fastdeploy.OCRResult Predict(OpenCvSharp.Mat im) +``` + +> 模型预测接口,输入图像直接输出结果。 +> +> **参数** +> +>> * **im**(Mat): 输入图像,注意需为HWC,BGR格式 +>> +> **返回值** +> +>> * **result**: OCR预测结果,包括由检测模型输出的检测框位置,分类模型输出的方向分类,以及识别模型输出的识别结果, OCRResult说明参考[视觉模型预测结果](../../../../../docs/api/vision_results/) + + +## 6. 其它文档 +- [FastDeploy部署PaddleOCR模型概览](../../) +- [PP-OCRv3 Python部署](../python) +- [PP-OCRv3 C++ 部署](../cpp) +- [PP-OCRv3 C 部署](../c) diff --git a/deploy/fastdeploy/cpu-gpu/csharp/infer.cs b/deploy/fastdeploy/cpu-gpu/csharp/infer.cs new file mode 100644 index 0000000000000000000000000000000000000000..962500e08a1b1e8b8cd19350d0137c1243a88d79 --- /dev/null +++ b/deploy/fastdeploy/cpu-gpu/csharp/infer.cs @@ -0,0 +1,79 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.IO; +using System.Runtime.InteropServices; +using OpenCvSharp; +using fastdeploy; + +namespace Test +{ + public class TestPPOCRv3 + { + public static void Main(string[] args) + { + if (args.Length < 6) { + Console.WriteLine( + "Usage: infer_demo path/to/det_model path/to/cls_model " + + "path/to/rec_model path/to/rec_label_file path/to/image " + + "run_option, " + + "e.g ./infer_demo ./ch_PP-OCRv2_det_infer " + + "./ch_ppocr_mobile_v2.0_cls_infer ./ch_PP-OCRv2_rec_infer " + + "./ppocr_keys_v1.txt ./12.jpg 0" + ); + Console.WriteLine( "The data type of run_option is int, 0: run with cpu; 1: run with gpu"); + return; + } + string det_model_dir = args[0]; + string cls_model_dir = args[1]; + string rec_model_dir = args[2]; + string rec_label_file = args[3]; + string image_path = args[4]; + RuntimeOption runtimeoption = new RuntimeOption(); + int device_option = Int32.Parse(args[5]); + if(device_option==0){ + runtimeoption.UseCpu(); + }else{ + runtimeoption.UseGpu(); + } + string sep = "\\"; + string det_model_file = det_model_dir + sep + "inference.pdmodel"; + string det_params_file = det_model_dir + sep + "inference.pdiparams"; + + string cls_model_file = cls_model_dir + sep + "inference.pdmodel"; + string cls_params_file = cls_model_dir + sep + "inference.pdiparams"; + + string rec_model_file = rec_model_dir + sep + "inference.pdmodel"; + string rec_params_file = rec_model_dir + sep + "inference.pdiparams"; + + fastdeploy.vision.ocr.DBDetector dbdetector = new fastdeploy.vision.ocr.DBDetector(det_model_file, det_params_file, runtimeoption, ModelFormat.PADDLE); + fastdeploy.vision.ocr.Classifier classifier = new fastdeploy.vision.ocr.Classifier(cls_model_file, cls_params_file, runtimeoption, ModelFormat.PADDLE); + fastdeploy.vision.ocr.Recognizer recognizer = new fastdeploy.vision.ocr.Recognizer(rec_model_file, rec_params_file, rec_label_file, runtimeoption, ModelFormat.PADDLE); + fastdeploy.pipeline.PPOCRv3 model = new fastdeploy.pipeline.PPOCRv3(dbdetector, classifier, recognizer); + if(!model.Initialized()){ + Console.WriteLine("Failed to initialize.\n"); + } + Mat image = Cv2.ImRead(image_path); + fastdeploy.vision.OCRResult res = model.Predict(image); + Console.WriteLine(res.ToString()); + Mat res_img = fastdeploy.vision.Visualize.VisOcr(image, res); + Cv2.ImShow("result.png", res_img); + Cv2.ImWrite("result.png", res_img); + Cv2.WaitKey(0); + + } + + } +} \ No newline at end of file diff --git a/deploy/fastdeploy/cpu-gpu/python/README.md b/deploy/fastdeploy/cpu-gpu/python/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d8143e028922a51f38d2ee478b9a99a39fba0d79 --- /dev/null +++ b/deploy/fastdeploy/cpu-gpu/python/README.md @@ -0,0 +1,153 @@ +[English](README.md) | 简体中文 +# PaddleOCR CPU-GPU Python部署示例 +本目录下提供`infer.py`快速完成PP-OCRv3在CPU/GPU,以及GPU上通过Paddle-TensorRT加速部署的示例. + +## 1. 说明 +PaddleOCR支持利用FastDeploy在NVIDIA GPU、X86 CPU、飞腾CPU、ARM CPU、Intel GPU(独立显卡/集成显卡)硬件上快速部署OCR模型 + +## 2. 部署环境准备 +在部署前,需确认软硬件环境,同时下载预编译部署库,参考[FastDeploy安装文档](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install#FastDeploy预编译库安装)安装FastDeploy预编译库. + +## 3. 部署模型准备 +在部署前, 请准备好您所需要运行的推理模型, 您可以在[FastDeploy支持的PaddleOCR模型列表](../README.md)中下载所需模型. + +## 4. 运行部署示例 +```bash +# 安装FastDpeloy python包(详细文档请参考`部署环境准备`) +pip install fastdeploy-gpu-python -f https://www.paddlepaddle.org.cn/whl/fastdeploy.html +conda config --add channels conda-forge && conda install cudatoolkit=11.2 cudnn=8.2 + +# 下载部署示例代码 +git clone https://github.com/PaddlePaddle/FastDeploy.git +cd FastDeploy/examples/vision/ocr/PP-OCR/cpu-gpu/python + +# 如果您希望从PaddleOCR下载示例代码,请运行 +git clone https://github.com/PaddlePaddle/PaddleOCR.git +# 注意:如果当前分支找不到下面的fastdeploy测试代码,请切换到dygraph分支 +git checkout dygraph +cd PaddleOCR/deploy/fastdeploy/cpu-gpu/python + +# 下载PP-OCRv3文字检测模型 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar +tar -xvf ch_PP-OCRv3_det_infer.tar +# 下载文字方向分类器模型 +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar +tar -xvf ch_ppocr_mobile_v2.0_cls_infer.tar +# 下载PP-OCRv3文字识别模型 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar +tar -xvf ch_PP-OCRv3_rec_infer.tar + +# 下载预测图片与字典文件 +wget https://gitee.com/paddlepaddle/PaddleOCR/raw/release/2.6/doc/imgs/12.jpg +wget https://gitee.com/paddlepaddle/PaddleOCR/raw/release/2.6/ppocr/utils/ppocr_keys_v1.txt + +# 运行部署示例 +# 在CPU上使用Paddle Inference推理 +python infer.py --det_model ch_PP-OCRv3_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv3_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device cpu --backend paddle +# 在CPU上使用OenVINO推理 +python infer.py --det_model ch_PP-OCRv3_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv3_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device cpu --backend openvino +# 在CPU上使用ONNX Runtime推理 +python infer.py --det_model ch_PP-OCRv3_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv3_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device cpu --backend ort +# 在CPU上使用Paddle Lite推理 +python infer.py --det_model ch_PP-OCRv3_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv3_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device cpu --backend pplite +# 在GPU上使用Paddle Inference推理 +python infer.py --det_model ch_PP-OCRv3_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv3_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device gpu --backend paddle +# 在GPU上使用Paddle TensorRT推理 +python infer.py --det_model ch_PP-OCRv3_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv3_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device gpu --backend pptrt +# 在GPU上使用ONNX Runtime推理 +python infer.py --det_model ch_PP-OCRv3_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv3_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device gpu --backend ort +# 在GPU上使用Nvidia TensorRT推理 +python infer.py --det_model ch_PP-OCRv3_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv3_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device gpu --backend trt + +# 同时, FastDeploy提供文字检测,文字分类,文字识别三个模型的单独推理, +# 有需要的用户, 请准备合适的图片, 同时根据自己的需求, 参考infer.py来配置自定义硬件与推理后端. + +# 在CPU上,单独使用文字检测模型部署 +python infer_det.py --det_model ch_PP-OCRv3_det_infer --image 12.jpg --device cpu + +# 在CPU上,单独使用文字方向分类模型部署 +python infer_cls.py --cls_model ch_ppocr_mobile_v2.0_cls_infer --image 12.jpg --device cpu + +# 在CPU上,单独使用文字识别模型部署 +python infer_rec.py --rec_model ch_PP-OCRv3_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device cpu + +``` + +运行完成可视化结果如下图所示 +
+ +
+ +## 5. 部署示例选项说明 + +|参数|含义|默认值 +|---|---|---| +|--det_model|指定检测模型文件夹所在的路径|None| +|--cls_model|指定分类模型文件夹所在的路径|None| +|--rec_model|指定识别模型文件夹所在的路径|None| +|--rec_label_file|识别模型所需label所在的路径|None| +|--image|指定测试图片所在的路径|None| +|--device|指定即将运行的硬件类型,支持的值为`[cpu, gpu]`,当设置为cpu时,可运行在x86 cpu/arm cpu等cpu上|cpu| +|--device_id|使用gpu时, 指定设备号|0| +|--backend|部署模型时使用的后端, 支持的值为`[paddle,pptrt,pplite,ort,openvino,trt]` |paddle| + +关于如何通过FastDeploy使用更多不同的推理后端,以及如何使用不同的硬件,请参考文档:[如何切换模型推理后端引擎](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/faq/how_to_change_backend.md) + +## 6. 更多指南 + +### 6.1 如何使用Python部署PP-OCRv2系列模型. +本目录下的`infer.py`代码是以PP-OCRv3模型为例, 如果用户有使用PP-OCRv2的需求, 只需要按照下面所示的方式, 来创建PP-OCRv2并使用. + +```python +# 此行为创建PP-OCRv3模型的代码 +ppocr_v3 = fd.vision.ocr.PPOCRv3(det_model=det_model, cls_model=cls_model, rec_model=rec_model) +# 只需要将PPOCRv3改为PPOCRv2,即可创造PPOCRv2模型, 同时, 后续的接口均使用ppocr_v2来调用 +ppocr_v2 = fd.vision.ocr.PPOCRv2(det_model=det_model, cls_model=cls_model, rec_model=rec_model) + +# 如果用户在部署PP-OCRv2时, 需要使用TensorRT推理, 还需要改动Rec模型的TensorRT的输入shape. +# 建议如下修改, 需要把 H 维度改为32, W 纬度按需修改. +rec_option.set_trt_input_shape("x", [1, 3, 32, 10], + [args.rec_bs, 3, 32, 320], + [args.rec_bs, 3, 32, 2304]) +``` + +### 6.2 如何在PP-OCRv2/v3系列模型中, 关闭文字方向分类器的使用. + +在PP-OCRv3/v2中, 文字方向分类器是可选的, 用户可以按照以下方式, 来决定自己是否使用方向分类器. +```python +# 使用 Cls 模型 +ppocr_v3 = fd.vision.ocr.PPOCRv3(det_model=det_model, cls_model=cls_model, rec_model=rec_model) + +# 不使用 Cls 模型 +ppocr_v3 = fd.vision.ocr.PPOCRv3(det_model=det_model, cls_model=None, rec_model=rec_model) +``` +### 6.3 如何修改前后处理超参数. +在示例代码中, 我们展示出了修改前后处理超参数的接口,并设置为默认值,其中, FastDeploy提供的超参数的含义与文档[PaddleOCR推理模型参数解释](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_ch/inference_args.md)是相同的. 如果用户想要进行更多定制化的开发, 请阅读[PP-OCR系列 Python API查阅](https://www.paddlepaddle.org.cn/fastdeploy-api-doc/python/html/ocr.html) + +```python +# 设置检测模型的max_side_len +det_model.preprocessor.max_side_len = 960 +# 其他... +``` + +### 6.4 其他指南 +- [FastDeploy部署PaddleOCR模型概览](../../) +- [PP-OCRv3 C++部署](../cpp) +- [PP-OCRv3 C 部署](../c) +- [PP-OCRv3 C# 部署](../csharp) + +## 7. 常见问题 +- PaddleOCR能在FastDeploy支持的多种后端上推理,支持情况如下表所示, 如何切换后端, 详见文档[如何切换模型推理后端引擎](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/faq/how_to_change_backend.md) + +|硬件类型|支持的后端| +|:---:|:---:| +|X86 CPU| Paddle Inference, ONNX Runtime, OpenVINO | +|ARM CPU| Paddle Lite | +|飞腾 CPU| ONNX Runtime | +|NVIDIA GPU| Paddle Inference, ONNX Runtime, TensorRT | + +- [如何将模型预测结果转为numpy格式](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/faq/vision_result_related_problems.md) +- [Intel GPU(独立显卡/集成显卡)的使用](https://github.com/PaddlePaddle/FastDeploy/blob/develop/tutorials/intel_gpu/README.md) +- [编译CPU部署库](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install/cpu.md) +- [编译GPU部署库](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install/gpu.md) +- [编译Jetson部署库](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install/jetson.md) diff --git a/deploy/fastdeploy/cpu-gpu/python/infer.py b/deploy/fastdeploy/cpu-gpu/python/infer.py new file mode 100755 index 0000000000000000000000000000000000000000..8eac84599897b8f2c8142151235d88714c934311 --- /dev/null +++ b/deploy/fastdeploy/cpu-gpu/python/infer.py @@ -0,0 +1,218 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import fastdeploy as fd +import cv2 +import os + + +def parse_arguments(): + import argparse + import ast + parser = argparse.ArgumentParser() + parser.add_argument( + "--det_model", required=True, help="Path of Detection model of PPOCR.") + parser.add_argument( + "--cls_model", + required=True, + help="Path of Classification model of PPOCR.") + parser.add_argument( + "--rec_model", + required=True, + help="Path of Recognization model of PPOCR.") + parser.add_argument( + "--rec_label_file", + required=True, + help="Path of Recognization model of PPOCR.") + parser.add_argument( + "--image", type=str, required=True, help="Path of test image file.") + parser.add_argument( + "--device", + type=str, + default='cpu', + help="Type of inference device, support 'cpu' or 'gpu'.") + parser.add_argument( + "--device_id", + type=int, + default=0, + help="Define which GPU card used to run model.") + parser.add_argument( + "--cls_bs", + type=int, + default=1, + help="Classification model inference batch size.") + parser.add_argument( + "--rec_bs", + type=int, + default=6, + help="Recognition model inference batch size") + parser.add_argument( + "--backend", + type=str, + default="default", + help="Type of inference backend, support ort/trt/paddle/openvino, default 'openvino' for cpu, 'tensorrt' for gpu" + ) + + return parser.parse_args() + + +def build_option(args): + + det_option = fd.RuntimeOption() + cls_option = fd.RuntimeOption() + rec_option = fd.RuntimeOption() + + if args.device.lower() == "gpu": + det_option.use_gpu(args.device_id) + cls_option.use_gpu(args.device_id) + rec_option.use_gpu(args.device_id) + + if args.backend.lower() == "trt": + assert args.device.lower( + ) == "gpu", "TensorRT backend require inference on device GPU." + det_option.use_trt_backend() + cls_option.use_trt_backend() + rec_option.use_trt_backend() + + # If use TRT backend, the dynamic shape will be set as follow. + # We recommend that users set the length and height of the detection model to a multiple of 32. + # We also recommend that users set the Trt input shape as follow. + det_option.set_trt_input_shape("x", [1, 3, 64, 64], [1, 3, 640, 640], + [1, 3, 960, 960]) + cls_option.set_trt_input_shape("x", [1, 3, 48, 10], + [args.cls_bs, 3, 48, 320], + [args.cls_bs, 3, 48, 1024]) + rec_option.set_trt_input_shape("x", [1, 3, 48, 10], + [args.rec_bs, 3, 48, 320], + [args.rec_bs, 3, 48, 2304]) + + # Users could save TRT cache file to disk as follow. + det_option.set_trt_cache_file(args.det_model + "/det_trt_cache.trt") + cls_option.set_trt_cache_file(args.cls_model + "/cls_trt_cache.trt") + rec_option.set_trt_cache_file(args.rec_model + "/rec_trt_cache.trt") + + elif args.backend.lower() == "pptrt": + assert args.device.lower( + ) == "gpu", "Paddle-TensorRT backend require inference on device GPU." + det_option.use_paddle_infer_backend() + det_option.paddle_infer_option.collect_trt_shape = True + det_option.paddle_infer_option.enable_trt = True + + cls_option.use_paddle_infer_backend() + cls_option.paddle_infer_option.collect_trt_shape = True + cls_option.paddle_infer_option.enable_trt = True + + rec_option.use_paddle_infer_backend() + rec_option.paddle_infer_option.collect_trt_shape = True + rec_option.paddle_infer_option.enable_trt = True + + # If use TRT backend, the dynamic shape will be set as follow. + # We recommend that users set the length and height of the detection model to a multiple of 32. + # We also recommend that users set the Trt input shape as follow. + det_option.set_trt_input_shape("x", [1, 3, 64, 64], [1, 3, 640, 640], + [1, 3, 960, 960]) + cls_option.set_trt_input_shape("x", [1, 3, 48, 10], + [args.cls_bs, 3, 48, 320], + [args.cls_bs, 3, 48, 1024]) + rec_option.set_trt_input_shape("x", [1, 3, 48, 10], + [args.rec_bs, 3, 48, 320], + [args.rec_bs, 3, 48, 2304]) + + # Users could save TRT cache file to disk as follow. + det_option.set_trt_cache_file(args.det_model) + cls_option.set_trt_cache_file(args.cls_model) + rec_option.set_trt_cache_file(args.rec_model) + + elif args.backend.lower() == "ort": + det_option.use_ort_backend() + cls_option.use_ort_backend() + rec_option.use_ort_backend() + + elif args.backend.lower() == "paddle": + det_option.use_paddle_infer_backend() + cls_option.use_paddle_infer_backend() + rec_option.use_paddle_infer_backend() + + elif args.backend.lower() == "openvino": + assert args.device.lower( + ) == "cpu", "OpenVINO backend require inference on device CPU." + det_option.use_openvino_backend() + cls_option.use_openvino_backend() + rec_option.use_openvino_backend() + + elif args.backend.lower() == "pplite": + assert args.device.lower( + ) == "cpu", "Paddle Lite backend require inference on device CPU." + det_option.use_lite_backend() + cls_option.use_lite_backend() + rec_option.use_lite_backend() + + return det_option, cls_option, rec_option + + +args = parse_arguments() + +det_model_file = os.path.join(args.det_model, "inference.pdmodel") +det_params_file = os.path.join(args.det_model, "inference.pdiparams") + +cls_model_file = os.path.join(args.cls_model, "inference.pdmodel") +cls_params_file = os.path.join(args.cls_model, "inference.pdiparams") + +rec_model_file = os.path.join(args.rec_model, "inference.pdmodel") +rec_params_file = os.path.join(args.rec_model, "inference.pdiparams") +rec_label_file = args.rec_label_file + +det_option, cls_option, rec_option = build_option(args) + +det_model = fd.vision.ocr.DBDetector( + det_model_file, det_params_file, runtime_option=det_option) + +cls_model = fd.vision.ocr.Classifier( + cls_model_file, cls_params_file, runtime_option=cls_option) + +rec_model = fd.vision.ocr.Recognizer( + rec_model_file, rec_params_file, rec_label_file, runtime_option=rec_option) + +# Parameters settings for pre and post processing of Det/Cls/Rec Models. +# All parameters are set to default values. +det_model.preprocessor.max_side_len = 960 +det_model.postprocessor.det_db_thresh = 0.3 +det_model.postprocessor.det_db_box_thresh = 0.6 +det_model.postprocessor.det_db_unclip_ratio = 1.5 +det_model.postprocessor.det_db_score_mode = "slow" +det_model.postprocessor.use_dilation = False +cls_model.postprocessor.cls_thresh = 0.9 + +# Create PP-OCRv3, if cls_model is not needed, just set cls_model=None . +ppocr_v3 = fd.vision.ocr.PPOCRv3( + det_model=det_model, cls_model=cls_model, rec_model=rec_model) + +# Set inference batch size for cls model and rec model, the value could be -1 and 1 to positive infinity. +# When inference batch size is set to -1, it means that the inference batch size +# of the cls and rec models will be the same as the number of boxes detected by the det model. +ppocr_v3.cls_batch_size = args.cls_bs +ppocr_v3.rec_batch_size = args.rec_bs + +# Read the input image +im = cv2.imread(args.image) + +# Predict and reutrn the results +result = ppocr_v3.predict(im) + +print(result) + +# Visuliaze the results. +vis_im = fd.vision.vis_ppocr(im, result) +cv2.imwrite("visualized_result.jpg", vis_im) +print("Visualized result save in ./visualized_result.jpg") diff --git a/deploy/fastdeploy/cpu-gpu/python/infer_cls.py b/deploy/fastdeploy/cpu-gpu/python/infer_cls.py new file mode 100755 index 0000000000000000000000000000000000000000..b34868daef9e46ae59b5cb60fce1ff66fdf1bfd2 --- /dev/null +++ b/deploy/fastdeploy/cpu-gpu/python/infer_cls.py @@ -0,0 +1,77 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import fastdeploy as fd +import cv2 +import os + + +def parse_arguments(): + import argparse + import ast + parser = argparse.ArgumentParser() + parser.add_argument( + "--cls_model", + required=True, + help="Path of Classification model of PPOCR.") + parser.add_argument( + "--image", type=str, required=True, help="Path of test image file.") + parser.add_argument( + "--device", + type=str, + default='cpu', + help="Type of inference device, support 'cpu', 'kunlunxin' or 'gpu'.") + parser.add_argument( + "--device_id", + type=int, + default=0, + help="Define which GPU card used to run model.") + return parser.parse_args() + + +def build_option(args): + + cls_option = fd.RuntimeOption() + + if args.device.lower() == "gpu": + cls_option.use_gpu(args.device_id) + + return cls_option + + +args = parse_arguments() + +cls_model_file = os.path.join(args.cls_model, "inference.pdmodel") +cls_params_file = os.path.join(args.cls_model, "inference.pdiparams") + +# Set the runtime option +cls_option = build_option(args) + +# Create the cls_model +cls_model = fd.vision.ocr.Classifier( + cls_model_file, cls_params_file, runtime_option=cls_option) + +# Set the postprocessing parameters +cls_model.postprocessor.cls_thresh = 0.9 + +# Read the image +im = cv2.imread(args.image) + +# Predict and return the results +result = cls_model.predict(im) + +# User can infer a batch of images by following code. +# result = cls_model.batch_predict([im]) + +print(result) diff --git a/deploy/fastdeploy/cpu-gpu/python/infer_det.py b/deploy/fastdeploy/cpu-gpu/python/infer_det.py new file mode 100755 index 0000000000000000000000000000000000000000..7a7f5a07b7f57932ddc2aa33b4624f0399691bb0 --- /dev/null +++ b/deploy/fastdeploy/cpu-gpu/python/infer_det.py @@ -0,0 +1,82 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import fastdeploy as fd +import cv2 +import os + + +def parse_arguments(): + import argparse + import ast + parser = argparse.ArgumentParser() + parser.add_argument( + "--det_model", required=True, help="Path of Detection model of PPOCR.") + parser.add_argument( + "--image", type=str, required=True, help="Path of test image file.") + parser.add_argument( + "--device", + type=str, + default='cpu', + help="Type of inference device, support 'cpu', 'kunlunxin' or 'gpu'.") + parser.add_argument( + "--device_id", + type=int, + default=0, + help="Define which GPU card used to run model.") + return parser.parse_args() + + +def build_option(args): + + det_option = fd.RuntimeOption() + + if args.device.lower() == "gpu": + det_option.use_gpu(args.device_id) + + return det_option + + +args = parse_arguments() + +det_model_file = os.path.join(args.det_model, "inference.pdmodel") +det_params_file = os.path.join(args.det_model, "inference.pdiparams") + +# Set the runtime option +det_option = build_option(args) + +# Create the det_model +det_model = fd.vision.ocr.DBDetector( + det_model_file, det_params_file, runtime_option=det_option) + +# Set the preporcessing parameters +det_model.preprocessor.max_side_len = 960 +det_model.postprocessor.det_db_thresh = 0.3 +det_model.postprocessor.det_db_box_thresh = 0.6 +det_model.postprocessor.det_db_unclip_ratio = 1.5 +det_model.postprocessor.det_db_score_mode = "slow" +det_model.postprocessor.use_dilation = False + +# Read the image +im = cv2.imread(args.image) + +# Predict and return the results +result = det_model.predict(im) + +print(result) + +# Visualize the results +vis_im = fd.vision.vis_ppocr(im, result) +cv2.imwrite("visualized_result.jpg", vis_im) +print("Visualized result save in ./visualized_result.jpg") diff --git a/deploy/fastdeploy/cpu-gpu/python/infer_rec.py b/deploy/fastdeploy/cpu-gpu/python/infer_rec.py new file mode 100755 index 0000000000000000000000000000000000000000..6f9e03b20ec3a3d4e382a7b237564cc496e09c25 --- /dev/null +++ b/deploy/fastdeploy/cpu-gpu/python/infer_rec.py @@ -0,0 +1,79 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import fastdeploy as fd +import cv2 +import os + + +def parse_arguments(): + import argparse + import ast + parser = argparse.ArgumentParser() + parser.add_argument( + "--rec_model", + required=True, + help="Path of Recognization model of PPOCR.") + parser.add_argument( + "--rec_label_file", + required=True, + help="Path of Recognization model of PPOCR.") + parser.add_argument( + "--image", type=str, required=True, help="Path of test image file.") + parser.add_argument( + "--device", + type=str, + default='cpu', + help="Type of inference device, support 'cpu', 'kunlunxin' or 'gpu'.") + parser.add_argument( + "--device_id", + type=int, + default=0, + help="Define which GPU card used to run model.") + return parser.parse_args() + + +def build_option(args): + + rec_option = fd.RuntimeOption() + + if args.device.lower() == "gpu": + rec_option.use_gpu(args.device_id) + + return rec_option + + +args = parse_arguments() + +rec_model_file = os.path.join(args.rec_model, "inference.pdmodel") +rec_params_file = os.path.join(args.rec_model, "inference.pdiparams") +rec_label_file = args.rec_label_file + +# Set the runtime option +rec_option = build_option(args) + +# Create the rec_model +rec_model = fd.vision.ocr.Recognizer( + rec_model_file, rec_params_file, rec_label_file, runtime_option=rec_option) + +# Read the image +im = cv2.imread(args.image) + +# Predict and return the result +result = rec_model.predict(im) + +# User can infer a batch of images by following code. +# result = rec_model.batch_predict([im]) + +print(result) diff --git a/deploy/fastdeploy/kunlunxin/README.md b/deploy/fastdeploy/kunlunxin/README.md new file mode 100644 index 0000000000000000000000000000000000000000..16487674c9d503ed5db3c6422938dfe1993074ef --- /dev/null +++ b/deploy/fastdeploy/kunlunxin/README.md @@ -0,0 +1,32 @@ +[English](README.md) | 简体中文 + +# PaddleOCR 在昆仑芯上部署方案-FastDeploy + +## 1. 说明 +PaddleOCR支持利用FastDeploy在昆仑芯片上部署模型. + +支持如下芯片的部署 +- 昆仑 818-100(推理芯片) +- 昆仑 818-300(训练芯片) + +支持如下芯片的设备 +- K100/K200 昆仑 AI 加速卡 +- R200 昆仑芯 AI 加速卡 + +## 2. 支持的PaddleOCR推理模型 + +下表中的推理模型为FastDeploy测试过的模型, 下载链接由PaddleOCR模型库提供, +更多的模型, 详见[PP-OCR系列模型列表](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6/doc/doc_ch/models_list.md), 欢迎用户尝试. + +| PaddleOCR版本 | 文本框检测 | 方向分类模型 | 文字识别 |字典文件| 说明 | +|:----|:----|:----|:----|:----|:--------| +| ch_PP-OCRv3[推荐] |[ch_PP-OCRv3_det](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) | [ch_ppocr_mobile_v2.0_cls](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) | [ch_PP-OCRv3_rec](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) | [ppocr_keys_v1.txt](https://bj.bcebos.com/paddlehub/fastdeploy/ppocr_keys_v1.txt) | OCRv3系列原始超轻量模型,支持中英文、多语种文本检测 | +| en_PP-OCRv3[推荐] |[en_PP-OCRv3_det](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) | [ch_ppocr_mobile_v2.0_cls](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) | [en_PP-OCRv3_rec](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) | [en_dict.txt](https://bj.bcebos.com/paddlehub/fastdeploy/en_dict.txt) | OCRv3系列原始超轻量模型,支持英文与数字识别,除检测模型和识别模型的训练数据与中文模型不同以外,无其他区别 | +| ch_PP-OCRv2 |[ch_PP-OCRv2_det](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) | [ch_ppocr_mobile_v2.0_cls](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) | [ch_PP-OCRv2_rec](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) | [ppocr_keys_v1.txt](https://bj.bcebos.com/paddlehub/fastdeploy/ppocr_keys_v1.txt) | OCRv2系列原始超轻量模型,支持中英文、多语种文本检测 | +| ch_PP-OCRv2_mobile |[ch_ppocr_mobile_v2.0_det](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) | [ch_ppocr_mobile_v2.0_cls](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) | [ch_ppocr_mobile_v2.0_rec](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) | [ppocr_keys_v1.txt](https://bj.bcebos.com/paddlehub/fastdeploy/ppocr_keys_v1.txt) | OCRv2系列原始超轻量模型,支持中英文、多语种文本检测,比PPOCRv2更加轻量 | +| ch_PP-OCRv2_server |[ch_ppocr_server_v2.0_det](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) | [ch_ppocr_mobile_v2.0_cls](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) | [ch_ppocr_server_v2.0_rec](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) |[ppocr_keys_v1.txt](https://bj.bcebos.com/paddlehub/fastdeploy/ppocr_keys_v1.txt) | OCRv2服务器系列模型, 支持中英文、多语种文本检测,比超轻量模型更大,但效果更好| + + +## 3. 详细部署的部署示例 +- [Python部署](python) +- [C++部署](cpp) diff --git a/deploy/fastdeploy/kunlunxin/cpp/CMakeLists.txt b/deploy/fastdeploy/kunlunxin/cpp/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..93540a7e83e05228bcb38042a91166c858c95137 --- /dev/null +++ b/deploy/fastdeploy/kunlunxin/cpp/CMakeLists.txt @@ -0,0 +1,14 @@ +PROJECT(infer_demo C CXX) +CMAKE_MINIMUM_REQUIRED (VERSION 3.10) + +# 指定下载解压后的fastdeploy库路径 +option(FASTDEPLOY_INSTALL_DIR "Path of downloaded fastdeploy sdk.") + +include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake) + +# 添加FastDeploy依赖头文件 +include_directories(${FASTDEPLOY_INCS}) + +add_executable(infer_demo ${PROJECT_SOURCE_DIR}/infer.cc) +# 添加FastDeploy库依赖 +target_link_libraries(infer_demo ${FASTDEPLOY_LIBS}) diff --git a/deploy/fastdeploy/kunlunxin/cpp/README.md b/deploy/fastdeploy/kunlunxin/cpp/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3725a807e1ea2c76cfe4f059cc207533a433e1d8 --- /dev/null +++ b/deploy/fastdeploy/kunlunxin/cpp/README.md @@ -0,0 +1,58 @@ +[English](README.md) | 简体中文 +# PP-OCRv3 昆仑芯XPU C++部署示例 + +本目录下提供`infer.cc`, 供用户完成PP-OCRv3在昆仑芯XPU上的部署. + +## 1. 部署环境准备 +在部署前,需自行编译基于昆仑芯XPU的预测库,参考文档[昆仑芯XPU部署环境编译安装](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install#自行编译安装) + +## 2.部署模型准备 +在部署前, 请准备好您所需要运行的推理模型, 您可以在[FastDeploy支持的PaddleOCR模型列表](../README.md)中下载所需模型. + +## 3.运行部署示例 +``` +# 下载部署示例代码 +git clone https://github.com/PaddlePaddle/FastDeploy.git +cd FastDeploy/examples/vision/ocr/PP-OCR/kunlunxin/cpp + +# 如果您希望从PaddleOCR下载示例代码,请运行 +git clone https://github.com/PaddlePaddle/PaddleOCR.git +# 注意:如果当前分支找不到下面的fastdeploy测试代码,请切换到dygraph分支 +git checkout dygraph +cd PaddleOCR/deploy/fastdeploy/kunlunxin/cpp + +mkdir build +cd build +# 使用编译完成的FastDeploy库编译infer_demo +cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-kunlunxin +make -j + +# 下载PP-OCRv3文字检测模型 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar +tar -xvf ch_PP-OCRv3_det_infer.tar +# 下载文字方向分类器模型 +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar +tar -xvf ch_ppocr_mobile_v2.0_cls_infer.tar +# 下载PP-OCRv3文字识别模型 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar +tar -xvf ch_PP-OCRv3_rec_infer.tar + +# 下载预测图片与字典文件 +wget https://gitee.com/paddlepaddle/PaddleOCR/raw/release/2.6/doc/imgs/12.jpg +wget https://gitee.com/paddlepaddle/PaddleOCR/raw/release/2.6/ppocr/utils/ppocr_keys_v1.txt + +./infer_demo ./ch_PP-OCRv3_det_infer ./ch_ppocr_mobile_v2.0_cls_infer ./ch_PP-OCRv3_rec_infer ./ppocr_keys_v1.txt ./12.jpg + +``` + +运行完成可视化结果如下图所示 + +
+ +
+ +## 4. 更多指南 +- [PP-OCR系列 C++ API查阅](https://www.paddlepaddle.org.cn/fastdeploy-api-doc/cpp/html/namespacefastdeploy_1_1vision_1_1ocr.html) +- [FastDeploy部署PaddleOCR模型概览](../../) +- [PP-OCRv3 Python部署](../python) +- 如果用户想要调整前后处理超参数、单独使用文字检测识别模型、使用其他模型等,更多详细文档与说明请参考[PP-OCR系列在CPU/GPU上的部署](../../cpu-gpu/cpp/README.md) diff --git a/deploy/fastdeploy/kunlunxin/cpp/infer.cc b/deploy/fastdeploy/kunlunxin/cpp/infer.cc new file mode 100644 index 0000000000000000000000000000000000000000..3342b53d16382c683d913d172423f790351ddd3b --- /dev/null +++ b/deploy/fastdeploy/kunlunxin/cpp/infer.cc @@ -0,0 +1,115 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision.h" +#ifdef WIN32 +const char sep = '\\'; +#else +const char sep = '/'; +#endif + +void KunlunXinInfer(const std::string &det_model_dir, + const std::string &cls_model_dir, + const std::string &rec_model_dir, + const std::string &rec_label_file, + const std::string &image_file) { + auto det_model_file = det_model_dir + sep + "inference.pdmodel"; + auto det_params_file = det_model_dir + sep + "inference.pdiparams"; + + auto cls_model_file = cls_model_dir + sep + "inference.pdmodel"; + auto cls_params_file = cls_model_dir + sep + "inference.pdiparams"; + + auto rec_model_file = rec_model_dir + sep + "inference.pdmodel"; + auto rec_params_file = rec_model_dir + sep + "inference.pdiparams"; + + auto option = fastdeploy::RuntimeOption(); + option.UseKunlunXin(); + + auto det_option = option; + auto cls_option = option; + auto rec_option = option; + + // The cls and rec model can inference a batch of images now. + // User could initialize the inference batch size and set them after create + // PP-OCR model. + int cls_batch_size = 1; + int rec_batch_size = 6; + + auto det_model = fastdeploy::vision::ocr::DBDetector( + det_model_file, det_params_file, det_option); + auto cls_model = fastdeploy::vision::ocr::Classifier( + cls_model_file, cls_params_file, cls_option); + auto rec_model = fastdeploy::vision::ocr::Recognizer( + rec_model_file, rec_params_file, rec_label_file, rec_option); + + assert(det_model.Initialized()); + assert(cls_model.Initialized()); + assert(rec_model.Initialized()); + + // The classification model is optional, so the PP-OCR can also be connected + // in series as follows + // auto ppocr_v3 = fastdeploy::pipeline::PPOCRv3(&det_model, &rec_model); + auto ppocr_v3 = + fastdeploy::pipeline::PPOCRv3(&det_model, &cls_model, &rec_model); + + // Set inference batch size for cls model and rec model, the value could be -1 + // and 1 to positive infinity. + // When inference batch size is set to -1, it means that the inference batch + // size + // of the cls and rec models will be the same as the number of boxes detected + // by the det model. + ppocr_v3.SetClsBatchSize(cls_batch_size); + ppocr_v3.SetRecBatchSize(rec_batch_size); + + if (!ppocr_v3.Initialized()) { + std::cerr << "Failed to initialize PP-OCR." << std::endl; + return; + } + + auto im = cv::imread(image_file); + auto im_bak = im.clone(); + + fastdeploy::vision::OCRResult result; + if (!ppocr_v3.Predict(&im, &result)) { + std::cerr << "Failed to predict." << std::endl; + return; + } + + std::cout << result.Str() << std::endl; + + auto vis_im = fastdeploy::vision::VisOcr(im_bak, result); + cv::imwrite("vis_result.jpg", vis_im); + std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl; +} + +int main(int argc, char *argv[]) { + if (argc < 6) { + std::cout << "Usage: infer_demo path/to/det_model path/to/cls_model " + "path/to/rec_model path/to/rec_label_file path/to/image " + "e.g ./infer_demo ./ch_PP-OCRv3_det_infer " + "./ch_ppocr_mobile_v2.0_cls_infer ./ch_PP-OCRv3_rec_infer " + "./ppocr_keys_v1.txt ./12.jpg" + << std::endl; + return -1; + } + + std::string det_model_dir = argv[1]; + std::string cls_model_dir = argv[2]; + std::string rec_model_dir = argv[3]; + std::string rec_label_file = argv[4]; + std::string test_image = argv[5]; + KunlunXinInfer(det_model_dir, cls_model_dir, rec_model_dir, rec_label_file, + test_image); + return 0; +} diff --git a/deploy/fastdeploy/kunlunxin/python/README.md b/deploy/fastdeploy/kunlunxin/python/README.md new file mode 100644 index 0000000000000000000000000000000000000000..724fad27157e15914685d55ab38be933c070cf35 --- /dev/null +++ b/deploy/fastdeploy/kunlunxin/python/README.md @@ -0,0 +1,54 @@ +[English](README.md) | 简体中文 +# PP-OCRv3 昆仑芯XPU Python部署示例 + +本目录下提供`infer.py`, 供用户完成PP-OCRv3在昆仑芯XPU上的部署. + +## 1. 部署环境准备 +在部署前,需自行编译基于昆仑XPU的FastDeploy python wheel包并安装,参考文档[昆仑芯XPU部署环境](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install#自行编译安装) + +## 2.部署模型准备 +在部署前, 请准备好您所需要运行的推理模型, 您可以在[FastDeploy支持的PaddleOCR模型列表](../README.md)中下载所需模型. + +## 3.运行部署示例 +``` +# 下载部署示例代码 +git clone https://github.com/PaddlePaddle/FastDeploy.git +cd FastDeploy/examples/vision/ocr/PP-OCR/kunlunxin/python + +# 如果您希望从PaddleOCR下载示例代码,请运行 +git clone https://github.com/PaddlePaddle/PaddleOCR.git +# 注意:如果当前分支找不到下面的fastdeploy测试代码,请切换到dygraph分支 +git checkout dygraph +cd PaddleOCR/deploy/fastdeploy/kunlunxin/python + +# 下载PP-OCRv3文字检测模型 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar +tar -xvf ch_PP-OCRv3_det_infer.tar +# 下载文字方向分类器模型 +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar +tar -xvf ch_ppocr_mobile_v2.0_cls_infer.tar +# 下载PP-OCRv3文字识别模型 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar +tar -xvf ch_PP-OCRv3_rec_infer.tar + +# 下载预测图片与字典文件 +wget https://gitee.com/paddlepaddle/PaddleOCR/raw/release/2.6/doc/imgs/12.jpg +wget https://gitee.com/paddlepaddle/PaddleOCR/raw/release/2.6/ppocr/utils/ppocr_keys_v1.txt + +python infer.py --det_model ch_PP-OCRv3_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv3_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg +``` + +运行完成可视化结果如下图所示 + +
+ +
+ +## 4. 更多指南 +- [PP-OCR系列 Python API查阅](https://www.paddlepaddle.org.cn/fastdeploy-api-doc/python/html/ocr.html) +- [FastDeploy部署PaddleOCR模型概览](../../) +- [PP-OCRv3 C++部署](../cpp) +- 如果用户想要调整前后处理超参数、单独使用文字检测识别模型、使用其他模型等,更多详细文档与说明请参考[PP-OCR系列在CPU/GPU上的部署](../../cpu-gpu/python/README.md) + +## 5. 常见问题 +- [如何将视觉模型预测结果转为numpy格式](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/faq/vision_result_related_problems.md) diff --git a/deploy/fastdeploy/kunlunxin/python/infer.py b/deploy/fastdeploy/kunlunxin/python/infer.py new file mode 100755 index 0000000000000000000000000000000000000000..4780df832c9fb2c92a6ba81519914d99b4d81aad --- /dev/null +++ b/deploy/fastdeploy/kunlunxin/python/infer.py @@ -0,0 +1,111 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import fastdeploy as fd +import cv2 +import os + + +def parse_arguments(): + import argparse + import ast + parser = argparse.ArgumentParser() + parser.add_argument( + "--det_model", required=True, help="Path of Detection model of PPOCR.") + parser.add_argument( + "--cls_model", + required=True, + help="Path of Classification model of PPOCR.") + parser.add_argument( + "--rec_model", + required=True, + help="Path of Recognization model of PPOCR.") + parser.add_argument( + "--rec_label_file", + required=True, + help="Path of Recognization model of PPOCR.") + parser.add_argument( + "--image", type=str, required=True, help="Path of test image file.") + parser.add_argument( + "--cls_bs", + type=int, + default=1, + help="Classification model inference batch size.") + parser.add_argument( + "--rec_bs", + type=int, + default=6, + help="Recognition model inference batch size") + return parser.parse_args() + + +def build_option(args): + + det_option = fd.RuntimeOption() + cls_option = fd.RuntimeOption() + rec_option = fd.RuntimeOption() + + det_option.use_kunlunxin() + cls_option.use_kunlunxin() + rec_option.use_kunlunxin() + + return det_option, cls_option, rec_option + + +args = parse_arguments() + +det_model_file = os.path.join(args.det_model, "inference.pdmodel") +det_params_file = os.path.join(args.det_model, "inference.pdiparams") + +cls_model_file = os.path.join(args.cls_model, "inference.pdmodel") +cls_params_file = os.path.join(args.cls_model, "inference.pdiparams") + +rec_model_file = os.path.join(args.rec_model, "inference.pdmodel") +rec_params_file = os.path.join(args.rec_model, "inference.pdiparams") +rec_label_file = args.rec_label_file + +det_option, cls_option, rec_option = build_option(args) + +det_model = fd.vision.ocr.DBDetector( + det_model_file, det_params_file, runtime_option=det_option) + +cls_model = fd.vision.ocr.Classifier( + cls_model_file, cls_params_file, runtime_option=cls_option) + +rec_model = fd.vision.ocr.Recognizer( + rec_model_file, rec_params_file, rec_label_file, runtime_option=rec_option) + +# Create PP-OCRv3, if cls_model is not needed, +# just set cls_model=None . +ppocr_v3 = fd.vision.ocr.PPOCRv3( + det_model=det_model, cls_model=cls_model, rec_model=rec_model) + +# Set inference batch size for cls model and rec model, the value could be -1 and 1 to positive infinity. +# When inference batch size is set to -1, it means that the inference batch size +# of the cls and rec models will be the same as the number of boxes detected by the det model. +ppocr_v3.cls_batch_size = args.cls_bs +ppocr_v3.rec_batch_size = args.rec_bs + +# Prepare image. +im = cv2.imread(args.image) + +# Print the results. +result = ppocr_v3.predict(im) + +print(result) + +# Visuliaze the output. +vis_im = fd.vision.vis_ppocr(im, result) +cv2.imwrite("visualized_result.jpg", vis_im) +print("Visualized result save in ./visualized_result.jpg") diff --git a/deploy/fastdeploy/rockchip/README.md b/deploy/fastdeploy/rockchip/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b38f7f89631c4903751e879e8d3c434ff0e47051 --- /dev/null +++ b/deploy/fastdeploy/rockchip/README.md @@ -0,0 +1,23 @@ +[English](README.md) | 简体中文 + +# PaddleOCR 模型在RKNPU2上部署方案-FastDeploy + +## 1. 说明 +PaddleOCR支持通过FastDeploy在RKNPU2上部署相关模型. + +## 2. 支持模型列表 + +下表中的模型下载链接由PaddleOCR模型库提供, 详见[PP-OCR系列模型列表](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6/doc/doc_ch/models_list.md) + +| PaddleOCR版本 | 文本框检测 | 方向分类模型 | 文字识别 |字典文件| 说明 | +|:----|:----|:----|:----|:----|:--------| +| ch_PP-OCRv3[推荐] |[ch_PP-OCRv3_det](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) | [ch_ppocr_mobile_v2.0_cls](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) | [ch_PP-OCRv3_rec](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) | [ppocr_keys_v1.txt](https://bj.bcebos.com/paddlehub/fastdeploy/ppocr_keys_v1.txt) | OCRv3系列原始超轻量模型,支持中英文、多语种文本检测 | +| en_PP-OCRv3[推荐] |[en_PP-OCRv3_det](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) | [ch_ppocr_mobile_v2.0_cls](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) | [en_PP-OCRv3_rec](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) | [en_dict.txt](https://bj.bcebos.com/paddlehub/fastdeploy/en_dict.txt) | OCRv3系列原始超轻量模型,支持英文与数字识别,除检测模型和识别模型的训练数据与中文模型不同以外,无其他区别 | +| ch_PP-OCRv2 |[ch_PP-OCRv2_det](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) | [ch_ppocr_mobile_v2.0_cls](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) | [ch_PP-OCRv2_rec](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) | [ppocr_keys_v1.txt](https://bj.bcebos.com/paddlehub/fastdeploy/ppocr_keys_v1.txt) | OCRv2系列原始超轻量模型,支持中英文、多语种文本检测 | +| ch_PP-OCRv2_mobile |[ch_ppocr_mobile_v2.0_det](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) | [ch_ppocr_mobile_v2.0_cls](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) | [ch_ppocr_mobile_v2.0_rec](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) | [ppocr_keys_v1.txt](https://bj.bcebos.com/paddlehub/fastdeploy/ppocr_keys_v1.txt) | OCRv2系列原始超轻量模型,支持中英文、多语种文本检测,比PPOCRv2更加轻量 | +| ch_PP-OCRv2_server |[ch_ppocr_server_v2.0_det](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) | [ch_ppocr_mobile_v2.0_cls](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) | [ch_ppocr_server_v2.0_rec](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) |[ppocr_keys_v1.txt](https://bj.bcebos.com/paddlehub/fastdeploy/ppocr_keys_v1.txt) | OCRv2服务器系列模型, 支持中英文、多语种文本检测,比超轻量模型更大,但效果更好| + + +## 3. 详细部署的部署示例 +- [Python部署](python) +- [C++部署](cpp) diff --git a/deploy/fastdeploy/rockchip/cpp/CMakeLists.txt b/deploy/fastdeploy/rockchip/cpp/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..93540a7e83e05228bcb38042a91166c858c95137 --- /dev/null +++ b/deploy/fastdeploy/rockchip/cpp/CMakeLists.txt @@ -0,0 +1,14 @@ +PROJECT(infer_demo C CXX) +CMAKE_MINIMUM_REQUIRED (VERSION 3.10) + +# 指定下载解压后的fastdeploy库路径 +option(FASTDEPLOY_INSTALL_DIR "Path of downloaded fastdeploy sdk.") + +include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake) + +# 添加FastDeploy依赖头文件 +include_directories(${FASTDEPLOY_INCS}) + +add_executable(infer_demo ${PROJECT_SOURCE_DIR}/infer.cc) +# 添加FastDeploy库依赖 +target_link_libraries(infer_demo ${FASTDEPLOY_LIBS}) diff --git a/deploy/fastdeploy/rockchip/cpp/README.md b/deploy/fastdeploy/rockchip/cpp/README.md new file mode 100755 index 0000000000000000000000000000000000000000..f5fb212d94f2e30ed66b16bf8a7586cd8b82f022 --- /dev/null +++ b/deploy/fastdeploy/rockchip/cpp/README.md @@ -0,0 +1,128 @@ +[English](README_CN.md) | 简体中文 +# PP-OCRv3 RKNPU2 C++部署示例 + +本目录下提供`infer.cc`, 供用户完成PP-OCRv3在RKNPU2的部署. + + +## 1. 部署环境准备 +在部署前,需确认以下两个步骤 +- 1. 在部署前,需自行编译基于RKNPU2的预测库,参考文档[RKNPU2部署环境编译](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install#自行编译安装) +- 2. 同时请用户参考[FastDeploy RKNPU2资源导航](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install/rknpu2.md) + +## 2.部署模型准备 +在部署前, 请准备好您所需要运行的推理模型, 您可以在[FastDeploy支持的PaddleOCR模型列表](../README.md)中下载所需模型. +同时, 在RKNPU2上部署PP-OCR系列模型时,我们需要把Paddle的推理模型转为RKNN模型. +由于rknn_toolkit2工具暂不支持直接从Paddle直接转换为RKNN模型,因此我们需要先将Paddle推理模型转为ONNX模型, 最后转为RKNN模型, 示例如下. + +```bash +# 下载PP-OCRv3文字检测模型 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar +tar -xvf ch_PP-OCRv3_det_infer.tar +# 下载文字方向分类器模型 +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar +tar -xvf ch_ppocr_mobile_v2.0_cls_infer.tar +# 下载PP-OCRv3文字识别模型 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar +tar -xvf ch_PP-OCRv3_rec_infer.tar + +# 请用户自行安装最新发布版本的paddle2onnx, 转换模型到ONNX格式的模型 +paddle2onnx --model_dir ch_PP-OCRv3_det_infer \ + --model_filename inference.pdmodel \ + --params_filename inference.pdiparams \ + --save_file ch_PP-OCRv3_det_infer/ch_PP-OCRv3_det_infer.onnx \ + --enable_dev_version True +paddle2onnx --model_dir ch_ppocr_mobile_v2.0_cls_infer \ + --model_filename inference.pdmodel \ + --params_filename inference.pdiparams \ + --save_file ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.onnx \ + --enable_dev_version True +paddle2onnx --model_dir ch_PP-OCRv3_rec_infer \ + --model_filename inference.pdmodel \ + --params_filename inference.pdiparams \ + --save_file ch_PP-OCRv3_rec_infer/ch_PP-OCRv3_rec_infer.onnx \ + --enable_dev_version True + +# 固定模型的输入shape +python -m paddle2onnx.optimize --input_model ch_PP-OCRv3_det_infer/ch_PP-OCRv3_det_infer.onnx \ + --output_model ch_PP-OCRv3_det_infer/ch_PP-OCRv3_det_infer.onnx \ + --input_shape_dict "{'x':[1,3,960,960]}" +python -m paddle2onnx.optimize --input_model ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.onnx \ + --output_model ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.onnx \ + --input_shape_dict "{'x':[1,3,48,192]}" +python -m paddle2onnx.optimize --input_model ch_PP-OCRv3_rec_infer/ch_PP-OCRv3_rec_infer.onnx \ + --output_model ch_PP-OCRv3_rec_infer/ch_PP-OCRv3_rec_infer.onnx \ + --input_shape_dict "{'x':[1,3,48,320]}" + +# 在rockchip/rknpu2_tools/目录下, 我们为用户提供了转换ONNX模型到RKNN模型的工具 +python rockchip/rknpu2_tools/export.py --config_path tools/rknpu2/config/ppocrv3_det.yaml \ + --target_platform rk3588 +python rockchip/rknpu2_tools/export.py --config_path tools/rknpu2/config/ppocrv3_rec.yaml \ + --target_platform rk3588 +python rockchip/rknpu2_tools/export.py --config_path tools/rknpu2/config/ppocrv3_cls.yaml \ + --target_platform rk3588 +``` + +## 3.运行部署示例 +在本目录执行如下命令即可完成编译测试,支持此模型需保证FastDeploy版本1.0.3以上(x.x.x>1.0.3), RKNN版本在1.4.1b22以上。 + +``` +# 下载部署示例代码 +git clone https://github.com/PaddlePaddle/FastDeploy.git +cd FastDeploy/examples/vision/ocr/PP-OCR/rockchip/cpp + +# 如果您希望从PaddleOCR下载示例代码,请运行 +git clone https://github.com/PaddlePaddle/PaddleOCR.git +# 注意:如果当前分支找不到下面的fastdeploy测试代码,请切换到dygraph分支 +git checkout dygraph +cd PaddleOCR/deploy/fastdeploy/rockchip/cpp + +mkdir build +cd build +# 使用编译完成的FastDeploy库编译infer_demo +cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-rockchip +make -j + +# 下载图片和字典文件 +wget https://gitee.com/paddlepaddle/PaddleOCR/raw/release/2.6/doc/imgs/12.jpg +wget https://gitee.com/paddlepaddle/PaddleOCR/raw/release/2.6/ppocr/utils/ppocr_keys_v1.txt + +# 拷贝RKNN模型到build目录 + +# CPU推理 +./infer_demo ./ch_PP-OCRv3_det_infer/ch_PP-OCRv3_det_infer.onnx \ + ./ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.onnx \ + ./ch_PP-OCRv3_rec_infer/ch_PP-OCRv3_rec_infer.onnx \ + ./ppocr_keys_v1.txt \ + ./12.jpg \ + 0 +# RKNPU推理 +./infer_demo ./ch_PP-OCRv3_det_infer/ch_PP-OCRv3_det_infer_rk3588_unquantized.rknn \ + ./ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v20_cls_infer_rk3588_unquantized.rknn \ + ./ch_PP-OCRv3_rec_infer/ch_PP-OCRv3_rec_infer_rk3588_unquantized.rknn \ + ./ppocr_keys_v1.txt \ + ./12.jpg \ + 1 +``` + +运行完成可视化结果如下图所示: + + + +结果输出如下: + +```text +det boxes: [[276,174],[285,173],[285,178],[276,179]]rec text: rec score:0.000000 cls label: 1 cls score: 0.766602 +det boxes: [[43,408],[483,390],[483,431],[44,449]]rec text: 上海斯格威铂尔曼大酒店 rec score:0.888450 cls label: 0 cls score: 1.000000 +det boxes: [[186,456],[399,448],[399,480],[186,488]]rec text: 打浦路15号 rec score:0.988769 cls label: 0 cls score: 1.000000 +det boxes: [[18,501],[513,485],[514,537],[18,554]]rec text: 绿洲仕格维花园公寓 rec score:0.992730 cls label: 0 cls score: 1.000000 +det boxes: [[78,553],[404,541],[404,573],[78,585]]rec text: 打浦路252935号 rec score:0.983545 cls label: 0 cls score: 1.000000 +Visualized result saved in ./vis_result.jpg +``` + +## 4. 更多指南 + +- [PP-OCR系列 C++ API查阅](https://www.paddlepaddle.org.cn/fastdeploy-api-doc/cpp/html/namespacefastdeploy_1_1vision_1_1ocr.html) +- [FastDeploy部署PaddleOCR模型概览](../../) +- [PP-OCRv3 Python部署](../python) +- [FastDeploy RKNPU2资源导航](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install/rknpu2.md) +- 如果用户想要调整前后处理超参数、单独使用文字检测识别模型、使用其他模型等,更多详细文档与说明请参考[PP-OCR系列在CPU/GPU上的部署](../../cpu-gpu/cpp/README.md) diff --git a/deploy/fastdeploy/rockchip/cpp/infer.cc b/deploy/fastdeploy/rockchip/cpp/infer.cc new file mode 100644 index 0000000000000000000000000000000000000000..7add35688a0f3d2c6f18b3c848be79f6d02db431 --- /dev/null +++ b/deploy/fastdeploy/rockchip/cpp/infer.cc @@ -0,0 +1,126 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision.h" + +void InitAndInfer(const std::string &det_model_file, + const std::string &cls_model_file, + const std::string &rec_model_file, + const std::string &rec_label_file, + const std::string &image_file, + const fastdeploy::RuntimeOption &option, + const fastdeploy::ModelFormat &format) { + auto det_params_file = ""; + auto cls_params_file = ""; + auto rec_params_file = ""; + + auto det_option = option; + auto cls_option = option; + auto rec_option = option; + + if (format == fastdeploy::ONNX) { + std::cout << "ONNX Model" << std::endl; + } + + auto det_model = fastdeploy::vision::ocr::DBDetector( + det_model_file, det_params_file, det_option, format); + auto cls_model = fastdeploy::vision::ocr::Classifier( + cls_model_file, cls_params_file, cls_option, format); + auto rec_model = fastdeploy::vision::ocr::Recognizer( + rec_model_file, rec_params_file, rec_label_file, rec_option, format); + + if (format == fastdeploy::RKNN) { + cls_model.GetPreprocessor().DisableNormalize(); + cls_model.GetPreprocessor().DisablePermute(); + + det_model.GetPreprocessor().DisableNormalize(); + det_model.GetPreprocessor().DisablePermute(); + + rec_model.GetPreprocessor().DisableNormalize(); + rec_model.GetPreprocessor().DisablePermute(); + } + det_model.GetPreprocessor().SetStaticShapeInfer(true); + rec_model.GetPreprocessor().SetStaticShapeInfer(true); + + assert(det_model.Initialized()); + assert(cls_model.Initialized()); + assert(rec_model.Initialized()); + + // The classification model is optional, so the PP-OCR can also be connected + // in series as follows auto ppocr_v3 = + // fastdeploy::pipeline::PPOCRv3(&det_model, &rec_model); + auto ppocr_v3 = + fastdeploy::pipeline::PPOCRv3(&det_model, &cls_model, &rec_model); + + // When users enable static shape infer for rec model, the batch size of cls + // and rec model must to be set to 1. + ppocr_v3.SetClsBatchSize(1); + ppocr_v3.SetRecBatchSize(1); + + if (!ppocr_v3.Initialized()) { + std::cerr << "Failed to initialize PP-OCR." << std::endl; + return; + } + + auto im = cv::imread(image_file); + + fastdeploy::vision::OCRResult result; + if (!ppocr_v3.Predict(im, &result)) { + std::cerr << "Failed to predict." << std::endl; + return; + } + + std::cout << result.Str() << std::endl; + + auto vis_im = fastdeploy::vision::VisOcr(im, result); + cv::imwrite("vis_result.jpg", vis_im); + std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl; +} + +int main(int argc, char *argv[]) { + if (argc < 7) { + std::cout << "Usage: infer_demo path/to/det_model path/to/cls_model " + "path/to/rec_model path/to/rec_label_file path/to/image " + "run_option, " + "e.g ./infer_demo ./ch_PP-OCRv3_det_infer " + "./ch_ppocr_mobile_v2.0_cls_infer ./ch_PP-OCRv3_rec_infer " + "./ppocr_keys_v1.txt ./12.jpg 0" + << std::endl; + std::cout << "The data type of run_option is int, 0: run with cpu; 1: run " + "with ascend." + << std::endl; + return -1; + } + + fastdeploy::RuntimeOption option; + fastdeploy::ModelFormat format; + int flag = std::atoi(argv[6]); + + if (flag == 0) { + option.UseCpu(); + format = fastdeploy::ONNX; + } else if (flag == 1) { + option.UseRKNPU2(); + format = fastdeploy::RKNN; + } + + std::string det_model_dir = argv[1]; + std::string cls_model_dir = argv[2]; + std::string rec_model_dir = argv[3]; + std::string rec_label_file = argv[4]; + std::string test_image = argv[5]; + InitAndInfer(det_model_dir, cls_model_dir, rec_model_dir, rec_label_file, + test_image, option, format); + return 0; +} diff --git a/deploy/fastdeploy/rockchip/python/README.md b/deploy/fastdeploy/rockchip/python/README.md new file mode 100755 index 0000000000000000000000000000000000000000..00d97dd96914a5a8dd3d227069aa1083a0090243 --- /dev/null +++ b/deploy/fastdeploy/rockchip/python/README.md @@ -0,0 +1,112 @@ +[English](README_CN.md) | 简体中文 +# PP-OCRv3 RKNPU2 Python部署示例 +本目录下提供`infer.py`, 供用户完成PP-OCRv3在RKNPU2的部署. + + +## 1. 部署环境准备 +在部署前,需确认以下两个步骤 +- 1. 在部署前,需自行编译基于RKNPU2的Python预测库,参考文档[RKNPU2部署环境编译](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install#自行编译安装) +- 2. 同时请用户参考[FastDeploy RKNPU2资源导航](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install/rknpu2.md) + +## 2.部署模型准备 +在部署前, 请准备好您所需要运行的推理模型, 您可以在[FastDeploy支持的PaddleOCR模型列表](../README.md)中下载所需模型. +同时, 在RKNPU2上部署PP-OCR系列模型时,我们需要把Paddle的推理模型转为RKNN模型. +由于rknn_toolkit2工具暂不支持直接从Paddle直接转换为RKNN模型,因此我们需要先将Paddle推理模型转为ONNX模型, 最后转为RKNN模型, 示例如下. + +```bash +# 下载PP-OCRv3文字检测模型 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar +tar -xvf ch_PP-OCRv3_det_infer.tar +# 下载文字方向分类器模型 +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar +tar -xvf ch_ppocr_mobile_v2.0_cls_infer.tar +# 下载PP-OCRv3文字识别模型 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar +tar -xvf ch_PP-OCRv3_rec_infer.tar + +# 请用户自行安装最新发布版本的paddle2onnx, 转换模型到ONNX格式的模型 +paddle2onnx --model_dir ch_PP-OCRv3_det_infer \ + --model_filename inference.pdmodel \ + --params_filename inference.pdiparams \ + --save_file ch_PP-OCRv3_det_infer/ch_PP-OCRv3_det_infer.onnx \ + --enable_dev_version True +paddle2onnx --model_dir ch_ppocr_mobile_v2.0_cls_infer \ + --model_filename inference.pdmodel \ + --params_filename inference.pdiparams \ + --save_file ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.onnx \ + --enable_dev_version True +paddle2onnx --model_dir ch_PP-OCRv3_rec_infer \ + --model_filename inference.pdmodel \ + --params_filename inference.pdiparams \ + --save_file ch_PP-OCRv3_rec_infer/ch_PP-OCRv3_rec_infer.onnx \ + --enable_dev_version True + +# 固定模型的输入shape +python -m paddle2onnx.optimize --input_model ch_PP-OCRv3_det_infer/ch_PP-OCRv3_det_infer.onnx \ + --output_model ch_PP-OCRv3_det_infer/ch_PP-OCRv3_det_infer.onnx \ + --input_shape_dict "{'x':[1,3,960,960]}" +python -m paddle2onnx.optimize --input_model ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.onnx \ + --output_model ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.onnx \ + --input_shape_dict "{'x':[1,3,48,192]}" +python -m paddle2onnx.optimize --input_model ch_PP-OCRv3_rec_infer/ch_PP-OCRv3_rec_infer.onnx \ + --output_model ch_PP-OCRv3_rec_infer/ch_PP-OCRv3_rec_infer.onnx \ + --input_shape_dict "{'x':[1,3,48,320]}" + +# 在rockchip/rknpu2_tools/目录下, 我们为用户提供了转换ONNX模型到RKNN模型的工具 +python rockchip/rknpu2_tools/export.py --config_path tools/rknpu2/config/ppocrv3_det.yaml \ + --target_platform rk3588 +python rockchip/rknpu2_tools/export.py --config_path tools/rknpu2/config/ppocrv3_rec.yaml \ + --target_platform rk3588 +python rockchip/rknpu2_tools/export.py --config_path tools/rknpu2/config/ppocrv3_cls.yaml \ + --target_platform rk3588 +``` + + +## 3.运行部署示例 +在本目录执行如下命令即可完成编译测试,支持此模型需保证FastDeploy版本1.0.3以上(x.x.x>1.0.3), RKNN版本在1.4.1b22以上。 + +``` +# 下载图片和字典文件 +wget https://gitee.com/paddlepaddle/PaddleOCR/raw/release/2.6/doc/imgs/12.jpg +wget https://gitee.com/paddlepaddle/PaddleOCR/raw/release/2.6/ppocr/utils/ppocr_keys_v1.txt + +# 下载部署示例代码 +# 下载部署示例代码 +git clone https://github.com/PaddlePaddle/FastDeploy.git +cd FastDeploy/examples/vision/ocr/PP-OCR/rockchip/python + +# 如果您希望从PaddleOCR下载示例代码,请运行 +git clone https://github.com/PaddlePaddle/PaddleOCR.git +# 注意:如果当前分支找不到下面的fastdeploy测试代码,请切换到dygraph分支 +git checkout dygraph +cd PaddleOCR/deploy/fastdeploy/rockchip/python + + +# CPU推理 +python3 infer.py \ + --det_model ./ch_PP-OCRv3_det_infer/ch_PP-OCRv3_det_infer.onnx \ + --cls_model ./ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.onnx \ + --rec_model ./ch_PP-OCRv3_rec_infer/ch_PP-OCRv3_rec_infer.onnx \ + --rec_label_file ./ppocr_keys_v1.txt \ + --image 12.jpg \ + --device cpu + +# NPU推理 +python3 infer.py \ + --det_model ./ch_PP-OCRv3_det_infer/ch_PP-OCRv3_det_infer_rk3588_unquantized.rknn \ + --cls_model ./ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v20_cls_infer_rk3588_unquantized.rknn \ + --rec_model ./ch_PP-OCRv3_rec_infer/ch_PP-OCRv3_rec_infer_rk3588_unquantized.rknn \ + --rec_label_file ppocr_keys_v1.txt \ + --image 12.jpg \ + --device npu +``` + +运行完成可视化结果如下图所示 + + +## 4. 更多指南 +- [PP-OCR系列 Python API查阅](https://www.paddlepaddle.org.cn/fastdeploy-api-doc/python/html/ocr.html) +- [FastDeploy部署PaddleOCR模型概览](../../) +- [PP-OCRv3 C++部署](../cpp) +- [FastDeploy RKNPU2资源导航](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install/rknpu2.md) +- 如果用户想要调整前后处理超参数、单独使用文字检测识别模型、使用其他模型等,更多详细文档与说明请参考[PP-OCR系列在CPU/GPU上的部署](../../cpu-gpu/python/README.md) diff --git a/deploy/fastdeploy/rockchip/python/infer.py b/deploy/fastdeploy/rockchip/python/infer.py new file mode 100755 index 0000000000000000000000000000000000000000..7aa1382179b41c4ec3d1e634a75645fccd346256 --- /dev/null +++ b/deploy/fastdeploy/rockchip/python/infer.py @@ -0,0 +1,144 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import fastdeploy as fd +import cv2 +import os + + +def parse_arguments(): + import argparse + import ast + parser = argparse.ArgumentParser() + parser.add_argument( + "--det_model", required=True, help="Path of Detection model of PPOCR.") + parser.add_argument( + "--cls_model", + required=True, + help="Path of Classification model of PPOCR.") + parser.add_argument( + "--rec_model", + required=True, + help="Path of Recognization model of PPOCR.") + parser.add_argument( + "--rec_label_file", + required=True, + help="Path of Recognization model of PPOCR.") + parser.add_argument( + "--image", type=str, required=True, help="Path of test image file.") + parser.add_argument( + "--device", + type=str, + default='cpu', + help="Type of inference device, support 'cpu', 'kunlunxin' or 'gpu'.") + parser.add_argument( + "--cpu_thread_num", + type=int, + default=9, + help="Number of threads while inference on CPU.") + return parser.parse_args() + + +def build_option(args): + + det_option = fd.RuntimeOption() + cls_option = fd.RuntimeOption() + rec_option = fd.RuntimeOption() + if args.device == "npu": + det_option.use_rknpu2() + cls_option.use_rknpu2() + rec_option.use_rknpu2() + + return det_option, cls_option, rec_option + + +def build_format(args): + det_format = fd.ModelFormat.ONNX + cls_format = fd.ModelFormat.ONNX + rec_format = fd.ModelFormat.ONNX + if args.device == "npu": + det_format = fd.ModelFormat.RKNN + cls_format = fd.ModelFormat.RKNN + rec_format = fd.ModelFormat.RKNN + + return det_format, cls_format, rec_format + + +args = parse_arguments() + +# Detection模型, 检测文字框 +det_model_file = args.det_model +det_params_file = "" +# Classification模型,方向分类,可选 +cls_model_file = args.cls_model +cls_params_file = "" +# Recognition模型,文字识别模型 +rec_model_file = args.rec_model +rec_params_file = "" +rec_label_file = args.rec_label_file + +det_option, cls_option, rec_option = build_option(args) +det_format, cls_format, rec_format = build_format(args) + +det_model = fd.vision.ocr.DBDetector( + det_model_file, + det_params_file, + runtime_option=det_option, + model_format=det_format) + +cls_model = fd.vision.ocr.Classifier( + cls_model_file, + cls_params_file, + runtime_option=cls_option, + model_format=cls_format) + +rec_model = fd.vision.ocr.Recognizer( + rec_model_file, + rec_params_file, + rec_label_file, + runtime_option=rec_option, + model_format=rec_format) + +# Det,Rec模型启用静态shape推理 +det_model.preprocessor.static_shape_infer = True +rec_model.preprocessor.static_shape_infer = True + +if args.device == "npu": + det_model.preprocessor.disable_normalize() + det_model.preprocessor.disable_permute() + cls_model.preprocessor.disable_normalize() + cls_model.preprocessor.disable_permute() + rec_model.preprocessor.disable_normalize() + rec_model.preprocessor.disable_permute() + +# 创建PP-OCR,串联3个模型,其中cls_model可选,如无需求,可设置为None +ppocr_v3 = fd.vision.ocr.PPOCRv3( + det_model=det_model, cls_model=cls_model, rec_model=rec_model) + +# Cls模型和Rec模型的batch size 必须设置为1, 开启静态shape推理 +ppocr_v3.cls_batch_size = 1 +ppocr_v3.rec_batch_size = 1 + +# 预测图片准备 +im = cv2.imread(args.image) + +#预测并打印结果 +result = ppocr_v3.predict(im) + +print(result) + +# 可视化结果 +vis_im = fd.vision.vis_ppocr(im, result) +cv2.imwrite("visualized_result.jpg", vis_im) +print("Visualized result save in ./visualized_result.jpg") diff --git a/deploy/fastdeploy/rockchip/rknpu2_tools/config/ppocrv3_cls.yaml b/deploy/fastdeploy/rockchip/rknpu2_tools/config/ppocrv3_cls.yaml new file mode 100644 index 0000000000000000000000000000000000000000..197becc2f25dd40e1b7cb1b7bebeb8527401c355 --- /dev/null +++ b/deploy/fastdeploy/rockchip/rknpu2_tools/config/ppocrv3_cls.yaml @@ -0,0 +1,15 @@ +mean: + - + - 127.5 + - 127.5 + - 127.5 +std: + - + - 127.5 + - 127.5 + - 127.5 +model_path: ./ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.onnx +outputs_nodes: +do_quantization: False +dataset: +output_folder: "./ch_ppocr_mobile_v2.0_cls_infer" diff --git a/deploy/fastdeploy/rockchip/rknpu2_tools/config/ppocrv3_det.yaml b/deploy/fastdeploy/rockchip/rknpu2_tools/config/ppocrv3_det.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2897c5f74b2c92713b2d936794e5242a6ff48514 --- /dev/null +++ b/deploy/fastdeploy/rockchip/rknpu2_tools/config/ppocrv3_det.yaml @@ -0,0 +1,15 @@ +mean: + - + - 123.675 + - 116.28 + - 103.53 +std: + - + - 58.395 + - 57.12 + - 57.375 +model_path: ./ch_PP-OCRv3_det_infer/ch_PP-OCRv3_det_infer.onnx +outputs_nodes: +do_quantization: False +dataset: +output_folder: "./ch_PP-OCRv3_det_infer" diff --git a/deploy/fastdeploy/rockchip/rknpu2_tools/config/ppocrv3_rec.yaml b/deploy/fastdeploy/rockchip/rknpu2_tools/config/ppocrv3_rec.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8a22a39a2eee1b24f6fe1d99e71bf3d4b82195e8 --- /dev/null +++ b/deploy/fastdeploy/rockchip/rknpu2_tools/config/ppocrv3_rec.yaml @@ -0,0 +1,15 @@ +mean: + - + - 127.5 + - 127.5 + - 127.5 +std: + - + - 127.5 + - 127.5 + - 127.5 +model_path: ./ch_PP-OCRv3_rec_infer/ch_PP-OCRv3_rec_infer.onnx +outputs_nodes: +do_quantization: False +dataset: +output_folder: "./ch_PP-OCRv3_rec_infer" diff --git a/deploy/fastdeploy/rockchip/rknpu2_tools/export.py b/deploy/fastdeploy/rockchip/rknpu2_tools/export.py new file mode 100644 index 0000000000000000000000000000000000000000..a94b348859cc87999c3944e53884dea5d11638af --- /dev/null +++ b/deploy/fastdeploy/rockchip/rknpu2_tools/export.py @@ -0,0 +1,80 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import yaml +import argparse +from rknn.api import RKNN + + +def get_config(): + parser = argparse.ArgumentParser() + parser.add_argument("--verbose", default=True, help="rknntoolkit verbose") + parser.add_argument("--config_path") + parser.add_argument("--target_platform") + args = parser.parse_args() + return args + + +if __name__ == "__main__": + config = get_config() + with open(config.config_path) as file: + file_data = file.read() + yaml_config = yaml.safe_load(file_data) + print(yaml_config) + model = RKNN(config.verbose) + + # Config + mean_values = yaml_config["mean"] + std_values = yaml_config["std"] + model.config( + mean_values=mean_values, + std_values=std_values, + target_platform=config.target_platform) + + # Load ONNX model + if yaml_config["outputs_nodes"] is None: + ret = model.load_onnx(model=yaml_config["model_path"]) + else: + ret = model.load_onnx( + model=yaml_config["model_path"], + outputs=yaml_config["outputs_nodes"]) + assert ret == 0, "Load model failed!" + + # Build model + ret = model.build( + do_quantization=yaml_config["do_quantization"], + dataset=yaml_config["dataset"]) + assert ret == 0, "Build model failed!" + + # Init Runtime + ret = model.init_runtime() + assert ret == 0, "Init runtime environment failed!" + + # Export + if not os.path.exists(yaml_config["output_folder"]): + os.mkdir(yaml_config["output_folder"]) + + name_list = os.path.basename(yaml_config["model_path"]).split(".") + model_base_name = "" + for name in name_list[0:-1]: + model_base_name += name + model_device_name = config.target_platform.lower() + if yaml_config["do_quantization"]: + model_save_name = model_base_name + "_" + model_device_name + "_quantized" + ".rknn" + else: + model_save_name = model_base_name + "_" + model_device_name + "_unquantized" + ".rknn" + ret = model.export_rknn( + os.path.join(yaml_config["output_folder"], model_save_name)) + assert ret == 0, "Export rknn model failed!" + print("Export OK!") diff --git a/deploy/fastdeploy/serving/README.md b/deploy/fastdeploy/serving/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1d52fec45e52a253139bf61d5d555cd3f474722b --- /dev/null +++ b/deploy/fastdeploy/serving/README.md @@ -0,0 +1,24 @@ +[English](README.md) | 简体中文 +# PaddleOCR 使用 FastDeploy 服务化部署PP-OCR系列模型 +## 1. FastDeploy 服务化部署介绍 +在线推理作为企业或个人线上部署模型的最后一环,是工业界必不可少的环节,其中最重要的就是服务化推理框架。FastDeploy 目前提供两种服务化部署方式:simple_serving和fastdeploy_serving +- simple_serving:适用于只需要通过http等调用AI推理任务,没有高并发需求的场景。simple_serving基于Flask框架具有简单高效的特点,可以快速验证线上部署模型的可行性 +- fastdeploy_serving:适用于高并发、高吞吐量请求的场景。基于Triton Inference Server框架,是一套可用于实际生产的完备且性能卓越的服务化部署框架 + +## 2. 支持的PaddleOCR推理模型 + +下表中的推理模型为FastDeploy测试过的模型, 下载链接由PaddleOCR模型库提供, +更多的模型, 详见[PP-OCR系列模型列表](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6/doc/doc_ch/models_list.md), 欢迎用户尝试. + +| PaddleOCR版本 | 文本框检测 | 方向分类模型 | 文字识别 |字典文件| 说明 | +|:----|:----|:----|:----|:----|:--------| +| ch_PP-OCRv3[推荐] |[ch_PP-OCRv3_det](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) | [ch_ppocr_mobile_v2.0_cls](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) | [ch_PP-OCRv3_rec](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) | [ppocr_keys_v1.txt](https://bj.bcebos.com/paddlehub/fastdeploy/ppocr_keys_v1.txt) | OCRv3系列原始超轻量模型,支持中英文、多语种文本检测 | +| en_PP-OCRv3[推荐] |[en_PP-OCRv3_det](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) | [ch_ppocr_mobile_v2.0_cls](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) | [en_PP-OCRv3_rec](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) | [en_dict.txt](https://bj.bcebos.com/paddlehub/fastdeploy/en_dict.txt) | OCRv3系列原始超轻量模型,支持英文与数字识别,除检测模型和识别模型的训练数据与中文模型不同以外,无其他区别 | +| ch_PP-OCRv2 |[ch_PP-OCRv2_det](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) | [ch_ppocr_mobile_v2.0_cls](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) | [ch_PP-OCRv2_rec](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) | [ppocr_keys_v1.txt](https://bj.bcebos.com/paddlehub/fastdeploy/ppocr_keys_v1.txt) | OCRv2系列原始超轻量模型,支持中英文、多语种文本检测 | +| ch_PP-OCRv2_mobile |[ch_ppocr_mobile_v2.0_det](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) | [ch_ppocr_mobile_v2.0_cls](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) | [ch_ppocr_mobile_v2.0_rec](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) | [ppocr_keys_v1.txt](https://bj.bcebos.com/paddlehub/fastdeploy/ppocr_keys_v1.txt) | OCRv2系列原始超轻量模型,支持中英文、多语种文本检测,比PPOCRv2更加轻量 | +| ch_PP-OCRv2_server |[ch_ppocr_server_v2.0_det](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) | [ch_ppocr_mobile_v2.0_cls](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) | [ch_ppocr_server_v2.0_rec](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) |[ppocr_keys_v1.txt](https://bj.bcebos.com/paddlehub/fastdeploy/ppocr_keys_v1.txt) | OCRv2服务器系列模型, 支持中英文、多语种文本检测,比超轻量模型更大,但效果更好| + +## 3. 详细的部署示例 + +- [fastdeploy serving](fastdeploy_serving) +- [simple serving](simple_serving) diff --git a/deploy/fastdeploy/serving/fastdeploy_serving/README.md b/deploy/fastdeploy/serving/fastdeploy_serving/README.md new file mode 100755 index 0000000000000000000000000000000000000000..7da6ce6fd9687d98edf8eb52d728bb98e26fff28 --- /dev/null +++ b/deploy/fastdeploy/serving/fastdeploy_serving/README.md @@ -0,0 +1,127 @@ +[English](README.md) | 简体中文 +# PaddleOCR服务化部署示例 + +PaddleOCR 服务化部署示例是利用FastDeploy Serving搭建的服务化部署示例。FastDeploy Serving是基于Triton Inference Server框架封装的适用于高并发、高吞吐量请求的服务化部署框架,是一套可用于实际生产的完备且性能卓越的服务化部署框架。如没有高并发,高吞吐场景的需求,只想快速检验模型线上部署的可行性,请参考[simple_serving](../simple_serving/) + +## 1. 部署环境准备 +在服务化部署前,需确认服务化镜像的软硬件环境要求和镜像拉取命令,请参考[FastDeploy服务化部署](https://github.com/PaddlePaddle/FastDeploy/blob/develop/serving/README_CN.md) + +## 2. PP-OCRv3服务化部署介绍 +本文介绍了使用FastDeploy搭建PP-OCRv3模型服务的方法. +服务端必须在docker内启动,而客户端不是必须在docker容器内. + +**本文所在路径($PWD)下的models里包含模型的配置和代码(服务端会加载模型和代码以启动服务), 需要将其映射到docker中使用.** + +PP-OCRv3由det(检测)、cls(分类)和rec(识别)三个模型组成. + +服务化部署串联的示意图如下图所示,其中`pp_ocr`串联了`det_preprocess`、`det_runtime`和`det_postprocess`,`cls_pp`串联了`cls_runtime`和`cls_postprocess`,`rec_pp`串联了`rec_runtime`和`rec_postprocess`. + +特别的是,在`det_postprocess`中会多次调用`cls_pp`和`rec_pp`服务,来实现对检测结果(多个框)进行分类和识别,,最后返回给用户最终的识别结果。 + +

+
+ +
+

+ + +## 3. 服务端的使用 + +### 3.1 下载模型并使用服务化Docker +```bash +# 下载仓库代码 +# 下载部署示例代码 +git clone https://github.com/PaddlePaddle/FastDeploy.git +cd FastDeploy/examples/vision/ocr/PP-OCR/serving/fastdeploy_serving + +# 如果您希望从PaddleOCR下载示例代码,请运行 +git clone https://github.com/PaddlePaddle/PaddleOCR.git +# 注意:如果当前分支找不到下面的fastdeploy测试代码,请切换到dygraph分支 +git checkout dygraph +cd PaddleOCR/deploy/fastdeploy/serving/fastdeploy_serving + +# 下载模型,图片和字典文件 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar +tar xvf ch_PP-OCRv3_det_infer.tar && mv ch_PP-OCRv3_det_infer 1 +mv 1/inference.pdiparams 1/model.pdiparams && mv 1/inference.pdmodel 1/model.pdmodel +mv 1 models/det_runtime/ && rm -rf ch_PP-OCRv3_det_infer.tar + +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar +tar xvf ch_ppocr_mobile_v2.0_cls_infer.tar && mv ch_ppocr_mobile_v2.0_cls_infer 1 +mv 1/inference.pdiparams 1/model.pdiparams && mv 1/inference.pdmodel 1/model.pdmodel +mv 1 models/cls_runtime/ && rm -rf ch_ppocr_mobile_v2.0_cls_infer.tar + +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar +tar xvf ch_PP-OCRv3_rec_infer.tar && mv ch_PP-OCRv3_rec_infer 1 +mv 1/inference.pdiparams 1/model.pdiparams && mv 1/inference.pdmodel 1/model.pdmodel +mv 1 models/rec_runtime/ && rm -rf ch_PP-OCRv3_rec_infer.tar + +mkdir models/pp_ocr/1 && mkdir models/rec_pp/1 && mkdir models/cls_pp/1 + +wget https://gitee.com/paddlepaddle/PaddleOCR/raw/release/2.6/ppocr/utils/ppocr_keys_v1.txt +mv ppocr_keys_v1.txt models/rec_postprocess/1/ + +wget https://gitee.com/paddlepaddle/PaddleOCR/raw/release/2.6/doc/imgs/12.jpg + +# x.y.z为镜像版本号,需参照serving文档替换为数字 +docker pull registry.baidubce.com/paddlepaddle/fastdeploy:x.y.z-gpu-cuda11.4-trt8.4-21.10 +docker run -dit --net=host --name fastdeploy --shm-size="1g" -v $PWD:/ocr_serving registry.baidubce.com/paddlepaddle/fastdeploy:x.y.z-gpu-cuda11.4-trt8.4-21.10 bash +docker exec -it -u root fastdeploy bash +``` + +### 3.2 安装(在docker内) +```bash +ldconfig +apt-get install libgl1 +``` + +#### 3.3 启动服务端(在docker内) +```bash +fastdeployserver --model-repository=/ocr_serving/models +``` + +参数: + - `model-repository`(required): 整套模型streaming_pp_tts存放的路径. + - `http-port`(optional): HTTP服务的端口号. 默认: `8000`. 本示例中未使用该端口. + - `grpc-port`(optional): GRPC服务的端口号. 默认: `8001`. + - `metrics-port`(optional): 服务端指标的端口号. 默认: `8002`. 本示例中未使用该端口. + + +## 4. 客户端的使用 +### 4.1 安装 +```bash +pip3 install tritonclient[all] +``` + +### 4.2 发送请求 +```bash +python3 client.py +``` + +## 5.配置修改 +当前默认配置在GPU上运行, 如果要在CPU或其他推理引擎上运行。 需要修改`models/runtime/config.pbtxt`中配置,详情请参考[配置文档](../../../../../serving/docs/zh_CN/model_configuration.md) + +## 6. 其他指南 + +- 使用PP-OCRv2进行服务化部署, 除了自行准备PP-OCRv2模型之外, 只需手动添加一行代码即可. +在[model.py](./models/det_postprocess/1/model.py#L109)文件**109行添加以下代码**: +``` +self.rec_preprocessor.cls_image_shape[1] = 32 +``` + +- [使用 VisualDL 进行 Serving 可视化部署](https://github.com/PaddlePaddle/FastDeploy/blob/develop/serving/docs/zh_CN/vdl_management.md) +通过VisualDL的可视化界面对PP-OCRv3进行服务化部署只需要如下三步: +```text +1. 载入模型库:./vision/ocr/PP-OCRv3/serving +2. 下载模型资源文件:点击det_runtime模型,点击版本号1添加预训练模型,选择文字识别模型ch_PP-OCRv3_det进行下载。点击cls_runtime模型,点击版本号1添加预训练模型,选择文字识别模型ch_ppocr_mobile_v2.0_cls进行下载。点击rec_runtime模型,点击版本号1添加预训练模型,选择文字识别模型ch_PP-OCRv3_rec进行下载。点击rec_postprocess模型,点击版本号1添加预训练模型,选择文字识别模型ch_PP-OCRv3_rec进行下载。 +3. 启动服务:点击启动服务按钮,输入启动参数。 +``` +

+ +

+ +## 7. 常见问题 +- [如何编写客户端 HTTP/GRPC 请求](https://github.com/PaddlePaddle/FastDeploy/blob/develop/serving/docs/zh_CN/client.md) +- [如何编译服务化部署镜像](https://github.com/PaddlePaddle/FastDeploy/blob/develop/serving/docs/zh_CN/compile.md) +- [服务化部署原理及动态Batch介绍](https://github.com/PaddlePaddle/FastDeploy/blob/develop/serving/docs/zh_CN/demo.md) +- [模型仓库介绍](https://github.com/PaddlePaddle/FastDeploy/blob/develop/serving/docs/zh_CN/model_repository.md) diff --git a/deploy/fastdeploy/serving/fastdeploy_serving/client.py b/deploy/fastdeploy/serving/fastdeploy_serving/client.py new file mode 100755 index 0000000000000000000000000000000000000000..6b758c5e39ac0fada03e4e7a561e4a4d0192c6e0 --- /dev/null +++ b/deploy/fastdeploy/serving/fastdeploy_serving/client.py @@ -0,0 +1,109 @@ +import logging +import numpy as np +import time +from typing import Optional +import cv2 +import json + +from tritonclient import utils as client_utils +from tritonclient.grpc import InferenceServerClient, InferInput, InferRequestedOutput, service_pb2_grpc, service_pb2 + +LOGGER = logging.getLogger("run_inference_on_triton") + + +class SyncGRPCTritonRunner: + DEFAULT_MAX_RESP_WAIT_S = 120 + + def __init__( + self, + server_url: str, + model_name: str, + model_version: str, + *, + verbose=False, + resp_wait_s: Optional[float]=None, ): + self._server_url = server_url + self._model_name = model_name + self._model_version = model_version + self._verbose = verbose + self._response_wait_t = self.DEFAULT_MAX_RESP_WAIT_S if resp_wait_s is None else resp_wait_s + + self._client = InferenceServerClient( + self._server_url, verbose=self._verbose) + error = self._verify_triton_state(self._client) + if error: + raise RuntimeError( + f"Could not communicate to Triton Server: {error}") + + LOGGER.debug( + f"Triton server {self._server_url} and model {self._model_name}:{self._model_version} " + f"are up and ready!") + + model_config = self._client.get_model_config(self._model_name, + self._model_version) + model_metadata = self._client.get_model_metadata(self._model_name, + self._model_version) + LOGGER.info(f"Model config {model_config}") + LOGGER.info(f"Model metadata {model_metadata}") + + self._inputs = {tm.name: tm for tm in model_metadata.inputs} + self._input_names = list(self._inputs) + self._outputs = {tm.name: tm for tm in model_metadata.outputs} + self._output_names = list(self._outputs) + self._outputs_req = [ + InferRequestedOutput(name) for name in self._outputs + ] + + def Run(self, inputs): + """ + Args: + inputs: list, Each value corresponds to an input name of self._input_names + Returns: + results: dict, {name : numpy.array} + """ + infer_inputs = [] + for idx, data in enumerate(inputs): + infer_input = InferInput(self._input_names[idx], data.shape, + "UINT8") + infer_input.set_data_from_numpy(data) + infer_inputs.append(infer_input) + + results = self._client.infer( + model_name=self._model_name, + model_version=self._model_version, + inputs=infer_inputs, + outputs=self._outputs_req, + client_timeout=self._response_wait_t, ) + results = {name: results.as_numpy(name) for name in self._output_names} + return results + + def _verify_triton_state(self, triton_client): + if not triton_client.is_server_live(): + return f"Triton server {self._server_url} is not live" + elif not triton_client.is_server_ready(): + return f"Triton server {self._server_url} is not ready" + elif not triton_client.is_model_ready(self._model_name, + self._model_version): + return f"Model {self._model_name}:{self._model_version} is not ready" + return None + + +if __name__ == "__main__": + model_name = "pp_ocr" + model_version = "1" + url = "localhost:8001" + runner = SyncGRPCTritonRunner(url, model_name, model_version) + im = cv2.imread("12.jpg") + im = np.array([im, ]) + for i in range(1): + result = runner.Run([im, ]) + batch_texts = result['rec_texts'] + batch_scores = result['rec_scores'] + batch_bboxes = result['det_bboxes'] + for i_batch in range(len(batch_texts)): + texts = batch_texts[i_batch] + scores = batch_scores[i_batch] + bboxes = batch_bboxes[i_batch] + for i_box in range(len(texts)): + print('text=', texts[i_box].decode('utf-8'), ' score=', + scores[i_box], ' bbox=', bboxes[i_box]) diff --git a/deploy/fastdeploy/serving/fastdeploy_serving/models/cls_postprocess/1/model.py b/deploy/fastdeploy/serving/fastdeploy_serving/models/cls_postprocess/1/model.py new file mode 100644 index 0000000000000000000000000000000000000000..891db5f24b8f117c6d499e258dd5e16ee7a7f356 --- /dev/null +++ b/deploy/fastdeploy/serving/fastdeploy_serving/models/cls_postprocess/1/model.py @@ -0,0 +1,105 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import numpy as np +import time + +import fastdeploy as fd + +# triton_python_backend_utils is available in every Triton Python model. You +# need to use this module to create inference requests and responses. It also +# contains some utility functions for extracting information from model_config +# and converting Triton input/output types to numpy types. +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to intialize any state associated with this model. + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + # You must parse model_config. JSON string is not parsed here + self.model_config = json.loads(args['model_config']) + print("model_config:", self.model_config) + + self.input_names = [] + for input_config in self.model_config["input"]: + self.input_names.append(input_config["name"]) + print("postprocess input names:", self.input_names) + + self.output_names = [] + self.output_dtype = [] + for output_config in self.model_config["output"]: + self.output_names.append(output_config["name"]) + dtype = pb_utils.triton_string_to_numpy(output_config["data_type"]) + self.output_dtype.append(dtype) + print("postprocess output names:", self.output_names) + self.postprocessor = fd.vision.ocr.ClassifierPostprocessor() + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. Depending on the batching configuration (e.g. Dynamic + Batching) used, `requests` may contain multiple requests. Every + Python model, must create one pb_utils.InferenceResponse for every + pb_utils.InferenceRequest in `requests`. If there is an error, you can + set the error argument when creating a pb_utils.InferenceResponse. + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + responses = [] + for request in requests: + infer_outputs = pb_utils.get_input_tensor_by_name( + request, self.input_names[0]) + infer_outputs = infer_outputs.as_numpy() + results = self.postprocessor.run([infer_outputs]) + out_tensor_0 = pb_utils.Tensor(self.output_names[0], + np.array(results[0])) + out_tensor_1 = pb_utils.Tensor(self.output_names[1], + np.array(results[1])) + inference_response = pb_utils.InferenceResponse( + output_tensors=[out_tensor_0, out_tensor_1]) + responses.append(inference_response) + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is optional. This function allows + the model to perform any necessary clean ups before exit. + """ + print('Cleaning up...') diff --git a/deploy/fastdeploy/serving/fastdeploy_serving/models/cls_postprocess/config.pbtxt b/deploy/fastdeploy/serving/fastdeploy_serving/models/cls_postprocess/config.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..18ab2facc6389217da7b16fc91804b1a52b0ce30 --- /dev/null +++ b/deploy/fastdeploy/serving/fastdeploy_serving/models/cls_postprocess/config.pbtxt @@ -0,0 +1,30 @@ +name: "cls_postprocess" +backend: "python" +max_batch_size: 128 +input [ + { + name: "POST_INPUT_0" + data_type: TYPE_FP32 + dims: [ 2 ] + } +] + +output [ + { + name: "POST_OUTPUT_0" + data_type: TYPE_INT32 + dims: [ 1 ] + }, + { + name: "POST_OUTPUT_1" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] + +instance_group [ + { + count: 1 + kind: KIND_CPU + } +] diff --git a/deploy/fastdeploy/serving/fastdeploy_serving/models/cls_pp/config.pbtxt b/deploy/fastdeploy/serving/fastdeploy_serving/models/cls_pp/config.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..068b1e7d87954eb66b59b99a74b7693a98060e33 --- /dev/null +++ b/deploy/fastdeploy/serving/fastdeploy_serving/models/cls_pp/config.pbtxt @@ -0,0 +1,54 @@ +name: "cls_pp" +platform: "ensemble" +max_batch_size: 128 +input [ + { + name: "x" + data_type: TYPE_FP32 + dims: [ 3, -1, -1 ] + } +] +output [ + { + name: "cls_labels" + data_type: TYPE_INT32 + dims: [ 1 ] + }, + { + name: "cls_scores" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] +ensemble_scheduling { + step [ + { + model_name: "cls_runtime" + model_version: 1 + input_map { + key: "x" + value: "x" + } + output_map { + key: "softmax_0.tmp_0" + value: "infer_output" + } + }, + { + model_name: "cls_postprocess" + model_version: 1 + input_map { + key: "POST_INPUT_0" + value: "infer_output" + } + output_map { + key: "POST_OUTPUT_0" + value: "cls_labels" + } + output_map { + key: "POST_OUTPUT_1" + value: "cls_scores" + } + } + ] +} diff --git a/deploy/fastdeploy/serving/fastdeploy_serving/models/cls_runtime/config.pbtxt b/deploy/fastdeploy/serving/fastdeploy_serving/models/cls_runtime/config.pbtxt new file mode 100755 index 0000000000000000000000000000000000000000..eb7b2550366a9c69cc90e002d5390eee99e31abb --- /dev/null +++ b/deploy/fastdeploy/serving/fastdeploy_serving/models/cls_runtime/config.pbtxt @@ -0,0 +1,52 @@ +# optional, If name is specified it must match the name of the model repository directory containing the model. +name: "cls_runtime" +backend: "fastdeploy" +max_batch_size: 128 + +# Input configuration of the model +input [ + { + # input name + name: "x" + # input type such as TYPE_FP32、TYPE_UINT8、TYPE_INT8、TYPE_INT16、TYPE_INT32、TYPE_INT64、TYPE_FP16、TYPE_STRING + data_type: TYPE_FP32 + # input shape, The batch dimension is omitted and the actual shape is [batch, c, h, w] + dims: [ 3, -1, -1 ] + } +] + +# The output of the model is configured in the same format as the input +output [ + { + name: "softmax_0.tmp_0" + data_type: TYPE_FP32 + dims: [ 2 ] + } +] + +# Number of instances of the model +instance_group [ + { + # The number of instances is 1 + count: 1 + # Use GPU, CPU inference option is:KIND_CPU + kind: KIND_GPU + # The instance is deployed on the 0th GPU card + gpus: [0] + } +] + +optimization { + execution_accelerators { + # GPU推理配置, 配合KIND_GPU使用 + gpu_execution_accelerator : [ + { + name : "paddle" + # 设置推理并行计算线程数为4 + parameters { key: "cpu_threads" value: "4" } + # 开启mkldnn加速,设置为0关闭mkldnn + parameters { key: "use_mkldnn" value: "1" } + } + ] + } +} diff --git a/deploy/fastdeploy/serving/fastdeploy_serving/models/det_postprocess/1/model.py b/deploy/fastdeploy/serving/fastdeploy_serving/models/det_postprocess/1/model.py new file mode 100644 index 0000000000000000000000000000000000000000..87115c2d949762adfe3796487e93bc6e94483a60 --- /dev/null +++ b/deploy/fastdeploy/serving/fastdeploy_serving/models/det_postprocess/1/model.py @@ -0,0 +1,238 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import numpy as np +import time +import math +import cv2 +import fastdeploy as fd + +# triton_python_backend_utils is available in every Triton Python model. You +# need to use this module to create inference requests and responses. It also +# contains some utility functions for extracting information from model_config +# and converting Triton input/output types to numpy types. +import triton_python_backend_utils as pb_utils + + +def get_rotate_crop_image(img, box): + ''' + img_height, img_width = img.shape[0:2] + left = int(np.min(points[:, 0])) + right = int(np.max(points[:, 0])) + top = int(np.min(points[:, 1])) + bottom = int(np.max(points[:, 1])) + img_crop = img[top:bottom, left:right, :].copy() + points[:, 0] = points[:, 0] - left + points[:, 1] = points[:, 1] - top + ''' + points = [] + for i in range(4): + points.append([box[2 * i], box[2 * i + 1]]) + points = np.array(points, dtype=np.float32) + img = img.astype(np.float32) + assert len(points) == 4, "shape of points must be 4*2" + img_crop_width = int( + max( + np.linalg.norm(points[0] - points[1]), + np.linalg.norm(points[2] - points[3]))) + img_crop_height = int( + max( + np.linalg.norm(points[0] - points[3]), + np.linalg.norm(points[1] - points[2]))) + pts_std = np.float32([[0, 0], [img_crop_width, 0], + [img_crop_width, img_crop_height], + [0, img_crop_height]]) + M = cv2.getPerspectiveTransform(points, pts_std) + dst_img = cv2.warpPerspective( + img, + M, (img_crop_width, img_crop_height), + borderMode=cv2.BORDER_REPLICATE, + flags=cv2.INTER_CUBIC) + dst_img_height, dst_img_width = dst_img.shape[0:2] + if dst_img_height * 1.0 / dst_img_width >= 1.5: + dst_img = np.rot90(dst_img) + return dst_img + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to intialize any state associated with this model. + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + # You must parse model_config. JSON string is not parsed here + self.model_config = json.loads(args['model_config']) + print("model_config:", self.model_config) + + self.input_names = [] + for input_config in self.model_config["input"]: + self.input_names.append(input_config["name"]) + print("postprocess input names:", self.input_names) + + self.output_names = [] + self.output_dtype = [] + for output_config in self.model_config["output"]: + self.output_names.append(output_config["name"]) + dtype = pb_utils.triton_string_to_numpy(output_config["data_type"]) + self.output_dtype.append(dtype) + print("postprocess output names:", self.output_names) + self.postprocessor = fd.vision.ocr.DBDetectorPostprocessor() + self.cls_preprocessor = fd.vision.ocr.ClassifierPreprocessor() + self.rec_preprocessor = fd.vision.ocr.RecognizerPreprocessor() + self.cls_threshold = 0.9 + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. Depending on the batching configuration (e.g. Dynamic + Batching) used, `requests` may contain multiple requests. Every + Python model, must create one pb_utils.InferenceResponse for every + pb_utils.InferenceRequest in `requests`. If there is an error, you can + set the error argument when creating a pb_utils.InferenceResponse. + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + responses = [] + for request in requests: + infer_outputs = pb_utils.get_input_tensor_by_name( + request, self.input_names[0]) + im_infos = pb_utils.get_input_tensor_by_name(request, + self.input_names[1]) + ori_imgs = pb_utils.get_input_tensor_by_name(request, + self.input_names[2]) + + infer_outputs = infer_outputs.as_numpy() + im_infos = im_infos.as_numpy() + ori_imgs = ori_imgs.as_numpy() + + results = self.postprocessor.run([infer_outputs], im_infos) + batch_rec_texts = [] + batch_rec_scores = [] + batch_box_list = [] + for i_batch in range(len(results)): + + cls_labels = [] + cls_scores = [] + rec_texts = [] + rec_scores = [] + + box_list = fd.vision.ocr.sort_boxes(results[i_batch]) + image_list = [] + if len(box_list) == 0: + image_list.append(ori_imgs[i_batch]) + else: + for box in box_list: + crop_img = get_rotate_crop_image(ori_imgs[i_batch], box) + image_list.append(crop_img) + + batch_box_list.append(box_list) + + cls_pre_tensors = self.cls_preprocessor.run(image_list) + cls_dlpack_tensor = cls_pre_tensors[0].to_dlpack() + cls_input_tensor = pb_utils.Tensor.from_dlpack( + "x", cls_dlpack_tensor) + + inference_request = pb_utils.InferenceRequest( + model_name='cls_pp', + requested_output_names=['cls_labels', 'cls_scores'], + inputs=[cls_input_tensor]) + inference_response = inference_request.exec() + if inference_response.has_error(): + raise pb_utils.TritonModelException( + inference_response.error().message()) + else: + # Extract the output tensors from the inference response. + cls_labels = pb_utils.get_output_tensor_by_name( + inference_response, 'cls_labels') + cls_labels = cls_labels.as_numpy() + + cls_scores = pb_utils.get_output_tensor_by_name( + inference_response, 'cls_scores') + cls_scores = cls_scores.as_numpy() + + for index in range(len(image_list)): + if cls_labels[index] == 1 and cls_scores[ + index] > self.cls_threshold: + image_list[index] = cv2.rotate( + image_list[index].astype(np.float32), 1) + image_list[index] = np.astype(np.uint8) + + rec_pre_tensors = self.rec_preprocessor.run(image_list) + rec_dlpack_tensor = rec_pre_tensors[0].to_dlpack() + rec_input_tensor = pb_utils.Tensor.from_dlpack( + "x", rec_dlpack_tensor) + + inference_request = pb_utils.InferenceRequest( + model_name='rec_pp', + requested_output_names=['rec_texts', 'rec_scores'], + inputs=[rec_input_tensor]) + inference_response = inference_request.exec() + if inference_response.has_error(): + raise pb_utils.TritonModelException( + inference_response.error().message()) + else: + # Extract the output tensors from the inference response. + rec_texts = pb_utils.get_output_tensor_by_name( + inference_response, 'rec_texts') + rec_texts = rec_texts.as_numpy() + + rec_scores = pb_utils.get_output_tensor_by_name( + inference_response, 'rec_scores') + rec_scores = rec_scores.as_numpy() + + batch_rec_texts.append(rec_texts) + batch_rec_scores.append(rec_scores) + + out_tensor_0 = pb_utils.Tensor( + self.output_names[0], + np.array( + batch_rec_texts, dtype=np.object_)) + out_tensor_1 = pb_utils.Tensor(self.output_names[1], + np.array(batch_rec_scores)) + out_tensor_2 = pb_utils.Tensor(self.output_names[2], + np.array(batch_box_list)) + inference_response = pb_utils.InferenceResponse( + output_tensors=[out_tensor_0, out_tensor_1, out_tensor_2]) + responses.append(inference_response) + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is optional. This function allows + the model to perform any necessary clean ups before exit. + """ + print('Cleaning up...') diff --git a/deploy/fastdeploy/serving/fastdeploy_serving/models/det_postprocess/config.pbtxt b/deploy/fastdeploy/serving/fastdeploy_serving/models/det_postprocess/config.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..378b7bab64f76a71163177f071f776b104c00df3 --- /dev/null +++ b/deploy/fastdeploy/serving/fastdeploy_serving/models/det_postprocess/config.pbtxt @@ -0,0 +1,45 @@ +name: "det_postprocess" +backend: "python" +max_batch_size: 128 +input [ + { + name: "POST_INPUT_0" + data_type: TYPE_FP32 + dims: [ 1, -1, -1] + }, + { + name: "POST_INPUT_1" + data_type: TYPE_INT32 + dims: [ 4 ] + }, + { + name: "ORI_IMG" + data_type: TYPE_UINT8 + dims: [ -1, -1, 3 ] + } +] + +output [ + { + name: "POST_OUTPUT_0" + data_type: TYPE_STRING + dims: [ -1, 1 ] + }, + { + name: "POST_OUTPUT_1" + data_type: TYPE_FP32 + dims: [ -1, 1 ] + }, + { + name: "POST_OUTPUT_2" + data_type: TYPE_FP32 + dims: [ -1, -1, 1 ] + } +] + +instance_group [ + { + count: 1 + kind: KIND_CPU + } +] diff --git a/deploy/fastdeploy/serving/fastdeploy_serving/models/det_preprocess/1/model.py b/deploy/fastdeploy/serving/fastdeploy_serving/models/det_preprocess/1/model.py new file mode 100644 index 0000000000000000000000000000000000000000..28e838da5b6394b7ae14d0ad5f99bded996b14d8 --- /dev/null +++ b/deploy/fastdeploy/serving/fastdeploy_serving/models/det_preprocess/1/model.py @@ -0,0 +1,107 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import numpy as np +import time + +import fastdeploy as fd + +# triton_python_backend_utils is available in every Triton Python model. You +# need to use this module to create inference requests and responses. It also +# contains some utility functions for extracting information from model_config +# and converting Triton input/output types to numpy types. +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to intialize any state associated with this model. + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + # You must parse model_config. JSON string is not parsed here + self.model_config = json.loads(args['model_config']) + print("model_config:", self.model_config) + + self.input_names = [] + for input_config in self.model_config["input"]: + self.input_names.append(input_config["name"]) + print("preprocess input names:", self.input_names) + + self.output_names = [] + self.output_dtype = [] + for output_config in self.model_config["output"]: + self.output_names.append(output_config["name"]) + dtype = pb_utils.triton_string_to_numpy(output_config["data_type"]) + self.output_dtype.append(dtype) + print("preprocess output names:", self.output_names) + self.preprocessor = fd.vision.ocr.DBDetectorPreprocessor() + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. Depending on the batching configuration (e.g. Dynamic + Batching) used, `requests` may contain multiple requests. Every + Python model, must create one pb_utils.InferenceResponse for every + pb_utils.InferenceRequest in `requests`. If there is an error, you can + set the error argument when creating a pb_utils.InferenceResponse. + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + responses = [] + for request in requests: + data = pb_utils.get_input_tensor_by_name(request, + self.input_names[0]) + data = data.as_numpy() + outputs, im_infos = self.preprocessor.run(data) + dlpack_tensor = outputs[0].to_dlpack() + output_tensor_0 = pb_utils.Tensor.from_dlpack(self.output_names[0], + dlpack_tensor) + output_tensor_1 = pb_utils.Tensor( + self.output_names[1], np.array( + im_infos, dtype=np.int32)) + inference_response = pb_utils.InferenceResponse( + output_tensors=[output_tensor_0, output_tensor_1]) + responses.append(inference_response) + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is optional. This function allows + the model to perform any necessary clean ups before exit. + """ + print('Cleaning up...') diff --git a/deploy/fastdeploy/serving/fastdeploy_serving/models/det_preprocess/config.pbtxt b/deploy/fastdeploy/serving/fastdeploy_serving/models/det_preprocess/config.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..93aa1d062ebb440429e588f0cefe9bb6235a2932 --- /dev/null +++ b/deploy/fastdeploy/serving/fastdeploy_serving/models/det_preprocess/config.pbtxt @@ -0,0 +1,37 @@ +# optional, If name is specified it must match the name of the model repository directory containing the model. +name: "det_preprocess" +backend: "python" +max_batch_size: 1 + +# Input configuration of the model +input [ + { + # input name + name: "INPUT_0" + # input type such as TYPE_FP32、TYPE_UINT8、TYPE_INT8、TYPE_INT16、TYPE_INT32、TYPE_INT64、TYPE_FP16、TYPE_STRING + data_type: TYPE_UINT8 + # input shape, The batch dimension is omitted and the actual shape is [batch, c, h, w] + dims: [ -1, -1, 3 ] + } +] + +# The output of the model is configured in the same format as the input +output [ + { + name: "OUTPUT_0" + data_type: TYPE_FP32 + dims: [ 3, -1, -1 ] + }, + { + name: "OUTPUT_1" + data_type: TYPE_INT32 + dims: [ 4 ] + } +] + +instance_group [ + { + count: 1 + kind: KIND_CPU + } +] diff --git a/deploy/fastdeploy/serving/fastdeploy_serving/models/det_runtime/config.pbtxt b/deploy/fastdeploy/serving/fastdeploy_serving/models/det_runtime/config.pbtxt new file mode 100755 index 0000000000000000000000000000000000000000..96d85e3e1941293b049242b1c2b1cf207bb108bc --- /dev/null +++ b/deploy/fastdeploy/serving/fastdeploy_serving/models/det_runtime/config.pbtxt @@ -0,0 +1,52 @@ +# optional, If name is specified it must match the name of the model repository directory containing the model. +name: "det_runtime" +backend: "fastdeploy" +max_batch_size: 1 + +# Input configuration of the model +input [ + { + # input name + name: "x" + # input type such as TYPE_FP32、TYPE_UINT8、TYPE_INT8、TYPE_INT16、TYPE_INT32、TYPE_INT64、TYPE_FP16、TYPE_STRING + data_type: TYPE_FP32 + # input shape, The batch dimension is omitted and the actual shape is [batch, c, h, w] + dims: [ 3, -1, -1 ] + } +] + +# The output of the model is configured in the same format as the input +output [ + { + name: "sigmoid_0.tmp_0" + data_type: TYPE_FP32 + dims: [ 1, -1, -1 ] + } +] + +# Number of instances of the model +instance_group [ + { + # The number of instances is 1 + count: 1 + # Use GPU, CPU inference option is:KIND_CPU + kind: KIND_GPU + # The instance is deployed on the 0th GPU card + gpus: [0] + } +] + +optimization { + execution_accelerators { + # GPU推理配置, 配合KIND_GPU使用 + gpu_execution_accelerator : [ + { + name : "paddle" + # 设置推理并行计算线程数为4 + parameters { key: "cpu_threads" value: "4" } + # 开启mkldnn加速,设置为0关闭mkldnn + parameters { key: "use_mkldnn" value: "1" } + } + ] + } +} \ No newline at end of file diff --git a/deploy/fastdeploy/serving/fastdeploy_serving/models/pp_ocr/config.pbtxt b/deploy/fastdeploy/serving/fastdeploy_serving/models/pp_ocr/config.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..5ef951107e4f36696a46ce7396ddedc5c9316cee --- /dev/null +++ b/deploy/fastdeploy/serving/fastdeploy_serving/models/pp_ocr/config.pbtxt @@ -0,0 +1,87 @@ +name: "pp_ocr" +platform: "ensemble" +max_batch_size: 1 +input [ + { + name: "INPUT" + data_type: TYPE_UINT8 + dims: [ -1, -1, 3 ] + } +] +output [ + { + name: "rec_texts" + data_type: TYPE_STRING + dims: [ -1, 1 ] + }, + { + name: "rec_scores" + data_type: TYPE_FP32 + dims: [ -1, 1 ] + }, + { + name: "det_bboxes" + data_type: TYPE_FP32 + dims: [ -1, -1, 1 ] + } +] +ensemble_scheduling { + step [ + { + model_name: "det_preprocess" + model_version: 1 + input_map { + key: "INPUT_0" + value: "INPUT" + } + output_map { + key: "OUTPUT_0" + value: "infer_input" + } + output_map { + key: "OUTPUT_1" + value: "infos" + } + }, + { + model_name: "det_runtime" + model_version: 1 + input_map { + key: "x" + value: "infer_input" + } + output_map { + key: "sigmoid_0.tmp_0" + value: "infer_output" + } + }, + { + model_name: "det_postprocess" + model_version: 1 + input_map { + key: "POST_INPUT_0" + value: "infer_output" + } + input_map { + key: "POST_INPUT_1" + value: "infos" + } + input_map { + key: "ORI_IMG" + value: "INPUT" + } + output_map { + key: "POST_OUTPUT_0" + value: "rec_texts" + } + output_map { + key: "POST_OUTPUT_1" + value: "rec_scores" + } + output_map { + key: "POST_OUTPUT_2" + value: "det_bboxes" + } + } + ] +} diff --git a/deploy/fastdeploy/serving/fastdeploy_serving/models/rec_postprocess/1/model.py b/deploy/fastdeploy/serving/fastdeploy_serving/models/rec_postprocess/1/model.py new file mode 100755 index 0000000000000000000000000000000000000000..c046cd929b75175bcbeceea80f14a8fb04c733ca --- /dev/null +++ b/deploy/fastdeploy/serving/fastdeploy_serving/models/rec_postprocess/1/model.py @@ -0,0 +1,112 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import numpy as np +import time +import os +import sys +import codecs +import fastdeploy as fd + +# triton_python_backend_utils is available in every Triton Python model. You +# need to use this module to create inference requests and responses. It also +# contains some utility functions for extracting information from model_config +# and converting Triton input/output types to numpy types. +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to intialize any state associated with this model. + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + # You must parse model_config. JSON string is not parsed here + self.model_config = json.loads(args['model_config']) + print("model_config:", self.model_config) + + self.input_names = [] + for input_config in self.model_config["input"]: + self.input_names.append(input_config["name"]) + print("postprocess input names:", self.input_names) + + self.output_names = [] + self.output_dtype = [] + for output_config in self.model_config["output"]: + self.output_names.append(output_config["name"]) + dtype = pb_utils.triton_string_to_numpy(output_config["data_type"]) + self.output_dtype.append(dtype) + print("postprocess output names:", self.output_names) + + dir_name = os.path.dirname(os.path.realpath(__file__)) + "/" + file_name = dir_name + "ppocr_keys_v1.txt" + #self.label_list = load_dict() + self.postprocessor = fd.vision.ocr.RecognizerPostprocessor(file_name) + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. Depending on the batching configuration (e.g. Dynamic + Batching) used, `requests` may contain multiple requests. Every + Python model, must create one pb_utils.InferenceResponse for every + pb_utils.InferenceRequest in `requests`. If there is an error, you can + set the error argument when creating a pb_utils.InferenceResponse. + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + responses = [] + for request in requests: + infer_outputs = pb_utils.get_input_tensor_by_name( + request, self.input_names[0]) + infer_outputs = infer_outputs.as_numpy() + results = self.postprocessor.run([infer_outputs]) + out_tensor_0 = pb_utils.Tensor( + self.output_names[0], np.array( + results[0], dtype=np.object_)) + out_tensor_1 = pb_utils.Tensor(self.output_names[1], + np.array(results[1])) + inference_response = pb_utils.InferenceResponse( + output_tensors=[out_tensor_0, out_tensor_1]) + responses.append(inference_response) + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is optional. This function allows + the model to perform any necessary clean ups before exit. + """ + print('Cleaning up...') diff --git a/deploy/fastdeploy/serving/fastdeploy_serving/models/rec_postprocess/config.pbtxt b/deploy/fastdeploy/serving/fastdeploy_serving/models/rec_postprocess/config.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..c125140c8b15f8d090ed7ef72ee855454059aa42 --- /dev/null +++ b/deploy/fastdeploy/serving/fastdeploy_serving/models/rec_postprocess/config.pbtxt @@ -0,0 +1,30 @@ +name: "rec_postprocess" +backend: "python" +max_batch_size: 128 +input [ + { + name: "POST_INPUT_0" + data_type: TYPE_FP32 + dims: [ -1, 6625 ] + } +] + +output [ + { + name: "POST_OUTPUT_0" + data_type: TYPE_STRING + dims: [ 1 ] + }, + { + name: "POST_OUTPUT_1" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] + +instance_group [ + { + count: 1 + kind: KIND_CPU + } +] diff --git a/deploy/fastdeploy/serving/fastdeploy_serving/models/rec_pp/config.pbtxt b/deploy/fastdeploy/serving/fastdeploy_serving/models/rec_pp/config.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..bb79f90012ba70fc1eac7779218395c3135be8f4 --- /dev/null +++ b/deploy/fastdeploy/serving/fastdeploy_serving/models/rec_pp/config.pbtxt @@ -0,0 +1,54 @@ +name: "rec_pp" +platform: "ensemble" +max_batch_size: 128 +input [ + { + name: "x" + data_type: TYPE_FP32 + dims: [ 3, 48, -1 ] + } +] +output [ + { + name: "rec_texts" + data_type: TYPE_STRING + dims: [ 1 ] + }, + { + name: "rec_scores" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] +ensemble_scheduling { + step [ + { + model_name: "rec_runtime" + model_version: 1 + input_map { + key: "x" + value: "x" + } + output_map { + key: "softmax_5.tmp_0" + value: "infer_output" + } + }, + { + model_name: "rec_postprocess" + model_version: 1 + input_map { + key: "POST_INPUT_0" + value: "infer_output" + } + output_map { + key: "POST_OUTPUT_0" + value: "rec_texts" + } + output_map { + key: "POST_OUTPUT_1" + value: "rec_scores" + } + } + ] +} diff --git a/deploy/fastdeploy/serving/fastdeploy_serving/models/rec_runtime/config.pbtxt b/deploy/fastdeploy/serving/fastdeploy_serving/models/rec_runtime/config.pbtxt new file mode 100755 index 0000000000000000000000000000000000000000..037d7a9f285550c8946bcf3f3cb9191c667a792c --- /dev/null +++ b/deploy/fastdeploy/serving/fastdeploy_serving/models/rec_runtime/config.pbtxt @@ -0,0 +1,52 @@ +# optional, If name is specified it must match the name of the model repository directory containing the model. +name: "rec_runtime" +backend: "fastdeploy" +max_batch_size: 128 + +# Input configuration of the model +input [ + { + # input name + name: "x" + # input type such as TYPE_FP32、TYPE_UINT8、TYPE_INT8、TYPE_INT16、TYPE_INT32、TYPE_INT64、TYPE_FP16、TYPE_STRING + data_type: TYPE_FP32 + # input shape, The batch dimension is omitted and the actual shape is [batch, c, h, w] + dims: [ 3, 48, -1 ] + } +] + +# The output of the model is configured in the same format as the input +output [ + { + name: "softmax_5.tmp_0" + data_type: TYPE_FP32 + dims: [ -1, 6625 ] + } +] + +# Number of instances of the model +instance_group [ + { + # The number of instances is 1 + count: 1 + # Use GPU, CPU inference option is:KIND_CPU + kind: KIND_GPU + # The instance is deployed on the 0th GPU card + gpus: [0] + } +] + +optimization { + execution_accelerators { + # GPU推理配置, 配合KIND_GPU使用 + gpu_execution_accelerator : [ + { + name : "paddle" + # 设置推理并行计算线程数为4 + parameters { key: "cpu_threads" value: "4" } + # 开启mkldnn加速,设置为0关闭mkldnn + parameters { key: "use_mkldnn" value: "1" } + } + ] + } +} \ No newline at end of file diff --git a/deploy/fastdeploy/serving/fastdeploy_serving/ppocr.png b/deploy/fastdeploy/serving/fastdeploy_serving/ppocr.png new file mode 100644 index 0000000000000000000000000000000000000000..db12eddc49c9afe0d2d6ea661633abd8eff50c1b Binary files /dev/null and b/deploy/fastdeploy/serving/fastdeploy_serving/ppocr.png differ diff --git a/deploy/fastdeploy/serving/simple_serving/README.md b/deploy/fastdeploy/serving/simple_serving/README.md new file mode 100644 index 0000000000000000000000000000000000000000..913475c79ea9f304719afec335cf0ade3531d691 --- /dev/null +++ b/deploy/fastdeploy/serving/simple_serving/README.md @@ -0,0 +1,54 @@ +简体中文 | [English](README.md) + + +# PaddleOCR Python轻量服务化部署示例 + +PaddleOCR Python轻量服务化部署是FastDeploy基于Flask框架搭建的可快速验证线上模型部署可行性的服务化部署示例,基于http请求完成AI推理任务,适用于无并发推理的简单场景,如有高并发,高吞吐场景的需求请参考[fastdeploy_serving](../fastdeploy_serving/) + + +## 1. 部署环境准备 + +在部署前,需确认软硬件环境,同时下载预编译python wheel 包,参考文档[FastDeploy预编译库安装](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install#FastDeploy预编译库安装) + + +## 2. 启动服务 +```bash +# 下载部署示例代码 +git clone https://github.com/PaddlePaddle/FastDeploy.git +cd FastDeploy/examples/vision/ocr/PP-OCR/serving/simple_serving + +# 如果您希望从PaddleOCR下载示例代码,请运行 +git clone https://github.com/PaddlePaddle/PaddleOCR.git +# 注意:如果当前分支找不到下面的fastdeploy测试代码,请切换到dygraph分支 +git checkout dygraph +cd PaddleOCR/deploy/fastdeploy/serving/simple_serving + +# 下载模型和字典文件 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar +tar xvf ch_PP-OCRv3_det_infer.tar + +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar +tar -xvf ch_ppocr_mobile_v2.0_cls_infer.tar + +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar +tar xvf ch_PP-OCRv3_rec_infer.tar + +wget https://gitee.com/paddlepaddle/PaddleOCR/raw/release/2.6/ppocr/utils/ppocr_keys_v1.txt + +# 启动服务,可修改server.py中的配置项来指定硬件、后端等 +# 可通过--host、--port指定IP和端口号 +fastdeploy simple_serving --app server:app +``` + +## 3. 客户端请求 +```bash +# 下载部署示例代码 +git clone https://github.com/PaddlePaddle/PaddleOCR.git +cd PaddleOCR/deploy/fastdeploy/serving/simple_serving + +# 下载测试图片 +wget https://gitee.com/paddlepaddle/PaddleOCR/raw/release/2.6/doc/imgs/12.jpg + +# 请求服务,获取推理结果(如有必要,请修改脚本中的IP和端口号) +python client.py +``` diff --git a/deploy/fastdeploy/serving/simple_serving/client.py b/deploy/fastdeploy/serving/simple_serving/client.py new file mode 100644 index 0000000000000000000000000000000000000000..6849c22046e67492c6a7d9db15fcf3f6b5a40d5d --- /dev/null +++ b/deploy/fastdeploy/serving/simple_serving/client.py @@ -0,0 +1,24 @@ +import requests +import json +import cv2 +import fastdeploy as fd +from fastdeploy.serving.utils import cv2_to_base64 + +if __name__ == '__main__': + url = "http://127.0.0.1:8000/fd/ppocrv3" + headers = {"Content-Type": "application/json"} + + im = cv2.imread("12.jpg") + data = {"data": {"image": cv2_to_base64(im)}, "parameters": {}} + + resp = requests.post(url=url, headers=headers, data=json.dumps(data)) + if resp.status_code == 200: + r_json = json.loads(resp.json()["result"]) + print(r_json) + ocr_result = fd.vision.utils.json_to_ocr(r_json) + vis_im = fd.vision.vis_ppocr(im, ocr_result) + cv2.imwrite("visualized_result.jpg", vis_im) + print("Visualized result save in ./visualized_result.jpg") + else: + print("Error code:", resp.status_code) + print(resp.text) diff --git a/deploy/fastdeploy/serving/simple_serving/server.py b/deploy/fastdeploy/serving/simple_serving/server.py new file mode 100644 index 0000000000000000000000000000000000000000..0078b7112f91004926ced6623253589cdc68cab2 --- /dev/null +++ b/deploy/fastdeploy/serving/simple_serving/server.py @@ -0,0 +1,80 @@ +import fastdeploy as fd +from fastdeploy.serving.server import SimpleServer +import os +import logging + +logging.getLogger().setLevel(logging.INFO) + +# Configurations +det_model_dir = 'ch_PP-OCRv3_det_infer' +cls_model_dir = 'ch_ppocr_mobile_v2.0_cls_infer' +rec_model_dir = 'ch_PP-OCRv3_rec_infer' +rec_label_file = 'ppocr_keys_v1.txt' +device = 'cpu' +# backend: ['paddle', 'trt'], you can also use other backends, but need to modify +# the runtime option below +backend = 'paddle' + +# Prepare models +# Detection model +det_model_file = os.path.join(det_model_dir, "inference.pdmodel") +det_params_file = os.path.join(det_model_dir, "inference.pdiparams") +# Classification model +cls_model_file = os.path.join(cls_model_dir, "inference.pdmodel") +cls_params_file = os.path.join(cls_model_dir, "inference.pdiparams") +# Recognition model +rec_model_file = os.path.join(rec_model_dir, "inference.pdmodel") +rec_params_file = os.path.join(rec_model_dir, "inference.pdiparams") + +# Setup runtime option to select hardware, backend, etc. +option = fd.RuntimeOption() +if device.lower() == 'gpu': + option.use_gpu() +if backend == 'trt': + option.use_trt_backend() +else: + option.use_paddle_infer_backend() + +det_option = option +det_option.set_trt_input_shape("x", [1, 3, 64, 64], [1, 3, 640, 640], + [1, 3, 960, 960]) + +# det_option.set_trt_cache_file("det_trt_cache.trt") +print(det_model_file, det_params_file) +det_model = fd.vision.ocr.DBDetector( + det_model_file, det_params_file, runtime_option=det_option) + +cls_batch_size = 1 +rec_batch_size = 6 + +cls_option = option +cls_option.set_trt_input_shape("x", [1, 3, 48, 10], + [cls_batch_size, 3, 48, 320], + [cls_batch_size, 3, 48, 1024]) + +# cls_option.set_trt_cache_file("cls_trt_cache.trt") +cls_model = fd.vision.ocr.Classifier( + cls_model_file, cls_params_file, runtime_option=cls_option) + +rec_option = option +rec_option.set_trt_input_shape("x", [1, 3, 48, 10], + [rec_batch_size, 3, 48, 320], + [rec_batch_size, 3, 48, 2304]) + +# rec_option.set_trt_cache_file("rec_trt_cache.trt") +rec_model = fd.vision.ocr.Recognizer( + rec_model_file, rec_params_file, rec_label_file, runtime_option=rec_option) + +# Create PPOCRv3 pipeline +ppocr_v3 = fd.vision.ocr.PPOCRv3( + det_model=det_model, cls_model=cls_model, rec_model=rec_model) + +ppocr_v3.cls_batch_size = cls_batch_size +ppocr_v3.rec_batch_size = rec_batch_size + +# Create server, setup REST API +app = SimpleServer() +app.register( + task_name="fd/ppocrv3", + model_handler=fd.serving.handler.VisionModelHandler, + predictor=ppocr_v3) diff --git a/deploy/fastdeploy/sophgo/README.md b/deploy/fastdeploy/sophgo/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9fd2e9563f48263cda37e98a6b4cf49ec5d53b4f --- /dev/null +++ b/deploy/fastdeploy/sophgo/README.md @@ -0,0 +1,102 @@ +[English](README.md) | 简体中文 + +# PaddleOCR 模型在SOPHGO上部署方案-FastDeploy + +## 1. 说明 +PaddleOCR支持通过FastDeploy在SOPHGO上部署相关模型. + +## 2.支持模型列表 + +下表中的模型下载链接由PaddleOCR模型库提供, 详见[PP-OCR系列模型列表](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6/doc/doc_ch/models_list.md) + +| PaddleOCR版本 | 文本框检测 | 方向分类模型 | 文字识别 |字典文件| 说明 | +|:----|:----|:----|:----|:----|:--------| +| ch_PP-OCRv3[推荐] |[ch_PP-OCRv3_det](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) | [ch_ppocr_mobile_v2.0_cls](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) | [ch_PP-OCRv3_rec](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) | [ppocr_keys_v1.txt](https://bj.bcebos.com/paddlehub/fastdeploy/ppocr_keys_v1.txt) | OCRv3系列原始超轻量模型,支持中英文、多语种文本检测 | +| en_PP-OCRv3[推荐] |[en_PP-OCRv3_det](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) | [ch_ppocr_mobile_v2.0_cls](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) | [en_PP-OCRv3_rec](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) | [en_dict.txt](https://bj.bcebos.com/paddlehub/fastdeploy/en_dict.txt) | OCRv3系列原始超轻量模型,支持英文与数字识别,除检测模型和识别模型的训练数据与中文模型不同以外,无其他区别 | +| ch_PP-OCRv2 |[ch_PP-OCRv2_det](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) | [ch_ppocr_mobile_v2.0_cls](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) | [ch_PP-OCRv2_rec](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) | [ppocr_keys_v1.txt](https://bj.bcebos.com/paddlehub/fastdeploy/ppocr_keys_v1.txt) | OCRv2系列原始超轻量模型,支持中英文、多语种文本检测 | +| ch_PP-OCRv2_mobile |[ch_ppocr_mobile_v2.0_det](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) | [ch_ppocr_mobile_v2.0_cls](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) | [ch_ppocr_mobile_v2.0_rec](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) | [ppocr_keys_v1.txt](https://bj.bcebos.com/paddlehub/fastdeploy/ppocr_keys_v1.txt) | OCRv2系列原始超轻量模型,支持中英文、多语种文本检测,比PPOCRv2更加轻量 | +| ch_PP-OCRv2_server |[ch_ppocr_server_v2.0_det](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) | [ch_ppocr_mobile_v2.0_cls](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) | [ch_ppocr_server_v2.0_rec](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) |[ppocr_keys_v1.txt](https://bj.bcebos.com/paddlehub/fastdeploy/ppocr_keys_v1.txt) | OCRv2服务器系列模型, 支持中英文、多语种文本检测,比超轻量模型更大,但效果更好| + +## 3. 准备PP-OCR推理模型以及转换模型 + +PP-OCRv3包括文本检测模型(ch_PP-OCRv3_det)、方向分类模型(ch_ppocr_mobile_v2.0_cls)、文字识别模型(ch_PP-OCRv3_rec) +SOPHGO-TPU部署模型前需要将以上Paddle模型转换成bmodel模型,我们以ch_PP-OCRv3_det模型为例,具体步骤如下: +- 下载Paddle模型[ch_PP-OCRv3_det](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) +- Pddle模型转换为ONNX模型,请参考[Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX) +- ONNX模型转换bmodel模型的过程,请参考[TPU-MLIR](https://github.com/sophgo/tpu-mlir) +下面我们提供一个example, 供用户参考,完成模型的转换. + +### 3.1 下载ch_PP-OCRv3_det模型,并转换为ONNX模型 +```shell +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar +tar xvf ch_PP-OCRv3_det_infer.tar + +# 修改ch_PP-OCRv3_det模型的输入shape,由动态输入变成固定输入 +python paddle_infer_shape.py --model_dir ch_PP-OCRv3_det_infer \ + --model_filename inference.pdmodel \ + --params_filename inference.pdiparams \ + --save_dir ch_PP-OCRv3_det_infer_fix \ + --input_shape_dict="{'x':[1,3,960,608]}" + +# 请用户自行安装最新发布版本的paddle2onnx, 转换模型到ONNX格式的模型 +paddle2onnx --model_dir ch_PP-OCRv3_det_infer_fix \ + --model_filename inference.pdmodel \ + --params_filename inference.pdiparams \ + --save_file ch_PP-OCRv3_det_infer_fix.onnx \ + --enable_dev_version True +``` + +### 3.2 导出bmodel模型 + +以转换BM1684x的bmodel模型为例子,我们需要下载[TPU-MLIR](https://github.com/sophgo/tpu-mlir)工程,安装过程具体参见[TPU-MLIR文档](https://github.com/sophgo/tpu-mlir/blob/master/README.md)。 +#### 3.2.1 安装 +``` shell +docker pull sophgo/tpuc_dev:latest + +# myname1234是一个示例,也可以设置其他名字 +docker run --privileged --name myname1234 -v $PWD:/workspace -it sophgo/tpuc_dev:latest + +source ./envsetup.sh +./build.sh +``` + +#### 3.2.2 ONNX模型转换为bmodel模型 +``` shell +mkdir ch_PP-OCRv3_det && cd ch_PP-OCRv3_det + +#在该文件中放入测试图片,同时将上一步转换的ch_PP-OCRv3_det_infer_fix.onnx放入该文件夹中 +cp -rf ${REGRESSION_PATH}/dataset/COCO2017 . +cp -rf ${REGRESSION_PATH}/image . +#放入onnx模型文件ch_PP-OCRv3_det_infer_fix.onnx + +mkdir workspace && cd workspace + +#将ONNX模型转换为mlir模型,其中参数--output_names可以通过NETRON查看 +model_transform.py \ + --model_name ch_PP-OCRv3_det \ + --model_def ../ch_PP-OCRv3_det_infer_fix.onnx \ + --input_shapes [[1,3,960,608]] \ + --mean 0.0,0.0,0.0 \ + --scale 0.0039216,0.0039216,0.0039216 \ + --keep_aspect_ratio \ + --pixel_format rgb \ + --output_names sigmoid_0.tmp_0 \ + --test_input ../image/dog.jpg \ + --test_result ch_PP-OCRv3_det_top_outputs.npz \ + --mlir ch_PP-OCRv3_det.mlir + +#将mlir模型转换为BM1684x的F32 bmodel模型 +model_deploy.py \ + --mlir ch_PP-OCRv3_det.mlir \ + --quantize F32 \ + --chip bm1684x \ + --test_input ch_PP-OCRv3_det_in_f32.npz \ + --test_reference ch_PP-OCRv3_det_top_outputs.npz \ + --model ch_PP-OCRv3_det_1684x_f32.bmodel +``` +最终获得可以在BM1684x上能够运行的bmodel模型ch_PP-OCRv3_det_1684x_f32.bmodel。按照上面同样的方法,可以将ch_ppocr_mobile_v2.0_cls,ch_PP-OCRv3_rec转换为bmodel的格式。如果需要进一步对模型进行加速,可以将ONNX模型转换为INT8 bmodel,具体步骤参见[TPU-MLIR文档](https://github.com/sophgo/tpu-mlir/blob/master/README.md)。 + + +## 4. 详细部署的部署示例 +- [Python部署](python) +- [C++部署](cpp) diff --git a/deploy/fastdeploy/sophgo/cpp/CMakeLists.txt b/deploy/fastdeploy/sophgo/cpp/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..b32846afedc66b290e53721191493645b04fb707 --- /dev/null +++ b/deploy/fastdeploy/sophgo/cpp/CMakeLists.txt @@ -0,0 +1,13 @@ +PROJECT(infer_demo C CXX) +CMAKE_MINIMUM_REQUIRED (VERSION 3.10) +# 指定下载解压后的fastdeploy库路径 +option(FASTDEPLOY_INSTALL_DIR "Path of downloaded fastdeploy sdk.") + +include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake) + +# 添加FastDeploy依赖头文件 +include_directories(${FASTDEPLOY_INCS}) + +add_executable(infer_demo ${PROJECT_SOURCE_DIR}/infer.cc) +# 添加FastDeploy库依赖 +target_link_libraries(infer_demo ${FASTDEPLOY_LIBS}) diff --git a/deploy/fastdeploy/sophgo/cpp/README.md b/deploy/fastdeploy/sophgo/cpp/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0b17f7df879684c77b3a83b01adf4cc550f1f49f --- /dev/null +++ b/deploy/fastdeploy/sophgo/cpp/README.md @@ -0,0 +1,66 @@ +[English](README_CN.md) | 简体中文 +# PP-OCRv3 SOPHGO C++部署示例 +本目录下提供`infer.cc`快速完成PPOCRv3模型在SOPHGO BM1684x板子上加速部署的示例。 + +## 1. 部署环境准备 +在部署前,需自行编译基于SOPHGO硬件的预测库,参考文档[SOPHGO硬件部署环境](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install#算能硬件部署环境) + +## 2. 生成基本目录文件 + +该例程由以下几个部分组成 +```text +. +├── CMakeLists.txt +├── fastdeploy-sophgo # 编译好的SDK文件夹 +├── image # 存放图片的文件夹 +├── infer.cc +└── model # 存放模型文件的文件夹 +``` + +## 3.部署示例 + +### 3.1 下载部署示例代码 +```bash +# 下载部署示例代码 +git clone https://github.com/PaddlePaddle/FastDeploy.git +cd FastDeploy/examples/vision/ocr/PP-OCR/sophgo/cpp + +# 如果您希望从PaddleOCR下载示例代码,请运行 +git clone https://github.com/PaddlePaddle/PaddleOCR.git +# 注意:如果当前分支找不到下面的fastdeploy测试代码,请切换到dygraph分支 +git checkout dygraph +cd PaddleOCR/deploy/fastdeploy/sophgo/cpp +``` + +### 3.2 拷贝bmodel模型文至model文件夹 +将Paddle模型转换为SOPHGO bmodel模型,转换步骤参考[文档](../README.md). 将转换后的SOPHGO bmodel模型文件拷贝至model中. + +### 3.3 准备测试图片至image文件夹,以及字典文件 +```bash +wget https://gitee.com/paddlepaddle/PaddleOCR/raw/release/2.6/doc/imgs/12.jpg +cp 12.jpg image/ + +wget https://gitee.com/paddlepaddle/PaddleOCR/raw/release/2.6/ppocr/utils/ppocr_keys_v1.txt +``` + +### 3.4 编译example + +```bash +cd build +cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-0.0.3 +make +``` + +### 3.5 运行例程 + +```bash +./infer_demo model ./ppocr_keys_v1.txt image/12.jpeg +``` + + +## 4. 更多指南 + +- [PP-OCR系列 C++ API查阅](https://www.paddlepaddle.org.cn/fastdeploy-api-doc/cpp/html/namespacefastdeploy_1_1vision_1_1ocr.html) +- [FastDeploy部署PaddleOCR模型概览](../../) +- [PP-OCRv3 Python部署](../python) +- 如果用户想要调整前后处理超参数、单独使用文字检测识别模型、使用其他模型等,更多详细文档与说明请参考[PP-OCR系列在CPU/GPU上的部署](../../cpu-gpu/cpp/README.md) diff --git a/deploy/fastdeploy/sophgo/cpp/infer.cc b/deploy/fastdeploy/sophgo/cpp/infer.cc new file mode 100644 index 0000000000000000000000000000000000000000..181561b39e94c6e242502de24c17aadcda2d34c7 --- /dev/null +++ b/deploy/fastdeploy/sophgo/cpp/infer.cc @@ -0,0 +1,136 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/vision.h" +#ifdef WIN32 +const char sep = '\\'; +#else +const char sep = '/'; +#endif + +void InitAndInfer(const std::string &det_model_dir, + const std::string &rec_label_file, + const std::string &image_file, + const fastdeploy::RuntimeOption &option) { + auto det_model_file = + det_model_dir + sep + "ch_PP-OCRv3_det_1684x_f32.bmodel"; + auto det_params_file = det_model_dir + sep + ""; + + auto cls_model_file = + det_model_dir + sep + "ch_ppocr_mobile_v2.0_cls_1684x_f32.bmodel"; + auto cls_params_file = det_model_dir + sep + ""; + + auto rec_model_file = + det_model_dir + sep + "ch_PP-OCRv3_rec_1684x_f32.bmodel"; + auto rec_params_file = det_model_dir + sep + ""; + + auto format = fastdeploy::ModelFormat::SOPHGO; + + auto det_option = option; + auto cls_option = option; + auto rec_option = option; + + // The cls and rec model can inference a batch of images now. + // User could initialize the inference batch size and set them after create + // PPOCR model. + int cls_batch_size = 1; + int rec_batch_size = 1; + + // If use TRT backend, the dynamic shape will be set as follow. + // We recommend that users set the length and height of the detection model to + // a multiple of 32. We also recommend that users set the Trt input shape as + // follow. + det_option.SetTrtInputShape("x", {1, 3, 64, 64}, {1, 3, 640, 640}, + {1, 3, 960, 960}); + cls_option.SetTrtInputShape("x", {1, 3, 48, 10}, {cls_batch_size, 3, 48, 320}, + {cls_batch_size, 3, 48, 1024}); + rec_option.SetTrtInputShape("x", {1, 3, 48, 10}, {rec_batch_size, 3, 48, 320}, + {rec_batch_size, 3, 48, 2304}); + + // Users could save TRT cache file to disk as follow. + // det_option.SetTrtCacheFile(det_model_dir + sep + "det_trt_cache.trt"); + // cls_option.SetTrtCacheFile(cls_model_dir + sep + "cls_trt_cache.trt"); + // rec_option.SetTrtCacheFile(rec_model_dir + sep + "rec_trt_cache.trt"); + + auto det_model = fastdeploy::vision::ocr::DBDetector( + det_model_file, det_params_file, det_option, format); + auto cls_model = fastdeploy::vision::ocr::Classifier( + cls_model_file, cls_params_file, cls_option, format); + auto rec_model = fastdeploy::vision::ocr::Recognizer( + rec_model_file, rec_params_file, rec_label_file, rec_option, format); + + // Users could enable static shape infer for rec model when deploy PP-OCR on + // hardware which can not support dynamic shape infer well, like Huawei Ascend + // series. + rec_model.GetPreprocessor().SetStaticShapeInfer(true); + rec_model.GetPreprocessor().SetRecImageShape({3, 48, 584}); + + assert(det_model.Initialized()); + assert(cls_model.Initialized()); + assert(rec_model.Initialized()); + + // The classification model is optional, so the PP-OCR can also be connected + // in series as follows auto ppocr_v3 = + // fastdeploy::pipeline::PPOCRv3(&det_model, &rec_model); + auto ppocr_v3 = + fastdeploy::pipeline::PPOCRv3(&det_model, &cls_model, &rec_model); + + // Set inference batch size for cls model and rec model, the value could be -1 + // and 1 to positive infinity. When inference batch size is set to -1, it + // means that the inference batch size of the cls and rec models will be the + // same as the number of boxes detected by the det model. + ppocr_v3.SetClsBatchSize(cls_batch_size); + ppocr_v3.SetRecBatchSize(rec_batch_size); + + if (!ppocr_v3.Initialized()) { + std::cerr << "Failed to initialize PP-OCR." << std::endl; + return; + } + + auto im = cv::imread(image_file); + auto im_bak = im.clone(); + + fastdeploy::vision::OCRResult result; + if (!ppocr_v3.Predict(&im, &result)) { + std::cerr << "Failed to predict." << std::endl; + return; + } + + std::cout << result.Str() << std::endl; + + auto vis_im = fastdeploy::vision::VisOcr(im_bak, result); + cv::imwrite("vis_result.jpg", vis_im); + std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl; +} + +int main(int argc, char *argv[]) { + if (argc < 4) { + std::cout << "Usage: infer_demo path/to/model " + "path/to/rec_label_file path/to/image " + "e.g ./infer_demo ./ocr_bmodel " + "./ppocr_keys_v1.txt ./12.jpg" + << std::endl; + return -1; + } + + fastdeploy::RuntimeOption option; + option.UseSophgo(); + option.UseSophgoBackend(); + + std::string model_dir = argv[1]; + std::string rec_label_file = argv[2]; + std::string test_image = argv[3]; + InitAndInfer(model_dir, rec_label_file, test_image, option); + return 0; +} diff --git a/deploy/fastdeploy/sophgo/python/README.md b/deploy/fastdeploy/sophgo/python/README.md new file mode 100644 index 0000000000000000000000000000000000000000..27dbe2694cd7ca71b9f16960937fde8564a5b4d4 --- /dev/null +++ b/deploy/fastdeploy/sophgo/python/README.md @@ -0,0 +1,52 @@ +[English](README.md) | 简体中文 +# PP-OCRv3 SOPHGO Python部署示例 +本目录下提供`infer.py`快速完成 PP-OCRv3 在SOPHGO TPU上部署的示例。 + +## 1. 部署环境准备 + +在部署前,需自行编译基于算能硬件的FastDeploy python wheel包并安装,参考文档[算能硬件部署环境](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install#算能硬件部署环境) + + +## 2.运行部署示例 + +### 2.1 模型准备 +将Paddle模型转换为SOPHGO bmodel模型, 转换步骤参考[文档](../README.md) + +### 2.2 开始部署 +```bash +# 下载部署示例代码 +git clone https://github.com/PaddlePaddle/FastDeploy.git +cd FastDeploy/examples/vision/ocr/PP-OCR/sophgo/python + +# 如果您希望从PaddleOCR下载示例代码,请运行 +git clone https://github.com/PaddlePaddle/PaddleOCR.git +# 注意:如果当前分支找不到下面的fastdeploy测试代码,请切换到dygraph分支 +git checkout dygraph +cd PaddleOCR/deploy/fastdeploy/sophgo/python + +# 下载图片 +wget https://gitee.com/paddlepaddle/PaddleOCR/raw/release/2.6/doc/imgs/12.jpg + +#下载字典文件 +wget https://gitee.com/paddlepaddle/PaddleOCR/raw/release/2.6/ppocr/utils/ppocr_keys_v1.txt + +# 推理 +python3 infer.py --det_model ocr_bmodel/ch_PP-OCRv3_det_1684x_f32.bmodel \ + --cls_model ocr_bmodel/ch_ppocr_mobile_v2.0_cls_1684x_f32.bmodel \ + --rec_model ocr_bmodel/ch_PP-OCRv3_rec_1684x_f32.bmodel \ + --rec_label_file ../ppocr_keys_v1.txt \ + --image ../12.jpg + +# 运行完成后返回结果如下所示 +det boxes: [[42,413],[483,391],[484,428],[43,450]]rec text: 上海斯格威铂尔大酒店 rec score:0.952958 cls label: 0 cls score: 1.000000 +det boxes: [[187,456],[399,448],[400,480],[188,488]]rec text: 打浦路15号 rec score:0.897335 cls label: 0 cls score: 1.000000 +det boxes: [[23,507],[513,488],[515,529],[24,548]]rec text: 绿洲仕格维花园公寓 rec score:0.994589 cls label: 0 cls score: 1.000000 +det boxes: [[74,553],[427,542],[428,571],[75,582]]rec text: 打浦路252935号 rec score:0.900663 cls label: 0 cls score: 1.000000 + +可视化结果保存在sophgo_result.jpg中 +``` + +## 3. 其它文档 +- [PP-OCRv3 C++部署](../cpp) +- [转换 PP-OCRv3 SOPHGO模型文档](../README.md) +- 如果用户想要调整前后处理超参数、单独使用文字检测识别模型、使用其他模型等,更多详细文档与说明请参考[PP-OCR系列在CPU/GPU上的部署](../../cpu-gpu/cpp/README.md) diff --git a/deploy/fastdeploy/sophgo/python/infer.py b/deploy/fastdeploy/sophgo/python/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..356317099f4072bad3830c403c14913c74f573cd --- /dev/null +++ b/deploy/fastdeploy/sophgo/python/infer.py @@ -0,0 +1,116 @@ +import fastdeploy as fd +import cv2 +import os + + +def parse_arguments(): + import argparse + import ast + parser = argparse.ArgumentParser() + parser.add_argument( + "--det_model", required=True, help="Path of Detection model of PPOCR.") + parser.add_argument( + "--cls_model", + required=True, + help="Path of Classification model of PPOCR.") + parser.add_argument( + "--rec_model", + required=True, + help="Path of Recognization model of PPOCR.") + parser.add_argument( + "--rec_label_file", + required=True, + help="Path of Recognization label of PPOCR.") + parser.add_argument( + "--image", type=str, required=True, help="Path of test image file.") + + return parser.parse_args() + + +args = parse_arguments() + +# 配置runtime,加载模型 +runtime_option = fd.RuntimeOption() +runtime_option.use_sophgo() + +# Detection模型, 检测文字框 +det_model_file = args.det_model +det_params_file = "" +# Classification模型,方向分类,可选 +cls_model_file = args.cls_model +cls_params_file = "" +# Recognition模型,文字识别模型 +rec_model_file = args.rec_model +rec_params_file = "" +rec_label_file = args.rec_label_file + +# PPOCR的cls和rec模型现在已经支持推理一个Batch的数据 +# 定义下面两个变量后, 可用于设置trt输入shape, 并在PPOCR模型初始化后, 完成Batch推理设置 +cls_batch_size = 1 +rec_batch_size = 1 + +# 当使用TRT时,分别给三个模型的runtime设置动态shape,并完成模型的创建. +# 注意: 需要在检测模型创建完成后,再设置分类模型的动态输入并创建分类模型, 识别模型同理. +# 如果用户想要自己改动检测模型的输入shape, 我们建议用户把检测模型的长和高设置为32的倍数. +det_option = runtime_option +det_option.set_trt_input_shape("x", [1, 3, 64, 64], [1, 3, 640, 640], + [1, 3, 960, 960]) +# 用户可以把TRT引擎文件保存至本地 +# det_option.set_trt_cache_file(args.det_model + "/det_trt_cache.trt") +det_model = fd.vision.ocr.DBDetector( + det_model_file, + det_params_file, + runtime_option=det_option, + model_format=fd.ModelFormat.SOPHGO) + +cls_option = runtime_option +cls_option.set_trt_input_shape("x", [1, 3, 48, 10], + [cls_batch_size, 3, 48, 320], + [cls_batch_size, 3, 48, 1024]) +# 用户可以把TRT引擎文件保存至本地 +# cls_option.set_trt_cache_file(args.cls_model + "/cls_trt_cache.trt") +cls_model = fd.vision.ocr.Classifier( + cls_model_file, + cls_params_file, + runtime_option=cls_option, + model_format=fd.ModelFormat.SOPHGO) + +rec_option = runtime_option +rec_option.set_trt_input_shape("x", [1, 3, 48, 10], + [rec_batch_size, 3, 48, 320], + [rec_batch_size, 3, 48, 2304]) +# 用户可以把TRT引擎文件保存至本地 +# rec_option.set_trt_cache_file(args.rec_model + "/rec_trt_cache.trt") +rec_model = fd.vision.ocr.Recognizer( + rec_model_file, + rec_params_file, + rec_label_file, + runtime_option=rec_option, + model_format=fd.ModelFormat.SOPHGO) + +# 创建PP-OCR,串联3个模型,其中cls_model可选,如无需求,可设置为None +ppocr_v3 = fd.vision.ocr.PPOCRv3( + det_model=det_model, cls_model=cls_model, rec_model=rec_model) + +# 需要使用下行代码, 来启用rec模型的静态shape推理,这里rec模型的静态输入为[3, 48, 584] +rec_model.preprocessor.static_shape_infer = True +rec_model.preprocessor.rec_image_shape = [3, 48, 584] + +# 给cls和rec模型设置推理时的batch size +# 此值能为-1, 和1到正无穷 +# 当此值为-1时, cls和rec模型的batch size将默认和det模型检测出的框的数量相同 +ppocr_v3.cls_batch_size = cls_batch_size +ppocr_v3.rec_batch_size = rec_batch_size + +# 预测图片准备 +im = cv2.imread(args.image) + +#预测并打印结果 +result = ppocr_v3.predict(im) + +print(result) + +# 可视化结果 +vis_im = fd.vision.vis_ppocr(im, result) +cv2.imwrite("sophgo_result.jpg", vis_im) +print("Visualized result save in ./sophgo_result.jpg") diff --git a/deploy/fastdeploy/web/README.md b/deploy/fastdeploy/web/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5ca9628d30630d538479e6df90c744a1eea52261 --- /dev/null +++ b/deploy/fastdeploy/web/README.md @@ -0,0 +1,33 @@ +[English](README.md) | 简体中文 +# PP-OCRv3 前端部署示例 + +本节介绍部署PaddleOCR的PP-OCRv3模型在浏览器中运行,以及@paddle-js-models/ocr npm包中的js接口。 + + +## 1. 前端部署PP-OCRv3模型 +PP-OCRv3模型web demo使用[**参考文档**](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/application/js/web_demo) + +## 2. PP-OCRv3 js接口 + +``` +import * as ocr from "@paddle-js-models/ocr"; +await ocr.init(detConfig, recConfig); +const res = await ocr.recognize(img, option, postConfig); +``` +ocr模型加载和初始化,其中模型为Paddle.js模型格式,js模型转换方式参考[文档](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/application/js/web_demo/README.md) + +**init函数参数** + +> * **detConfig**(dict): 文本检测模型配置参数,默认值为 {modelPath: 'https://js-models.bj.bcebos.com/PaddleOCR/PP-OCRv3/ch_PP-OCRv3_det_infer_js_960/model.json', fill: '#fff', mean: [0.485, 0.456, 0.406],std: [0.229, 0.224, 0.225]}; 其中,modelPath为文本检测模型路径,fill 为图像预处理padding的值,mean和std分别为预处理的均值和标准差 +> * **recConfig**(dict)): 文本识别模型配置参数,默认值为 {modelPath: 'https://js-models.bj.bcebos.com/PaddleOCR/PP-OCRv3/ch_PP-OCRv3_rec_infer_js/model.json', fill: '#000', mean: [0.5, 0.5, 0.5], std: [0.5, 0.5, 0.5]}; 其中,modelPath为文本检测模型路径,fill 为图像预处理padding的值,mean和std分别为预处理的均值和标准差 + + +**recognize函数参数** + +> * **img**(HTMLImageElement): 输入图像参数,类型为HTMLImageElement。 +> * **option**(dict): 可视化文本检测框的canvas参数,可不用设置。 +> * **postConfig**(dict): 文本检测后处理参数,默认值为:{shape: 960, thresh: 0.3, box_thresh: 0.6, unclip_ratio:1.5}; thresh是输出预测图的二值化阈值;box_thresh是输出框的阈值,低于此值的预测框会被丢弃,unclip_ratio是输出框扩大的比例。 + + +## 其它文档 +- [PP-OCRv3 微信小程序部署文档](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/application/js/mini_program) diff --git a/deploy/hubserving/ocr_det/module.py b/deploy/hubserving/ocr_det/module.py index 8fef3be017eef1c6a52395348624f5bfcb6260e7..3dbaf161cdfd59528a36b5d5f645656cade2f1fc 100644 --- a/deploy/hubserving/ocr_det/module.py +++ b/deploy/hubserving/ocr_det/module.py @@ -122,7 +122,7 @@ class OCRDet(hub.Module): rec_res_final = [] for dno in range(len(dt_boxes)): rec_res_final.append({ - 'text_region': dt_boxes[dno].astype(np.int).tolist() + 'text_region': dt_boxes[dno].astype(np.int32).tolist() }) all_results.append(rec_res_final) return all_results diff --git a/deploy/hubserving/ocr_system/module.py b/deploy/hubserving/ocr_system/module.py index dff3abb48010946a9817b832383f1c95b7053970..192fff9650901df7889ae7b0620beea1d8b03e81 100644 --- a/deploy/hubserving/ocr_system/module.py +++ b/deploy/hubserving/ocr_system/module.py @@ -130,7 +130,7 @@ class OCRSystem(hub.Module): rec_res_final.append({ 'text': text, 'confidence': float(score), - 'text_region': dt_boxes[dno].astype(np.int).tolist() + 'text_region': dt_boxes[dno].astype(np.int32).tolist() }) all_results.append(rec_res_final) return all_results diff --git a/deploy/paddlejs/README.md b/deploy/paddlejs/README.md index e80b7cce8249482f0e3e5ef06ed1e9e22b261644..eef39b6c9583e056c1413e335ed989dc957afa51 100644 --- a/deploy/paddlejs/README.md +++ b/deploy/paddlejs/README.md @@ -1,14 +1,26 @@ -English| [简体中文](README_ch.md) +English| [简体中文](README_ch.md) -# Paddle.js +# Paddle.js Introduction -[Paddle.js](https://github.com/PaddlePaddle/Paddle.js) is a web project for Baidu PaddlePaddle, which is an open source deep learning framework running in the browser. Paddle.js can either load a pre-trained model, or transforming a model from paddle-hub with model transforming tools provided by Paddle.js. It could run in every browser with WebGL/WebGPU/WebAssembly supported. It could also run in Baidu Smartprogram and WX miniprogram. +[Paddle.js](https://github.com/PaddlePaddle/Paddle.js) is a web project for Baidu PaddlePaddle, which is an open source deep learning framework running in the browser. Paddle.js can either load a pre-trained model, or transforming a model from paddle-hub with model transforming tools provided by Paddle.js. It could run in every browser with WebGL/WebGPU/WebAssembly supported. It could also run in Baidu Smartprogram and wechat miniprogram. +## Web Demo +Run OCR demo in browser refer to [tutorial](https://github.com/PaddlePaddle/FastDeploy/blob/develop/examples/application/js/WebDemo.md). -- [Online experience](https://paddlejs.baidu.com/ocr) -- [Tutorial](https://github.com/PaddlePaddle/Paddle.js/blob/release/v2.2.3/packages/paddlejs-models/ocr/README_cn.md) -- Visualization: +|demo|web demo dicrctory|visualization| +|-|-|-| +|PP-OCRv3|[TextDetection、TextRecognition](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/application/js/web_demo/src/pages/cv/ocr/)|| + + +## Mini Program Demo +The Mini Program demo running tutorial eference +Run OCR demo in wechat miniprogram refer to [tutorial](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/application/js/mini_program). + +|demo|directory| +|-|-| +|Text Detection| [ocrdetecXcx](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/application/js/mini_program/ocrdetectXcx/) | +|Text Recognition| [ocrXcx](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/application/js/mini_program/ocrXcx/) |
-
\ No newline at end of file + diff --git a/deploy/paddlejs/README_ch.md b/deploy/paddlejs/README_ch.md index 9e514df085b052e4163812d2b989bc27b15e2ba4..466661305962353e0fdb33ca9834d5880b21cf7a 100644 --- a/deploy/paddlejs/README_ch.md +++ b/deploy/paddlejs/README_ch.md @@ -5,10 +5,27 @@ [Paddle.js](https://github.com/PaddlePaddle/Paddle.js) 是百度 PaddlePaddle 的 web 方向子项目,是一个运行在浏览器中的开源深度学习框架。Paddle.js 可以加载提前训练好的 paddle 模型,通过 Paddle.js 的模型转换工具 paddlejs-converter 变成浏览器友好的模型进行在线推理预测使用。目前,Paddle.js 可以在支持 WebGL/WebGPU/WebAssembly 的浏览器中运行,也可以在百度小程序和微信小程序环境下运行。 -- [在线体验](https://paddlejs.baidu.com/ocr) -- [直达教程](https://github.com/PaddlePaddle/Paddle.js/blob/release/v2.2.3/packages/paddlejs-models/ocr/README_cn.md) +## Web Demo使用 + +在浏览器中直接运行官方OCR demo参考[教程](https://github.com/PaddlePaddle/FastDeploy/blob/develop/examples/application/js/WebDemo.md) + +|demo名称|web demo目录|可视化| +|-|-|-| +|PP-OCRv3|[TextDetection、TextRecognition](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/application/js/web_demo/src/pages/cv/ocr/)|| + + +## 微信小程序Demo使用 + +在微信小程序运行官方demo参考[教程](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/application/js/mini_program) + +|名称|目录| +|-|-| +|OCR文本检测| [ocrdetecXcx](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/application/js/mini_program/ocrdetectXcx/) | +|OCR文本识别| [ocrXcx](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/application/js/mini_program/ocrXcx/) | + + - 效果:
- -
\ No newline at end of file + + diff --git a/deploy/slim/prune/export_prune_model.py b/deploy/slim/prune/export_prune_model.py index f4385972009e1b5382504754dc655381f0cc7717..b64b1d4c1e82b9d7db761aa65ba85f180f3299c6 100644 --- a/deploy/slim/prune/export_prune_model.py +++ b/deploy/slim/prune/export_prune_model.py @@ -25,7 +25,7 @@ sys.path.append(os.path.join(__dir__, '..', '..', '..')) sys.path.append(os.path.join(__dir__, '..', '..', '..', 'tools')) import paddle -from ppocr.data import build_dataloader +from ppocr.data import build_dataloader, set_signal_handlers from ppocr.modeling.architectures import build_model from ppocr.postprocess import build_post_process @@ -39,6 +39,7 @@ def main(config, device, logger, vdl_writer): global_config = config['Global'] # build dataloader + set_signal_handlers() valid_dataloader = build_dataloader(config, 'Eval', device, logger) # build post process diff --git a/deploy/slim/prune/sensitivity_anal.py b/deploy/slim/prune/sensitivity_anal.py index be64a6bcf860c3e2e7a8a6fa20c4c241149a147b..ded8ac04255a1c83f72a28ebba6e19c0387103ac 100644 --- a/deploy/slim/prune/sensitivity_anal.py +++ b/deploy/slim/prune/sensitivity_anal.py @@ -26,7 +26,7 @@ sys.path.append(os.path.join(__dir__, '..', '..', '..', 'tools')) import paddle import paddle.distributed as dist -from ppocr.data import build_dataloader +from ppocr.data import build_dataloader, set_signal_handlers from ppocr.modeling.architectures import build_model from ppocr.losses import build_loss from ppocr.optimizer import build_optimizer @@ -57,6 +57,7 @@ def main(config, device, logger, vdl_writer): global_config = config['Global'] # build dataloader + set_signal_handlers() train_dataloader = build_dataloader(config, 'Train', device, logger) if config['Eval']: valid_dataloader = build_dataloader(config, 'Eval', device, logger) diff --git a/deploy/slim/quantization/README.md b/deploy/slim/quantization/README.md index 7f1ff7ae22e78cded28f1689d66a5e41dd8950a2..d401d3ba0c8ba209994c43b72a7dbf240fe9dd3d 100644 --- a/deploy/slim/quantization/README.md +++ b/deploy/slim/quantization/README.md @@ -54,4 +54,7 @@ python deploy/slim/quantization/export_model.py -c configs/det/ch_PP-OCRv3/ch_PP ### 5. 量化模型部署 上述步骤导出的量化模型,参数精度仍然是FP32,但是参数的数值范围是int8,导出的模型可以通过PaddleLite的opt模型转换工具完成模型转换。 -量化模型部署的可参考 [移动端模型部署](../../lite/readme.md) + +量化模型移动端部署的可参考 [移动端模型部署](../../lite/readme.md) + +备注:量化训练后的模型参数是float32类型,转inference model预测时相对不量化无加速效果,原因是量化后模型结构之间存在量化和反量化算子,如果要使用量化模型部署,建议使用TensorRT并设置precision为INT8加速量化模型的预测时间。 diff --git a/deploy/slim/quantization/export_model.py b/deploy/slim/quantization/export_model.py index bd132b625181cab853961efd2e2c38c411e9edf4..30696f3e3606da695156ecb3c7adbddf1a0071bb 100755 --- a/deploy/slim/quantization/export_model.py +++ b/deploy/slim/quantization/export_model.py @@ -34,7 +34,7 @@ from tools.program import load_config, merge_config, ArgsParser from ppocr.metrics import build_metric import tools.program as program from paddleslim.dygraph.quant import QAT -from ppocr.data import build_dataloader +from ppocr.data import build_dataloader, set_signal_handlers from tools.export_model import export_single_model @@ -134,6 +134,7 @@ def main(): eval_class = build_metric(config['Metric']) # build dataloader + set_signal_handlers() valid_dataloader = build_dataloader(config, 'Eval', device, logger) use_srn = config['Architecture']['algorithm'] == "SRN" diff --git a/deploy/slim/quantization/quant.py b/deploy/slim/quantization/quant.py index ef2c3e28f94e8b72d1aa7822fc88ecfd5c406b89..a580ce4346dab8c5593ed70f19114e33f9a7738b 100755 --- a/deploy/slim/quantization/quant.py +++ b/deploy/slim/quantization/quant.py @@ -31,7 +31,7 @@ import paddle.distributed as dist paddle.seed(2) -from ppocr.data import build_dataloader +from ppocr.data import build_dataloader, set_signal_handlers from ppocr.modeling.architectures import build_model from ppocr.losses import build_loss from ppocr.optimizer import build_optimizer @@ -95,6 +95,7 @@ def main(config, device, logger, vdl_writer): global_config = config['Global'] # build dataloader + set_signal_handlers() train_dataloader = build_dataloader(config, 'Train', device, logger) if config['Eval']: valid_dataloader = build_dataloader(config, 'Eval', device, logger) diff --git a/deploy/slim/quantization/quant_kl.py b/deploy/slim/quantization/quant_kl.py index 73e1a957e8606fd7cc8269e96eec1e274484db06..fa2d16e8d3a167a87034d23bcd5794cb4acb8f84 100755 --- a/deploy/slim/quantization/quant_kl.py +++ b/deploy/slim/quantization/quant_kl.py @@ -31,7 +31,7 @@ import paddle.distributed as dist paddle.seed(2) -from ppocr.data import build_dataloader +from ppocr.data import build_dataloader, set_signal_handlers from ppocr.modeling.architectures import build_model from ppocr.losses import build_loss from ppocr.optimizer import build_optimizer @@ -117,6 +117,7 @@ def main(config, device, logger, vdl_writer): global_config = config['Global'] # build dataloader + set_signal_handlers() config['Train']['loader']['num_workers'] = 0 is_layoutxlm_ser = config['Architecture']['model_type'] =='kie' and config['Architecture']['Backbone']['name'] == 'LayoutXLMForSer' train_dataloader = build_dataloader(config, 'Train', device, logger) diff --git a/doc/doc_ch/PP-OCRv4_introduction.md b/doc/doc_ch/PP-OCRv4_introduction.md new file mode 100644 index 0000000000000000000000000000000000000000..cf1ac63f864698c7b810b4efb91a3937f57de3e8 --- /dev/null +++ b/doc/doc_ch/PP-OCRv4_introduction.md @@ -0,0 +1,179 @@ +# PP-OCRv4 + +- [1. 简介](#1) +- [2. 检测优化](#2) +- [3. 识别优化](#3) +- [4. 端到端评估](#4) + + + +## 1. 简介 + +PP-OCRv4在PP-OCRv3的基础上进一步升级。整体的框架图保持了与PP-OCRv3相同的pipeline,针对检测模型和识别模型进行了数据、网络结构、训练策略等多个模块的优化。 PP-OCRv4系统框图如下所示: + +
+ +
+ + +从算法改进思路上看,分别针对检测和识别模型,进行了共10个方面的改进: +* 检测模块: + * LCNetV3:精度更高的骨干网络 + * PFHead:并行head分支融合结构 + * DSR: 训练中动态增加shrink ratio + * CML:添加Student和Teacher网络输出的KL div loss +* 识别模块: + * SVTR_LCNetV3:精度更高的骨干网络 + * Lite-Neck:精简的Neck结构 + * GTC-NRTR:稳定的Attention指导分支 + * Multi-Scale:多尺度训练策略 + * DF: 数据挖掘方案 + * DKD :DKD蒸馏策略 + +从效果上看,速度可比情况下,多种场景精度均有大幅提升: +* 中文场景,相对于PP-OCRv3中文模型提升超4%; +* 英文数字场景,相比于PP-OCRv3英文模型提升6%; +* 多语言场景,优化80个语种识别效果,平均准确率提升超8%。 + + + +## 2. 检测优化 + +PP-OCRv4检测模型在PP-OCRv3检测模型的基础上,在网络结构,训练策略,蒸馏策略三个方面做了优化。首先,PP-OCRv4检测模型使用PP-LCNetV3替换MobileNetv3,并提出并行分支融合的PFhead结构;其次,训练时动态调整shrink ratio的比例;最后,PP-OCRv4对CML的蒸馏loss进行优化,进一步提升文字检测效果。 + +消融实验如下: + +|序号|策略|模型大小|hmean|速度(cpu + mkldnn)| +|-|-|-|-|-| +|baseline|PP-OCRv3|3.4M|78.84%|69ms| +|baseline student|PP-OCRv3 student|3.4M|76.22%|69ms| +|01|+PFHead|3.6M|76.97%|96ms| +|02|+Dynamic Shrink Ratio|3.6M|78.24%|96ms| +|03|+PP-LCNetv3|4.8M|79.08%|94ms| +|03|+CML|4.8M|79.87%|67ms| + +测试环境: Intel Gold 6148 CPU,预测引擎使用openvino。 + +**(1)PFhead:多分支融合Head结构** + +PFhead结构如下图所示,PFHead在经过第一个转置卷积后,分别进行上采样和转置卷积,上采样的输出通过3x3卷积得到输出结果,然后和转置卷积的分支的结果级联并经过1x1卷积层,最后1x1卷积的结果和转置卷积的结果相加得到最后输出的概率图。PP-OCRv4学生检测模型使用PFhead,hmean从76.22%增加到76.97%。 + +
+ +
+ +**(2)DSR: 收缩比例动态调整策略** + +动态shrink ratio(dynamic shrink ratio): 在训练中,shrink ratio由固定值调整为动态变化,随着训练epoch的增加,shrink ratio从0.4线性增加到0.6。该策略在PP-OCRv4学生检测模型上,hmean从76.97%提升到78.24%。 + +**(3) PP-LCNetV3:精度更高的骨干网络** + +PP-LCNetV3系列模型是PP-LCNet系列模型的延续,覆盖了更大的精度范围,能够适应不同下游任务的需要。PP-LCNetV3系列模型从多个方面进行了优化,提出了可学习仿射变换模块,对重参数化策略、激活函数进行了改进,同时调整了网络深度与宽度。最终,PP-LCNetV3系列模型能够在性能与效率之间达到最佳的平衡,在不同精度范围内取得极致的推理速度。使用PP-LCNetV3替换MobileNetv3 backbone,PP-OCRv4学生检测模型hmean从78.24%提升到79.08%。 + +**(4)CML: 融合KD的互学习策略** + +PP-OCRv4检测模型对PP-OCRv3中的CML(Collaborative Mutual Learning) 协同互学习文本检测蒸馏策略进行了优化。如下图所示,在计算Student Model和Teacher Model的distill Loss时,额外添加KL div loss,让两者输出的response maps分布接近,由此进一步提升Student网络的精度,检测Hmean从79.08%增加到79.56%,端到端指标从61.31%增加到61.87%。 + +
+ +
+ + +## 3. 识别优化 + +PP-OCRv3的识别模块是基于文本识别算法[SVTR](https://arxiv.org/abs/2205.00159)优化。SVTR不再采用RNN结构,通过引入Transformers结构更加有效地挖掘文本行图像的上下文信息,从而提升文本识别能力。直接将PP-OCRv2的识别模型,替换成SVTR_Tiny,识别准确率从74.8%提升到80.1%(+5.3%),但是预测速度慢了将近11倍,CPU上预测一条文本行,将近100ms。因此,如下图所示,PP-OCRv3采用如下6个优化策略进行识别模型加速。 + +
+ +
+ +基于上述策略,PP-OCRv4识别模型相比PP-OCRv3,在速度可比的情况下,精度进一步提升4%。 具体消融实验如下所示: + +| ID | 策略 | 模型大小 | 精度 | 预测耗时(CPU openvino)| +|-----|-----|--------|----| --- | +| 01 | PP-OCRv3 | 12M | 71.50% | 8.54ms | +| 02 | +DF | 12M | 72.70% | 8.54ms | +| 03 | + LiteNeck + GTC | 9.6M | 73.21% | 9.09ms | +| 04 | + PP-LCNetV3 | 11M | 74.18% | 9.8ms | +| 05 | + multi-scale | 11M | 74.20% | 9.8ms | +| 06 | + TextConAug | 11M | 74.72% | 9.8ms | +| 08 | + UDML | 11M | 75.45% | 9.8ms | + +注: 测试速度时,输入图片尺寸均为(3,48,320)。在实际预测时,图像为变长输入,速度会有所变化。测试环境: Intel Gold 6148 CPU,预测时使用Openvino预测引擎。 + +**(1)DF:数据挖掘方案** + +DF(Data Filter) 是一种简单有效的数据挖掘方案。核心思想是利用已有模型预测训练数据,通过置信度和预测结果等信息,对全量数据进行筛选。具体的:首先使用少量数据快速训练得到一个低精度模型,使用该低精度模型对千万级的数据进行预测,去除置信度大于0.95的样本,该部分被认为是对提升模型精度无效的冗余数据。其次使用PP-OCRv3作为高精度模型,对剩余数据进行预测,去除置信度小于0.15的样本,该部分被认为是难以识别或质量很差的样本。 +使用该策略,千万级别训练数据被精简至百万级,显著提升模型训练效率,模型训练时间从2周减少到5天,同时精度提升至72.7%(+1.2%)。 + + +
+ +
+ + +**(2)PP-LCNetV3:精度更优的骨干网络** + +PP-LCNetV3系列模型是PP-LCNet系列模型的延续,覆盖了更大的精度范围,能够适应不同下游任务的需要。PP-LCNetV3系列模型从多个方面进行了优化,提出了可学习仿射变换模块,对重参数化策略、激活函数进行了改进,同时调整了网络深度与宽度。最终,PP-LCNetV3系列模型能够在性能与效率之间达到最佳的平衡,在不同精度范围内取得极致的推理速度。 + +**(3)Lite-Neck:精简参数的Neck结构** + +Lite-Neck整体结构沿用PP-OCRv3版本,在参数上稍作精简,识别模型整体的模型大小可从12M降低到8.5M,而精度不变;在CTCHead中,将Neck输出特征的维度从64提升到120,此时模型大小从8.5M提升到9.6M,精度提升0.5%。 + + +**(4)GTC-NRTR:Attention指导CTC训练策略** + +GTC(Guided Training of CTC),是在PP-OCRv3中使用过的策略,融合多种文本特征的表达,有效的提升文本识别精度。在PP-OCRv4中使用训练更稳定的Transformer模型NRTR作为指导,相比SAR基于循环神经网络的结构,NRTR基于Transformer实现解码过程泛化能力更强,能有效指导CTC分支学习。解决简单场景下快速过拟合的问题。模型大小不变,识别精度提升至73.21%(+0.5%)。 + +
+ +
+ + +**(5)Multi-Scale:多尺度训练策略** + +动态尺度训练策略,是在训练过程中随机resize输入图片的高度,以增大模型的鲁棒性。在训练过程中随机选择(32,48,64)三种高度进行resize,实验证明在测试集上评估精度不掉,在端到端串联推理时,指标可以提升0.5%。 + +
+ +
+ + +**(6)DKD:蒸馏策略** + +识别模型的蒸馏包含两个部分,NRTRhead蒸馏和CTCHead蒸馏; + +对于NRTR head,使用了DKD loss蒸馏,使学生模型NRTR head输出的logits与教师NRTR head接近。最终NRTR head的loss是学生与教师间的DKD loss和与ground truth的cross entropy loss的加权和,用于监督学生模型的backbone训练。通过实验,我们发现加入DKD loss后,计算与ground truth的cross entropy loss时去除label smoothing可以进一步提高精度,因此我们在这里使用的是不带label smoothing的cross entropy loss。 + +对于CTCHead,由于CTC的输出中存在Blank位,即使教师模型和学生模型的预测结果一样,二者的输出的logits分布也会存在差异,影响教师模型向学生模型的知识传递。PP-OCRv4识别模型蒸馏策略中,将CTC输出logits沿着文本长度维度计算均值,将多字符识别问题转换为多字符分类问题,用于监督CTC Head的训练。使用该策略融合NRTRhead DKD蒸馏策略,指标从0.7377提升到0.7545。 + + + + +## 4. 端到端评估 + +经过以上优化,最终PP-OCRv4在速度可比情况下,中文场景端到端Hmean指标相比于PP-OCRv3提升4.5%,效果大幅提升。具体指标如下表所示: + +| Model | Hmean | Model Size (M) | Time Cost (CPU, ms) | +|-----|-----|--------|----| +| PP-OCRv3 | 57.99% | 15.6 | 78 | +| PP-OCRv4 | 62.24% | 15.8 | 76 | + +测试环境:CPU型号为Intel Gold 6148,CPU预测时使用openvino。 + +除了更新中文模型,本次升级也优化了英文数字模型,在自有评估集上文本识别准确率提升6%,如下表所示: + +| Model | ACC | +|-----|-----| +| PP-OCR_en | 54.38% | +| PP-OCRv3_en | 64.04% | +| PP-OCRv4_en | 70.1% | + +同时,也对已支持的80余种语言识别模型进行了升级更新,在有评估集的四种语系识别准确率平均提升5%以上,如下表所示: + +| Model | 拉丁语系 | 阿拉伯语系 | 日语 | 韩语 | +|-----|-----|--------|----| --- | +| PP-OCR_mul | 69.60% | 40.50% | 38.50% | 55.40% | +| PP-OCRv3_mul | 75.20%| 45.37% | 45.80% | 60.10% | +| PP-OCRv4_mul | 80.00%| 75.48% | 56.50% | 83.25% | + diff --git a/doc/doc_ch/PPOCRv3_det_train.md b/doc/doc_ch/PPOCRv3_det_train.md index 601acddee1ba68c90d9a768c16376496080bd711..bcddd249ab7fb61ea757957e7c6c6f852e18d8ab 100644 --- a/doc/doc_ch/PPOCRv3_det_train.md +++ b/doc/doc_ch/PPOCRv3_det_train.md @@ -2,13 +2,13 @@ # PP-OCRv3 文本检测模型训练 - [1. 简介](#1) -- [2. PPOCRv3检测训练](#2) -- [3. 基于PPOCRv3检测的finetune训练](#3) +- [2. PP-OCRv3检测训练](#2) +- [3. 基于PP-OCRv3检测的finetune训练](#3) ## 1. 简介 -PP-OCRv3在PP-OCRv2的基础上进一步升级。本节介绍PP-OCRv3检测模型的训练步骤。有关PPOCRv3策略介绍参考[文档](./PP-OCRv3_introduction.md)。 +PP-OCRv3在PP-OCRv2的基础上进一步升级。本节介绍PP-OCRv3检测模型的训练步骤。有关PP-OCRv3策略介绍参考[文档](./PP-OCRv3_introduction.md)。 @@ -55,10 +55,10 @@ python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/ 训练过程中保存的模型在output目录下,包含以下文件: ``` -best_accuracy.states +best_accuracy.states best_accuracy.pdparams # 默认保存最优精度的模型参数 best_accuracy.pdopt # 默认保存最优精度的优化器相关参数 -latest.states +latest.states latest.pdparams # 默认保存的最新模型参数 latest.pdopt # 默认保存的最新模型的优化器相关参数 ``` @@ -145,19 +145,19 @@ paddle.save(s_params, "./pretrain_models/cml_student.pdparams") -## 3. 基于PPOCRv3检测finetune训练 +## 3. 基于PP-OCRv3检测finetune训练 -本节介绍如何使用PPOCRv3检测模型在其他场景上的finetune训练。 +本节介绍如何使用PP-OCRv3检测模型在其他场景上的finetune训练。 finetune训练适用于三种场景: -- 基于CML蒸馏方法的finetune训练,适用于教师模型在使用场景上精度高于PPOCRv3检测模型,且希望得到一个轻量检测模型。 -- 基于PPOCRv3轻量检测模型的finetune训练,无需训练教师模型,希望在PPOCRv3检测模型基础上提升使用场景上的精度。 +- 基于CML蒸馏方法的finetune训练,适用于教师模型在使用场景上精度高于PP-OCRv3检测模型,且希望得到一个轻量检测模型。 +- 基于PP-OCRv3轻量检测模型的finetune训练,无需训练教师模型,希望在PP-OCRv3检测模型基础上提升使用场景上的精度。 - 基于DML蒸馏方法的finetune训练,适用于采用DML方法进一步提升精度的场景。 **基于CML蒸馏方法的finetune训练** -下载PPOCRv3训练模型: +下载PP-OCRv3训练模型: ``` wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar tar xf ch_PP-OCRv3_det_distill_train.tar @@ -177,10 +177,10 @@ python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs Global.save_model_dir=./output/ ``` -**基于PPOCRv3轻量检测模型的finetune训练** +**基于PP-OCRv3轻量检测模型的finetune训练** -下载PPOCRv3训练模型,并提取Student结构的模型参数: +下载PP-OCRv3训练模型,并提取Student结构的模型参数: ``` wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar tar xf ch_PP-OCRv3_det_distill_train.tar @@ -248,5 +248,3 @@ python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/ Architecture.Models.Student2.pretrained=./teacher \ Global.save_model_dir=./output/ ``` - - diff --git a/doc/doc_ch/algorithm_det_east.md b/doc/doc_ch/algorithm_det_east.md index 94a0d097d803cf5a74461be8faaadcabbd28938d..ef60e1e0752d61ea468c044e427d0df963b64b0a 100644 --- a/doc/doc_ch/algorithm_det_east.md +++ b/doc/doc_ch/algorithm_det_east.md @@ -26,8 +26,8 @@ |模型|骨干网络|配置文件|precision|recall|Hmean|下载链接| | --- | --- | --- | --- | --- | --- | --- | -|EAST|ResNet50_vd|88.71%| 81.36%| 84.88%| [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_east_v2.0_train.tar)| -|EAST| MobileNetV3| 78.20%| 79.10%| 78.65%| [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_east_v2.0_train.tar)| +|EAST|ResNet50_vd| [det_r50_vd_east.yml](../../configs/det/det_r50_vd_east.yml)|88.71%| 81.36%| 84.88%| [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_east_v2.0_train.tar)| +|EAST|MobileNetV3|[det_mv3_east.yml](../../configs/det/det_mv3_east.yml) | 78.20%| 79.10%| 78.65%| [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_east_v2.0_train.tar)| diff --git a/doc/doc_ch/algorithm_det_sast.md b/doc/doc_ch/algorithm_det_sast.md index 038d73fc15f3203bbcc17997c1a8e1c208f80ba8..f18eaf1a44cb18430fbc3f28d2451ac85e524863 100644 --- a/doc/doc_ch/algorithm_det_sast.md +++ b/doc/doc_ch/algorithm_det_sast.md @@ -73,9 +73,9 @@ python3 tools/export_model.py -c configs/det/det_r50_vd_sast_totaltext.yml -o Gl ``` -SAST文本检测模型推理,需要设置参数`--det_algorithm="SAST"`,同时,还需要增加参数`--det_sast_polygon=True`,可以执行如下命令: +SAST文本检测模型推理,需要设置参数`--det_algorithm="SAST"`,同时,还需要增加参数`--det_box_type=poly`,可以执行如下命令: ``` -python3 tools/infer/predict_det.py --det_algorithm="SAST" --image_dir="./doc/imgs_en/img623.jpg" --det_model_dir="./inference/det_sast_tt/" --det_sast_polygon=True +python3 tools/infer/predict_det.py --det_algorithm="SAST" --image_dir="./doc/imgs_en/img623.jpg" --det_model_dir="./inference/det_sast_tt/" --det_box_type='poly' ``` 可视化文本检测结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为'det_res'。结果示例如下: diff --git a/doc/doc_ch/algorithm_overview.md b/doc/doc_ch/algorithm_overview.md index 7f6919c13aad833d8e3fda960bdc172c5fec6c7b..ed556ed9c9e11483b1ba4954f0a2e44219e8219a 100755 --- a/doc/doc_ch/algorithm_overview.md +++ b/doc/doc_ch/algorithm_overview.md @@ -3,6 +3,8 @@ - [1. 两阶段OCR算法](#1) - [1.1 文本检测算法](#11) - [1.2 文本识别算法](#12) + - [1.3 文本超分辨率算法](#13) + - [1.4 公式识别算法](#14) - [2. 端到端OCR算法](#2) - [3. 表格识别算法](#3) - [4. 关键信息抽取算法](#4) @@ -30,6 +32,7 @@ PaddleOCR将**持续新增**支持OCR领域前沿算法与模型,**欢迎广 - [x] [PSENet](./algorithm_det_psenet.md) - [x] [FCENet](./algorithm_det_fcenet.md) - [x] [DRRG](./algorithm_det_drrg.md) +- [x] [CT](./algorithm_det_ct.md) 在ICDAR2015文本检测公开数据集上,算法效果如下: @@ -49,6 +52,7 @@ PaddleOCR将**持续新增**支持OCR领域前沿算法与模型,**欢迎广 |模型|骨干网络|precision|recall|Hmean|下载链接| | --- | --- | --- | --- | --- | --- | |SAST|ResNet50_vd|89.63%|78.44%|83.66%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_totaltext_v2.0_train.tar)| +|CT|ResNet18_vd|88.68%|81.70%|85.05%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r18_ct_train.tar)| 在CTW1500文本检测公开数据集上,算法效果如下: @@ -107,6 +111,34 @@ PaddleOCR将**持续新增**支持OCR领域前沿算法与模型,**欢迎广 |RobustScanner|ResNet31| 87.77% | rec_r31_robustscanner | [训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_r31_robustscanner.tar)| |RFL|ResNetRFL| 88.63% | rec_resnet_rfl_att | [训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl_att_train.tar) | + + + +### 1.3 文本超分辨率算法 +已支持的文本超分辨率算法列表(戳链接获取使用教程): +- [x] [Text Gestalt](./algorithm_sr_gestalt.md) +- [x] [Text Telescope](./algorithm_sr_telescope.md) + +在TextZoom公开数据集上,算法效果如下: + +|模型|骨干网络|PSNR_Avg|SSIM_Avg|配置文件|下载链接| +|---|---|---|---|---|---| +|Text Gestalt|tsrn|19.28|0.6560| [configs/sr/sr_tsrn_transformer_strock.yml](../../configs/sr/sr_tsrn_transformer_strock.yml)|[训练模型](https://paddleocr.bj.bcebos.com/sr_tsrn_transformer_strock_train.tar)| +|Text Telescope|tbsrn|21.56|0.7411| [configs/sr/sr_telescope.yml](../../configs/sr/sr_telescope.yml)|[训练模型](https://paddleocr.bj.bcebos.com/contribution/sr_telescope_train.tar)| + + + +### 1.4 公式识别算法 + +已支持的公式识别算法列表(戳链接获取使用教程): +- [x] [CAN](./algorithm_rec_can.md) + +在CROHME手写公式数据集上,算法效果如下: + +|模型 |骨干网络|配置文件|ExpRate|下载链接| +| ----- | ----- | ----- | ----- | ----- | +|CAN|DenseNet|[rec_d28_can.yml](../../configs/rec/rec_d28_can.yml)|51.72%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_d28_can_train.tar)| + ## 2. 端到端算法 diff --git a/doc/doc_ch/algorithm_rec_svtr.md b/doc/doc_ch/algorithm_rec_svtr.md index c0e26433e92d8de722b80951ce8ccf17d28d19c3..42a1a9a415126e905daed820ea11aa3927f9b736 100644 --- a/doc/doc_ch/algorithm_rec_svtr.md +++ b/doc/doc_ch/algorithm_rec_svtr.md @@ -159,7 +159,23 @@ Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9999998807907104) ## 5. FAQ -1. 由于`SVTR`使用的算子大多为矩阵相乘,在GPU环境下,速度具有优势,但在CPU开启mkldnn加速环境下,`SVTR`相比于被优化的卷积网络没有优势。 +- 1. GPU和CPU速度对比 + - 由于`SVTR`使用的算子大多为矩阵相乘,在GPU环境下,速度具有优势,但在CPU开启mkldnn加速环境下,`SVTR`相比于被优化的卷积网络没有优势。 +- 2. SVTR模型转ONNX失败 + - 保证`paddle2onnx`和`onnxruntime`版本最新,转onnx命令参考[SVTR模型转onnx步骤实例](https://github.com/PaddlePaddle/PaddleOCR/issues/7821#issuecomment-1271214273)。 +- 3. SVTR转ONNX成功但是推理结果不正确 + - 可能的原因模型参数`out_char_num`设置不正确,应设置为W//4、W//8或者W//12,可以参考[高精度中文场景文本识别模型SVTR的3.3.3章节](https://aistudio.baidu.com/aistudio/projectdetail/5073182?contributionType=1)。 +- 4. 长文本识别优化 + - 参考[高精度中文场景文本识别模型SVTR的3.3章节](https://aistudio.baidu.com/aistudio/projectdetail/5073182?contributionType=1)。 +- 5. 论文结果复现注意事项 + - 数据集使用[ABINet](https://github.com/FangShancheng/ABINet)提供的数据集; + - 默认使用4卡GPU训练,单卡Batchsize默认为512,总Batchsize为2048,对应的学习率为0.0005,当修改Batchsize或者改变GPU卡数,学习率应等比例修改。 +- 6. 进一步优化的探索点 + - 学习率调整:可以调整为默认的两倍保持Batchsize不变;或者将Batchsize减小为默认的1/2,保持学习率不变; + - 数据增强策略:可选`RecConAug`和`RecAug`; + - 如果不使用STN时,可以将`mixer`的`Local`替换为`Conv`、`local_mixer`全部修改为`[5, 5]`; + - 网格搜索最优的`embed_dim`、`depth`、`num_heads`配置; + - 使用`后Normalization策略`,即是将模型配置`prenorm`修改为`True`。 ## 引用 diff --git a/doc/doc_ch/finetune.md b/doc/doc_ch/finetune.md index 973c4cb103cbf9146993dc3e52b80b6924da6de2..ec4bd06590f8881122799e10072b81a4e0e7ad3f 100644 --- a/doc/doc_ch/finetune.md +++ b/doc/doc_ch/finetune.md @@ -26,21 +26,11 @@ PaddleOCR提供的PP-OCR系列模型在通用场景中性能优异,能够解 ### 2.2 模型选择 -建议选择PP-OCRv2模型(配置文件:[ch_PP-OCRv2_det_student.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_student.yml),预训练模型:[ch_PP-OCRv2_det_distill_train.tar](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar))进行微调,其精度与泛化性能是目前提供的最优预训练模型。 +建议选择PP-OCRv3模型(配置文件:[ch_PP-OCRv3_det_student.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml),预训练模型:[ch_PP-OCRv3_det_distill_train.tar](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar))进行微调,其精度与泛化性能是目前提供的最优预训练模型。 -更多PP-OCR系列模型,请参考[PaddleOCR 首页说明文档](../../README_ch.md)。 +更多PP-OCR系列模型,请参考[PP-OCR 系列模型库](./models_list.md)。 -注意:在使用上述预训练模型的时候,由于保存的模型中包含教师模型,因此需要将其中的学生模型单独提取出来,再加载学生模型即可进行模型微调。 - -```python -import paddle -# 加载完整的检测预训练模型 -a = paddle.load("ch_PP-OCRv2_det_distill_train/best_accuracy.pdparams") -# 提取学生模型的参数 -b = {k[len("student_model."):]: a[k] for k in a if "student_model." in k} -# 保存模型,用于后续模型微调 -paddle.save(b, "ch_PP-OCRv2_det_student.pdparams") -``` +注意:在使用上述预训练模型的时候,需要使用文件夹中的`student.pdparams`文件作为预训练模型,即,仅使用学生模型。 ### 2.3 训练超参选择 @@ -49,7 +39,7 @@ paddle.save(b, "ch_PP-OCRv2_det_student.pdparams") ```yaml Global: - pretrained_model: ./pretrain_models/student.pdparams # 预训练模型路径 + pretrained_model: ./ch_PP-OCRv3_det_distill_train/student.pdparams # 预训练模型路径 Optimizer: lr: name: Cosine @@ -67,7 +57,7 @@ Train: num_workers: 4 ``` -上述配置文件中,首先需要将`pretrained_model`字段指定为2.2章节中提取出来的`ch_PP-OCRv2_det_student.pdparams`文件路径。 +上述配置文件中,首先需要将`pretrained_model`字段指定为`student.pdparams`文件路径。 PaddleOCR提供的配置文件是在8卡训练(相当于总的batch size是`8*8=64`)、且没有加载预训练模型情况下的配置文件,因此您的场景中,学习率与总的batch size需要对应线性调整,例如 @@ -88,7 +78,7 @@ PaddleOCR提供的配置文件是在8卡训练(相当于总的batch size是`8* | det_db_score_mode | str | "fast" | DB的检测结果得分计算方法,支持`fast`和`slow`,`fast`是根据polygon的外接矩形边框内的所有像素计算平均得分,`slow`是根据原始polygon内的所有像素计算平均得分,计算速度相对较慢一些,但是更加准确一些。 | -更多关于推理方法的介绍可以参考[Paddle Inference推理教程](./inference.md)。 +更多关于推理方法的介绍可以参考[Paddle Inference推理教程](././inference_ppocr.md)。 ## 3. 文本识别模型微调 @@ -109,10 +99,70 @@ PaddleOCR提供的配置文件是在8卡训练(相当于总的batch size是`8* ### 3.2 模型选择 -建议选择PP-OCRv2模型(配置文件:[ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml),预训练模型:[ch_PP-OCRv2_rec_train.tar](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar))进行微调,其精度与泛化性能是目前提供的最优预训练模型。 +建议选择PP-OCRv3模型(配置文件:[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml),预训练模型:[ch_PP-OCRv3_rec_train.tar](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar))进行微调,其精度与泛化性能是目前提供的最优预训练模型。 -更多PP-OCR系列,模型请参考[PaddleOCR 首页说明文档](../../README_ch.md)。 +更多PP-OCR系列模型,请参考[PP-OCR 系列模型库](./models_list.md)。 +PP-OCRv3 模型使用了GTC策略,其中SAR分支参数量大,当训练数据为简单场景时模型容易过拟合,导致微调效果不佳,建议去除GTC策略,模型结构部分配置文件修改如下: + +```yaml +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Neck: + name: SequenceEncoder + encoder_type: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: False + Head: + name: CTCHead + fc_decay: 0.00001 +Loss: + name: CTCLoss + +Train: + dataset: + ...... + transforms: + # 去除 RecConAug 增广 + # - RecConAug: + # prob: 0.5 + # ext_data_num: 2 + # image_shape: [48, 320, 3] + # max_text_length: *max_text_length + - RecAug: + # 修改 Encode 方式 + - CTCLabelEncode: + - KeepKeys: + keep_keys: + - image + - label + - length +... + +Eval: + dataset: + ... + transforms: + ... + - CTCLabelEncode: + - KeepKeys: + keep_keys: + - image + - label + - length +... + + +``` ### 3.3 训练超参选择 @@ -147,7 +197,7 @@ Train: ``` -上述配置文件中,首先需要将`pretrained_model`字段指定为2.2章节中解压得到的`ch_PP-OCRv2_rec_train/best_accuracy.pdparams`文件路径。 +上述配置文件中,首先需要将`pretrained_model`字段指定为3.2章节中解压得到的`ch_PP-OCRv3_rec_train/best_accuracy.pdparams`文件路径。 PaddleOCR提供的配置文件是在8卡训练(相当于总的batch size是`8*128=1024`)、且没有加载预训练模型情况下的配置文件,因此您的场景中,学习率与总的batch size需要对应线性调整,例如: @@ -173,7 +223,9 @@ Train: ratio_list: [1.0, 0.1] ``` + ### 3.4 训练调优 -训练过程并非一蹴而就的,完成一个阶段的训练评估后,建议收集分析当前模型在真实场景中的 badcase,有针对性的调整训练数据比例,或者进一步新增合成数据。 -通过多次迭代训练,不断优化模型效果。 +训练过程并非一蹴而就的,完成一个阶段的训练评估后,建议收集分析当前模型在真实场景中的 badcase,有针对性的调整训练数据比例,或者进一步新增合成数据。通过多次迭代训练,不断优化模型效果。 + +如果在训练时修改了自定义字典,由于无法加载最后一层FC的参数,在迭代初期acc=0是正常的情况,不必担心,加载预训练模型依然可以加快模型收敛。 diff --git a/doc/doc_ch/inference_args.md b/doc/doc_ch/inference_args.md index 24e7223e397c94fe65b0f26d993fc507b323ed16..aad7973b16ee5d501d48ce6b8d2bc46d05c27c2a 100644 --- a/doc/doc_ch/inference_args.md +++ b/doc/doc_ch/inference_args.md @@ -70,7 +70,7 @@ SAST算法相关参数如下 | :--: | :--: | :--: | :--: | | det_sast_score_thresh | float | 0.5 | SAST后处理中的得分阈值 | | det_sast_nms_thresh | float | 0.5 | SAST后处理中nms的阈值 | -| det_sast_polygon | bool | False | 是否多边形检测,弯曲文本场景(如Total-Text)设置为True | +| det_box_type | str | quad | 是否多边形检测,弯曲文本场景(如Total-Text)设置为'poly' | PSE算法相关参数如下 @@ -79,7 +79,7 @@ PSE算法相关参数如下 | det_pse_thresh | float | 0.0 | 对输出图做二值化的阈值 | | det_pse_box_thresh | float | 0.85 | 对box进行过滤的阈值,低于此阈值的丢弃 | | det_pse_min_area | float | 16 | box的最小面积,低于此阈值的丢弃 | -| det_pse_box_type | str | "box" | 返回框的类型,box:四点坐标,poly: 弯曲文本的所有点坐标 | +| det_box_type | str | "quad" | 返回框的类型,quad:四点坐标,poly: 弯曲文本的所有点坐标 | | det_pse_scale | int | 1 | 输入图像相对于进后处理的图的比例,如`640*640`的图像,网络输出为`160*160`,scale为2的情况下,进后处理的图片shape为`320*320`。这个值调大可以加快后处理速度,但是会带来精度的下降 | * 文本识别模型相关 @@ -88,7 +88,7 @@ PSE算法相关参数如下 | :--: | :--: | :--: | :--: | | rec_algorithm | str | "CRNN" | 文本识别算法名称,目前支持`CRNN`, `SRN`, `RARE`, `NETR`, `SAR`, `ViTSTR`, `ABINet`, `VisionLAN`, `SPIN`, `RobustScanner`, `SVTR`, `SVTR_LCNet` | | rec_model_dir | str | 无,如果使用识别模型,该项是必填项 | 识别inference模型路径 | -| rec_image_shape | list | [3, 48, 320] | 识别时的图像尺寸 | +| rec_image_shape | str | "3,48,320" | 识别时的图像尺寸 | | rec_batch_num | int | 6 | 识别的batch size | | max_text_length | int | 25 | 识别结果最大长度,在`SRN`中有效 | | rec_char_dict_path | str | "./ppocr/utils/ppocr_keys_v1.txt" | 识别的字符字典文件 | @@ -115,7 +115,7 @@ PSE算法相关参数如下 | :--: | :--: | :--: | :--: | | use_angle_cls | bool | False | 是否使用方向分类器 | | cls_model_dir | str | 无,如果需要使用,则必须显式指定路径 | 方向分类器inference模型路径 | -| cls_image_shape | list | [3, 48, 192] | 预测尺度 | +| cls_image_shape | str | "3,48,192" | 预测尺度 | | label_list | list | ['0', '180'] | class id对应的角度值 | | cls_batch_num | int | 6 | 方向分类器预测的batch size | | cls_thresh | float | 0.9 | 预测阈值,模型预测结果为180度,且得分大于该阈值时,认为最终预测结果为180度,需要翻转 | diff --git a/doc/doc_ch/inference_ppocr.md b/doc/doc_ch/inference_ppocr.md index 2061f059d13cebce0586334fa536e7c92b0a6bb6..085802fc2cd41bbe922f0d794bcbc724ebdbe49d 100644 --- a/doc/doc_ch/inference_ppocr.md +++ b/doc/doc_ch/inference_ppocr.md @@ -11,7 +11,7 @@ - [2.3 多语言模型的推理](#23-多语言模型的推理) - [3. 方向分类模型推理](#3-方向分类模型推理) - [4. 文本检测、方向分类和文字识别串联推理](#4-文本检测方向分类和文字识别串联推理) - - [5. TensorRT推理](5-TensorRT推理) + - [5. TensorRT推理](#5-tensorrt推理) @@ -144,7 +144,7 @@ Predicts of ./doc/imgs_words/ch/word_4.jpg:['0', 0.9999982] **注意** `PP-OCRv3`的识别模型使用的输入shape为`3,48,320`, 如果使用其他识别模型,则需根据模型设置参数`--rec_image_shape`。此外,`PP-OCRv3`的识别模型默认使用的`rec_algorithm`为`SVTR_LCNet`,注意和原始`SVTR`的区别。 -以超轻量中文OCR模型推理为例,在执行预测时,需要通过参数`image_dir`指定单张图像或者图像集合的路径,也支持PDF文件、参数`det_model_dir`,`cls_model_dir`和`rec_model_dir`分别指定检测,方向分类和识别的inference模型路径。参数`use_angle_cls`用于控制是否启用方向分类模型。`use_mp`表示是否使用多进程。`total_process_num`表示在使用多进程时的进程数。可视化识别结果默认保存到 ./inference_results 文件夹里面。 +以超轻量中文OCR模型推理为例,在执行预测时,需要通过参数`image_dir`指定单张图像或者图像集合的路径,也支持PDF文件、参数`det_model_dir`,`cls_model_dir`和`rec_model_dir`分别指定检测,方向分类和识别的inference模型路径。参数`use_angle_cls`用于控制是否启用方向分类模型。`use_mp`表示是否使用多进程(Paddle Inference并不是线程安全,建议使用多进程)。`total_process_num`表示在使用多进程时的进程数。可视化识别结果默认保存到 ./inference_results 文件夹里面。 ```shell # 使用方向分类器 diff --git a/doc/doc_ch/kie.md b/doc/doc_ch/kie.md index b6f38a662fd98597011c5a51ff29c417d880ca17..26d2e560fce4d5208eb72a033d315d27da1a5577 100644 --- a/doc/doc_ch/kie.md +++ b/doc/doc_ch/kie.md @@ -438,7 +438,25 @@ inference/ser_vi_layoutxlm/ └── inference.pdmodel # inference模型的模型结构文件 ``` -RE任务的动转静过程适配中,敬请期待。 +信息抽取模型中的RE任务转inference模型步骤如下: + +``` bash +# -c 后面设置训练算法的yml配置文件 +# -o 配置可选参数 +# Architecture.Backbone.checkpoints 参数设置待转换的训练模型地址 +# Global.save_inference_dir 参数设置转换的模型将保存的地址 + +python3 tools/export_model.py -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./output/re_vi_layoutxlm_xfund_zh/best_accuracy Global.save_inference_dir=./inference/re_vi_layoutxlm +``` + +转换成功后,在目录下有三个文件: + +``` +inference/re_vi_layoutxlm/ + ├── inference.pdiparams # inference模型的参数文件 + ├── inference.pdiparams.info # inference模型的参数信息,可忽略 + └── inference.pdmodel # inference模型的模型结构文件 +``` ## 4.2 模型推理 @@ -461,6 +479,26 @@ python3 kie/predict_kie_token_ser.py \ +VI-LayoutXLM模型基于RE任务进行推理,可以执行如下命令: + +```bash +cd ppstructure +python3 kie/predict_kie_token_ser_re.py \ + --kie_algorithm=LayoutXLM \ + --re_model_dir=../inference/re_vi_layoutxlm \ + --ser_model_dir=../inference/ser_vi_layoutxlm \ + --use_visual_backbone=False \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../train_data/XFUND/class_list_xfun.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" +``` + +RE可视化结果默认保存到`./output`文件夹里面,结果示例如下: + +
+ +
# 5. FAQ diff --git a/doc/doc_ch/models_list.md b/doc/doc_ch/models_list.md index c6cbd6873f776c2b8eab49be496fa847929d85a0..7126a1a3cc9e4f53a9bd73306b2c0055ab02a554 100644 --- a/doc/doc_ch/models_list.md +++ b/doc/doc_ch/models_list.md @@ -1,12 +1,13 @@ -# PP-OCR系列模型列表(V3,2022年4月28日更新) +# PP-OCR系列模型列表(V4,2023年8月1日更新) > **说明** -> 1. V3版模型相比V2版模型,在模型精度上有进一步提升 -> 2. 2.0+版模型和[1.1版模型](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/models_list.md) 的主要区别在于动态图训练vs.静态图训练,模型性能上无明显差距。 -> 3. 本文档提供的是PPOCR自研模型列表,更多基于公开数据集的算法介绍与预训练模型可以参考:[算法概览文档](./algorithm_overview.md)。 +> 1. V4版模型相比V3版模型,在模型精度上有进一步提升 +> 2. V3版模型相比V2版模型,在模型精度上有进一步提升 +> 3. 2.0+版模型和[1.1版模型](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/models_list.md) 的主要区别在于动态图训练vs.静态图训练,模型性能上无明显差距。 +> 4. 本文档提供的是PPOCR自研模型列表,更多基于公开数据集的算法介绍与预训练模型可以参考:[算法概览文档](./algorithm_overview.md)。 -- PP-OCR系列模型列表(V3,2022年4月28日更新) +- PP-OCR系列模型列表(V4,2023年8月1日更新) - [1. 文本检测模型](#1-文本检测模型) - [1.1 中文检测模型](#1.1) - [2.2 英文检测模型](#1.2) @@ -41,8 +42,10 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训 |模型名称|模型简介|配置文件|推理模型大小|下载地址| | --- | --- | --- | --- | --- | -|ch_PP-OCRv3_det_slim|【最新】slim量化+蒸馏版超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 1.1M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_distill_train.tar) / [nb模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.nb)| -|ch_PP-OCRv3_det| 【最新】原始超轻量模型,支持中英文、多语种文本检测 |[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.80M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar)| +|ch_PP-OCRv4_det| 【最新】原始超轻量模型,支持中英文、多语种文本检测 |[ch_PP-OCRv4_det_cml.yml](../../configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det_cml.yml)| 4.70M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_det_train.tar)| +|ch_PP-OCRv4_server_det| 【最新】原始高精度模型,支持中英文、多语种文本检测 |[ch_PP-OCRv4_det_teacher.yml](../../configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det_teacher.yml)| 110M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_det_server_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_det_server_train.tar)| +|ch_PP-OCRv3_det_slim|slim量化+蒸馏版超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 1.1M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_distill_train.tar) / [nb模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.nb)| +|ch_PP-OCRv3_det| 原始超轻量模型,支持中英文、多语种文本检测 |[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.80M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar)| |ch_PP-OCRv2_det_slim| slim量化+蒸馏版超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)| 3.0M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar)| |ch_PP-OCRv2_det| 原始超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)|3.0M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)| |ch_ppocr_mobile_slim_v2.0_det|slim裁剪版超轻量模型,支持中英文、多语种文本检测|[ch_det_mv3_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml)| 2.60M |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_det_prune_infer.tar)| @@ -81,8 +84,10 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训 |模型名称|模型简介|配置文件|推理模型大小|下载地址| | --- | --- | --- | --- | --- | -|ch_PP-OCRv3_rec_slim |【最新】slim量化版超轻量模型,支持中英文、数字识别|[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 4.9M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_train.tar) / [nb模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.nb) | -|ch_PP-OCRv3_rec|【最新】原始超轻量模型,支持中英文、数字识别|[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 12.4M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) | +|ch_PP-OCRv4_rec|【最新】超轻量模型,支持中英文、数字识别|[ch_PP-OCRv4_rec_distill.yml](../../configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_distill.yml)| 10M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_rec_train.tar) | +|ch_PP-OCRv4_server_rec|【最新】高精度模型,支持中英文、数字识别|[ch_PP-OCRv4_rec_hgnet.yml](../../configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_hgnet.yml)| 88M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_rec_server_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_rec_server_train.tar) | +|ch_PP-OCRv3_rec_slim |slim量化版超轻量模型,支持中英文、数字识别|[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 4.9M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_train.tar) / [nb模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.nb) | +|ch_PP-OCRv3_rec|原始超轻量模型,支持中英文、数字识别|[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 12.4M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) | |ch_PP-OCRv2_rec_slim| slim量化版超轻量模型,支持中英文、数字识别|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)| 9.0M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) | |ch_PP-OCRv2_rec| 原始超轻量模型,支持中英文、数字识别|[ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml)|8.50M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) | |ch_ppocr_mobile_slim_v2.0_rec|slim裁剪量化版超轻量模型,支持中英文、数字识别|[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml)| 6.0M |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_train.tar) | @@ -96,8 +101,9 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训 |模型名称|模型简介|配置文件|推理模型大小|下载地址| | --- | --- | --- | --- | --- | -|en_PP-OCRv3_rec_slim |【最新】slim量化版超轻量模型,支持英文、数字识别 | [en_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml)| 3.2M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_train.tar) / [nb模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.nb) | -|en_PP-OCRv3_rec |【最新】原始超轻量模型,支持英文、数字识别|[en_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml)| 9.6M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) | +|en_PP-OCRv4_rec |【最新】原始超轻量模型,支持英文、数字识别|[en_PP-OCRv4_rec.yml](../../configs/rec/PP-OCRv4/en_PP-OCRv4_rec.yml)| 9.7M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_train.tar) | +|en_PP-OCRv3_rec_slim |slim量化版超轻量模型,支持英文、数字识别 | [en_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml)| 3.2M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_train.tar) / [nb模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.nb) | +|en_PP-OCRv3_rec |原始超轻量模型,支持英文、数字识别|[en_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml)| 9.6M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) | |en_number_mobile_slim_v2.0_rec|slim裁剪量化版超轻量模型,支持英文、数字识别|[rec_en_number_lite_train.yml](../../configs/rec/multi_language/rec_en_number_lite_train.yml)| 2.7M | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/en_number_mobile_v2.0_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/en_number_mobile_v2.0_rec_slim_train.tar) | |en_number_mobile_v2.0_rec|原始超轻量模型,支持英文、数字识别|[rec_en_number_lite_train.yml](../../configs/rec/multi_language/rec_en_number_lite_train.yml)|2.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_train.tar) | diff --git a/doc/doc_ch/ocr_book.md b/doc/doc_ch/ocr_book.md index 03a6011b6b921eff82ab41863058341fc599e41b..420bc8860ab8d0d34edd78587683f8f09b6c910f 100644 --- a/doc/doc_ch/ocr_book.md +++ b/doc/doc_ch/ocr_book.md @@ -1,6 +1,6 @@ # 《动手学OCR》电子书 -《动手学OCR》是PaddleOCR团队携手复旦大学青年研究员陈智能、中国移动研究院视觉领域资深专家黄文辉等产学研同仁,以及OCR开发者共同打造的结合OCR前沿理论与代码实践的教材。主要特色如下: +《动手学OCR》是PaddleOCR团队携手华中科技大学博导/教授,IAPR Fellow 白翔、复旦大学青年研究员陈智能、中国移动研究院视觉领域资深专家黄文辉、中国工商银行大数据人工智能实验室研究员等产学研同仁,以及OCR开发者共同打造的结合OCR前沿理论与代码实践的教材。主要特色如下: - 覆盖从文本检测识别到文档分析的OCR全栈技术 - 紧密结合理论实践,跨越代码实现鸿沟,并配套教学视频 @@ -21,5 +21,5 @@ ## 资料地址 - 中文版电子书下载请扫描首页二维码入群后领取 -- [notebook教程](../../notebook/notebook_ch/) +- [notebook教程](https://github.com/PaddleOCR-Community/Dive-into-OCR) - [教学视频](https://aistudio.baidu.com/aistudio/education/group/info/25207) diff --git a/doc/doc_ch/quickstart.md b/doc/doc_ch/quickstart.md index cac7664c2fb38b91efa4b3f2daa388b90e1ee1f8..0600d1642c34f495fef9ec40406b62cef7131794 100644 --- a/doc/doc_ch/quickstart.md +++ b/doc/doc_ch/quickstart.md @@ -107,9 +107,10 @@ cd /path/to/ppocr_img ``` **版本说明** -paddleocr默认使用PP-OCRv3模型(`--ocr_version PP-OCRv3`),如需使用其他版本可通过设置参数`--ocr_version`,具体版本说明如下: +paddleocr默认使用PP-OCRv4模型(`--ocr_version PP-OCRv4`),如需使用其他版本可通过设置参数`--ocr_version`,具体版本说明如下: | 版本名称 | 版本说明 | | --- | --- | +| PP-OCRv4 | 支持中、英文检测和识别,方向分类器,支持多语种识别 | | PP-OCRv3 | 支持中、英文检测和识别,方向分类器,支持多语种识别 | | PP-OCRv2 | 支持中英文的检测和识别,方向分类器,多语言暂未更新 | | PP-OCR | 支持中、英文检测和识别,方向分类器,支持多语种识别 | @@ -210,7 +211,7 @@ from paddleocr import PaddleOCR, draw_ocr # Paddleocr目前支持的多语言语种可以通过修改lang参数进行切换 # 例如`ch`, `en`, `fr`, `german`, `korean`, `japan` -ocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=2) # need to run only once to download and load model into memory +ocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=2) # need to run only once to download and load model into memory img_path = './xxx.pdf' result = ocr.ocr(img_path, cls=True) for idx in range(len(result)): @@ -251,4 +252,4 @@ for idx in range(len(result)): 通过本节内容,相信您已经熟练掌握PaddleOCR whl包的使用方法并获得了初步效果。 -PaddleOCR是一套丰富领先实用的OCR工具库,打通数据、模型训练、压缩和推理部署全流程,您可以参考[文档教程](../../README_ch.md#文档教程),正式开启PaddleOCR的应用之旅。 +飞桨AI套件(PaddleX)提供了飞桨生态优质模型,是训压推一站式全流程高效率开发平台,其使命是助力AI技术快速落地,愿景是使人人成为AI Developer!目前PP-OCRv4已上线PaddleX,您可以进入[通用OCR](https://aistudio.baidu.com/aistudio/modelsdetail?modelId=286)体验模型训练、压缩和推理部署全流程。 diff --git a/doc/doc_ch/table_recognition.md b/doc/doc_ch/table_recognition.md index 156ba80e37d268ab419ca8f301ed5703563f9ea7..8b8dad661fcc358af5f094ab31e8a9ea2bea25b0 100644 --- a/doc/doc_ch/table_recognition.md +++ b/doc/doc_ch/table_recognition.md @@ -6,6 +6,7 @@ - [1.1. 数据集格式](#11-数据集格式) - [1.2. 数据下载](#12-数据下载) - [1.3. 数据集生成](#13-数据集生成) + - [1.4 数据标注](#14-数据标注) - [2. 开始训练](#2-开始训练) - [2.1. 启动训练](#21-启动训练) - [2.2. 断点训练](#22-断点训练) @@ -14,6 +15,9 @@ - [2.5. 分布式训练](#25-分布式训练) - [2.6. 其他训练环境](#26-其他训练环境) - [2.7. 模型微调](#27-模型微调) + - [2.7.1 数据选择](#271-数据选择) + - [2.7.2 模型选择](#272-模型选择) + - [2.7.3 训练超参选择](#273-训练超参选择) - [3. 模型评估与预测](#3-模型评估与预测) - [3.1. 指标评估](#31-指标评估) - [3.2. 测试表格结构识别效果](#32-测试表格结构识别效果) @@ -36,15 +40,15 @@ img_label 每一行的json格式为: ```txt { - 'filename': PMC5755158_010_01.png, # 图像名 - 'split': ’train‘, # 图像属于训练集还是验证集 - 'imgid': 0, # 图像的index + 'filename': PMC5755158_010_01.png, # 图像名 + 'split': ’train‘, # 图像属于训练集还是验证集 + 'imgid': 0, # 图像的index 'html': { - 'structure': {'tokens': ['', '', '', ...]}, # 表格的HTML字符串 + 'structure': {'tokens': ['', '', '', ...]}, # 表格的HTML字符串 'cells': [ { - 'tokens': ['P', 'a', 'd', 'd', 'l', 'e', 'P', 'a', 'd', 'd', 'l', 'e'], # 表格中的单个文本 - 'bbox': [x0, y0, x1, y1] # 表格中的单个文本的坐标 + 'tokens': ['P', 'a', 'd', 'd', 'l', 'e', 'P', 'a', 'd', 'd', 'l', 'e'], # 表格中的单个文本 + 'bbox': [x0, y0, x1, y1] # 表格中的单个文本的坐标 } ] } @@ -75,6 +79,10 @@ TableGeneration是一个开源表格数据集生成工具,其通过浏览器 |简单表格|![](https://raw.githubusercontent.com/WenmuZhou/TableGeneration/main/imgs/simple.jpg)| |彩色表格|![](https://raw.githubusercontent.com/WenmuZhou/TableGeneration/main/imgs/color.jpg)| +## 1.4 数据标注 + +数据标注可参考[PPOCRLabel](../../PPOCRLabel/README_ch.md) + # 2. 开始训练 PaddleOCR提供了训练脚本、评估脚本和预测脚本,本节将以 [SLANet](../../configs/table/SLANet.yml) 模型训练PubTabNet英文数据集为例: @@ -219,7 +227,39 @@ DCU设备上运行需要设置环境变量 `export HIP_VISIBLE_DEVICES=0,1,2,3` ## 2.7. 模型微调 -实际使用过程中,建议加载官方提供的预训练模型,在自己的数据集中进行微调,关于模型的微调方法,请参考:[模型微调教程](./finetune.md)。 +### 2.7.1 数据选择 + +数据量:建议至少准备2000张的表格识别数据集用于模型微调。 + +### 2.7.2 模型选择 + +建议选择SLANet模型(配置文件:[SLANet_ch.yml](../../configs/table/SLANet_ch.yml),预训练模型:[ch_ppstructure_mobile_v2.0_SLANet_train.tar](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar))进行微调,其精度与泛化性能是目前提供的最优中文表格预训练模型。 + +更多表格识别模型,请参考[PP-Structure 系列模型库](../../ppstructure/docs/models_list.md)。 + +### 2.7.3 训练超参选择 + +在模型微调的时候,最重要的超参就是预训练模型路径`pretrained_model`, 学习率`learning_rate`,部分配置文件如下所示。 + +```yaml +Global: + pretrained_model: ./ch_ppstructure_mobile_v2.0_SLANet_train/best_accuracy.pdparams # 预训练模型路径 +Optimizer: + lr: + name: Cosine + learning_rate: 0.001 # + warmup_epoch: 0 + regularizer: + name: 'L2' + factor: 0 +``` + +上述配置文件中,首先需要将`pretrained_model`字段指定为`best_accuracy.pdparams`文件路径。 + +PaddleOCR提供的配置文件是在4卡训练(相当于总的batch size是`4*48=192`)、且没有加载预训练模型情况下的配置文件,因此您的场景中,学习率与总的batch size需要对应线性调整,例如 + +* 如果您的场景中是单卡训练,单卡batch_size=48,则总的batch_size=48,建议将学习率调整为`0.00025`左右。 +* 如果您的场景中是单卡训练,由于显存限制,只能设置单卡batch_size=32,则总的batch_size=32,建议将学习率调整为`0.00017`左右。 # 3. 模型评估与预测 diff --git a/doc/doc_ch/whl.md b/doc/doc_ch/whl.md index 83f062801a343289f11681995549dded97982397..ba955c832bdeb8a686d70c089f6f8287c194aaef 100644 --- a/doc/doc_ch/whl.md +++ b/doc/doc_ch/whl.md @@ -294,7 +294,7 @@ paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --use_angle_cls tru ## 3 自定义模型 -当内置模型无法满足需求时,需要使用到自己训练的模型。 首先,参照[inference.md](./inference.md) 第一节转换将检测、分类和识别模型转换为inference模型,然后按照如下方式使用 +当内置模型无法满足需求时,需要使用到自己训练的模型。 首先,参照[模型导出](./detection.md#4-模型导出与预测)将检测、分类和识别模型转换为inference模型,然后按照如下方式使用 ### 3.1 代码使用 diff --git a/doc/doc_en/algorithm_det_east_en.md b/doc/doc_en/algorithm_det_east_en.md index 3848464abfd275fd319a24b0d3f6b3522c06c4a2..85440debfabc9fc8edf9701ba991d173b9da58cb 100644 --- a/doc/doc_en/algorithm_det_east_en.md +++ b/doc/doc_en/algorithm_det_east_en.md @@ -26,8 +26,9 @@ On the ICDAR2015 dataset, the text detection result is as follows: |Model|Backbone|Configuration|Precision|Recall|Hmean|Download| | --- | --- | --- | --- | --- | --- | --- | -|EAST|ResNet50_vd|88.71%| 81.36%| 84.88%| [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_east_v2.0_train.tar)| -|EAST| MobileNetV3| 78.20%| 79.10%| 78.65%| [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_east_v2.0_train.tar)| +|EAST|ResNet50_vd| [det_r50_vd_east.yml](../../configs/det/det_r50_vd_east.yml)|88.71%| 81.36%| 84.88%| [model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_east_v2.0_train.tar)| +|EAST|MobileNetV3|[det_mv3_east.yml](../../configs/det/det_mv3_east.yml) | 78.20%| 79.10%| 78.65%| [model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_east_v2.0_train.tar)| + diff --git a/doc/doc_en/algorithm_det_sast_en.md b/doc/doc_en/algorithm_det_sast_en.md index e3437d22be9d75835aaa43e72363b498225db9e1..dde8eb32dc1d75270fa18155548a9fa6242c4215 100644 --- a/doc/doc_en/algorithm_det_sast_en.md +++ b/doc/doc_en/algorithm_det_sast_en.md @@ -74,10 +74,10 @@ First, convert the model saved in the SAST text detection training process into python3 tools/export_model.py -c configs/det/det_r50_vd_sast_totaltext.yml -o Global.pretrained_model=./det_r50_vd_sast_totaltext_v2.0_train/best_accuracy Global.save_inference_dir=./inference/det_sast_tt ``` -For SAST curved text detection model inference, you need to set the parameter `--det_algorithm="SAST"` and `--det_sast_polygon=True`, run the following command: +For SAST curved text detection model inference, you need to set the parameter `--det_algorithm="SAST"` and `--det_box_type=poly`, run the following command: ``` -python3 tools/infer/predict_det.py --det_algorithm="SAST" --image_dir="./doc/imgs_en/img623.jpg" --det_model_dir="./inference/det_sast_tt/" --det_sast_polygon=True +python3 tools/infer/predict_det.py --det_algorithm="SAST" --image_dir="./doc/imgs_en/img623.jpg" --det_model_dir="./inference/det_sast_tt/" --det_box_type='poly' ``` The visualized text detection results are saved to the `./inference_results` folder by default, and the name of the result file is prefixed with 'det_res'. Examples of results are as follows: diff --git a/doc/doc_en/algorithm_overview_en.md b/doc/doc_en/algorithm_overview_en.md index 309d074ed4fc3cb39e53134d51a07fa07e1be621..2e25746dc071f25d4c17cabd6fb5fdcb85f4615d 100755 --- a/doc/doc_en/algorithm_overview_en.md +++ b/doc/doc_en/algorithm_overview_en.md @@ -3,6 +3,8 @@ - [1. Two-stage OCR Algorithms](#1) - [1.1 Text Detection Algorithms](#11) - [1.2 Text Recognition Algorithms](#12) + - [1.3 Text Super-Resolution Algorithms](#13) + - [1.4 Formula Recognition Algorithm](#14) - [2. End-to-end OCR Algorithms](#2) - [3. Table Recognition Algorithms](#3) - [4. Key Information Extraction Algorithms](#4) @@ -28,6 +30,7 @@ Supported text detection algorithms (Click the link to get the tutorial): - [x] [PSENet](./algorithm_det_psenet_en.md) - [x] [FCENet](./algorithm_det_fcenet_en.md) - [x] [DRRG](./algorithm_det_drrg_en.md) +- [x] [CT](./algorithm_det_ct_en.md) On the ICDAR2015 dataset, the text detection result is as follows: @@ -47,6 +50,7 @@ On Total-Text dataset, the text detection result is as follows: |Model|Backbone|Precision|Recall|Hmean|Download link| | --- | --- | --- | --- | --- | --- | |SAST|ResNet50_vd|89.63%|78.44%|83.66%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_totaltext_v2.0_train.tar)| +|CT|ResNet18_vd|88.68%|81.70%|85.05%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r18_ct_train.tar)| On CTW1500 dataset, the text detection result is as follows: @@ -104,6 +108,36 @@ Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation r |RobustScanner|ResNet31| 87.77% | rec_r31_robustscanner | [trained model](https://paddleocr.bj.bcebos.com/contribution/rec_r31_robustscanner.tar)| |RFL|ResNetRFL| 88.63% | rec_resnet_rfl_att | [trained model](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl_att_train.tar) | + + +### 1.3 Text Super-Resolution Algorithms + +Supported text super-resolution algorithms (Click the link to get the tutorial): +- [x] [Text Gestalt](./algorithm_sr_gestalt.md) +- [x] [Text Telescope](./algorithm_sr_telescope.md) + +On the TextZoom public dataset, the effect of the algorithm is as follows: + +|Model|Backbone|PSNR_Avg|SSIM_Avg|Config|Download link| +|---|---|---|---|---|---| +|Text Gestalt|tsrn|19.28|0.6560| [configs/sr/sr_tsrn_transformer_strock.yml](../../configs/sr/sr_tsrn_transformer_strock.yml)|[trained model](https://paddleocr.bj.bcebos.com/sr_tsrn_transformer_strock_train.tar)| +|Text Telescope|tbsrn|21.56|0.7411| [configs/sr/sr_telescope.yml](../../configs/sr/sr_telescope.yml)|[trained model](https://paddleocr.bj.bcebos.com/contribution/sr_telescope_train.tar)| + + + +### 1.4 Formula Recognition Algorithm + +Supported formula recognition algorithms (Click the link to get the tutorial): + +- [x] [CAN](./algorithm_rec_can_en.md) + +On the CROHME handwritten formula dataset, the effect of the algorithm is as follows: + +|Model |Backbone|Config|ExpRate|Download link| +| ----- | ----- | ----- | ----- | ----- | +|CAN|DenseNet|[rec_d28_can.yml](../../configs/rec/rec_d28_can.yml)|51.72%|[trained model](https://paddleocr.bj.bcebos.com/contribution/rec_d28_can_train.tar)| + + ## 2. End-to-end OCR Algorithms @@ -122,7 +156,7 @@ On the PubTabNet dataset, the algorithm result is as follows: |Model|Backbone|Config|Acc|Download link| |---|---|---|---|---| -|TableMaster|TableResNetExtra|[configs/table/table_master.yml](../../configs/table/table_master.yml)|77.47%|[trained](https://paddleocr.bj.bcebos.com/ppstructure/models/tablemaster/table_structure_tablemaster_train.tar) / [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/tablemaster/table_structure_tablemaster_infer.tar)| +|TableMaster|TableResNetExtra|[configs/table/table_master.yml](../../configs/table/table_master.yml)|77.47%|[trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/tablemaster/table_structure_tablemaster_train.tar) / [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/tablemaster/table_structure_tablemaster_infer.tar)| diff --git a/doc/doc_en/algorithm_rec_svtr_en.md b/doc/doc_en/algorithm_rec_svtr_en.md index 37cd35f35a2025cbb55ff85fe27b50e5d6e556aa..d22fe73e6fed1e30b4dc8317fb5fce69c4998a56 100644 --- a/doc/doc_en/algorithm_rec_svtr_en.md +++ b/doc/doc_en/algorithm_rec_svtr_en.md @@ -130,7 +130,23 @@ Not supported ## 5. FAQ -1. Since most of the operators used by `SVTR` are matrix multiplication, in the GPU environment, the speed has an advantage, but in the environment where mkldnn is enabled on the CPU, `SVTR` has no advantage over the optimized convolutional network. +- 1. Speed situation on CPU and GPU + - Since most of the operators used by `SVTR` are matrix multiplication, in the GPU environment, the speed has an advantage, but in the environment where mkldnn is enabled on the CPU, `SVTR` has no advantage over the optimized convolutional network. +- 2. SVTR model convert to ONNX failed + - Ensure `paddle2onnx` and `onnxruntime` versions are up to date, refer to [SVTR model to onnx step-by-step example](https://github.com/PaddlePaddle/PaddleOCR/issues/7821#issuecomment-) for the convert onnx command. 1271214273). +- 3. SVTR model convert to ONNX is successful but the inference result is incorrect + - The possible reason is that the model parameter `out_char_num` is not set correctly, it should be set to W//4, W//8 or W//12, please refer to [Section 3.3.3 of SVTR, a high-precision Chinese scene text recognition model](https://aistudio.baidu.com/aistudio/) projectdetail/5073182?contributionType=1). +- 4. Optimization of long text recognition + - Refer to [Section 3.3 of SVTR, a high-precision Chinese scene text recognition model](https://aistudio.baidu.com/aistudio/projectdetail/5073182?contributionType=1). +- 5. Notes on the reproduction of the paper results + - Dataset using provided by [ABINet](https://github.com/FangShancheng/ABINet). + - By default, 4 cards of GPUs are used for training, the default Batchsize of a single card is 512, and the total Batchsize is 2048, corresponding to a learning rate of 0.0005. When modifying the Batchsize or changing the number of GPU cards, the learning rate should be modified in equal proportion. +- 6. Exploration Directions for further optimization + - Learning rate adjustment: adjusting to twice the default to keep Batchsize unchanged; or reducing Batchsize to 1/2 the default to keep the learning rate unchanged. + - Data augmentation strategies: optionally `RecConAug` and `RecAug`. + - If STN is not used, `Local` of `mixer` can be replaced by `Conv` and `local_mixer` can all be modified to `[5, 5]`. + - Grid search for optimal `embed_dim`, `depth`, `num_heads` configurations. + - Use the `Post-Normalization strategy`, which is to modify the model configuration `prenorm` to `True`. ## Citation diff --git a/doc/doc_en/algorithm_sr_telescope_en.md b/doc/doc_en/algorithm_sr_telescope_en.md index 9acb524312fc037bfc48b3c16e6f66024eb132b7..334b58b6e8238fe3b825f625527f6c84df94a510 100644 --- a/doc/doc_en/algorithm_sr_telescope_en.md +++ b/doc/doc_en/algorithm_sr_telescope_en.md @@ -27,7 +27,7 @@ Paper: Referring to the [FudanOCR](https://github.com/FudanVI/FudanOCR/tree/main/scene-text-telescope) data download instructions, the effect of the super-score algorithm on the TextZoom test set is as follows: |Model|Backbone|config|Acc|Download link| -|---|---|---|---|---|---| +|---|---|---|---|---| |Text Gestalt|tsrn|21.56|0.7411| [configs/sr/sr_telescope.yml](../../configs/sr/sr_telescope.yml)|[train model](https://paddleocr.bj.bcebos.com/contribution/sr_telescope_train.tar)| The [TextZoom dataset](https://paddleocr.bj.bcebos.com/dataset/TextZoom.tar) comes from two superfraction data sets, RealSR and SR-RAW, both of which contain LR-HR pairs. TextZoom has 17367 pairs of training data and 4373 pairs of test data. diff --git a/doc/doc_en/detection_en.md b/doc/doc_en/detection_en.md index c215e1a46636a84d372245097b460c095e9cb7fd..ab2e868c5401b8613425d09299b9473c79e3b819 100644 --- a/doc/doc_en/detection_en.md +++ b/doc/doc_en/detection_en.md @@ -13,6 +13,7 @@ This section uses the icdar2015 dataset as an example to introduce the training, * [2.5 Distributed Training](#25-distributed-training) * [2.6 Training with knowledge distillation](#26) * [2.7 Training on other platform(Windows/macOS/Linux DCU)](#27) + * [2.8 Fine-tuning](#28) - [3. Evaluation and Test](#3-evaluation-and-test) - [3.1 Evaluation](#31-evaluation) - [3.2 Test](#32-test) @@ -178,6 +179,10 @@ GPU mode is not supported, you need to set `use_gpu` to False in the configurati - Linux DCU Running on a DCU device requires setting the environment variable `export HIP_VISIBLE_DEVICES=0,1,2,3`, and the rest of the training and evaluation prediction commands are exactly the same as the Linux GPU. +### 2.8 Fine-tuning + +In actual use, it is recommended to load the official pre-trained model and fine-tune it in your own data set. For the fine-tuning method of the detection model, please refer to: [Model Fine-tuning Tutorial](./finetune_en.md). + ## 3. Evaluation and Test ### 3.1 Evaluation diff --git a/doc/doc_en/finetune_en.md b/doc/doc_en/finetune_en.md new file mode 100644 index 0000000000000000000000000000000000000000..e76eb1e26a257e41f16675988948f0ca178f8890 --- /dev/null +++ b/doc/doc_en/finetune_en.md @@ -0,0 +1,229 @@ +# Fine-tune + +## 1. background and meaning + +The PP-OCR series models provided by PaddleOCR have excellent performance in general scenarios and can solve detection and recognition problems in most cases. In vertical scenarios, if you want to obtain better model, you can further improve the accuracy of the PP-OCR series detection and recognition models through fine-tune. + +This article mainly introduces some precautions when fine-tuning the text detection and recognition model. Finally, you can obtain a text detection and recognition model with higher accuracy through model fine-tuning in your own scenarios. + +The core points of this article are as follows: + +1. The pre-trained model provided by PP-OCR has better generalization ability +2. Adding a small amount of real data (detection:>=500, recognition:>=5000) will greatly improve the detection and recognition effect of vertical scenes +3. When fine-tuning the model, adding real general scene data can further improve the model accuracy and generalization performance +4. In the text detection task, increasing the prediction shape of the image can further improve the detection effect of the smaller text area +5. When fine-tuning the model, it is necessary to properly adjust the hyperparameters (learning rate, batch size are the most important) to obtain a better fine-tuning effect. + +For more details, please refer to Chapter 2 and Chapter 3。 + +## 2. Text detection model fine-tuning + +### 2.1 Dataset + +* Dataset: It is recommended to prepare at least 500 text detection datasets for model fine-tuning. + +* Dataset annotation: single-line text annotation format, it is recommended that the labeled detection frame be consistent with the actual semantic content. For example, in the train ticket scene, the surname and first name may be far apart, but they belong to the same detection field semantically. Here, the entire name also needs to be marked as a detection frame. + +### 2.2 Model + +It is recommended to choose the PP-OCRv3 model (configuration file: [ch_PP-OCRv3_det_student.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml),pre-trained model: [ch_PP-OCRv3_det_distill_train.tar](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar), its accuracy and generalization performance is the best pre-training model currently available. + +For more PP-OCR series models, please refer to [PP-OCR Series Model Library](./models_list_en.md)。 + +Note: When using the above pre-trained model, you need to use the `student.pdparams` file in the folder as the pre-trained model, that is, only use the student model. + + +### 2.3 Training hyperparameter + +When fine-tuning the model, the most important hyperparameter is the pre-training model path `pretrained_model`, `learning_rate`与`batch_size`,some hyperparameters are as follows: + +```yaml +Global: + pretrained_model: ./ch_PP-OCRv3_det_distill_train/student.pdparams # pre-training model path +Optimizer: + lr: + name: Cosine + learning_rate: 0.001 # learning_rate + warmup_epoch: 2 + regularizer: + name: 'L2' + factor: 0 + +Train: + loader: + shuffle: True + drop_last: False + batch_size_per_card: 8 # single gpu batch size + num_workers: 4 +``` + +In the above configuration file, you need to specify the `pretrained_model` field as the `student.pdparams` file path. + +The configuration file provided by PaddleOCR is for 8-gpu training (equivalent to a total batch size of `8*8=64`) and no pre-trained model is loaded. Therefore, in your scenario, the learning rate is the same as the total The batch size needs to be adjusted linearly, for example + +* If your scenario is single-gpu training, single gpu batch_size=8, then the total batch_size=8, it is recommended to adjust the learning rate to about `1e-4`. +* If your scenario is for single-gpu training, due to memory limitations, you can only set batch_size=4 for a single gpu, and the total batch_size=4. It is recommended to adjust the learning rate to about `5e-5`. + +### 2.4 Prediction hyperparameter + +When exporting and inferring the trained model, you can further adjust the predicted image scale to improve the detection effect of small-area text. The following are some hyperparameters during DBNet inference, which can be adjusted appropriately to improve the effect. + +| hyperparameter | type | default | meaning | +| :--: | :--: | :--: | :--: | +| det_db_thresh | float | 0.3 | In the probability map output by DB, pixels with a score greater than the threshold will be considered as text pixels | +| det_db_box_thresh | float | 0.6 | When the average score of all pixels within the frame of the detection result is greater than the threshold, the result will be considered as a text area | +| det_db_unclip_ratio | float | 1.5 | The expansion coefficient of `Vatti clipping`, using this method to expand the text area | +| max_batch_size | int | 10 | batch size | +| use_dilation | bool | False | Whether to expand the segmentation results to obtain better detection results | +| det_db_score_mode | str | "fast" | DB's detection result score calculation method supports `fast` and `slow`. `fast` calculates the average score based on all pixels in the polygon’s circumscribed rectangle border, and `slow` calculates the average score based on all pixels in the original polygon. The calculation speed is relatively slower, but more accurate. | + + +For more information on inference methods, please refer to[Paddle Inference doc](././inference_ppocr_en.md)。 + + +## 3. Text recognition model fine-tuning + + +### 3.1 Dataset + +* Dataset:If the dictionary is not changed, it is recommended to prepare at least 5,000 text recognition datasets for model fine-tuning; if the dictionary is changed (not recommended), more quantities are required. + +* Data distribution: It is recommended that the distribution be as consistent as possible with the actual measurement scenario. If the actual scene contains a lot of short text, it is recommended to include more short text in the training data. If the actual scene has high requirements for the recognition effect of spaces, it is recommended to include more text content with spaces in the training data. + +* Data synthesis: In the case of some character recognition errors, it is recommended to obtain a batch of specific character dataset, add it to the original dataset and use a small learning rate for fine-tuning. The ratio of original dataset to new dataset can be 10:1 to 5:1 to avoid overfitting of the model caused by too much data in a single scene. At the same time, try to balance the word frequency of the corpus to ensure that the frequency of common words will not be too low. + + Specific characters can be generated using the TextRenderer tool, for synthesis examples, please refer to [data synthesis](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6/applications/%E5%85%89%E5%8A%9F%E7%8E%87%E8%AE%A1%E6%95%B0%E7%A0%81%E7%AE%A1%E5%AD%97%E7%AC%A6%E8%AF%86%E5%88%AB/%E5%85%89%E5%8A%9F%E7%8E%87%E8%AE%A1%E6%95%B0%E7%A0%81%E7%AE%A1%E5%AD%97%E7%AC%A6%E8%AF%86%E5%88%AB.md#31-%E6%95%B0%E6%8D%AE%E5%87%86%E5%A4%87) + . The synthetic data corpus should come from real usage scenarios as much as possible, and keep the richness of fonts and backgrounds on the basis of being close to the real scene, which will help improve the model effect. + +* Common Chinese and English data: During training, common real data can be added to the training set (for example, in the fine-tuning scenario without changing the dictionary, it is recommended to add real data such as LSVT, RCTW, MTWI) to further improve the generalization performance of the model. + +### 3.2 Model + +It is recommended to choose the PP-OCRv3 model (configuration file: [ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml),pre-trained model: [ch_PP-OCRv3_rec_train.tar](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar),its accuracy and generalization performance is the best pre-training model currently available. + +For more PP-OCR series models, please refer to [PP-OCR Series Model Library](./models_list_en.md)。 + +The PP-OCRv3 model uses the GTC strategy. The SAR branch has a large number of parameters. When the training data is a simple scene, the model is easy to overfit, resulting in poor fine-tuning effect. It is recommended to remove the GTC strategy. The configuration file of the model structure is modified as follows: + +```yaml +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Neck: + name: SequenceEncoder + encoder_type: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: False + Head: + name: CTCHead + fc_decay: 0.00001 +Loss: + name: CTCLoss + +Train: + dataset: + ...... + transforms: + # remove RecConAug + # - RecConAug: + # prob: 0.5 + # ext_data_num: 2 + # image_shape: [48, 320, 3] + # max_text_length: *max_text_length + - RecAug: + # modify Encode + - CTCLabelEncode: + - KeepKeys: + keep_keys: + - image + - label + - length +... + +Eval: + dataset: + ... + transforms: + ... + - CTCLabelEncode: + - KeepKeys: + keep_keys: + - image + - label + - length +... + + +``` + +### 3.3 Training hyperparameter + +Similar to text detection task fine-tuning, when fine-tuning the recognition model, the most important hyperparameters are the pre-trained model path `pretrained_model`, `learning_rate` and `batch_size`, some default configuration files are shown below. + +```yaml +Global: + pretrained_model: # pre-training model path +Optimizer: + lr: + name: Piecewise + decay_epochs : [700, 800] + values : [0.001, 0.0001] # learning_rate + warmup_epoch: 5 + regularizer: + name: 'L2' + factor: 0 + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: + - ./train_data/train_list.txt + ratio_list: [1.0] # Sampling ratio, the default value is [1.0] + loader: + shuffle: True + drop_last: False + batch_size_per_card: 128 # single gpu batch size + num_workers: 8 + +``` + + +In the above configuration file, you first need to specify the `pretrained_model` field as the `ch_PP-OCRv3_rec_train/best_accuracy.pdparams` file path decompressed in Chapter 3.2. + +The configuration file provided by PaddleOCR is for 8-gpu training (equivalent to a total batch size of `8*128=1024`) and no pre-trained model is loaded. Therefore, in your scenario, the learning rate is the same as the total The batch size needs to be adjusted linearly, for example: + +* If your scenario is single-gpu training, single gpu batch_size=128, then the total batch_size=128, in the case of loading the pre-trained model, it is recommended to adjust the learning rate to about `[1e-4, 2e-5]` (For the piecewise learning rate strategy, two values need to be set, the same below). +* If your scenario is for single-gpu training, due to memory limitations, you can only set batch_size=64 for a single gpu, and the total batch_size=64. When loading the pre-trained model, it is recommended to adjust the learning rate to `[5e-5 , 1e-5]`about. + + +If there is general real scene data added, it is recommended that in each epoch, the amount of vertical scene data and real scene data should be kept at about 1:1. + +For example: your own vertical scene recognition data volume is 1W, the data label file is `vertical.txt`, the collected general scene recognition data volume is 10W, and the data label file is `general.txt`. + +Then, the `label_file_list` and `ratio_list` parameters can be set as shown below. In each epoch, `vertical.txt` will be fully sampled (sampling ratio is 1.0), including 1W pieces of data; `general.txt` will be sampled according to a sampling ratio of 0.1, including `10W*0.1=1W` pieces of data, the final ratio of the two is `1:1`. + +```yaml +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: + - vertical.txt + - general.txt + ratio_list: [1.0, 0.1] +``` + +### 3.4 training optimization + +The training process does not happen overnight. After completing a stage of training evaluation, it is recommended to collect and analyze the badcase of the current model in the real scene, adjust the proportion of training data in a targeted manner, or further add synthetic data. Through multiple iterations of training, the model effect is continuously optimized. + +If you modify the custom dictionary during training, since the parameters of the last layer of FC cannot be loaded, it is normal for acc=0 at the beginning of the iteration. Don't worry, loading the pre-trained model can still speed up the model convergence. diff --git a/doc/doc_en/inference_args_en.md b/doc/doc_en/inference_args_en.md index b28cd8436da62dcd10f96f17751db9384ebcaa8d..ee2faedf403a812f688532edcacdb4560027080b 100644 --- a/doc/doc_en/inference_args_en.md +++ b/doc/doc_en/inference_args_en.md @@ -1,6 +1,6 @@ # PaddleOCR Model Inference Parameter Explanation -When using PaddleOCR for model inference, you can customize the modification parameters to modify the model, data, preprocessing, postprocessing, etc.(parameter file:[utility.py](../../tools/infer/utility.py)),The detailed parameter explanation is as follows: +When using PaddleOCR for model inference, you can customize the modification parameters to modify the model, data, preprocessing, postprocessing, etc. (parameter file: [utility.py](../../tools/infer/utility.py)),The detailed parameter explanation is as follows: * Global parameters @@ -70,7 +70,7 @@ The relevant parameters of the SAST algorithm are as follows | :--: | :--: | :--: | :--: | | det_sast_score_thresh | float | 0.5 | Score thresholds in SAST postprocess | | det_sast_nms_thresh | float | 0.5 | Thresholding of nms in SAST postprocess | -| det_sast_polygon | bool | False | Whether polygon detection, curved text scene (such as Total-Text) is set to True | +| det_box_type | str | 'quad' | Whether polygon detection, curved text scene (such as Total-Text) is set to 'poly' | The relevant parameters of the PSE algorithm are as follows @@ -79,7 +79,7 @@ The relevant parameters of the PSE algorithm are as follows | det_pse_thresh | float | 0.0 | Threshold for binarizing the output image | | det_pse_box_thresh | float | 0.85 | Threshold for filtering boxes, below this threshold is discarded | | det_pse_min_area | float | 16 | The minimum area of the box, below this threshold is discarded | -| det_pse_box_type | str | "box" | The type of the returned box, box: four point coordinates, poly: all point coordinates of the curved text | +| det_box_type | str | "quad" | The type of the returned box, quad: four point coordinates, poly: all point coordinates of the curved text | | det_pse_scale | int | 1 | The ratio of the input image relative to the post-processed image, such as an image of `640*640`, the network output is `160*160`, and when the scale is 2, the shape of the post-processed image is `320*320`. Increasing this value can speed up the post-processing speed, but it will bring about a decrease in accuracy | * Text recognition model related parameters @@ -88,7 +88,7 @@ The relevant parameters of the PSE algorithm are as follows | :--: | :--: | :--: | :--: | | rec_algorithm | str | "CRNN" | Text recognition algorithm name, currently supports `CRNN`, `SRN`, `RARE`, `NETR`, `SAR`, `ViTSTR`, `ABINet`, `VisionLAN`, `SPIN`, `RobustScanner`, `SVTR`, `SVTR_LCNet` | | rec_model_dir | str | None, it is required if using the recognition model | recognition inference model paths | -| rec_image_shape | list | [3, 48, 320] | Image size at the time of recognition | +| rec_image_shape | str | "3,48,320" ] | Image size at the time of recognition | | rec_batch_num | int | 6 | batch size | | max_text_length | int | 25 | The maximum length of the recognition result, valid in `SRN` | | rec_char_dict_path | str | "./ppocr/utils/ppocr_keys_v1.txt" | character dictionary file | @@ -115,7 +115,16 @@ The relevant parameters of the PSE algorithm are as follows | :--: | :--: | :--: | :--: | | use_angle_cls | bool | False | whether to use an angle classifier | | cls_model_dir | str | None, if you need to use, you must specify the path explicitly | angle classifier inference model path | -| cls_image_shape | list | [3, 48, 192] | prediction shape | +| cls_image_shape | str | "3,48,192" | prediction shape | | label_list | list | ['0', '180'] | The angle value corresponding to the class id | | cls_batch_num | int | 6 | batch size | | cls_thresh | float | 0.9 | Prediction threshold, when the model prediction result is 180 degrees, and the score is greater than the threshold, the final prediction result is considered to be 180 degrees and needs to be flipped | + + +* OCR image preprocessing parameters + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| invert | bool | False | whether to invert image before processing | +| binarize | bool | False | whether to threshold binarize image before processing | +| alphacolor | tuple | "255,255,255" | Replacement color for the alpha channel, if the latter is present; R,G,B integers | diff --git a/doc/doc_en/inference_en.md b/doc/doc_en/inference_en.md index d1233780d89c175729e835d069db1bcc0bb9273f..95ac2c96bfdc84da53124c359e15c9b8f01a8ff2 100755 --- a/doc/doc_en/inference_en.md +++ b/doc/doc_en/inference_en.md @@ -10,30 +10,28 @@ For more details, please refer to the document [Classification Framework](https: Next, we first introduce how to convert a trained model into an inference model, and then we will introduce text detection, text recognition, angle class, and the concatenation of them based on inference model. -- [1. Convert Training Model to Inference Model](#CONVERT) - - [1.1 Convert Detection Model to Inference Model](#Convert_detection_model) - - [1.2 Convert Recognition Model to Inference Model](#Convert_recognition_model) - - [1.3 Convert Angle Classification Model to Inference Model](#Convert_angle_class_model) - - -- [2. Text Detection Model Inference](#DETECTION_MODEL_INFERENCE) - - [2.1 Lightweight Chinese Detection Model Inference](#LIGHTWEIGHT_DETECTION) - - [2.2 DB Text Detection Model Inference](#DB_DETECTION) - - [2.3 East Text Detection Model Inference](#EAST_DETECTION) - - [2.4 Sast Text Detection Model Inference](#SAST_DETECTION) - -- [3. Text Recognition Model Inference](#RECOGNITION_MODEL_INFERENCE) - - [3.1 Lightweight Chinese Text Recognition Model Reference](#LIGHTWEIGHT_RECOGNITION) - - [3.2 CTC-Based Text Recognition Model Inference](#CTC-BASED_RECOGNITION) - - [3.3 SRN-Based Text Recognition Model Inference](#SRN-BASED_RECOGNITION) - - [3.4 Text Recognition Model Inference Using Custom Characters Dictionary](#USING_CUSTOM_CHARACTERS) - - [3.5 Multilingual Model Inference](#MULTILINGUAL_MODEL_INFERENCE) - -- [4. Angle Classification Model Inference](#ANGLE_CLASS_MODEL_INFERENCE) - -- [5. Text Detection Angle Classification And Recognition Inference Concatenation](#CONCATENATION) - - [5.1 Lightweight Chinese Model](#LIGHTWEIGHT_CHINESE_MODEL) - - [5.2 Other Models](#OTHER_MODELS) +- [Inference Based on Python Prediction Engine](#inference-based-on-python-prediction-engine) + - [1. Convert Training Model to Inference Model](#1-convert-training-model-to-inference-model) + - [1.1 Convert Detection Model to Inference Model](#11-convert-detection-model-to-inference-model) + - [1.2 Convert Recognition Model to Inference Model](#12-convert-recognition-model-to-inference-model) + - [1.3 Convert Angle Classification Model to Inference Model](#13-convert-angle-classification-model-to-inference-model) + - [2. Text Detection Model Inference](#2-text-detection-model-inference) + - [2.1 Lightweight Chinese Detection Model Inference](#21-lightweight-chinese-detection-model-inference) + - [2.2 DB Text Detection Model Inference](#22-db-text-detection-model-inference) + - [2.3 EAST TEXT DETECTION MODEL INFERENCE](#23-east-text-detection-model-inference) + - [2.4 Sast Text Detection Model Inference](#24-sast-text-detection-model-inference) + - [(1). Quadrangle text detection model (ICDAR2015)](#1-quadrangle-text-detection-model-icdar2015) + - [(2). Curved text detection model (Total-Text)](#2-curved-text-detection-model-total-text) + - [3. Text Recognition Model Inference](#3-text-recognition-model-inference) + - [3.1 Lightweight Chinese Text Recognition Model Reference](#31-lightweight-chinese-text-recognition-model-reference) + - [3.2 CTC-Based Text Recognition Model Inference](#32-ctc-based-text-recognition-model-inference) + - [3.3 SRN-Based Text Recognition Model Inference](#33-srn-based-text-recognition-model-inference) + - [3.4 Text Recognition Model Inference Using Custom Characters Dictionary](#34-text-recognition-model-inference-using-custom-characters-dictionary) + - [3.5 Multilingual Model Inference](#35-multilingual-model-inference) + - [4. Angle Classification Model Inference](#4-angle-classification-model-inference) + - [5. Text Detection Angle Classification and Recognition Inference Concatenation](#5-text-detection-angle-classification-and-recognition-inference-concatenation) + - [5.1 Lightweight Chinese Model](#51-lightweight-chinese-model) + - [5.2 Other Models](#52-other-models) ## 1. Convert Training Model to Inference Model @@ -371,7 +369,7 @@ After executing the command, the prediction results (classification angle and sc ### 5.1 Lightweight Chinese Model -When performing prediction, you need to specify the path of a single image or a folder of images through the parameter `image_dir`, the parameter `det_model_dir` specifies the path to detect the inference model, the parameter `cls_model_dir` specifies the path to angle classification inference model and the parameter `rec_model_dir` specifies the path to identify the inference model. The parameter `use_angle_cls` is used to control whether to enable the angle classification model. The parameter `use_mp` specifies whether to use multi-process to infer `total_process_num` specifies process number when using multi-process. The parameter . The visualized recognition results are saved to the `./inference_results` folder by default. +When performing prediction, you need to specify the path of a single image or a folder of images through the parameter `image_dir`, the parameter `det_model_dir` specifies the path to detect the inference model, the parameter `cls_model_dir` specifies the path to angle classification inference model and the parameter `rec_model_dir` specifies the path to identify the inference model. The parameter `use_angle_cls` is used to control whether to enable the angle classification model. The parameter `use_mp` specifies whether to use multi-process to infer `total_process_num` specifies process number when using multi-process. The parameter(Paddle Inference is not thread-safe, it is recommended to use multi-process) . The visualized recognition results are saved to the `./inference_results` folder by default. ```shell # use direction classifier diff --git a/doc/doc_en/kie_en.md b/doc/doc_en/kie_en.md index 0c335a5ceb8991b80bc0cab6facdf402878abb50..cd1fffb27ac1c2a399a916e1ba5f5c3f87032515 100644 --- a/doc/doc_en/kie_en.md +++ b/doc/doc_en/kie_en.md @@ -457,14 +457,31 @@ inference/ser_vi_layoutxlm/ └── inference.pdmodel # The program file of recognition ``` -Export of RE model is also in adaptation. +The RE model can be converted to the inference model using the following command. + +```bash +# -c Set the training algorithm yml configuration file. +# -o Set optional parameters. +# Architecture.Backbone.checkpoints Set the training model address. +# Global.save_inference_dir Set the address where the converted model will be saved. +python3 tools/export_model.py -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./output/re_vi_layoutxlm_xfund_zh/best_accuracy Global.save_inference_dir=./inference/re_vi_layoutxlm +``` + +After the conversion is successful, there are three files in the model save directory: + +``` +inference/re_vi_layoutxlm/ + ├── inference.pdiparams # The parameter file of recognition inference model + ├── inference.pdiparams.info # The parameter information of recognition inference model, which can be ignored + └── inference.pdmodel # The program file of recognition +``` ## 4.2 Model inference The VI layoutxlm model performs reasoning based on the ser task, and can execute the following commands: -Using the following command to infer the VI-LayoutXLM model. +Using the following command to infer the VI-LayoutXLM SER model. ```bash cd ppstructure @@ -483,6 +500,26 @@ The visualized result will be saved in `./output`, which is shown as follows. +Using the following command to infer the VI-LayoutXLM RE model. + +```bash +cd ppstructure +python3 kie/predict_kie_token_ser_re.py \ + --kie_algorithm=LayoutXLM \ + --re_model_dir=../inference/re_vi_layoutxlm \ + --ser_model_dir=../inference/ser_vi_layoutxlm \ + --use_visual_backbone=False \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../train_data/XFUND/class_list_xfun.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" +``` + +The visualized result will be saved in `./output`, which is shown as follows. + +
+ +
# 5. FAQ diff --git a/doc/doc_en/ocr_book_en.md b/doc/doc_en/ocr_book_en.md index b0455fe61afe8ae456f224e57d346b1fed553eb4..63162be566d515dd7f4f181f80a140cdd0376f91 100644 --- a/doc/doc_en/ocr_book_en.md +++ b/doc/doc_en/ocr_book_en.md @@ -1,6 +1,6 @@ # E-book: *Dive Into OCR* -"Dive Into OCR" is a textbook that combines OCR theory and practice, written by the PaddleOCR team, Chen Zhineng, a Pre-tenure Professor at Fudan University, Huang Wenhui, a senior expert in the field of vision at China Mobile Research Institute, and other industry-university-research colleagues, as well as OCR developers. The main features are as follows: +"Dive Into OCR" is a textbook that combines OCR theory and practice, written by the PaddleOCR community. The main features are as follows: - OCR full-stack technology covering text detection, recognition and document analysis - Closely integrate theory and practice, cross the code implementation gap, and supporting instructional videos @@ -8,6 +8,10 @@ ## Structure +
+ +
+ - The first part is the preliminary knowledge of the book, including the knowledge index and resource links needed in the process of positioning and using the book content of the book - The second part is chapters 4-8 of the book, which introduce the concepts, applications, and industry practices related to the detection and identification capabilities of the OCR engine. In the "Introduction to OCR Technology", the application scenarios and challenges of OCR, the basic concepts of technology, and the pain points in industrial applications are comprehensively explained. Then, in the two chapters of "Text Detection" and "Text Recognition", the two basic tasks of OCR are introduced. In each chapter, an algorithm is accompanied by a detailed explanation of the code and practical exercises. Chapters 6 and 7 are a detailed introduction to the PP-OCR series model, PP-OCR is a set of OCR systems for industrial applications, on the basis of the basic detection and identification model, after a series of optimization strategies to achieve the general field of industrial SOTA model, while opening up a variety of predictive deployment solutions, enabling enterprises to quickly land OCR applications. @@ -16,6 +20,11 @@ ## Address -- [E-book: *Dive Into OCR* (link generating)]() -- [Jupyter notebook](../../notebook/notebook_en/) -- [videos (Chinese only)](https://aistudio.baidu.com/aistudio/education/group/info/25207) +- [E-book: *Dive Into OCR* (PDF)](https://paddleocr.bj.bcebos.com/ebook/Dive_into_OCR.pdf) +- [Notebook (.ipynb)](https://github.com/PaddleOCR-Community/Dive-into-OCR) +- [Videos (Chinese only)](https://aistudio.baidu.com/aistudio/education/group/info/25207) + + +trackgit-views + + diff --git a/doc/doc_en/quickstart_en.md b/doc/doc_en/quickstart_en.md index ea38845f503192705a4d87f3faacdaf25bb27ba9..430f1a7a29ebeeb5401115a439c3a274e456e1d9 100644 --- a/doc/doc_en/quickstart_en.md +++ b/doc/doc_en/quickstart_en.md @@ -28,13 +28,13 @@ - If you have CUDA 9 or CUDA 10 installed on your machine, please run the following command to install ```bash - python3 -m pip install paddlepaddle-gpu -i https://mirror.baidu.com/pypi/simple + python -m pip install paddlepaddle-gpu -i https://pypi.tuna.tsinghua.edu.cn/simple ``` - If you have no available GPU on your machine, please run the following command to install the CPU version ```bash - python3 -m pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple + python -m pip install paddlepaddle -i https://pypi.tuna.tsinghua.edu.cn/simple ``` For more software version requirements, please refer to the instructions in [Installation Document](https://www.paddlepaddle.org.cn/install/quick) for operation. @@ -120,9 +120,10 @@ If you do not use the provided test image, you can replace the following `--imag ``` **Version** -paddleocr uses the PP-OCRv3 model by default(`--ocr_version PP-OCRv3`). If you want to use other versions, you can set the parameter `--ocr_version`, the specific version description is as follows: +paddleocr uses the PP-OCRv4 model by default(`--ocr_version PP-OCRv4`). If you want to use other versions, you can set the parameter `--ocr_version`, the specific version description is as follows: | version name | description | | --- | --- | +| PP-OCRv4 | support Chinese and English detection and recognition, direction classifier, support multilingual recognition | | PP-OCRv3 | support Chinese and English detection and recognition, direction classifier, support multilingual recognition | | PP-OCRv2 | only supports Chinese and English detection and recognition, direction classifier, multilingual model is not updated | | PP-OCR | support Chinese and English detection and recognition, direction classifier, support multilingual recognition | @@ -223,7 +224,7 @@ from paddleocr import PaddleOCR, draw_ocr # Paddleocr supports Chinese, English, French, German, Korean and Japanese. # You can set the parameter `lang` as `ch`, `en`, `fr`, `german`, `korean`, `japan` # to switch the language model in order. -ocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=2) # need to run only once to download and load model into memory +ocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=2) # need to run only once to download and load model into memory img_path = './xxx.pdf' result = ocr.ocr(img_path, cls=True) for idx in range(len(result)): @@ -266,4 +267,4 @@ for idx in range(len(result)): In this section, you have mastered the use of PaddleOCR whl package. -PaddleOCR is a rich and practical OCR tool library that get through the whole process of data production, model training, compression, inference and deployment, please refer to the [tutorials](../../README.md#tutorials) to start the journey of PaddleOCR. +PaddleX provides a high-quality ecological model of the paddle. It is a one-stop full-process high-efficiency development platform for training, pressing and pushing. Its mission is to help AI technology to be implemented quickly. The vision is to make everyone an AI Developer! Currently PP-OCRv4 has been launched on PaddleX, you can enter [General OCR](https://aistudio.baidu.com/aistudio/modelsdetail?modelId=286) to experience the whole process of model training, compression and inference deployment. diff --git a/doc/doc_en/recognition_en.md b/doc/doc_en/recognition_en.md index 7d31b0ffe28c59ad3397d06fa178bcf8cbb822e9..78917aea90c2082a5fcff8be1342b21bfb8e88d8 100644 --- a/doc/doc_en/recognition_en.md +++ b/doc/doc_en/recognition_en.md @@ -15,6 +15,7 @@ * [2.6 Training with knowledge distillation](#kd) * [2.7 Multi-language Training](#Multi_language) * [2.8 Training on other platform(Windows/macOS/Linux DCU)](#28) + * [2.9 Fine-tuning](#29) - [3. Evaluation and Test](#3-evaluation-and-test) * [3.1 Evaluation](#31-evaluation) * [3.2 Test](#32-test) @@ -384,6 +385,11 @@ GPU mode is not supported, you need to set `use_gpu` to False in the configurati - Linux DCU Running on a DCU device requires setting the environment variable `export HIP_VISIBLE_DEVICES=0,1,2,3`, and the rest of the training and evaluation prediction commands are exactly the same as the Linux GPU. + +## 2.9 Fine-tuning + +In actual use, it is recommended to load the official pre-trained model and fine-tune it in your own data set. For the fine-tuning method of the recognition model, please refer to: [Model Fine-tuning Tutorial](./finetune_en.md). + ## 3. Evaluation and Test diff --git a/doc/doc_en/table_recognition_en.md b/doc/doc_en/table_recognition_en.md index cff2933df22249353b47f5a0a74098be7dd6a2ae..c0a1aa9d61ebb00e4f7e013ee5feb59b1835c78c 100644 --- a/doc/doc_en/table_recognition_en.md +++ b/doc/doc_en/table_recognition_en.md @@ -6,6 +6,7 @@ This article provides a full-process guide for the PaddleOCR table recognition m - [1.1. DataSet Format](#11-dataset-format) - [1.2. Data Download](#12-data-download) - [1.3. Dataset Generation](#13-dataset-generation) + - [1.4 Data annotation](#14-data-annotation) - [2. Training](#2-training) - [2.1. Start Training](#21-start-training) - [2.2. Resume Training](#22-resume-training) @@ -14,6 +15,9 @@ This article provides a full-process guide for the PaddleOCR table recognition m - [2.5. Distributed Training](#25-distributed-training) - [2.6. Training on other platform(Windows/macOS/Linux DCU)](#26-training-on-other-platformwindowsmacoslinux-dcu) - [2.7. Fine-tuning](#27-fine-tuning) + - [2.7.1 Dataset](#271-dataset) + - [2.7.2 model selection](#272-model-selection) + - [2.7.3 Training hyperparameter selection](#273-training-hyperparameter-selection) - [3. Evaluation and Test](#3-evaluation-and-test) - [3.1. Evaluation](#31-evaluation) - [3.2. Test table structure recognition effect](#32-test-table-structure-recognition-effect) @@ -77,6 +81,10 @@ Some samples are as follows: |Simple Table|![](https://raw.githubusercontent.com/WenmuZhou/TableGeneration/main/imgs/simple.jpg)| |Simple Color Table|![](https://raw.githubusercontent.com/WenmuZhou/TableGeneration/main/imgs/color.jpg)| +## 1.4 Data annotation + +Data annotation can refer to[PPOCRLabel](../../PPOCRLabel/README.md) + # 2. Training PaddleOCR provides training scripts, evaluation scripts, and prediction scripts. In this section, the [SLANet](../../configs/table/SLANet.yml) model will be used as an example: @@ -226,8 +234,40 @@ Running on a DCU device requires setting the environment variable `export HIP_VI ## 2.7. Fine-tuning -In the actual use process, it is recommended to load the officially provided pre-training model and fine-tune it in your own data set. For the fine-tuning method of the table recognition model, please refer to: [Model fine-tuning tutorial](./finetune.md). +### 2.7.1 Dataset + +Data number: It is recommended to prepare at least 2000 table recognition datasets for model fine-tuning. + +### 2.7.2 model selection + +It is recommended to choose the SLANet model (configuration file: [SLANet_ch.yml](../../configs/table/SLANet_ch.yml), pre-training model: [ch_ppstructure_mobile_v2.0_SLANet_train.tar](https://paddleocr.bj.bcebos .com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar)) for fine-tuning, its accuracy and generalization performance is the best Chinese table pre-training model currently available. + +For more table recognition models, please refer to [PP-Structure Series Model Library](../../ppstructure/docs/models_list.md). + +### 2.7.3 Training hyperparameter selection + +When fine-tuning the model, the most important hyperparameters are the pretrained model path `pretrained_model`, the learning rate `learning_rate`, and some configuration files are shown below. + +```yaml +Global: + pretrained_model: ./ch_ppstructure_mobile_v2.0_SLANet_train/best_accuracy.pdparams # Pre-trained model path +Optimizer: + lr: + name: Cosine + learning_rate: 0.001 # + warmup_epoch: 0 + regularizer: + name: 'L2' + factor: 0 +``` + +In the above configuration file, you first need to specify the `pretrained_model` field as the `best_accuracy.pdparams` file path. + +The configuration file provided by PaddleOCR is for 4-card training (equivalent to a total batch size of `4*48=192`) and no pre-trained model is loaded. Therefore, in your scenario, the learning rate is the same as the total The batch size needs to be adjusted linearly, for example + +* If your scenario is single card training, single card batch_size=48, then the total batch_size=48, it is recommended to adjust the learning rate to about `0.00025`. +* If your scenario is for single-card training, due to memory limitations, you can only set batch_size=32 for a single card, then the total batch_size=32, it is recommended to adjust the learning rate to about `0.00017`. # 3. Evaluation and Test diff --git a/doc/doc_en/whl_en.md b/doc/doc_en/whl_en.md index 77e80faa688392db5b2959f4fd1705275cb37d6b..5283391e5ef8b35eb56f0355fd70049f40a4ae04 100644 --- a/doc/doc_en/whl_en.md +++ b/doc/doc_en/whl_en.md @@ -261,7 +261,7 @@ Output will be a list, each item contains classification result and confidence ## 3 Use custom model When the built-in model cannot meet the needs, you need to use your own trained model. -First, refer to the first section of [inference_en.md](./inference_en.md) to convert your det and rec model to inference model, and then use it as follows +First, refer to [export](./detection_en.md#4-inference) doc to convert your det and rec model to inference model, and then use it as follows ### 3.1 Use by code @@ -335,7 +335,7 @@ ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to downlo img_path = 'PaddleOCR/doc/imgs/11.jpg' img = cv2.imread(img_path) # img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY), If your own training model supports grayscale images, you can uncomment this line -result = ocr.ocr(img_path, cls=True) +result = ocr.ocr(img, cls=True) for idx in range(len(result)): res = result[idx] for line in res: diff --git a/doc/joinus.PNG b/doc/joinus.PNG index 6489247e05d70896e2ca8a5929948437c6c82b5f..aef92ec8d1f61f6fa8766e8e7a9e56993a3e6f13 100644 Binary files a/doc/joinus.PNG and b/doc/joinus.PNG differ diff --git a/doc/joinus_paddlex.jpg b/doc/joinus_paddlex.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6b70dda865534210d35caa16497d49328d6dd25b Binary files /dev/null and b/doc/joinus_paddlex.jpg differ diff --git a/doc/ppocr_v4/DF.png b/doc/ppocr_v4/DF.png new file mode 100644 index 0000000000000000000000000000000000000000..f14953d4811adb4d77fe020eaaa325c89dffc4ce Binary files /dev/null and b/doc/ppocr_v4/DF.png differ diff --git a/doc/ppocr_v4/PFHead.png b/doc/ppocr_v4/PFHead.png new file mode 100644 index 0000000000000000000000000000000000000000..3728dc44e5c86c0ba80705ad83ada65da390928d Binary files /dev/null and b/doc/ppocr_v4/PFHead.png differ diff --git a/doc/ppocr_v4/multi_scale.png b/doc/ppocr_v4/multi_scale.png new file mode 100644 index 0000000000000000000000000000000000000000..673d306399db004cbb66474368ac5055e48dbe8f Binary files /dev/null and b/doc/ppocr_v4/multi_scale.png differ diff --git a/doc/ppocr_v4/ppocrv4_det_cml.png b/doc/ppocr_v4/ppocrv4_det_cml.png new file mode 100644 index 0000000000000000000000000000000000000000..9132c0a67c4215cfe19af27628ee37cfbab44720 Binary files /dev/null and b/doc/ppocr_v4/ppocrv4_det_cml.png differ diff --git a/doc/ppocr_v4/ppocrv4_framework.png b/doc/ppocr_v4/ppocrv4_framework.png new file mode 100644 index 0000000000000000000000000000000000000000..4aac40bae8e67b0b4964ddd4e84445845049bbad Binary files /dev/null and b/doc/ppocr_v4/ppocrv4_framework.png differ diff --git a/doc/ppocr_v4/ppocrv4_gtc.png b/doc/ppocr_v4/ppocrv4_gtc.png new file mode 100644 index 0000000000000000000000000000000000000000..7e6a3f5c13ca4c3012d0dd98ba857153c75e607a Binary files /dev/null and b/doc/ppocr_v4/ppocrv4_gtc.png differ diff --git a/doc/ppocr_v4/v4_rec_pipeline.png b/doc/ppocr_v4/v4_rec_pipeline.png new file mode 100644 index 0000000000000000000000000000000000000000..b1ec7a96892c6fa992c79531da4979164027b99c Binary files /dev/null and b/doc/ppocr_v4/v4_rec_pipeline.png differ diff --git a/paddleocr.py b/paddleocr.py index af0145b48b7d8a8e6860cfb69e36b7a973a1149c..dc92cbf6b7c1789af629ba764ad1c0c12b936e4c 100644 --- a/paddleocr.py +++ b/paddleocr.py @@ -26,37 +26,133 @@ import cv2 import logging import numpy as np from pathlib import Path +import base64 +from io import BytesIO +from PIL import Image -tools = importlib.import_module('.', 'tools') -ppocr = importlib.import_module('.', 'ppocr') -ppstructure = importlib.import_module('.', 'ppstructure') -from tools.infer import predict_system -from ppocr.utils.logging import get_logger +def _import_file(module_name, file_path, make_importable=False): + spec = importlib.util.spec_from_file_location(module_name, file_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + if make_importable: + sys.modules[module_name] = module + return module -logger = get_logger() -from ppocr.utils.utility import check_and_read, get_image_file_list + +tools = _import_file( + 'tools', os.path.join(__dir__, 'tools/__init__.py'), make_importable=True) +ppocr = importlib.import_module('ppocr', 'paddleocr') +ppstructure = importlib.import_module('ppstructure', 'paddleocr') +from ppocr.utils.logging import get_logger +from tools.infer import predict_system +from ppocr.utils.utility import check_and_read, get_image_file_list, alpha_to_color, binarize_img from ppocr.utils.network import maybe_download, download_with_progressbar, is_link, confirm_model_dir_url from tools.infer.utility import draw_ocr, str2bool, check_gpu from ppstructure.utility import init_args, draw_structure_result from ppstructure.predict_system import StructureSystem, save_structure_res, to_excel +logger = get_logger() __all__ = [ 'PaddleOCR', 'PPStructure', 'draw_ocr', 'draw_structure_result', 'save_structure_res', 'download_with_progressbar', 'to_excel' ] SUPPORT_DET_MODEL = ['DB'] -VERSION = '2.6.1.0' +VERSION = '2.7.0.2' SUPPORT_REC_MODEL = ['CRNN', 'SVTR_LCNet'] BASE_DIR = os.path.expanduser("~/.paddleocr/") -DEFAULT_OCR_MODEL_VERSION = 'PP-OCRv3' -SUPPORT_OCR_MODEL_VERSION = ['PP-OCR', 'PP-OCRv2', 'PP-OCRv3'] +DEFAULT_OCR_MODEL_VERSION = 'PP-OCRv4' +SUPPORT_OCR_MODEL_VERSION = ['PP-OCR', 'PP-OCRv2', 'PP-OCRv3', 'PP-OCRv4'] DEFAULT_STRUCTURE_MODEL_VERSION = 'PP-StructureV2' SUPPORT_STRUCTURE_MODEL_VERSION = ['PP-Structure', 'PP-StructureV2'] MODEL_URLS = { 'OCR': { + 'PP-OCRv4': { + 'det': { + 'ch': { + 'url': + 'https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_det_infer.tar', + }, + 'en': { + 'url': + 'https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar', + }, + 'ml': { + 'url': + 'https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_infer.tar' + } + }, + 'rec': { + 'ch': { + 'url': + 'https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_rec_infer.tar', + 'dict_path': './ppocr/utils/ppocr_keys_v1.txt' + }, + 'en': { + 'url': + 'https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar', + 'dict_path': './ppocr/utils/en_dict.txt' + }, + 'korean': { + 'url': + 'https://paddleocr.bj.bcebos.com/PP-OCRv4/multilingual/korean_PP-OCRv4_rec_infer.tar', + 'dict_path': './ppocr/utils/dict/korean_dict.txt' + }, + 'japan': { + 'url': + 'https://paddleocr.bj.bcebos.com/PP-OCRv4/multilingual/japan_PP-OCRv4_rec_infer.tar', + 'dict_path': './ppocr/utils/dict/japan_dict.txt' + }, + 'chinese_cht': { + 'url': + 'https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/chinese_cht_PP-OCRv3_rec_infer.tar', + 'dict_path': './ppocr/utils/dict/chinese_cht_dict.txt' + }, + 'ta': { + 'url': + 'https://paddleocr.bj.bcebos.com/PP-OCRv4/multilingual/ta_PP-OCRv4_rec_infer.tar', + 'dict_path': './ppocr/utils/dict/ta_dict.txt' + }, + 'te': { + 'url': + 'https://paddleocr.bj.bcebos.com/PP-OCRv4/multilingual/te_PP-OCRv4_rec_infer.tar', + 'dict_path': './ppocr/utils/dict/te_dict.txt' + }, + 'ka': { + 'url': + 'https://paddleocr.bj.bcebos.com/PP-OCRv4/multilingual/ka_PP-OCRv4_rec_infer.tar', + 'dict_path': './ppocr/utils/dict/ka_dict.txt' + }, + 'latin': { + 'url': + 'https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/latin_PP-OCRv3_rec_infer.tar', + 'dict_path': './ppocr/utils/dict/latin_dict.txt' + }, + 'arabic': { + 'url': + 'https://paddleocr.bj.bcebos.com/PP-OCRv4/multilingual/arabic_PP-OCRv4_rec_infer.tar', + 'dict_path': './ppocr/utils/dict/arabic_dict.txt' + }, + 'cyrillic': { + 'url': + 'https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/cyrillic_PP-OCRv3_rec_infer.tar', + 'dict_path': './ppocr/utils/dict/cyrillic_dict.txt' + }, + 'devanagari': { + 'url': + 'https://paddleocr.bj.bcebos.com/PP-OCRv4/multilingual/devanagari_PP-OCRv4_rec_infer.tar', + 'dict_path': './ppocr/utils/dict/devanagari_dict.txt' + }, + }, + 'cls': { + 'ch': { + 'url': + 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar', + } + }, + }, 'PP-OCRv3': { 'det': { 'ch': { @@ -312,13 +408,14 @@ def parse_args(mMain=True): parser.add_argument("--det", type=str2bool, default=True) parser.add_argument("--rec", type=str2bool, default=True) parser.add_argument("--type", type=str, default='ocr') + parser.add_argument("--savefile", type=str2bool, default=False) parser.add_argument( "--ocr_version", type=str, choices=SUPPORT_OCR_MODEL_VERSION, - default='PP-OCRv3', + default='PP-OCRv4', help='OCR Model version, the current model support list is as follows: ' - '1. PP-OCRv3 Support Chinese and English detection and recognition model, and direction classifier model' + '1. PP-OCRv4/v3 Support Chinese and English detection and recognition model, and direction classifier model' '2. PP-OCRv2 Support Chinese detection and recognition model. ' '3. PP-OCR support Chinese detection, recognition and direction classifier and multilingual recognition model.' ) @@ -416,7 +513,7 @@ def get_model_config(type, version, model_type, lang): def img_decode(content: bytes): np_arr = np.frombuffer(content, dtype=np.uint8) - return cv2.imdecode(np_arr, cv2.IMREAD_COLOR) + return cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED) def check_img(img): @@ -431,7 +528,25 @@ def check_img(img): img, flag_gif, flag_pdf = check_and_read(image_file) if not flag_gif and not flag_pdf: with open(image_file, 'rb') as f: - img = img_decode(f.read()) + img_str = f.read() + img = img_decode(img_str) + if img is None: + try: + buf = BytesIO() + image = BytesIO(img_str) + im = Image.open(image) + rgb = im.convert('RGB') + rgb.save(buf, 'jpeg') + buf.seek(0) + image_bytes = buf.read() + data_base64 = str(base64.b64encode(image_bytes), + encoding="utf-8") + image_decode = base64.b64decode(data_base64) + img_array = np.frombuffer(image_decode, np.uint8) + img = cv2.imdecode(img_array, cv2.IMREAD_COLOR) + except: + logger.error("error in loading image:{}".format(image_file)) + return None if img is None: logger.error("error in loading image:{}".format(image_file)) return None @@ -476,7 +591,7 @@ class PaddleOCR(predict_system.TextSystem): params.cls_model_dir, cls_url = confirm_model_dir_url( params.cls_model_dir, os.path.join(BASE_DIR, 'whl', 'cls'), cls_model_config['url']) - if params.ocr_version == 'PP-OCRv3': + if params.ocr_version in ['PP-OCRv3', 'PP-OCRv4']: params.rec_image_shape = "3, 48, 320" else: params.rec_image_shape = "3, 32, 320" @@ -502,14 +617,17 @@ class PaddleOCR(predict_system.TextSystem): super().__init__(params) self.page_num = params.page_num - def ocr(self, img, det=True, rec=True, cls=True): + def ocr(self, img, det=True, rec=True, cls=True, bin=False, inv=False, alpha_color=(255, 255, 255)): """ - ocr with paddleocr - args: - img: img for ocr, support ndarray, img_path and list or ndarray - det: use text detection or not. If false, only rec will be exec. Default is True - rec: use text recognition or not. If false, only det will be exec. Default is True - cls: use angle classifier or not. Default is True. If true, the text with rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance. Text with rotation of 90 or 270 degrees can be recognized even if cls=False. + OCR with PaddleOCR + args: + img: img for OCR, support ndarray, img_path and list or ndarray + det: use text detection or not. If False, only rec will be exec. Default is True + rec: use text recognition or not. If False, only det will be exec. Default is True + cls: use angle classifier or not. Default is True. If True, the text with rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance. Text with rotation of 90 or 270 degrees can be recognized even if cls=False. + bin: binarize image to black and white. Default is False. + inv: invert image colors. Default is False. + alpha_color: set RGB color Tuple for transparent parts replacement. Default is pure white. """ assert isinstance(img, (np.ndarray, list, str, bytes)) if isinstance(img, list) and det == True: @@ -517,7 +635,7 @@ class PaddleOCR(predict_system.TextSystem): exit(0) if cls == True and self.use_angle_cls == False: logger.warning( - 'Since the angle classifier is not initialized, the angle classifier will not be uesd during the forward process' + 'Since the angle classifier is not initialized, it will not be used during the forward process' ) img = check_img(img) @@ -528,10 +646,23 @@ class PaddleOCR(predict_system.TextSystem): imgs = img[:self.page_num] else: imgs = [img] + + def preprocess_image(_image): + _image = alpha_to_color(_image, alpha_color) + if inv: + _image = cv2.bitwise_not(_image) + if bin: + _image = binarize_img(_image) + return _image + if det and rec: ocr_res = [] for idx, img in enumerate(imgs): + img = preprocess_image(img) dt_boxes, rec_res, _ = self.__call__(img, cls) + if not dt_boxes and not rec_res: + ocr_res.append(None) + continue tmp_res = [[box.tolist(), res] for box, res in zip(dt_boxes, rec_res)] ocr_res.append(tmp_res) @@ -539,7 +670,11 @@ class PaddleOCR(predict_system.TextSystem): elif det and not rec: ocr_res = [] for idx, img in enumerate(imgs): + img = preprocess_image(img) dt_boxes, elapse = self.text_detector(img) + if not dt_boxes: + ocr_res.append(None) + continue tmp_res = [box.tolist() for box in dt_boxes] ocr_res.append(tmp_res) return ocr_res @@ -548,6 +683,7 @@ class PaddleOCR(predict_system.TextSystem): cls_res = [] for idx, img in enumerate(imgs): if not isinstance(img, list): + img = preprocess_image(img) img = [img] if self.use_angle_cls and cls: img, cls_res_tmp, elapse = self.text_classifier(img) @@ -649,15 +785,35 @@ def main(): img_name = os.path.basename(img_path).split('.')[0] logger.info('{}{}{}'.format('*' * 10, img_path, '*' * 10)) if args.type == 'ocr': - result = engine.ocr(img_path, - det=args.det, - rec=args.rec, - cls=args.use_angle_cls) + result = engine.ocr( + img_path, + det=args.det, + rec=args.rec, + cls=args.use_angle_cls, + bin=args.binarize, + inv=args.invert, + alpha_color=args.alphacolor + ) if result is not None: + lines = [] for idx in range(len(result)): res = result[idx] for line in res: logger.info(line) + val = '[' + for box in line[0]: + val += str(box[0]) + ',' + str(box[1]) + ',' + + val = val[:-1] + val += '],' + line[1][0] + ',' + str(line[1][1]) + '\n' + lines.append(val) + if args.savefile: + if os.path.exists(args.output) is False: + os.mkdir(args.output) + outfile = args.output + '/' + img_name + '.txt' + with open(outfile,'w',encoding='utf-8') as f: + f.writelines(lines) + elif args.type == 'structure': img, flag_gif, flag_pdf = check_and_read(img_path) if not flag_gif and not flag_pdf: @@ -694,7 +850,7 @@ def main(): logger.info('processing {}/{} page:'.format(index + 1, len(img_paths))) new_img_name = os.path.basename(new_img_path).split('.')[0] - result = engine(new_img_path, img_idx=index) + result = engine(img, img_idx=index) save_structure_res(result, args.output, img_name, index) if args.recovery and result != []: diff --git a/ppocr/data/__init__.py b/ppocr/data/__init__.py index b602a346dbe4b0d45af287f25f05ead0f62daf44..48cd8ad8c5ccef9b0dd3c9a0c66eb028a70c8334 100644 --- a/ppocr/data/__init__.py +++ b/ppocr/data/__init__.py @@ -33,12 +33,22 @@ from paddle.io import Dataset, DataLoader, BatchSampler, DistributedBatchSampler import paddle.distributed as dist from ppocr.data.imaug import transform, create_operators -from ppocr.data.simple_dataset import SimpleDataSet -from ppocr.data.lmdb_dataset import LMDBDataSet, LMDBDataSetSR +from ppocr.data.simple_dataset import SimpleDataSet, MultiScaleDataSet +from ppocr.data.lmdb_dataset import LMDBDataSet, LMDBDataSetSR, LMDBDataSetTableMaster from ppocr.data.pgnet_dataset import PGDataSet from ppocr.data.pubtab_dataset import PubTabDataSet +from ppocr.data.multi_scale_sampler import MultiScaleSampler -__all__ = ['build_dataloader', 'transform', 'create_operators'] +# for PaddleX dataset_type +TextDetDataset = SimpleDataSet +TextRecDataset = SimpleDataSet +MSTextRecDataset = MultiScaleDataSet +PubTabTableRecDataset = PubTabDataSet +KieDataset = SimpleDataSet + +__all__ = [ + 'build_dataloader', 'transform', 'create_operators', 'set_signal_handlers' +] def term_mp(sig_num, frame): @@ -50,12 +60,43 @@ def term_mp(sig_num, frame): os.killpg(pgid, signal.SIGKILL) +def set_signal_handlers(): + pid = os.getpid() + try: + pgid = os.getpgid(pid) + except AttributeError: + # In case `os.getpgid` is not available, no signal handler will be set, + # because we cannot do safe cleanup. + pass + else: + # XXX: `term_mp` kills all processes in the process group, which in + # some cases includes the parent process of current process and may + # cause unexpected results. To solve this problem, we set signal + # handlers only when current process is the group leader. In the + # future, it would be better to consider killing only descendants of + # the current process. + if pid == pgid: + # support exit using ctrl+c + signal.signal(signal.SIGINT, term_mp) + signal.signal(signal.SIGTERM, term_mp) + + def build_dataloader(config, mode, device, logger, seed=None): config = copy.deepcopy(config) support_dict = [ - 'SimpleDataSet', 'LMDBDataSet', 'PGDataSet', 'PubTabDataSet', - 'LMDBDataSetSR' + 'SimpleDataSet', + 'LMDBDataSet', + 'PGDataSet', + 'PubTabDataSet', + 'LMDBDataSetSR', + 'LMDBDataSetTableMaster', + 'MultiScaleDataSet', + 'TextDetDataset', + 'TextRecDataset', + 'MSTextRecDataset', + 'PubTabTableRecDataset', + 'KieDataset', ] module_name = config[mode]['dataset']['name'] assert module_name in support_dict, Exception( @@ -76,11 +117,16 @@ def build_dataloader(config, mode, device, logger, seed=None): if mode == "Train": # Distribute data to multiple cards - batch_sampler = DistributedBatchSampler( - dataset=dataset, - batch_size=batch_size, - shuffle=shuffle, - drop_last=drop_last) + if 'sampler' in config[mode]: + config_sampler = config[mode]['sampler'] + sampler_name = config_sampler.pop("name") + batch_sampler = eval(sampler_name)(dataset, **config_sampler) + else: + batch_sampler = DistributedBatchSampler( + dataset=dataset, + batch_size=batch_size, + shuffle=shuffle, + drop_last=drop_last) else: # Distribute data to single card batch_sampler = BatchSampler( @@ -103,8 +149,4 @@ def build_dataloader(config, mode, device, logger, seed=None): use_shared_memory=use_shared_memory, collate_fn=collate_fn) - # support exit using ctrl+c - signal.signal(signal.SIGINT, term_mp) - signal.signal(signal.SIGTERM, term_mp) - return data_loader diff --git a/ppocr/data/imaug/__init__.py b/ppocr/data/imaug/__init__.py index 93d97446d44070b9c10064fbe10b0b5e05628a6a..121582b4908750fca6612dc592a3671ef4dcb328 100644 --- a/ppocr/data/imaug/__init__.py +++ b/ppocr/data/imaug/__init__.py @@ -27,7 +27,7 @@ from .make_pse_gt import MakePseGt from .rec_img_aug import BaseDataAugmentation, RecAug, RecConAug, RecResizeImg, ClsResizeImg, \ SRNRecResizeImg, GrayRecResizeImg, SARRecResizeImg, PRENResizeImg, \ ABINetRecResizeImg, SVTRRecResizeImg, ABINetRecAug, VLRecResizeImg, SPINRecResizeImg, RobustScannerRecResizeImg, \ - RFLRecResizeImg + RFLRecResizeImg, SVTRRecAug from .ssl_img_aug import SSLRotateResize from .randaugment import RandAugment from .copy_paste import CopyPaste diff --git a/ppocr/data/imaug/abinet_aug.py b/ppocr/data/imaug/abinet_aug.py index eefdc75d5a5c0ac3f7136bf22a2adb31129bd313..bcbdadb1bae06d0a58de4743df9ce0d8c15fcfa1 100644 --- a/ppocr/data/imaug/abinet_aug.py +++ b/ppocr/data/imaug/abinet_aug.py @@ -205,7 +205,7 @@ class CVRandomAffine(object): for x, y in startpoints] rect = cv2.minAreaRect(np.array(endpoints)) - bbox = cv2.boxPoints(rect).astype(dtype=np.int) + bbox = cv2.boxPoints(rect).astype(dtype=np.int32) max_x, max_y = bbox[:, 0].max(), bbox[:, 1].max() min_x, min_y = bbox[:, 0].min(), bbox[:, 1].min() @@ -234,9 +234,9 @@ class CVRandomPerspective(object): def get_params(self, width, height, distortion): offset_h = sample_asym( - distortion * height / 2, size=4).astype(dtype=np.int) + distortion * height / 2, size=4).astype(dtype=np.int32) offset_w = sample_asym( - distortion * width / 2, size=4).astype(dtype=np.int) + distortion * width / 2, size=4).astype(dtype=np.int32) topleft = (offset_w[0], offset_h[0]) topright = (width - 1 - offset_w[1], offset_h[1]) botright = (width - 1 - offset_w[2], height - 1 - offset_h[2]) @@ -256,7 +256,7 @@ class CVRandomPerspective(object): # TODO: more robust way to crop image rect = cv2.minAreaRect(endpoints) - bbox = cv2.boxPoints(rect).astype(dtype=np.int) + bbox = cv2.boxPoints(rect).astype(dtype=np.int32) max_x, max_y = bbox[:, 0].max(), bbox[:, 1].max() min_x, min_y = bbox[:, 0].min(), bbox[:, 1].min() min_x, min_y = max(min_x, 0), max(min_y, 0) @@ -405,3 +405,55 @@ class CVColorJitter(object): def __call__(self, img): if random.random() < self.p: return self.transforms(img) else: return img + + +class SVTRDeterioration(object): + def __init__(self, var, degrees, factor, p=0.5): + self.p = p + transforms = [] + if var is not None: + transforms.append(CVGaussianNoise(var=var)) + if degrees is not None: + transforms.append(CVMotionBlur(degrees=degrees)) + if factor is not None: + transforms.append(CVRescale(factor=factor)) + self.transforms = transforms + + def __call__(self, img): + if random.random() < self.p: + random.shuffle(self.transforms) + transforms = Compose(self.transforms) + return transforms(img) + else: + return img + + +class SVTRGeometry(object): + def __init__(self, + aug_type=0, + degrees=15, + translate=(0.3, 0.3), + scale=(0.5, 2.), + shear=(45, 15), + distortion=0.5, + p=0.5): + self.aug_type = aug_type + self.p = p + self.transforms = [] + self.transforms.append(CVRandomRotation(degrees=degrees)) + self.transforms.append( + CVRandomAffine( + degrees=degrees, translate=translate, scale=scale, shear=shear)) + self.transforms.append(CVRandomPerspective(distortion=distortion)) + + def __call__(self, img): + if random.random() < self.p: + if self.aug_type: + random.shuffle(self.transforms) + transforms = Compose(self.transforms[:random.randint(1, 3)]) + img = transforms(img) + else: + img = self.transforms[random.randint(0, 2)](img) + return img + else: + return img diff --git a/ppocr/data/imaug/ct_process.py b/ppocr/data/imaug/ct_process.py index 59715090036e1020800950b02b9ea06ab5c8d4c2..933d42f98c068780c2140740eddbc553cec02ee6 100644 --- a/ppocr/data/imaug/ct_process.py +++ b/ppocr/data/imaug/ct_process.py @@ -19,7 +19,8 @@ import pyclipper import paddle import numpy as np -import Polygon as plg +from ppocr.utils.utility import check_install + import scipy.io as scio from PIL import Image @@ -70,6 +71,8 @@ class MakeShrink(): return peri def shrink(self, bboxes, rate, max_shr=20): + check_install('Polygon', 'Polygon3') + import Polygon as plg rate = rate * rate shrinked_bboxes = [] for bbox in bboxes: diff --git a/ppocr/data/imaug/drrg_targets.py b/ppocr/data/imaug/drrg_targets.py index c56e878b837328ef2efde40b96b5571dffbb4791..7fdfd096819b266290353d842ef531e8220c586c 100644 --- a/ppocr/data/imaug/drrg_targets.py +++ b/ppocr/data/imaug/drrg_targets.py @@ -18,7 +18,7 @@ https://github.com/open-mmlab/mmocr/blob/main/mmocr/datasets/pipelines/textdet_t import cv2 import numpy as np -from lanms import merge_quadrangle_n9 as la_nms +from ppocr.utils.utility import check_install from numpy.linalg import norm @@ -543,6 +543,8 @@ class DRRGTargets(object): score = np.ones((text_comps.shape[0], 1), dtype=np.float32) text_comps = np.hstack([text_comps, score]) + check_install('lanms', 'lanms-neo') + from lanms import merge_quadrangle_n9 as la_nms text_comps = la_nms(text_comps, self.text_comp_nms_thr) if text_comps.shape[0] >= 1: diff --git a/ppocr/data/imaug/fce_aug.py b/ppocr/data/imaug/fce_aug.py index 66bafef13caaaa958c89f865bde04cb25f031329..baaaa3355558bb6919f6b0b3b58016680f15f31c 100644 --- a/ppocr/data/imaug/fce_aug.py +++ b/ppocr/data/imaug/fce_aug.py @@ -208,7 +208,7 @@ class RandomCropFlip: for polygon in all_polys: rect = cv2.minAreaRect(polygon.astype(np.int32).reshape(-1, 2)) box = cv2.boxPoints(rect) - box = np.int0(box) + box = np.int64(box) text_polys.append([box[0], box[1], box[2], box[3]]) polys = np.array(text_polys, dtype=np.int32) diff --git a/ppocr/data/imaug/fce_targets.py b/ppocr/data/imaug/fce_targets.py index 8c64276e26665d2779d35154bf9cd77edddad580..054631cb2ddee4e4fd4d5532538f318379568ded 100644 --- a/ppocr/data/imaug/fce_targets.py +++ b/ppocr/data/imaug/fce_targets.py @@ -22,10 +22,12 @@ from numpy.fft import fft from numpy.linalg import norm import sys + def vector_slope(vec): assert len(vec) == 2 return abs(vec[1] / (vec[0] + 1e-8)) + class FCENetTargets: """Generate the ground truth targets of FCENet: Fourier Contour Embedding for Arbitrary-Shaped Text Detection. @@ -107,7 +109,9 @@ class FCENetTargets: for i in range(1, n): current_line_len = i * delta_length - while current_edge_ind + 1 < len(length_cumsum) and current_line_len >= length_cumsum[current_edge_ind + 1]: + while current_edge_ind + 1 < len( + length_cumsum) and current_line_len >= length_cumsum[ + current_edge_ind + 1]: current_edge_ind += 1 current_edge_end_shift = current_line_len - length_cumsum[ @@ -239,10 +243,9 @@ class FCENetTargets: head_inds = [head_start, head_end] tail_inds = [tail_start, tail_end] else: - if vector_slope(points[1] - points[0]) + vector_slope( - points[3] - points[2]) < vector_slope(points[ - 2] - points[1]) + vector_slope(points[0] - points[ - 3]): + if vector_slope(points[1] - points[0]) + vector_slope(points[ + 3] - points[2]) < vector_slope(points[2] - points[ + 1]) + vector_slope(points[0] - points[3]): horizontal_edge_inds = [[0, 1], [2, 3]] vertical_edge_inds = [[3, 0], [1, 2]] else: @@ -582,7 +585,7 @@ class FCENetTargets: lv_ignore_polys = [[] for i in range(len(lv_size_divs))] level_maps = [] for poly in text_polys: - polygon = np.array(poly, dtype=np.int).reshape((1, -1, 2)) + polygon = np.array(poly, dtype=np.int32).reshape((1, -1, 2)) _, _, box_w, box_h = cv2.boundingRect(polygon) proportion = max(box_h, box_w) / (h + 1e-8) @@ -591,7 +594,7 @@ class FCENetTargets: lv_text_polys[ind].append(poly / lv_size_divs[ind]) for ignore_poly in ignore_polys: - polygon = np.array(ignore_poly, dtype=np.int).reshape((1, -1, 2)) + polygon = np.array(ignore_poly, dtype=np.int32).reshape((1, -1, 2)) _, _, box_w, box_h = cv2.boundingRect(polygon) proportion = max(box_h, box_w) / (h + 1e-8) diff --git a/ppocr/data/imaug/label_ops.py b/ppocr/data/imaug/label_ops.py index 63c5d6aa7851422e21a567dfe938c417793ca7ea..148b09368717a09fcc20e0852c69a948311cb511 100644 --- a/ppocr/data/imaug/label_ops.py +++ b/ppocr/data/imaug/label_ops.py @@ -64,7 +64,7 @@ class DetLabelEncode(object): return None boxes = self.expand_points_num(boxes) boxes = np.array(boxes, dtype=np.float32) - txt_tags = np.array(txt_tags, dtype=np.bool) + txt_tags = np.array(txt_tags, dtype=np.bool_) data['polys'] = boxes data['texts'] = txts @@ -218,7 +218,7 @@ class E2ELabelEncodeTest(BaseRecLabelEncode): else: txt_tags.append(False) boxes = np.array(boxes, dtype=np.float32) - txt_tags = np.array(txt_tags, dtype=np.bool) + txt_tags = np.array(txt_tags, dtype=np.bool_) data['polys'] = boxes data['ignore_tags'] = txt_tags temp_texts = [] @@ -254,7 +254,7 @@ class E2ELabelEncodeTrain(object): else: txt_tags.append(False) boxes = np.array(boxes, dtype=np.float32) - txt_tags = np.array(txt_tags, dtype=np.bool) + txt_tags = np.array(txt_tags, dtype=np.bool_) data['polys'] = boxes data['texts'] = txts @@ -886,6 +886,62 @@ class SARLabelEncode(BaseRecLabelEncode): return [self.padding_idx] +class SATRNLabelEncode(BaseRecLabelEncode): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False, + lower=False, + **kwargs): + super(SATRNLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) + self.lower = lower + + def add_special_char(self, dict_character): + beg_end_str = "" + unknown_str = "" + padding_str = "" + dict_character = dict_character + [unknown_str] + self.unknown_idx = len(dict_character) - 1 + dict_character = dict_character + [beg_end_str] + self.start_idx = len(dict_character) - 1 + self.end_idx = len(dict_character) - 1 + dict_character = dict_character + [padding_str] + self.padding_idx = len(dict_character) - 1 + + return dict_character + + def encode(self, text): + if self.lower: + text = text.lower() + text_list = [] + for char in text: + text_list.append(self.dict.get(char, self.unknown_idx)) + if len(text_list) == 0: + return None + return text_list + + def __call__(self, data): + text = data['label'] + text = self.encode(text) + if text is None: + return None + data['length'] = np.array(len(text)) + target = [self.start_idx] + text + [self.end_idx] + padded_text = [self.padding_idx for _ in range(self.max_text_len)] + if len(target) > self.max_text_len: + padded_text = target[:self.max_text_len] + else: + padded_text[:len(target)] = target + data['label'] = np.array(padded_text) + return data + + def get_ignored_tokens(self): + return [self.padding_idx] + + class PRENLabelEncode(BaseRecLabelEncode): def __init__(self, max_text_length, @@ -1185,27 +1241,36 @@ class MultiLabelEncode(BaseRecLabelEncode): max_text_length, character_dict_path=None, use_space_char=False, + gtc_encode=None, **kwargs): super(MultiLabelEncode, self).__init__( max_text_length, character_dict_path, use_space_char) self.ctc_encode = CTCLabelEncode(max_text_length, character_dict_path, use_space_char, **kwargs) - self.sar_encode = SARLabelEncode(max_text_length, character_dict_path, - use_space_char, **kwargs) + self.gtc_encode_type = gtc_encode + if gtc_encode is None: + self.gtc_encode = SARLabelEncode( + max_text_length, character_dict_path, use_space_char, **kwargs) + else: + self.gtc_encode = eval(gtc_encode)( + max_text_length, character_dict_path, use_space_char, **kwargs) def __call__(self, data): data_ctc = copy.deepcopy(data) - data_sar = copy.deepcopy(data) + data_gtc = copy.deepcopy(data) data_out = dict() data_out['img_path'] = data.get('img_path', None) data_out['image'] = data['image'] ctc = self.ctc_encode.__call__(data_ctc) - sar = self.sar_encode.__call__(data_sar) - if ctc is None or sar is None: + gtc = self.gtc_encode.__call__(data_gtc) + if ctc is None or gtc is None: return None data_out['label_ctc'] = ctc['label'] - data_out['label_sar'] = sar['label'] + if self.gtc_encode_type is not None: + data_out['label_gtc'] = gtc['label'] + else: + data_out['label_sar'] = gtc['label'] data_out['length'] = ctc['length'] return data_out @@ -1396,10 +1461,9 @@ class VLLabelEncode(BaseRecLabelEncode): max_text_length, character_dict_path=None, use_space_char=False, - lower=True, **kwargs): - super(VLLabelEncode, self).__init__( - max_text_length, character_dict_path, use_space_char, lower) + super(VLLabelEncode, self).__init__(max_text_length, + character_dict_path, use_space_char) self.dict = {} for i, char in enumerate(self.character): self.dict[char] = i diff --git a/ppocr/data/imaug/make_border_map.py b/ppocr/data/imaug/make_border_map.py index abab38368db2de84e54b060598fc509a65219296..03b7817cfbe2068184981b18a7aa539c8d350e3b 100644 --- a/ppocr/data/imaug/make_border_map.py +++ b/ppocr/data/imaug/make_border_map.py @@ -44,6 +44,10 @@ class MakeBorderMap(object): self.shrink_ratio = shrink_ratio self.thresh_min = thresh_min self.thresh_max = thresh_max + if 'total_epoch' in kwargs and 'epoch' in kwargs and kwargs[ + 'epoch'] != "None": + self.shrink_ratio = self.shrink_ratio + 0.2 * kwargs[ + 'epoch'] / float(kwargs['total_epoch']) def __call__(self, data): diff --git a/ppocr/data/imaug/make_shrink_map.py b/ppocr/data/imaug/make_shrink_map.py index 6c65c20e5621f91a5b1fba549b059c92923fca6f..d0317b61fe05ce75c479a2485cef540742f489e0 100644 --- a/ppocr/data/imaug/make_shrink_map.py +++ b/ppocr/data/imaug/make_shrink_map.py @@ -38,6 +38,10 @@ class MakeShrinkMap(object): def __init__(self, min_text_size=8, shrink_ratio=0.4, **kwargs): self.min_text_size = min_text_size self.shrink_ratio = shrink_ratio + if 'total_epoch' in kwargs and 'epoch' in kwargs and kwargs[ + 'epoch'] != "None": + self.shrink_ratio = self.shrink_ratio + 0.2 * kwargs[ + 'epoch'] / float(kwargs['total_epoch']) def __call__(self, data): image = data['image'] diff --git a/ppocr/data/imaug/rec_img_aug.py b/ppocr/data/imaug/rec_img_aug.py index e22153bdeab06565feed79715633172a275aecc7..9780082f1cc3629c7b05a24747537d473d2a42a4 100644 --- a/ppocr/data/imaug/rec_img_aug.py +++ b/ppocr/data/imaug/rec_img_aug.py @@ -18,8 +18,9 @@ import numpy as np import random import copy from PIL import Image +import PIL from .text_image_aug import tia_perspective, tia_stretch, tia_distort -from .abinet_aug import CVGeometry, CVDeterioration, CVColorJitter +from .abinet_aug import CVGeometry, CVDeterioration, CVColorJitter, SVTRGeometry, SVTRDeterioration from paddle.vision.transforms import Compose @@ -69,6 +70,8 @@ class BaseDataAugmentation(object): self.jitter_prob = jitter_prob self.blur_prob = blur_prob self.hsv_aug_prob = hsv_aug_prob + # for GaussianBlur + self.fil = cv2.getGaussianKernel(ksize=5, sigma=1, ktype=cv2.CV_32F) def __call__(self, data): img = data['image'] @@ -78,7 +81,8 @@ class BaseDataAugmentation(object): img = get_crop(img) if random.random() <= self.blur_prob: - img = blur(img) + # GaussianBlur + img = cv2.sepFilter2D(img, -1, self.fil, self.fil) if random.random() <= self.hsv_aug_prob: img = hsv_aug(img) @@ -169,6 +173,38 @@ class RecConAug(object): return data +class SVTRRecAug(object): + def __init__(self, + aug_type=0, + geometry_p=0.5, + deterioration_p=0.25, + colorjitter_p=0.25, + **kwargs): + self.transforms = Compose([ + SVTRGeometry( + aug_type=aug_type, + degrees=45, + translate=(0.0, 0.0), + scale=(0.5, 2.), + shear=(45, 15), + distortion=0.5, + p=geometry_p), SVTRDeterioration( + var=20, degrees=6, factor=4, p=deterioration_p), + CVColorJitter( + brightness=0.5, + contrast=0.5, + saturation=0.5, + hue=0.1, + p=colorjitter_p) + ]) + + def __call__(self, data): + img = data['image'] + img = self.transforms(img) + data['image'] = img + return data + + class ClsResizeImg(object): def __init__(self, image_shape, **kwargs): self.image_shape = image_shape @@ -184,17 +220,20 @@ class RecResizeImg(object): def __init__(self, image_shape, infer_mode=False, + eval_mode=False, character_dict_path='./ppocr/utils/ppocr_keys_v1.txt', padding=True, **kwargs): self.image_shape = image_shape self.infer_mode = infer_mode + self.eval_mode = eval_mode self.character_dict_path = character_dict_path self.padding = padding def __call__(self, data): img = data['image'] - if self.infer_mode and self.character_dict_path is not None: + if self.eval_mode or (self.infer_mode and + self.character_dict_path is not None): norm_img, valid_ratio = resize_norm_img_chinese(img, self.image_shape) else: @@ -368,7 +407,7 @@ class GrayRecResizeImg(object): def __init__(self, image_shape, resize_type, - inter_type='Image.ANTIALIAS', + inter_type="Image.Resampling.LANCZOS", scale=True, padding=False, **kwargs): @@ -538,7 +577,7 @@ def resize_norm_img_chinese(img, image_shape): max_wh_ratio = imgW * 1.0 / imgH h, w = img.shape[0], img.shape[1] ratio = w * 1.0 / h - max_wh_ratio = min(max(max_wh_ratio, ratio), max_wh_ratio) + max_wh_ratio = max(max_wh_ratio, ratio) imgW = int(imgH * max_wh_ratio) if math.ceil(imgH * ratio) > imgW: resized_w = imgW diff --git a/ppocr/data/lmdb_dataset.py b/ppocr/data/lmdb_dataset.py index 295643e401481d30cf433346727f39d4a4c7d2f4..f3efb604285a2dbc0062f80c95ad1ee8a9b3a127 100644 --- a/ppocr/data/lmdb_dataset.py +++ b/ppocr/data/lmdb_dataset.py @@ -18,6 +18,7 @@ import lmdb import cv2 import string import six +import pickle from PIL import Image from .imaug import transform, create_operators @@ -203,3 +204,87 @@ class LMDBDataSetSR(LMDBDataSet): if outs is None: return self.__getitem__(np.random.randint(self.__len__())) return outs + + +class LMDBDataSetTableMaster(LMDBDataSet): + def load_hierarchical_lmdb_dataset(self, data_dir): + lmdb_sets = {} + dataset_idx = 0 + env = lmdb.open( + data_dir, + max_readers=32, + readonly=True, + lock=False, + readahead=False, + meminit=False) + txn = env.begin(write=False) + num_samples = int(pickle.loads(txn.get(b"__len__"))) + lmdb_sets[dataset_idx] = {"dirpath":data_dir, "env":env, \ + "txn":txn, "num_samples":num_samples} + return lmdb_sets + + def get_img_data(self, value): + """get_img_data""" + if not value: + return None + imgdata = np.frombuffer(value, dtype='uint8') + if imgdata is None: + return None + imgori = cv2.imdecode(imgdata, 1) + if imgori is None: + return None + return imgori + + def get_lmdb_sample_info(self, txn, index): + def convert_bbox(bbox_str_list): + bbox_list = [] + for bbox_str in bbox_str_list: + bbox_list.append(int(bbox_str)) + return bbox_list + + try: + data = pickle.loads(txn.get(str(index).encode('utf8'))) + except: + return None + + # img_name, img, info_lines + file_name = data[0] + bytes = data[1] + info_lines = data[2] # raw data from TableMASTER annotation file. + # parse info_lines + raw_data = info_lines.strip().split('\n') + raw_name, text = raw_data[0], raw_data[ + 1] # don't filter the samples's length over max_seq_len. + text = text.split(',') + bbox_str_list = raw_data[2:] + bbox_split = ',' + bboxes = [{ + 'bbox': convert_bbox(bsl.strip().split(bbox_split)), + 'tokens': ['1', '2'] + } for bsl in bbox_str_list] + + # advance parse bbox + # import pdb;pdb.set_trace() + + line_info = {} + line_info['file_name'] = file_name + line_info['structure'] = text + line_info['cells'] = bboxes + line_info['image'] = bytes + return line_info + + def __getitem__(self, idx): + lmdb_idx, file_idx = self.data_idx_order_list[idx] + lmdb_idx = int(lmdb_idx) + file_idx = int(file_idx) + data = self.get_lmdb_sample_info(self.lmdb_sets[lmdb_idx]['txn'], + file_idx) + if data is None: + return self.__getitem__(np.random.randint(self.__len__())) + outs = transform(data, self.ops) + if outs is None: + return self.__getitem__(np.random.randint(self.__len__())) + return outs + + def __len__(self): + return self.data_idx_order_list.shape[0] diff --git a/ppocr/data/multi_scale_sampler.py b/ppocr/data/multi_scale_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..45793e2ba1f5c5a4f4388dd22e7146725854fa76 --- /dev/null +++ b/ppocr/data/multi_scale_sampler.py @@ -0,0 +1,171 @@ +from paddle.io import Sampler +import paddle.distributed as dist + +import numpy as np +import random +import math + + +class MultiScaleSampler(Sampler): + def __init__(self, + data_source, + scales, + first_bs=128, + fix_bs=True, + divided_factor=[8, 16], + is_training=True, + ratio_wh=0.8, + max_w=480., + seed=None): + """ + multi scale samper + Args: + data_source(dataset) + scales(list): several scales for image resolution + first_bs(int): batch size for the first scale in scales + divided_factor(list[w, h]): ImageNet models down-sample images by a factor, ensure that width and height dimensions are multiples are multiple of devided_factor. + is_training(boolean): mode + """ + # min. and max. spatial dimensions + self.data_source = data_source + self.data_idx_order_list = np.array(data_source.data_idx_order_list) + self.ds_width = data_source.ds_width + self.seed = data_source.seed + if self.ds_width: + self.wh_ratio = data_source.wh_ratio + self.wh_ratio_sort = data_source.wh_ratio_sort + self.n_data_samples = len(self.data_source) + self.ratio_wh = ratio_wh + self.max_w = max_w + + if isinstance(scales[0], list): + width_dims = [i[0] for i in scales] + height_dims = [i[1] for i in scales] + elif isinstance(scales[0], int): + width_dims = scales + height_dims = scales + base_im_w = width_dims[0] + base_im_h = height_dims[0] + base_batch_size = first_bs + + # Get the GPU and node related information + num_replicas = dist.get_world_size() + rank = dist.get_rank() + # adjust the total samples to avoid batch dropping + num_samples_per_replica = int(self.n_data_samples * 1.0 / num_replicas) + + img_indices = [idx for idx in range(self.n_data_samples)] + + self.shuffle = False + if is_training: + # compute the spatial dimensions and corresponding batch size + # ImageNet models down-sample images by a factor of 32. + # Ensure that width and height dimensions are multiples are multiple of 32. + width_dims = [ + int((w // divided_factor[0]) * divided_factor[0]) + for w in width_dims + ] + height_dims = [ + int((h // divided_factor[1]) * divided_factor[1]) + for h in height_dims + ] + + img_batch_pairs = list() + base_elements = base_im_w * base_im_h * base_batch_size + for (h, w) in zip(height_dims, width_dims): + if fix_bs: + batch_size = base_batch_size + else: + batch_size = int(max(1, (base_elements / (h * w)))) + img_batch_pairs.append((w, h, batch_size)) + self.img_batch_pairs = img_batch_pairs + self.shuffle = True + else: + self.img_batch_pairs = [(base_im_w, base_im_h, base_batch_size)] + + self.img_indices = img_indices + self.n_samples_per_replica = num_samples_per_replica + self.epoch = 0 + self.rank = rank + self.num_replicas = num_replicas + + self.batch_list = [] + self.current = 0 + last_index = num_samples_per_replica * num_replicas + indices_rank_i = self.img_indices[self.rank:last_index: + self.num_replicas] + while self.current < self.n_samples_per_replica: + for curr_w, curr_h, curr_bsz in self.img_batch_pairs: + end_index = min(self.current + curr_bsz, + self.n_samples_per_replica) + batch_ids = indices_rank_i[self.current:end_index] + n_batch_samples = len(batch_ids) + if n_batch_samples != curr_bsz: + batch_ids += indices_rank_i[:(curr_bsz - n_batch_samples)] + self.current += curr_bsz + + if len(batch_ids) > 0: + batch = [curr_w, curr_h, len(batch_ids)] + self.batch_list.append(batch) + random.shuffle(self.batch_list) + self.length = len(self.batch_list) + self.batchs_in_one_epoch = self.iter() + self.batchs_in_one_epoch_id = [ + i for i in range(len(self.batchs_in_one_epoch)) + ] + + def __iter__(self): + if self.seed is None: + random.seed(self.epoch) + self.epoch += 1 + else: + random.seed(self.seed) + random.shuffle(self.batchs_in_one_epoch_id) + for batch_tuple_id in self.batchs_in_one_epoch_id: + yield self.batchs_in_one_epoch[batch_tuple_id] + + def iter(self): + if self.shuffle: + if self.seed is not None: + random.seed(self.seed) + else: + random.seed(self.epoch) + if not self.ds_width: + random.shuffle(self.img_indices) + random.shuffle(self.img_batch_pairs) + indices_rank_i = self.img_indices[self.rank:len(self.img_indices): + self.num_replicas] + else: + indices_rank_i = self.img_indices[self.rank:len(self.img_indices): + self.num_replicas] + + start_index = 0 + batchs_in_one_epoch = [] + for batch_tuple in self.batch_list: + curr_w, curr_h, curr_bsz = batch_tuple + end_index = min(start_index + curr_bsz, self.n_samples_per_replica) + batch_ids = indices_rank_i[start_index:end_index] + n_batch_samples = len(batch_ids) + if n_batch_samples != curr_bsz: + batch_ids += indices_rank_i[:(curr_bsz - n_batch_samples)] + start_index += curr_bsz + + if len(batch_ids) > 0: + if self.ds_width: + wh_ratio_current = self.wh_ratio[self.wh_ratio_sort[ + batch_ids]] + ratio_current = wh_ratio_current.mean() + ratio_current = ratio_current if ratio_current * curr_h < self.max_w else self.max_w / curr_h + else: + ratio_current = None + batch = [(curr_w, curr_h, b_id, ratio_current) + for b_id in batch_ids] + # yield batch + batchs_in_one_epoch.append(batch) + return batchs_in_one_epoch + + def set_epoch(self, epoch: int): + self.epoch = epoch + + def __len__(self): + return self.length diff --git a/ppocr/data/simple_dataset.py b/ppocr/data/simple_dataset.py index 402f1e38fed9e32722e2dd160f10f779028807a3..f7c4c8f1a21ddb36e27fe4c1a217ce3fa9caff41 100644 --- a/ppocr/data/simple_dataset.py +++ b/ppocr/data/simple_dataset.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import numpy as np +import cv2 +import math import os import json import random @@ -48,11 +50,31 @@ class SimpleDataSet(Dataset): self.data_idx_order_list = list(range(len(self.data_lines))) if self.mode == "train" and self.do_shuffle: self.shuffle_data_random() + + self.set_epoch_as_seed(self.seed, dataset_config) + self.ops = create_operators(dataset_config['transforms'], global_config) self.ext_op_transform_idx = dataset_config.get("ext_op_transform_idx", 2) self.need_reset = True in [x < 1 for x in ratio_list] + def set_epoch_as_seed(self, seed, dataset_config): + if self.mode == 'train': + try: + border_map_id = [index + for index, dictionary in enumerate(dataset_config['transforms']) + if 'MakeBorderMap' in dictionary][0] + shrink_map_id = [index + for index, dictionary in enumerate(dataset_config['transforms']) + if 'MakeShrinkMap' in dictionary][0] + dataset_config['transforms'][border_map_id]['MakeBorderMap'][ + 'epoch'] = seed if seed is not None else 0 + dataset_config['transforms'][shrink_map_id]['MakeShrinkMap'][ + 'epoch'] = seed if seed is not None else 0 + except Exception as E: + print(E) + return + def get_image_info_list(self, file_list, ratio_list): if isinstance(file_list, str): file_list = [file_list] @@ -149,3 +171,96 @@ class SimpleDataSet(Dataset): def __len__(self): return len(self.data_idx_order_list) + + +class MultiScaleDataSet(SimpleDataSet): + def __init__(self, config, mode, logger, seed=None): + super(MultiScaleDataSet, self).__init__(config, mode, logger, seed) + self.ds_width = config[mode]['dataset'].get('ds_width', False) + if self.ds_width: + self.wh_aware() + + def wh_aware(self): + data_line_new = [] + wh_ratio = [] + for lins in self.data_lines: + data_line_new.append(lins) + lins = lins.decode('utf-8') + name, label, w, h = lins.strip("\n").split(self.delimiter) + wh_ratio.append(float(w) / float(h)) + + self.data_lines = data_line_new + self.wh_ratio = np.array(wh_ratio) + self.wh_ratio_sort = np.argsort(self.wh_ratio) + self.data_idx_order_list = list(range(len(self.data_lines))) + + def resize_norm_img(self, data, imgW, imgH, padding=True): + img = data['image'] + h = img.shape[0] + w = img.shape[1] + if not padding: + resized_image = cv2.resize( + img, (imgW, imgH), interpolation=cv2.INTER_LINEAR) + resized_w = imgW + else: + ratio = w / float(h) + if math.ceil(imgH * ratio) > imgW: + resized_w = imgW + else: + resized_w = int(math.ceil(imgH * ratio)) + resized_image = cv2.resize(img, (resized_w, imgH)) + resized_image = resized_image.astype('float32') + + resized_image = resized_image.transpose((2, 0, 1)) / 255 + resized_image -= 0.5 + resized_image /= 0.5 + padding_im = np.zeros((3, imgH, imgW), dtype=np.float32) + padding_im[:, :, :resized_w] = resized_image + valid_ratio = min(1.0, float(resized_w / imgW)) + data['image'] = padding_im + data['valid_ratio'] = valid_ratio + return data + + def __getitem__(self, properties): + # properites is a tuple, contains (width, height, index) + img_height = properties[1] + idx = properties[2] + if self.ds_width and properties[3] is not None: + wh_ratio = properties[3] + img_width = img_height * (1 if int(round(wh_ratio)) == 0 else + int(round(wh_ratio))) + file_idx = self.wh_ratio_sort[idx] + else: + file_idx = self.data_idx_order_list[idx] + img_width = properties[0] + wh_ratio = None + + data_line = self.data_lines[file_idx] + try: + data_line = data_line.decode('utf-8') + substr = data_line.strip("\n").split(self.delimiter) + file_name = substr[0] + file_name = self._try_parse_filename_list(file_name) + label = substr[1] + img_path = os.path.join(self.data_dir, file_name) + data = {'img_path': img_path, 'label': label} + if not os.path.exists(img_path): + raise Exception("{} does not exist!".format(img_path)) + with open(data['img_path'], 'rb') as f: + img = f.read() + data['image'] = img + data['ext_data'] = self.get_ext_data() + outs = transform(data, self.ops[:-1]) + if outs is not None: + outs = self.resize_norm_img(outs, img_width, img_height) + outs = transform(outs, self.ops[-1:]) + except: + self.logger.error( + "When parsing line {}, error happened with msg: {}".format( + data_line, traceback.format_exc())) + outs = None + if outs is None: + # during evaluation, we should fix the idx to get same results for many times of evaluation. + rnd_idx = (idx + 1) % self.__len__() + return self.__getitem__([img_width, img_height, rnd_idx, wh_ratio]) + return outs diff --git a/ppocr/losses/__init__.py b/ppocr/losses/__init__.py index c7142e3e5e73e25764dde4631a47be939905e3be..9e6a45478e108637494db694d6a05c8db5b5a40e 100644 --- a/ppocr/losses/__init__.py +++ b/ppocr/losses/__init__.py @@ -41,6 +41,8 @@ from .rec_vl_loss import VLLoss from .rec_spin_att_loss import SPINAttentionLoss from .rec_rfl_loss import RFLLoss from .rec_can_loss import CANLoss +from .rec_satrn_loss import SATRNLoss +from .rec_nrtr_loss import NRTRLoss # cls loss from .cls_loss import ClsLoss @@ -73,7 +75,8 @@ def build_loss(config): 'CELoss', 'TableAttentionLoss', 'SARLoss', 'AsterLoss', 'SDMGRLoss', 'VQASerTokenLayoutLMLoss', 'LossFromOutput', 'PRENLoss', 'MultiLoss', 'TableMasterLoss', 'SPINAttentionLoss', 'VLLoss', 'StrokeFocusLoss', - 'SLALoss', 'CTLoss', 'RFLLoss', 'DRRGLoss', 'CANLoss', 'TelescopeLoss' + 'SLALoss', 'CTLoss', 'RFLLoss', 'DRRGLoss', 'CANLoss', 'TelescopeLoss', + 'SATRNLoss', 'NRTRLoss' ] config = copy.deepcopy(config) module_name = config.pop('name') diff --git a/ppocr/losses/basic_loss.py b/ppocr/losses/basic_loss.py index 58410b4db2157074c2cb0f7db590c84021e10ace..9ad854cd120c996e2c18c61f00718e5826b25372 100644 --- a/ppocr/losses/basic_loss.py +++ b/ppocr/losses/basic_loss.py @@ -165,3 +165,79 @@ class LossFromOutput(nn.Layer): elif self.reduction == 'sum': loss = paddle.sum(loss) return {'loss': loss} + + +class KLDivLoss(nn.Layer): + """ + KLDivLoss + """ + + def __init__(self): + super().__init__() + + def _kldiv(self, x, target, mask=None): + eps = 1.0e-10 + loss = target * (paddle.log(target + eps) - x) + if mask is not None: + loss = loss.flatten(0, 1).sum(axis=1) + loss = loss.masked_select(mask).mean() + else: + # batch mean loss + loss = paddle.sum(loss) / loss.shape[0] + return loss + + def forward(self, logits_s, logits_t, mask=None): + log_out_s = F.log_softmax(logits_s, axis=-1) + out_t = F.softmax(logits_t, axis=-1) + loss = self._kldiv(log_out_s, out_t, mask) + return loss + + +class DKDLoss(nn.Layer): + """ + KLDivLoss + """ + + def __init__(self, temperature=1.0, alpha=1.0, beta=1.0): + super().__init__() + self.temperature = temperature + self.alpha = alpha + self.beta = beta + + def _cat_mask(self, t, mask1, mask2): + t1 = (t * mask1).sum(axis=1, keepdim=True) + t2 = (t * mask2).sum(axis=1, keepdim=True) + rt = paddle.concat([t1, t2], axis=1) + return rt + + def _kl_div(self, x, label, mask=None): + y = (label * (paddle.log(label + 1e-10) - x)).sum(axis=1) + if mask is not None: + y = y.masked_select(mask).mean() + else: + y = y.mean() + return y + + def forward(self, logits_student, logits_teacher, target, mask=None): + gt_mask = F.one_hot( + target.reshape([-1]), num_classes=logits_student.shape[-1]) + other_mask = 1 - gt_mask + logits_student = logits_student.flatten(0, 1) + logits_teacher = logits_teacher.flatten(0, 1) + pred_student = F.softmax(logits_student / self.temperature, axis=1) + pred_teacher = F.softmax(logits_teacher / self.temperature, axis=1) + pred_student = self._cat_mask(pred_student, gt_mask, other_mask) + pred_teacher = self._cat_mask(pred_teacher, gt_mask, other_mask) + log_pred_student = paddle.log(pred_student) + tckd_loss = self._kl_div(log_pred_student, + pred_teacher) * (self.temperature**2) + pred_teacher_part2 = F.softmax( + logits_teacher / self.temperature - 1000.0 * gt_mask, axis=1) + log_pred_student_part2 = F.log_softmax( + logits_student / self.temperature - 1000.0 * gt_mask, axis=1) + nckd_loss = self._kl_div(log_pred_student_part2, + pred_teacher_part2) * (self.temperature**2) + + loss = self.alpha * tckd_loss + self.beta * nckd_loss + + return loss diff --git a/ppocr/losses/combined_loss.py b/ppocr/losses/combined_loss.py index 8d697d544b51899cdafeff94be2ecce067b907a2..a520f10ffb6a83b444fb98c7d461bbcfaf4ce14d 100644 --- a/ppocr/losses/combined_loss.py +++ b/ppocr/losses/combined_loss.py @@ -20,9 +20,9 @@ from .center_loss import CenterLoss from .ace_loss import ACELoss from .rec_sar_loss import SARLoss -from .distillation_loss import DistillationCTCLoss -from .distillation_loss import DistillationSARLoss -from .distillation_loss import DistillationDMLLoss +from .distillation_loss import DistillationCTCLoss, DistillCTCLogits +from .distillation_loss import DistillationSARLoss, DistillationNRTRLoss +from .distillation_loss import DistillationDMLLoss, DistillationKLDivLoss, DistillationDKDLoss from .distillation_loss import DistillationDistanceLoss, DistillationDBLoss, DistillationDilaDBLoss from .distillation_loss import DistillationVQASerTokenLayoutLMLoss, DistillationSERDMLLoss from .distillation_loss import DistillationLossFromOutput diff --git a/ppocr/losses/det_db_loss.py b/ppocr/losses/det_db_loss.py index 708ffbdb47f349304e2bfd781a836e79348475f4..ce31ef124591ce3e5351460eb94ca50490bcf0e5 100755 --- a/ppocr/losses/det_db_loss.py +++ b/ppocr/losses/det_db_loss.py @@ -20,6 +20,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import paddle from paddle import nn from .det_basic_loss import BalanceLoss, MaskL1Loss, DiceLoss @@ -66,11 +67,21 @@ class DBLoss(nn.Layer): label_shrink_mask) loss_shrink_maps = self.alpha * loss_shrink_maps loss_threshold_maps = self.beta * loss_threshold_maps + # CBN loss + if 'distance_maps' in predicts.keys(): + distance_maps = predicts['distance_maps'] + cbn_maps = predicts['cbn_maps'] + cbn_loss = self.bce_loss(cbn_maps[:, 0, :, :], label_shrink_map, + label_shrink_mask) + else: + dis_loss = paddle.to_tensor([0.]) + cbn_loss = paddle.to_tensor([0.]) loss_all = loss_shrink_maps + loss_threshold_maps \ + loss_binary_maps - losses = {'loss': loss_all, \ + losses = {'loss': loss_all+ cbn_loss, \ "loss_shrink_maps": loss_shrink_maps, \ "loss_threshold_maps": loss_threshold_maps, \ - "loss_binary_maps": loss_binary_maps} + "loss_binary_maps": loss_binary_maps, \ + "loss_cbn": cbn_loss} return losses diff --git a/ppocr/losses/distillation_loss.py b/ppocr/losses/distillation_loss.py index 4bfbed75a338e2bd3bca0b80d16028030bf2f0b5..5812544e91d8357c161e4faa4d0e36ce4dbd9374 100644 --- a/ppocr/losses/distillation_loss.py +++ b/ppocr/losses/distillation_loss.py @@ -14,12 +14,14 @@ import paddle import paddle.nn as nn +import paddle.nn.functional as F import numpy as np import cv2 from .rec_ctc_loss import CTCLoss from .rec_sar_loss import SARLoss -from .basic_loss import DMLLoss +from .rec_ce_loss import CELoss +from .basic_loss import DMLLoss, KLDivLoss, DKDLoss from .basic_loss import DistanceLoss from .basic_loss import LossFromOutput from .det_db_loss import DBLoss @@ -102,7 +104,6 @@ class DistillationDMLLoss(DMLLoss): if self.key is not None: out1 = out1[self.key] out2 = out2[self.key] - if self.maps_name is None: if self.multi_head: loss = super().forward(out1[self.dis_head], @@ -133,6 +134,449 @@ class DistillationDMLLoss(DMLLoss): return loss_dict +class DistillationKLDivLoss(KLDivLoss): + """ + """ + + def __init__(self, + model_name_pairs=[], + key=None, + multi_head=False, + dis_head='ctc', + maps_name=None, + name="kl_div"): + super().__init__() + assert isinstance(model_name_pairs, list) + self.key = key + self.multi_head = multi_head + self.dis_head = dis_head + self.model_name_pairs = self._check_model_name_pairs(model_name_pairs) + self.name = name + self.maps_name = self._check_maps_name(maps_name) + + def _check_model_name_pairs(self, model_name_pairs): + if not isinstance(model_name_pairs, list): + return [] + elif isinstance(model_name_pairs[0], list) and isinstance( + model_name_pairs[0][0], str): + return model_name_pairs + else: + return [model_name_pairs] + + def _check_maps_name(self, maps_name): + if maps_name is None: + return None + elif type(maps_name) == str: + return [maps_name] + elif type(maps_name) == list: + return [maps_name] + else: + return None + + def _slice_out(self, outs): + new_outs = {} + for k in self.maps_name: + if k == "thrink_maps": + new_outs[k] = outs[:, 0, :, :] + elif k == "threshold_maps": + new_outs[k] = outs[:, 1, :, :] + elif k == "binary_maps": + new_outs[k] = outs[:, 2, :, :] + else: + continue + return new_outs + + def forward(self, predicts, batch): + loss_dict = dict() + for idx, pair in enumerate(self.model_name_pairs): + out1 = predicts[pair[0]] + out2 = predicts[pair[1]] + if self.key is not None: + out1 = out1[self.key] + out2 = out2[self.key] + if self.maps_name is None: + if self.multi_head: + # for nrtr dml loss + max_len = batch[3].max() + tgt = batch[2][:, 1:2 + max_len] + tgt = tgt.reshape([-1]) + non_pad_mask = paddle.not_equal( + tgt, paddle.zeros( + tgt.shape, dtype=tgt.dtype)) + loss = super().forward(out1[self.dis_head], + out2[self.dis_head], non_pad_mask) + else: + loss = super().forward(out1, out2) + if isinstance(loss, dict): + for key in loss: + loss_dict["{}_{}_{}_{}".format(key, pair[0], pair[1], + idx)] = loss[key] + else: + loss_dict["{}_{}".format(self.name, idx)] = loss + else: + outs1 = self._slice_out(out1) + outs2 = self._slice_out(out2) + for _c, k in enumerate(outs1.keys()): + loss = super().forward(outs1[k], outs2[k]) + if isinstance(loss, dict): + for key in loss: + loss_dict["{}_{}_{}_{}_{}".format(key, pair[ + 0], pair[1], self.maps_name, idx)] = loss[key] + else: + loss_dict["{}_{}_{}".format(self.name, self.maps_name[ + _c], idx)] = loss + + loss_dict = _sum_loss(loss_dict) + + return loss_dict + + +class DistillationDKDLoss(DKDLoss): + """ + """ + + def __init__(self, + model_name_pairs=[], + key=None, + multi_head=False, + dis_head='ctc', + maps_name=None, + name="dkd", + temperature=1.0, + alpha=1.0, + beta=1.0): + super().__init__(temperature, alpha, beta) + assert isinstance(model_name_pairs, list) + self.key = key + self.multi_head = multi_head + self.dis_head = dis_head + self.model_name_pairs = self._check_model_name_pairs(model_name_pairs) + self.name = name + self.maps_name = self._check_maps_name(maps_name) + + def _check_model_name_pairs(self, model_name_pairs): + if not isinstance(model_name_pairs, list): + return [] + elif isinstance(model_name_pairs[0], list) and isinstance( + model_name_pairs[0][0], str): + return model_name_pairs + else: + return [model_name_pairs] + + def _check_maps_name(self, maps_name): + if maps_name is None: + return None + elif type(maps_name) == str: + return [maps_name] + elif type(maps_name) == list: + return [maps_name] + else: + return None + + def _slice_out(self, outs): + new_outs = {} + for k in self.maps_name: + if k == "thrink_maps": + new_outs[k] = outs[:, 0, :, :] + elif k == "threshold_maps": + new_outs[k] = outs[:, 1, :, :] + elif k == "binary_maps": + new_outs[k] = outs[:, 2, :, :] + else: + continue + return new_outs + + def forward(self, predicts, batch): + loss_dict = dict() + + for idx, pair in enumerate(self.model_name_pairs): + out1 = predicts[pair[0]] + out2 = predicts[pair[1]] + if self.key is not None: + out1 = out1[self.key] + out2 = out2[self.key] + if self.maps_name is None: + if self.multi_head: + # for nrtr dml loss + max_len = batch[3].max() + tgt = batch[2][:, 1:2 + + max_len] # [batch_size, max_len + 1] + + tgt = tgt.reshape([-1]) # batch_size * (max_len + 1) + non_pad_mask = paddle.not_equal( + tgt, paddle.zeros( + tgt.shape, + dtype=tgt.dtype)) # batch_size * (max_len + 1) + + loss = super().forward( + out1[self.dis_head], out2[self.dis_head], tgt, + non_pad_mask) # [batch_size, max_len + 1, num_char] + else: + loss = super().forward(out1, out2) + if isinstance(loss, dict): + for key in loss: + loss_dict["{}_{}_{}_{}".format(key, pair[0], pair[1], + idx)] = loss[key] + else: + loss_dict["{}_{}".format(self.name, idx)] = loss + else: + outs1 = self._slice_out(out1) + outs2 = self._slice_out(out2) + for _c, k in enumerate(outs1.keys()): + loss = super().forward(outs1[k], outs2[k]) + if isinstance(loss, dict): + for key in loss: + loss_dict["{}_{}_{}_{}_{}".format(key, pair[ + 0], pair[1], self.maps_name, idx)] = loss[key] + else: + loss_dict["{}_{}_{}".format(self.name, self.maps_name[ + _c], idx)] = loss + + loss_dict = _sum_loss(loss_dict) + + return loss_dict + + +class DistillationNRTRDMLLoss(DistillationDMLLoss): + """ + """ + + def forward(self, predicts, batch): + loss_dict = dict() + for idx, pair in enumerate(self.model_name_pairs): + out1 = predicts[pair[0]] + out2 = predicts[pair[1]] + if self.key is not None: + out1 = out1[self.key] + out2 = out2[self.key] + + if self.multi_head: + # for nrtr dml loss + max_len = batch[3].max() + tgt = batch[2][:, 1:2 + max_len] + tgt = tgt.reshape([-1]) + non_pad_mask = paddle.not_equal( + tgt, paddle.zeros( + tgt.shape, dtype=tgt.dtype)) + loss = super().forward(out1[self.dis_head], out2[self.dis_head], + non_pad_mask) + else: + loss = super().forward(out1, out2) + if isinstance(loss, dict): + for key in loss: + loss_dict["{}_{}_{}_{}".format(key, pair[0], pair[1], + idx)] = loss[key] + else: + loss_dict["{}_{}".format(self.name, idx)] = loss + + loss_dict = _sum_loss(loss_dict) + + return loss_dict + + +class DistillationKLDivLoss(KLDivLoss): + """ + """ + + def __init__(self, + model_name_pairs=[], + key=None, + multi_head=False, + dis_head='ctc', + maps_name=None, + name="kl_div"): + super().__init__() + assert isinstance(model_name_pairs, list) + self.key = key + self.multi_head = multi_head + self.dis_head = dis_head + self.model_name_pairs = self._check_model_name_pairs(model_name_pairs) + self.name = name + self.maps_name = self._check_maps_name(maps_name) + + def _check_model_name_pairs(self, model_name_pairs): + if not isinstance(model_name_pairs, list): + return [] + elif isinstance(model_name_pairs[0], list) and isinstance( + model_name_pairs[0][0], str): + return model_name_pairs + else: + return [model_name_pairs] + + def _check_maps_name(self, maps_name): + if maps_name is None: + return None + elif type(maps_name) == str: + return [maps_name] + elif type(maps_name) == list: + return [maps_name] + else: + return None + + def _slice_out(self, outs): + new_outs = {} + for k in self.maps_name: + if k == "thrink_maps": + new_outs[k] = outs[:, 0, :, :] + elif k == "threshold_maps": + new_outs[k] = outs[:, 1, :, :] + elif k == "binary_maps": + new_outs[k] = outs[:, 2, :, :] + else: + continue + return new_outs + + def forward(self, predicts, batch): + loss_dict = dict() + for idx, pair in enumerate(self.model_name_pairs): + out1 = predicts[pair[0]] + out2 = predicts[pair[1]] + if self.key is not None: + out1 = out1[self.key] + out2 = out2[self.key] + if self.maps_name is None: + if self.multi_head: + # for nrtr dml loss + max_len = batch[3].max() + tgt = batch[2][:, 1:2 + max_len] + tgt = tgt.reshape([-1]) + non_pad_mask = paddle.not_equal( + tgt, paddle.zeros( + tgt.shape, dtype=tgt.dtype)) + loss = super().forward(out1[self.dis_head], + out2[self.dis_head], non_pad_mask) + else: + loss = super().forward(out1, out2) + if isinstance(loss, dict): + for key in loss: + loss_dict["{}_{}_{}_{}".format(key, pair[0], pair[1], + idx)] = loss[key] + else: + loss_dict["{}_{}".format(self.name, idx)] = loss + else: + outs1 = self._slice_out(out1) + outs2 = self._slice_out(out2) + for _c, k in enumerate(outs1.keys()): + loss = super().forward(outs1[k], outs2[k]) + if isinstance(loss, dict): + for key in loss: + loss_dict["{}_{}_{}_{}_{}".format(key, pair[ + 0], pair[1], self.maps_name, idx)] = loss[key] + else: + loss_dict["{}_{}_{}".format(self.name, self.maps_name[ + _c], idx)] = loss + + loss_dict = _sum_loss(loss_dict) + + return loss_dict + + +class DistillationDKDLoss(DKDLoss): + """ + """ + + def __init__(self, + model_name_pairs=[], + key=None, + multi_head=False, + dis_head='ctc', + maps_name=None, + name="dkd", + temperature=1.0, + alpha=1.0, + beta=1.0): + super().__init__(temperature, alpha, beta) + assert isinstance(model_name_pairs, list) + self.key = key + self.multi_head = multi_head + self.dis_head = dis_head + self.model_name_pairs = self._check_model_name_pairs(model_name_pairs) + self.name = name + self.maps_name = self._check_maps_name(maps_name) + + def _check_model_name_pairs(self, model_name_pairs): + if not isinstance(model_name_pairs, list): + return [] + elif isinstance(model_name_pairs[0], list) and isinstance( + model_name_pairs[0][0], str): + return model_name_pairs + else: + return [model_name_pairs] + + def _check_maps_name(self, maps_name): + if maps_name is None: + return None + elif type(maps_name) == str: + return [maps_name] + elif type(maps_name) == list: + return [maps_name] + else: + return None + + def _slice_out(self, outs): + new_outs = {} + for k in self.maps_name: + if k == "thrink_maps": + new_outs[k] = outs[:, 0, :, :] + elif k == "threshold_maps": + new_outs[k] = outs[:, 1, :, :] + elif k == "binary_maps": + new_outs[k] = outs[:, 2, :, :] + else: + continue + return new_outs + + def forward(self, predicts, batch): + loss_dict = dict() + + for idx, pair in enumerate(self.model_name_pairs): + out1 = predicts[pair[0]] + out2 = predicts[pair[1]] + if self.key is not None: + out1 = out1[self.key] + out2 = out2[self.key] + if self.maps_name is None: + if self.multi_head: + # for nrtr dml loss + max_len = batch[3].max() + tgt = batch[2][:, 1:2 + + max_len] # [batch_size, max_len + 1] + + tgt = tgt.reshape([-1]) # batch_size * (max_len + 1) + non_pad_mask = paddle.not_equal( + tgt, paddle.zeros( + tgt.shape, + dtype=tgt.dtype)) # batch_size * (max_len + 1) + + loss = super().forward( + out1[self.dis_head], out2[self.dis_head], tgt, + non_pad_mask) # [batch_size, max_len + 1, num_char] + else: + loss = super().forward(out1, out2) + if isinstance(loss, dict): + for key in loss: + loss_dict["{}_{}_{}_{}".format(key, pair[0], pair[1], + idx)] = loss[key] + else: + loss_dict["{}_{}".format(self.name, idx)] = loss + else: + outs1 = self._slice_out(out1) + outs2 = self._slice_out(out2) + for _c, k in enumerate(outs1.keys()): + loss = super().forward(outs1[k], outs2[k]) + if isinstance(loss, dict): + for key in loss: + loss_dict["{}_{}_{}_{}_{}".format(key, pair[ + 0], pair[1], self.maps_name, idx)] = loss[key] + else: + loss_dict["{}_{}_{}".format(self.name, self.maps_name[ + _c], idx)] = loss + + loss_dict = _sum_loss(loss_dict) + + return loss_dict + + class DistillationCTCLoss(CTCLoss): def __init__(self, model_name_list=[], @@ -199,6 +643,40 @@ class DistillationSARLoss(SARLoss): return loss_dict +class DistillationNRTRLoss(CELoss): + def __init__(self, + model_name_list=[], + key=None, + multi_head=False, + smoothing=True, + name="loss_nrtr", + **kwargs): + super().__init__(smoothing=smoothing) + self.model_name_list = model_name_list + self.key = key + self.name = name + self.multi_head = multi_head + + def forward(self, predicts, batch): + loss_dict = dict() + for idx, model_name in enumerate(self.model_name_list): + out = predicts[model_name] + if self.key is not None: + out = out[self.key] + if self.multi_head: + assert 'gtc' in out, 'multi head has multi out' + loss = super().forward(out['gtc'], batch[:1] + batch[2:]) + else: + loss = super().forward(out, batch) + if isinstance(loss, dict): + for key in loss: + loss_dict["{}_{}_{}".format(self.name, model_name, + idx)] = loss[key] + else: + loss_dict["{}_{}".format(self.name, model_name)] = loss + return loss_dict + + class DistillationDBLoss(DBLoss): def __init__(self, model_name_list=[], @@ -459,3 +937,212 @@ class DistillationVQADistanceLoss(DistanceLoss): loss_dict["{}_{}_{}_{}".format(self.name, pair[0], pair[1], idx)] = loss return loss_dict + + +class CTCDKDLoss(nn.Layer): + """ + KLDivLoss + """ + + def __init__(self, temperature=0.5, alpha=1.0, beta=1.0): + super().__init__() + self.temperature = temperature + self.alpha = alpha + self.beta = beta + self.eps = 1e-6 + self.t = temperature + self.act = nn.Softmax(axis=-1) + self.use_log = True + + def kl_loss(self, p1, p2): # predict, label + loss = paddle.multiply( + p2, paddle.log((p2 + self.eps) / (p1 + self.eps) + self.eps)) + bs = loss.shape[0] + loss = paddle.sum(loss) / bs + return loss + + def _cat_mask(self, t, mask1, mask2): + t1 = (t * mask1).sum(axis=1, keepdim=True) + t2 = (t * mask2).sum(axis=1, keepdim=True) + rt = paddle.concat([t1, t2], axis=1) + return rt + + def multi_label_mask(self, targets): + + targets = targets.astype("int32") + res = F.one_hot(targets, num_classes=11465) + mask = paddle.clip(paddle.sum(res, axis=1), 0, 1) + mask[:, 0] = 0 # ingore ctc blank label + return mask + + def forward(self, logits_student, logits_teacher, targets, mask=None): + + gt_mask = self.multi_label_mask(targets) + other_mask = paddle.ones_like(gt_mask) - gt_mask + + pred_student = F.softmax(logits_student / self.temperature, axis=-1) + pred_teacher = F.softmax(logits_teacher / self.temperature, axis=-1) + + # differents with dkd + pred_student = paddle.mean(pred_student, axis=1) + pred_teacher = paddle.mean(pred_teacher, axis=1) + + pred_student = self._cat_mask(pred_student, gt_mask, other_mask) + pred_teacher = self._cat_mask(pred_teacher, gt_mask, other_mask) + + # differents with dkd + tckd_loss = self.kl_loss(pred_student, pred_teacher) + + gt_mask_ex = paddle.expand_as(gt_mask.unsqueeze(axis=1), logits_teacher) + pred_teacher_part2 = F.softmax( + logits_teacher / self.temperature - 1000.0 * gt_mask_ex, axis=-1) + pred_student_part2 = F.softmax( + logits_student / self.temperature - 1000.0 * gt_mask_ex, axis=-1) + # differents with dkd + pred_teacher_part2 = paddle.mean(pred_teacher_part2, axis=1) + pred_student_part2 = paddle.mean(pred_student_part2, axis=1) + + # differents with dkd + nckd_loss = self.kl_loss(pred_student_part2, pred_teacher_part2) + loss = self.alpha * tckd_loss + self.beta * nckd_loss + return loss + + +class KLCTCLogits(nn.Layer): + def __init__(self, weight=1.0, reduction='mean', mode="mean"): + super().__init__() + self.weight = weight + self.reduction = reduction + self.eps = 1e-6 + self.t = 0.5 + self.act = nn.Softmax(axis=-1) + self.use_log = True + self.mode = mode + self.ctc_dkd_loss = CTCDKDLoss() + + def kl_loss(self, p1, p2): # predict, label + loss = paddle.multiply( + p2, paddle.log((p2 + self.eps) / (p1 + self.eps) + self.eps)) + bs = loss.shape[0] + loss = paddle.sum(loss) / bs + return loss + + def forward_meanmax(self, stu_out, tea_out): + + stu_out = paddle.mean(F.softmax(stu_out / self.t, axis=-1), axis=1) + tea_out = paddle.mean(F.softmax(tea_out / self.t, axis=-1), axis=1) + loss = self.kl_loss(stu_out, tea_out) + + return loss + + def forward_meanlog(self, stu_out, tea_out): + stu_out = paddle.mean(F.softmax(stu_out / self.t, axis=-1), axis=1) + tea_out = paddle.mean(F.softmax(tea_out / self.t, axis=-1), axis=1) + if self.use_log is True: + # for recognition distillation, log is needed for feature map + log_out1 = paddle.log(stu_out) + log_out2 = paddle.log(tea_out) + loss = ( + self._kldiv(log_out1, tea_out) + self._kldiv(log_out2, stu_out) + ) / 2.0 + + return loss + + def forward_sum(self, stu_out, tea_out): + stu_out = paddle.sum(F.softmax(stu_out / self.t, axis=-1), axis=1) + tea_out = paddle.sum(F.softmax(tea_out / self.t, axis=-1), axis=1) + stu_out = paddle.log(stu_out) + bs = stu_out.shape[0] + loss = tea_out * (paddle.log(tea_out + self.eps) - stu_out) + loss = paddle.sum(loss, axis=1) / loss.shape[0] + return loss + + def _kldiv(self, x, target): + eps = 1.0e-10 + loss = target * (paddle.log(target + eps) - x) + loss = paddle.sum(paddle.mean(loss, axis=1)) / loss.shape[0] + return loss + + def forward(self, stu_out, tea_out, targets=None): + if self.mode == "log": + return self.forward_log(stu_out, tea_out) + elif self.mode == "mean": + blank_mask = paddle.ones_like(stu_out) + blank_mask.stop_gradient = True + blank_mask[:, :, 0] = -1 + stu_out *= blank_mask + tea_out *= blank_mask + return self.forward_meanmax(stu_out, tea_out) + elif self.mode == "sum": + return self.forward_sum(stu_out, tea_out) + elif self.mode == "meanlog": + blank_mask = paddle.ones_like(stu_out) + blank_mask.stop_gradient = True + blank_mask[:, :, 0] = -1 + stu_out *= blank_mask + tea_out *= blank_mask + return self.forward_meanlog(stu_out, tea_out) + elif self.mode == "ctcdkd": + # ingore ctc blank logits + blank_mask = paddle.ones_like(stu_out) + blank_mask.stop_gradient = True + blank_mask[:, :, 0] = -1 + stu_out *= blank_mask + tea_out *= blank_mask + return self.ctc_dkd_loss(stu_out, tea_out, targets) + else: + raise ValueError("error!!!!!!") + + def forward_log(self, out1, out2): + if self.act is not None: + out1 = self.act(out1) + 1e-10 + out2 = self.act(out2) + 1e-10 + if self.use_log is True: + # for recognition distillation, log is needed for feature map + log_out1 = paddle.log(out1) + log_out2 = paddle.log(out2) + loss = ( + self._kldiv(log_out1, out2) + self._kldiv(log_out2, out1)) / 2.0 + + return loss + + +class DistillCTCLogits(KLCTCLogits): + def __init__(self, + model_name_pairs=[], + key=None, + name="ctc_logits", + reduction="mean"): + super().__init__(reduction=reduction) + self.model_name_pairs = self._check_model_name_pairs(model_name_pairs) + self.key = key + self.name = name + + def _check_model_name_pairs(self, model_name_pairs): + if not isinstance(model_name_pairs, list): + return [] + elif isinstance(model_name_pairs[0], list) and isinstance( + model_name_pairs[0][0], str): + return model_name_pairs + else: + return [model_name_pairs] + + def forward(self, predicts, batch): + loss_dict = dict() + for idx, pair in enumerate(self.model_name_pairs): + out1 = predicts[pair[0]] + out2 = predicts[pair[1]] + + if self.key is not None: + out1 = out1[self.key]['ctc'] + out2 = out2[self.key]['ctc'] + + ctc_label = batch[1] + loss = super().forward(out1, out2, ctc_label) + if isinstance(loss, dict): + for key in loss: + loss_dict["{}_{}_{}".format(self.name, model_name, + idx)] = loss[key] + else: + loss_dict["{}_{}".format(self.name, idx)] = loss + return loss_dict diff --git a/ppocr/losses/rec_aster_loss.py b/ppocr/losses/rec_aster_loss.py index 52605e46db35339cc22f7f1e6642456bfaf02f11..9b0a34eeac57089ae1d45ad9d8c0427b234c50c9 100644 --- a/ppocr/losses/rec_aster_loss.py +++ b/ppocr/losses/rec_aster_loss.py @@ -28,7 +28,7 @@ class CosineEmbeddingLoss(nn.Layer): def forward(self, x1, x2, target): similarity = paddle.sum( - x1 * x2, dim=-1) / (paddle.norm( + x1 * x2, axis=-1) / (paddle.norm( x1, axis=-1) * paddle.norm( x2, axis=-1) + self.epsilon) one_list = paddle.full_like(target, fill_value=1) diff --git a/ppocr/losses/rec_multi_loss.py b/ppocr/losses/rec_multi_loss.py index 09f007afe6303e83b9a6948df553ec0fca8b6b2d..4f9365750b29368b6fa70e300e6d1b6562ccd4db 100644 --- a/ppocr/losses/rec_multi_loss.py +++ b/ppocr/losses/rec_multi_loss.py @@ -21,6 +21,7 @@ from paddle import nn from .rec_ctc_loss import CTCLoss from .rec_sar_loss import SARLoss +from .rec_nrtr_loss import NRTRLoss class MultiLoss(nn.Layer): @@ -30,7 +31,6 @@ class MultiLoss(nn.Layer): self.loss_list = kwargs.pop('loss_config_list') self.weight_1 = kwargs.get('weight_1', 1.0) self.weight_2 = kwargs.get('weight_2', 1.0) - self.gtc_loss = kwargs.get('gtc_loss', 'sar') for loss_info in self.loss_list: for name, param in loss_info.items(): if param is not None: @@ -49,6 +49,9 @@ class MultiLoss(nn.Layer): elif name == 'SARLoss': loss = loss_func(predicts['sar'], batch[:1] + batch[2:])['loss'] * self.weight_2 + elif name == 'NRTRLoss': + loss = loss_func(predicts['nrtr'], + batch[:1] + batch[2:])['loss'] * self.weight_2 else: raise NotImplementedError( '{} is not supported in MultiLoss yet'.format(name)) diff --git a/ppocr/losses/rec_nrtr_loss.py b/ppocr/losses/rec_nrtr_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..fbd397fbf0be638d9b43ae5abb36ae1cfc3bb9eb --- /dev/null +++ b/ppocr/losses/rec_nrtr_loss.py @@ -0,0 +1,32 @@ +import paddle +from paddle import nn +import paddle.nn.functional as F + + +class NRTRLoss(nn.Layer): + def __init__(self, smoothing=True, ignore_index=0, **kwargs): + super(NRTRLoss, self).__init__() + if ignore_index >= 0 and not smoothing: + self.loss_func = nn.CrossEntropyLoss( + reduction='mean', ignore_index=ignore_index) + self.smoothing = smoothing + + def forward(self, pred, batch): + max_len = batch[2].max() + tgt = batch[1][:, 1:2 + max_len] + pred = pred.reshape([-1, pred.shape[2]]) + tgt = tgt.reshape([-1]) + if self.smoothing: + eps = 0.1 + n_class = pred.shape[1] + one_hot = F.one_hot(tgt, pred.shape[1]) + one_hot = one_hot * (1 - eps) + (1 - one_hot) * eps / (n_class - 1) + log_prb = F.log_softmax(pred, axis=1) + non_pad_mask = paddle.not_equal( + tgt, paddle.zeros( + tgt.shape, dtype=tgt.dtype)) + loss = -(one_hot * log_prb).sum(axis=1) + loss = loss.masked_select(non_pad_mask).mean() + else: + loss = self.loss_func(pred, tgt) + return {'loss': loss} diff --git a/ppocr/losses/rec_satrn_loss.py b/ppocr/losses/rec_satrn_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..fc7b517878d5349154fa6a9c6e05fe6d45a00dd7 --- /dev/null +++ b/ppocr/losses/rec_satrn_loss.py @@ -0,0 +1,46 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/1.x/mmocr/models/textrecog/module_losses/ce_module_loss.py +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn + + +class SATRNLoss(nn.Layer): + def __init__(self, **kwargs): + super(SATRNLoss, self).__init__() + ignore_index = kwargs.get('ignore_index', 92) # 6626 + self.loss_func = paddle.nn.loss.CrossEntropyLoss( + reduction="none", ignore_index=ignore_index) + + def forward(self, predicts, batch): + predict = predicts[:, : + -1, :] # ignore last index of outputs to be in same seq_len with targets + label = batch[1].astype( + "int64")[:, 1:] # ignore first index of target in loss calculation + batch_size, num_steps, num_classes = predict.shape[0], predict.shape[ + 1], predict.shape[2] + assert len(label.shape) == len(list(predict.shape)) - 1, \ + "The target's shape and inputs's shape is [N, d] and [N, num_steps]" + + inputs = paddle.reshape(predict, [-1, num_classes]) + targets = paddle.reshape(label, [-1]) + loss = self.loss_func(inputs, targets) + return {'loss': loss.mean()} diff --git a/ppocr/modeling/architectures/__init__.py b/ppocr/modeling/architectures/__init__.py index 1c955ef3abe9c38e816616cc9b5399c6832aa5f1..00220d28de9387859102d6cf9bc4bb66c923a3fe 100755 --- a/ppocr/modeling/architectures/__init__.py +++ b/ppocr/modeling/architectures/__init__.py @@ -38,9 +38,11 @@ def build_model(config): def apply_to_static(model, config, logger): if config["Global"].get("to_static", False) is not True: return model - assert "image_shape" in config[ - "Global"], "image_shape must be assigned for static training mode..." - supported_list = ["DB", "SVTR"] + assert "d2s_train_image_shape" in config[ + "Global"], "d2s_train_image_shape must be assigned for static training mode..." + supported_list = [ + "DB", "SVTR_LCNet", "TableMaster", "LayoutXLM", "SLANet", "SVTR" + ] if config["Architecture"]["algorithm"] in ["Distillation"]: algo = list(config["Architecture"]["Models"].values())[0]["algorithm"] else: @@ -49,10 +51,10 @@ def apply_to_static(model, config, logger): specs = [ InputSpec( - [None] + config["Global"]["image_shape"], dtype='float32') + [None] + config["Global"]["d2s_train_image_shape"], dtype='float32') ] - if algo == "SVTR": + if algo == "SVTR_LCNet": specs.append([ InputSpec( [None, config["Global"]["max_text_length"]], @@ -62,7 +64,55 @@ def apply_to_static(model, config, logger): [None], dtype='int64'), InputSpec( [None], dtype='float64') ]) - + elif algo == "TableMaster": + specs.append( + [ + InputSpec( + [None, config["Global"]["max_text_length"]], dtype='int64'), + InputSpec( + [None, config["Global"]["max_text_length"], 4], + dtype='float32'), + InputSpec( + [None, config["Global"]["max_text_length"], 1], + dtype='float32'), + InputSpec( + [None, 6], dtype='float32'), + ]) + elif algo == "LayoutXLM": + specs = [[ + InputSpec( + shape=[None, 512], dtype="int64"), # input_ids + InputSpec( + shape=[None, 512, 4], dtype="int64"), # bbox + InputSpec( + shape=[None, 512], dtype="int64"), # attention_mask + InputSpec( + shape=[None, 512], dtype="int64"), # token_type_ids + InputSpec( + shape=[None, 3, 224, 224], dtype="float32"), # image + InputSpec( + shape=[None, 512], dtype="int64"), # label + ]] + elif algo == "SLANet": + specs.append([ + InputSpec( + [None, config["Global"]["max_text_length"] + 2], dtype='int64'), + InputSpec( + [None, config["Global"]["max_text_length"] + 2, 4], + dtype='float32'), + InputSpec( + [None, config["Global"]["max_text_length"] + 2, 1], + dtype='float32'), + InputSpec( + [None, 6], dtype='float64'), + ]) + elif algo == "SVTR": + specs.append([ + InputSpec( + [None, config["Global"]["max_text_length"]], dtype='int64'), + InputSpec( + [None], dtype='int64') + ]) model = to_static(model, input_spec=specs) logger.info("Successfully to apply @to_static with specs: {}".format(specs)) return model diff --git a/ppocr/modeling/backbones/__init__.py b/ppocr/modeling/backbones/__init__.py index e2c2e9c4a4ed526b36d512d824ae8a8a701c17bc..873e8f6de1249bc8f76c4b720b1555d794ba9c4c 100755 --- a/ppocr/modeling/backbones/__init__.py +++ b/ppocr/modeling/backbones/__init__.py @@ -22,8 +22,11 @@ def build_backbone(config, model_type): from .det_resnet_vd import ResNet_vd from .det_resnet_vd_sast import ResNet_SAST from .det_pp_lcnet import PPLCNet + from .rec_lcnetv3 import PPLCNetV3 + from .rec_hgnet import PPHGNet_small support_dict = [ - "MobileNetV3", "ResNet", "ResNet_vd", "ResNet_SAST", "PPLCNet" + "MobileNetV3", "ResNet", "ResNet_vd", "ResNet_SAST", "PPLCNet", + "PPLCNetV3", "PPHGNet_small" ] if model_type == "table": from .table_master_resnet import TableResNetExtra @@ -44,11 +47,14 @@ def build_backbone(config, model_type): from .rec_vitstr import ViTSTR from .rec_resnet_rfl import ResNetRFL from .rec_densenet import DenseNet + from .rec_shallow_cnn import ShallowCNN + from .rec_lcnetv3 import PPLCNetV3 + from .rec_hgnet import PPHGNet_small support_dict = [ 'MobileNetV1Enhance', 'MobileNetV3', 'ResNet', 'ResNetFPN', 'MTB', 'ResNet31', 'ResNet45', 'ResNet_ASTER', 'MicroNet', 'EfficientNetb3_PREN', 'SVTRNet', 'ViTSTR', 'ResNet32', 'ResNetRFL', - 'DenseNet' + 'DenseNet', 'ShallowCNN', 'PPLCNetV3', 'PPHGNet_small' ] elif model_type == 'e2e': from .e2e_resnet_vd_pg import ResNet diff --git a/ppocr/modeling/backbones/rec_hgnet.py b/ppocr/modeling/backbones/rec_hgnet.py new file mode 100644 index 0000000000000000000000000000000000000000..d990453308a47f3e68f2d899c01edf3ecbdae8db --- /dev/null +++ b/ppocr/modeling/backbones/rec_hgnet.py @@ -0,0 +1,350 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn.initializer import KaimingNormal, Constant +from paddle.nn import Conv2D, BatchNorm2D, ReLU, AdaptiveAvgPool2D, MaxPool2D +from paddle.regularizer import L2Decay +from paddle import ParamAttr + +kaiming_normal_ = KaimingNormal() +zeros_ = Constant(value=0.) +ones_ = Constant(value=1.) + + +class ConvBNAct(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + groups=1, + use_act=True): + super().__init__() + self.use_act = use_act + self.conv = Conv2D( + in_channels, + out_channels, + kernel_size, + stride, + padding=(kernel_size - 1) // 2, + groups=groups, + bias_attr=False) + self.bn = BatchNorm2D( + out_channels, + weight_attr=ParamAttr(regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(regularizer=L2Decay(0.0))) + if self.use_act: + self.act = ReLU() + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + if self.use_act: + x = self.act(x) + return x + + +class ESEModule(nn.Layer): + def __init__(self, channels): + super().__init__() + self.avg_pool = AdaptiveAvgPool2D(1) + self.conv = Conv2D( + in_channels=channels, + out_channels=channels, + kernel_size=1, + stride=1, + padding=0) + self.sigmoid = nn.Sigmoid() + + def forward(self, x): + identity = x + x = self.avg_pool(x) + x = self.conv(x) + x = self.sigmoid(x) + return paddle.multiply(x=identity, y=x) + + +class HG_Block(nn.Layer): + def __init__( + self, + in_channels, + mid_channels, + out_channels, + layer_num, + identity=False, ): + super().__init__() + self.identity = identity + + self.layers = nn.LayerList() + self.layers.append( + ConvBNAct( + in_channels=in_channels, + out_channels=mid_channels, + kernel_size=3, + stride=1)) + for _ in range(layer_num - 1): + self.layers.append( + ConvBNAct( + in_channels=mid_channels, + out_channels=mid_channels, + kernel_size=3, + stride=1)) + + # feature aggregation + total_channels = in_channels + layer_num * mid_channels + self.aggregation_conv = ConvBNAct( + in_channels=total_channels, + out_channels=out_channels, + kernel_size=1, + stride=1) + self.att = ESEModule(out_channels) + + def forward(self, x): + identity = x + output = [] + output.append(x) + for layer in self.layers: + x = layer(x) + output.append(x) + x = paddle.concat(output, axis=1) + x = self.aggregation_conv(x) + x = self.att(x) + if self.identity: + x += identity + return x + + +class HG_Stage(nn.Layer): + def __init__(self, + in_channels, + mid_channels, + out_channels, + block_num, + layer_num, + downsample=True, + stride=[2, 1]): + super().__init__() + self.downsample = downsample + if downsample: + self.downsample = ConvBNAct( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=3, + stride=stride, + groups=in_channels, + use_act=False) + + blocks_list = [] + blocks_list.append( + HG_Block( + in_channels, + mid_channels, + out_channels, + layer_num, + identity=False)) + for _ in range(block_num - 1): + blocks_list.append( + HG_Block( + out_channels, + mid_channels, + out_channels, + layer_num, + identity=True)) + self.blocks = nn.Sequential(*blocks_list) + + def forward(self, x): + if self.downsample: + x = self.downsample(x) + x = self.blocks(x) + return x + + +class PPHGNet(nn.Layer): + """ + PPHGNet + Args: + stem_channels: list. Stem channel list of PPHGNet. + stage_config: dict. The configuration of each stage of PPHGNet. such as the number of channels, stride, etc. + layer_num: int. Number of layers of HG_Block. + use_last_conv: boolean. Whether to use a 1x1 convolutional layer before the classification layer. + class_expand: int=2048. Number of channels for the last 1x1 convolutional layer. + dropout_prob: float. Parameters of dropout, 0.0 means dropout is not used. + class_num: int=1000. The number of classes. + Returns: + model: nn.Layer. Specific PPHGNet model depends on args. + """ + + def __init__( + self, + stem_channels, + stage_config, + layer_num, + in_channels=3, + det=False, + out_indices=None, ): + super().__init__() + self.det = det + self.out_indices = out_indices if out_indices is not None else [ + 0, 1, 2, 3 + ] + + # stem + stem_channels.insert(0, in_channels) + self.stem = nn.Sequential(* [ + ConvBNAct( + in_channels=stem_channels[i], + out_channels=stem_channels[i + 1], + kernel_size=3, + stride=2 if i == 0 else 1) for i in range( + len(stem_channels) - 1) + ]) + + if self.det: + self.pool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1) + # stages + self.stages = nn.LayerList() + self.out_channels = [] + for block_id, k in enumerate(stage_config): + in_channels, mid_channels, out_channels, block_num, downsample, stride = stage_config[ + k] + self.stages.append( + HG_Stage(in_channels, mid_channels, out_channels, block_num, + layer_num, downsample, stride)) + if block_id in self.out_indices: + self.out_channels.append(out_channels) + + if not self.det: + self.out_channels = stage_config["stage4"][2] + + self._init_weights() + + def _init_weights(self): + for m in self.sublayers(): + if isinstance(m, nn.Conv2D): + kaiming_normal_(m.weight) + elif isinstance(m, (nn.BatchNorm2D)): + ones_(m.weight) + zeros_(m.bias) + elif isinstance(m, nn.Linear): + zeros_(m.bias) + + def forward(self, x): + x = self.stem(x) + if self.det: + x = self.pool(x) + + out = [] + for i, stage in enumerate(self.stages): + x = stage(x) + if self.det and i in self.out_indices: + out.append(x) + if self.det: + return out + + if self.training: + x = F.adaptive_avg_pool2d(x, [1, 40]) + else: + x = F.avg_pool2d(x, [3, 2]) + return x + + +def PPHGNet_tiny(pretrained=False, use_ssld=False, **kwargs): + """ + PPHGNet_tiny + Args: + pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise. + If str, means the path of the pretrained model. + use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True. + Returns: + model: nn.Layer. Specific `PPHGNet_tiny` model depends on args. + """ + stage_config = { + # in_channels, mid_channels, out_channels, blocks, downsample + "stage1": [96, 96, 224, 1, False, [2, 1]], + "stage2": [224, 128, 448, 1, True, [1, 2]], + "stage3": [448, 160, 512, 2, True, [2, 1]], + "stage4": [512, 192, 768, 1, True, [2, 1]], + } + + model = PPHGNet( + stem_channels=[48, 48, 96], + stage_config=stage_config, + layer_num=5, + **kwargs) + return model + + +def PPHGNet_small(pretrained=False, use_ssld=False, det=False, **kwargs): + """ + PPHGNet_small + Args: + pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise. + If str, means the path of the pretrained model. + use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True. + Returns: + model: nn.Layer. Specific `PPHGNet_small` model depends on args. + """ + stage_config_det = { + # in_channels, mid_channels, out_channels, blocks, downsample + "stage1": [128, 128, 256, 1, False, 2], + "stage2": [256, 160, 512, 1, True, 2], + "stage3": [512, 192, 768, 2, True, 2], + "stage4": [768, 224, 1024, 1, True, 2], + } + + stage_config_rec = { + # in_channels, mid_channels, out_channels, blocks, downsample + "stage1": [128, 128, 256, 1, True, [2, 1]], + "stage2": [256, 160, 512, 1, True, [1, 2]], + "stage3": [512, 192, 768, 2, True, [2, 1]], + "stage4": [768, 224, 1024, 1, True, [2, 1]], + } + + model = PPHGNet( + stem_channels=[64, 64, 128], + stage_config=stage_config_det if det else stage_config_rec, + layer_num=6, + det=det, + **kwargs) + return model + + +def PPHGNet_base(pretrained=False, use_ssld=True, **kwargs): + """ + PPHGNet_base + Args: + pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise. + If str, means the path of the pretrained model. + use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True. + Returns: + model: nn.Layer. Specific `PPHGNet_base` model depends on args. + """ + stage_config = { + # in_channels, mid_channels, out_channels, blocks, downsample + "stage1": [160, 192, 320, 1, False, [2, 1]], + "stage2": [320, 224, 640, 2, True, [1, 2]], + "stage3": [640, 256, 960, 3, True, [2, 1]], + "stage4": [960, 288, 1280, 2, True, [2, 1]], + } + + model = PPHGNet( + stem_channels=[96, 96, 160], + stage_config=stage_config, + layer_num=7, + dropout_prob=0.2, + **kwargs) + return model diff --git a/ppocr/modeling/backbones/rec_lcnetv3.py b/ppocr/modeling/backbones/rec_lcnetv3.py new file mode 100644 index 0000000000000000000000000000000000000000..ab0951761d0c13d9ad8e0884118086a93f39269b --- /dev/null +++ b/ppocr/modeling/backbones/rec_lcnetv3.py @@ -0,0 +1,491 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.nn.initializer import Constant, KaimingNormal +from paddle.nn import AdaptiveAvgPool2D, BatchNorm2D, Conv2D, Dropout, Hardsigmoid, Hardswish, Identity, Linear, ReLU +from paddle.regularizer import L2Decay + +NET_CONFIG_det = { + "blocks2": + #k, in_c, out_c, s, use_se + [[3, 16, 32, 1, False]], + "blocks3": [[3, 32, 64, 2, False], [3, 64, 64, 1, False]], + "blocks4": [[3, 64, 128, 2, False], [3, 128, 128, 1, False]], + "blocks5": + [[3, 128, 256, 2, False], [5, 256, 256, 1, False], [5, 256, 256, 1, False], + [5, 256, 256, 1, False], [5, 256, 256, 1, False]], + "blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True], + [5, 512, 512, 1, False], [5, 512, 512, 1, False]] +} + +NET_CONFIG_rec = { + "blocks2": + #k, in_c, out_c, s, use_se + [[3, 16, 32, 1, False]], + "blocks3": [[3, 32, 64, 1, False], [3, 64, 64, 1, False]], + "blocks4": [[3, 64, 128, (2, 1), False], [3, 128, 128, 1, False]], + "blocks5": + [[3, 128, 256, (1, 2), False], [5, 256, 256, 1, False], + [5, 256, 256, 1, False], [5, 256, 256, 1, False], [5, 256, 256, 1, False]], + "blocks6": [[5, 256, 512, (2, 1), True], [5, 512, 512, 1, True], + [5, 512, 512, (2, 1), False], [5, 512, 512, 1, False]] +} + + +def make_divisible(v, divisor=16, min_value=None): + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +class LearnableAffineBlock(nn.Layer): + def __init__(self, scale_value=1.0, bias_value=0.0, lr_mult=1.0, + lab_lr=0.1): + super().__init__() + self.scale = self.create_parameter( + shape=[1, ], + default_initializer=Constant(value=scale_value), + attr=ParamAttr(learning_rate=lr_mult * lab_lr)) + self.add_parameter("scale", self.scale) + self.bias = self.create_parameter( + shape=[1, ], + default_initializer=Constant(value=bias_value), + attr=ParamAttr(learning_rate=lr_mult * lab_lr)) + self.add_parameter("bias", self.bias) + + def forward(self, x): + return self.scale * x + self.bias + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + groups=1, + lr_mult=1.0): + super().__init__() + self.conv = Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr( + initializer=KaimingNormal(), learning_rate=lr_mult), + bias_attr=False) + + self.bn = BatchNorm2D( + out_channels, + weight_attr=ParamAttr( + regularizer=L2Decay(0.0), learning_rate=lr_mult), + bias_attr=ParamAttr( + regularizer=L2Decay(0.0), learning_rate=lr_mult)) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return x + + +class Act(nn.Layer): + def __init__(self, act="hswish", lr_mult=1.0, lab_lr=0.1): + super().__init__() + if act == "hswish": + self.act = Hardswish() + else: + assert act == "relu" + self.act = ReLU() + self.lab = LearnableAffineBlock(lr_mult=lr_mult, lab_lr=lab_lr) + + def forward(self, x): + return self.lab(self.act(x)) + + +class LearnableRepLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + num_conv_branches=1, + lr_mult=1.0, + lab_lr=0.1): + super().__init__() + self.is_repped = False + self.groups = groups + self.stride = stride + self.kernel_size = kernel_size + self.in_channels = in_channels + self.out_channels = out_channels + self.num_conv_branches = num_conv_branches + self.padding = (kernel_size - 1) // 2 + + self.identity = BatchNorm2D( + num_features=in_channels, + weight_attr=ParamAttr(learning_rate=lr_mult), + bias_attr=ParamAttr(learning_rate=lr_mult) + ) if out_channels == in_channels and stride == 1 else None + + self.conv_kxk = nn.LayerList([ + ConvBNLayer( + in_channels, + out_channels, + kernel_size, + stride, + groups=groups, + lr_mult=lr_mult) for _ in range(self.num_conv_branches) + ]) + + self.conv_1x1 = ConvBNLayer( + in_channels, + out_channels, + 1, + stride, + groups=groups, + lr_mult=lr_mult) if kernel_size > 1 else None + + self.lab = LearnableAffineBlock(lr_mult=lr_mult, lab_lr=lab_lr) + self.act = Act(lr_mult=lr_mult, lab_lr=lab_lr) + + def forward(self, x): + # for export + if self.is_repped: + out = self.lab(self.reparam_conv(x)) + if self.stride != 2: + out = self.act(out) + return out + + out = 0 + if self.identity is not None: + out += self.identity(x) + + if self.conv_1x1 is not None: + out += self.conv_1x1(x) + + for conv in self.conv_kxk: + out += conv(x) + + out = self.lab(out) + if self.stride != 2: + out = self.act(out) + return out + + def rep(self): + if self.is_repped: + return + kernel, bias = self._get_kernel_bias() + self.reparam_conv = Conv2D( + in_channels=self.in_channels, + out_channels=self.out_channels, + kernel_size=self.kernel_size, + stride=self.stride, + padding=self.padding, + groups=self.groups) + self.reparam_conv.weight.set_value(kernel) + self.reparam_conv.bias.set_value(bias) + self.is_repped = True + + def _pad_kernel_1x1_to_kxk(self, kernel1x1, pad): + if not isinstance(kernel1x1, paddle.Tensor): + return 0 + else: + return nn.functional.pad(kernel1x1, [pad, pad, pad, pad]) + + def _get_kernel_bias(self): + kernel_conv_1x1, bias_conv_1x1 = self._fuse_bn_tensor(self.conv_1x1) + kernel_conv_1x1 = self._pad_kernel_1x1_to_kxk(kernel_conv_1x1, + self.kernel_size // 2) + + kernel_identity, bias_identity = self._fuse_bn_tensor(self.identity) + + kernel_conv_kxk = 0 + bias_conv_kxk = 0 + for conv in self.conv_kxk: + kernel, bias = self._fuse_bn_tensor(conv) + kernel_conv_kxk += kernel + bias_conv_kxk += bias + + kernel_reparam = kernel_conv_kxk + kernel_conv_1x1 + kernel_identity + bias_reparam = bias_conv_kxk + bias_conv_1x1 + bias_identity + return kernel_reparam, bias_reparam + + def _fuse_bn_tensor(self, branch): + if not branch: + return 0, 0 + elif isinstance(branch, ConvBNLayer): + kernel = branch.conv.weight + running_mean = branch.bn._mean + running_var = branch.bn._variance + gamma = branch.bn.weight + beta = branch.bn.bias + eps = branch.bn._epsilon + else: + assert isinstance(branch, BatchNorm2D) + if not hasattr(self, 'id_tensor'): + input_dim = self.in_channels // self.groups + kernel_value = paddle.zeros( + (self.in_channels, input_dim, self.kernel_size, + self.kernel_size), + dtype=branch.weight.dtype) + for i in range(self.in_channels): + kernel_value[i, i % input_dim, self.kernel_size // 2, + self.kernel_size // 2] = 1 + self.id_tensor = kernel_value + kernel = self.id_tensor + running_mean = branch._mean + running_var = branch._variance + gamma = branch.weight + beta = branch.bias + eps = branch._epsilon + std = (running_var + eps).sqrt() + t = (gamma / std).reshape((-1, 1, 1, 1)) + return kernel * t, beta - running_mean * gamma / std + + +class SELayer(nn.Layer): + def __init__(self, channel, reduction=4, lr_mult=1.0): + super().__init__() + self.avg_pool = AdaptiveAvgPool2D(1) + self.conv1 = Conv2D( + in_channels=channel, + out_channels=channel // reduction, + kernel_size=1, + stride=1, + padding=0, + weight_attr=ParamAttr(learning_rate=lr_mult), + bias_attr=ParamAttr(learning_rate=lr_mult)) + self.relu = ReLU() + self.conv2 = Conv2D( + in_channels=channel // reduction, + out_channels=channel, + kernel_size=1, + stride=1, + padding=0, + weight_attr=ParamAttr(learning_rate=lr_mult), + bias_attr=ParamAttr(learning_rate=lr_mult)) + self.hardsigmoid = Hardsigmoid() + + def forward(self, x): + identity = x + x = self.avg_pool(x) + x = self.conv1(x) + x = self.relu(x) + x = self.conv2(x) + x = self.hardsigmoid(x) + x = paddle.multiply(x=identity, y=x) + return x + + +class LCNetV3Block(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride, + dw_size, + use_se=False, + conv_kxk_num=4, + lr_mult=1.0, + lab_lr=0.1): + super().__init__() + self.use_se = use_se + self.dw_conv = LearnableRepLayer( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=dw_size, + stride=stride, + groups=in_channels, + num_conv_branches=conv_kxk_num, + lr_mult=lr_mult, + lab_lr=lab_lr) + if use_se: + self.se = SELayer(in_channels, lr_mult=lr_mult) + self.pw_conv = LearnableRepLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + num_conv_branches=conv_kxk_num, + lr_mult=lr_mult, + lab_lr=lab_lr) + + def forward(self, x): + x = self.dw_conv(x) + if self.use_se: + x = self.se(x) + x = self.pw_conv(x) + return x + + +class PPLCNetV3(nn.Layer): + def __init__(self, + scale=1.0, + conv_kxk_num=4, + lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + lab_lr=0.1, + det=False, + **kwargs): + super().__init__() + self.scale = scale + self.lr_mult_list = lr_mult_list + self.det = det + + self.net_config = NET_CONFIG_det if self.det else NET_CONFIG_rec + + assert isinstance(self.lr_mult_list, ( + list, tuple + )), "lr_mult_list should be in (list, tuple) but got {}".format( + type(self.lr_mult_list)) + assert len(self.lr_mult_list + ) == 6, "lr_mult_list length should be 6 but got {}".format( + len(self.lr_mult_list)) + + self.conv1 = ConvBNLayer( + in_channels=3, + out_channels=make_divisible(16 * scale), + kernel_size=3, + stride=2, + lr_mult=self.lr_mult_list[0]) + + self.blocks2 = nn.Sequential(*[ + LCNetV3Block( + in_channels=make_divisible(in_c * scale), + out_channels=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + conv_kxk_num=conv_kxk_num, + lr_mult=self.lr_mult_list[1], + lab_lr=lab_lr) + for i, (k, in_c, out_c, s, se + ) in enumerate(self.net_config["blocks2"]) + ]) + + self.blocks3 = nn.Sequential(*[ + LCNetV3Block( + in_channels=make_divisible(in_c * scale), + out_channels=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + conv_kxk_num=conv_kxk_num, + lr_mult=self.lr_mult_list[2], + lab_lr=lab_lr) + for i, (k, in_c, out_c, s, se + ) in enumerate(self.net_config["blocks3"]) + ]) + + self.blocks4 = nn.Sequential(*[ + LCNetV3Block( + in_channels=make_divisible(in_c * scale), + out_channels=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + conv_kxk_num=conv_kxk_num, + lr_mult=self.lr_mult_list[3], + lab_lr=lab_lr) + for i, (k, in_c, out_c, s, se + ) in enumerate(self.net_config["blocks4"]) + ]) + + self.blocks5 = nn.Sequential(*[ + LCNetV3Block( + in_channels=make_divisible(in_c * scale), + out_channels=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + conv_kxk_num=conv_kxk_num, + lr_mult=self.lr_mult_list[4], + lab_lr=lab_lr) + for i, (k, in_c, out_c, s, se + ) in enumerate(self.net_config["blocks5"]) + ]) + + self.blocks6 = nn.Sequential(*[ + LCNetV3Block( + in_channels=make_divisible(in_c * scale), + out_channels=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + conv_kxk_num=conv_kxk_num, + lr_mult=self.lr_mult_list[5], + lab_lr=lab_lr) + for i, (k, in_c, out_c, s, se + ) in enumerate(self.net_config["blocks6"]) + ]) + self.out_channels = make_divisible(512 * scale) + + if self.det: + mv_c = [16, 24, 56, 480] + self.out_channels = [ + make_divisible(self.net_config["blocks3"][-1][2] * scale), + make_divisible(self.net_config["blocks4"][-1][2] * scale), + make_divisible(self.net_config["blocks5"][-1][2] * scale), + make_divisible(self.net_config["blocks6"][-1][2] * scale), + ] + + self.layer_list = nn.LayerList([ + nn.Conv2D(self.out_channels[0], int(mv_c[0] * scale), 1, 1, 0), + nn.Conv2D(self.out_channels[1], int(mv_c[1] * scale), 1, 1, 0), + nn.Conv2D(self.out_channels[2], int(mv_c[2] * scale), 1, 1, 0), + nn.Conv2D(self.out_channels[3], int(mv_c[3] * scale), 1, 1, 0) + ]) + self.out_channels = [ + int(mv_c[0] * scale), int(mv_c[1] * scale), + int(mv_c[2] * scale), int(mv_c[3] * scale) + ] + + def forward(self, x): + out_list = [] + x = self.conv1(x) + + x = self.blocks2(x) + x = self.blocks3(x) + out_list.append(x) + x = self.blocks4(x) + out_list.append(x) + x = self.blocks5(x) + out_list.append(x) + x = self.blocks6(x) + out_list.append(x) + + if self.det: + out_list[0] = self.layer_list[0](out_list[0]) + out_list[1] = self.layer_list[1](out_list[1]) + out_list[2] = self.layer_list[2](out_list[2]) + out_list[3] = self.layer_list[3](out_list[3]) + return out_list + + if self.training: + x = F.adaptive_avg_pool2d(x, [1, 40]) + else: + x = F.avg_pool2d(x, [3, 2]) + return x diff --git a/ppocr/modeling/backbones/rec_mv1_enhance.py b/ppocr/modeling/backbones/rec_mv1_enhance.py index bb6af5e82cf13ac42d9a970787596a65986ade54..2d4efe720991618f33cbc42c0fb84bc795bc7437 100644 --- a/ppocr/modeling/backbones/rec_mv1_enhance.py +++ b/ppocr/modeling/backbones/rec_mv1_enhance.py @@ -108,6 +108,7 @@ class MobileNetV1Enhance(nn.Layer): scale=0.5, last_conv_stride=1, last_pool_type='max', + last_pool_kernel_size=[3, 2], **kwargs): super().__init__() self.scale = scale @@ -214,7 +215,10 @@ class MobileNetV1Enhance(nn.Layer): self.block_list = nn.Sequential(*self.block_list) if last_pool_type == 'avg': - self.pool = nn.AvgPool2D(kernel_size=2, stride=2, padding=0) + self.pool = nn.AvgPool2D( + kernel_size=last_pool_kernel_size, + stride=last_pool_kernel_size, + padding=0) else: self.pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0) self.out_channels = int(1024 * scale) diff --git a/ppocr/modeling/backbones/rec_shallow_cnn.py b/ppocr/modeling/backbones/rec_shallow_cnn.py new file mode 100644 index 0000000000000000000000000000000000000000..544f108d26397421ae77ee025b15f31e319ab54c --- /dev/null +++ b/ppocr/modeling/backbones/rec_shallow_cnn.py @@ -0,0 +1,87 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/1.x/mmocr/models/textrecog/backbones/shallow_cnn.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import numpy as np +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn import MaxPool2D +from paddle.nn.initializer import KaimingNormal, Uniform, Constant + + +class ConvBNLayer(nn.Layer): + def __init__(self, + num_channels, + filter_size, + num_filters, + stride, + padding, + num_groups=1): + super(ConvBNLayer, self).__init__() + + self.conv = nn.Conv2D( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=filter_size, + stride=stride, + padding=padding, + groups=num_groups, + weight_attr=ParamAttr(initializer=KaimingNormal()), + bias_attr=False) + + self.bn = nn.BatchNorm2D( + num_filters, + weight_attr=ParamAttr(initializer=Uniform(0, 1)), + bias_attr=ParamAttr(initializer=Constant(0))) + self.relu = nn.ReLU() + + def forward(self, inputs): + y = self.conv(inputs) + y = self.bn(y) + y = self.relu(y) + return y + + +class ShallowCNN(nn.Layer): + def __init__(self, in_channels=1, hidden_dim=512): + super().__init__() + assert isinstance(in_channels, int) + assert isinstance(hidden_dim, int) + + self.conv1 = ConvBNLayer( + in_channels, 3, hidden_dim // 2, stride=1, padding=1) + self.conv2 = ConvBNLayer( + hidden_dim // 2, 3, hidden_dim, stride=1, padding=1) + self.pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0) + self.out_channels = hidden_dim + + def forward(self, x): + + x = self.conv1(x) + x = self.pool(x) + + x = self.conv2(x) + x = self.pool(x) + + return x diff --git a/ppocr/modeling/backbones/rec_svtrnet.py b/ppocr/modeling/backbones/rec_svtrnet.py index c2c07f4476929d49237c8e9a10713f881f5f556b..ea865a2da148bc5a0afe9eea4f74c1f26d782649 100644 --- a/ppocr/modeling/backbones/rec_svtrnet.py +++ b/ppocr/modeling/backbones/rec_svtrnet.py @@ -32,7 +32,7 @@ def drop_path(x, drop_prob=0., training=False): """ if drop_prob == 0. or not training: return x - keep_prob = paddle.to_tensor(1 - drop_prob) + keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype) shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1) random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype) random_tensor = paddle.floor(random_tensor) # binarize @@ -155,8 +155,9 @@ class Attention(nn.Layer): proj_drop=0.): super().__init__() self.num_heads = num_heads - head_dim = dim // num_heads - self.scale = qk_scale or head_dim**-0.5 + self.dim = dim + self.head_dim = dim // num_heads + self.scale = qk_scale or self.head_dim**-0.5 self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) @@ -183,13 +184,9 @@ class Attention(nn.Layer): self.mixer = mixer def forward(self, x): - if self.HW is not None: - N = self.N - C = self.C - else: - _, N, C = x.shape - qkv = self.qkv(x).reshape((0, N, 3, self.num_heads, C // - self.num_heads)).transpose((2, 0, 3, 1, 4)) + qkv = self.qkv(x).reshape( + (0, -1, 3, self.num_heads, self.head_dim)).transpose( + (2, 0, 3, 1, 4)) q, k, v = qkv[0] * self.scale, qkv[1], qkv[2] attn = (q.matmul(k.transpose((0, 1, 3, 2)))) @@ -198,7 +195,7 @@ class Attention(nn.Layer): attn = nn.functional.softmax(attn, axis=-1) attn = self.attn_drop(attn) - x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((0, N, C)) + x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((0, -1, self.dim)) x = self.proj(x) x = self.proj_drop(x) return x diff --git a/ppocr/modeling/backbones/vqa_layoutlm.py b/ppocr/modeling/backbones/vqa_layoutlm.py index acb1315cc0a588396549e5b8928bd2e4d3c769be..4357b5664587d74ecf0e1fc54a427a138a2f656e 100644 --- a/ppocr/modeling/backbones/vqa_layoutlm.py +++ b/ppocr/modeling/backbones/vqa_layoutlm.py @@ -54,18 +54,11 @@ class NLPBaseModel(nn.Layer): if checkpoints is not None: # load the trained model self.model = model_class.from_pretrained(checkpoints) else: # load the pretrained-model - pretrained_model_name = pretrained_model_dict[base_model_class][ - mode] - if pretrained is True: - base_model = base_model_class.from_pretrained( - pretrained_model_name) - else: - base_model = base_model_class.from_pretrained(pretrained) + pretrained_model_name = pretrained_model_dict[base_model_class][mode] if type == "ser": - self.model = model_class( - base_model, num_classes=kwargs["num_classes"], dropout=None) + self.model = model_class.from_pretrained(pretrained_model_name, num_classes=kwargs["num_classes"], dropout=0) else: - self.model = model_class(base_model, dropout=None) + self.model = model_class.from_pretrained(pretrained_model_name, dropout=0) self.out_channels = 1 self.use_visual_backbone = True diff --git a/ppocr/modeling/heads/__init__.py b/ppocr/modeling/heads/__init__.py index 65afaf84f4453f2d4199371576ac71bb93a1e6d5..440d9e0293af2c92c3c3bfbba82b39a851fd1331 100755 --- a/ppocr/modeling/heads/__init__.py +++ b/ppocr/modeling/heads/__init__.py @@ -17,14 +17,13 @@ __all__ = ['build_head'] def build_head(config): # det head - from .det_db_head import DBHead + from .det_db_head import DBHead, PFHeadLocal from .det_east_head import EASTHead from .det_sast_head import SASTHead from .det_pse_head import PSEHead from .det_fce_head import FCEHead from .e2e_pg_head import PGHead from .det_ct_head import CT_Head - # rec head from .rec_ctc_head import CTCHead from .rec_att_head import AttentionHead @@ -40,6 +39,7 @@ def build_head(config): from .rec_visionlan_head import VLHead from .rec_rfl_head import RFLHead from .rec_can_head import CANHead + from .rec_satrn_head import SATRNHead # cls head from .cls_head import ClsHead @@ -56,7 +56,7 @@ def build_head(config): 'TableAttentionHead', 'SARHead', 'AsterHead', 'SDMGRHead', 'PRENHead', 'MultiHead', 'ABINetHead', 'TableMasterHead', 'SPINAttentionHead', 'VLHead', 'SLAHead', 'RobustScannerHead', 'CT_Head', 'RFLHead', - 'DRRGHead', 'CANHead' + 'DRRGHead', 'CANHead', 'SATRNHead', 'PFHeadLocal' ] if config['name'] == 'DRRGHead': diff --git a/ppocr/modeling/heads/det_db_head.py b/ppocr/modeling/heads/det_db_head.py index a686ae5ab0662ad31ddfd339bd1999c45c370cf0..8db14d7f6f043b53f2df3c929579c202358e5345 100644 --- a/ppocr/modeling/heads/det_db_head.py +++ b/ppocr/modeling/heads/det_db_head.py @@ -21,6 +21,7 @@ import paddle from paddle import nn import paddle.nn.functional as F from paddle import ParamAttr +from ppocr.modeling.backbones.det_mobilenet_v3 import ConvBNLayer def get_bias_attr(k): @@ -31,7 +32,7 @@ def get_bias_attr(k): class Head(nn.Layer): - def __init__(self, in_channels, name_list, kernel_list=[3, 2, 2], **kwargs): + def __init__(self, in_channels, kernel_list=[3, 2, 2], **kwargs): super(Head, self).__init__() self.conv1 = nn.Conv2D( @@ -48,6 +49,7 @@ class Head(nn.Layer): bias_attr=ParamAttr( initializer=paddle.nn.initializer.Constant(value=1e-4)), act='relu') + self.conv2 = nn.Conv2DTranspose( in_channels=in_channels // 4, out_channels=in_channels // 4, @@ -72,13 +74,17 @@ class Head(nn.Layer): initializer=paddle.nn.initializer.KaimingUniform()), bias_attr=get_bias_attr(in_channels // 4), ) - def forward(self, x): + def forward(self, x, return_f=False): x = self.conv1(x) x = self.conv_bn1(x) x = self.conv2(x) x = self.conv_bn2(x) + if return_f is True: + f = x x = self.conv3(x) x = F.sigmoid(x) + if return_f is True: + return x, f return x @@ -93,16 +99,8 @@ class DBHead(nn.Layer): def __init__(self, in_channels, k=50, **kwargs): super(DBHead, self).__init__() self.k = k - binarize_name_list = [ - 'conv2d_56', 'batch_norm_47', 'conv2d_transpose_0', 'batch_norm_48', - 'conv2d_transpose_1', 'binarize' - ] - thresh_name_list = [ - 'conv2d_57', 'batch_norm_49', 'conv2d_transpose_2', 'batch_norm_50', - 'conv2d_transpose_3', 'thresh' - ] - self.binarize = Head(in_channels, binarize_name_list, **kwargs) - self.thresh = Head(in_channels, thresh_name_list, **kwargs) + self.binarize = Head(in_channels, **kwargs) + self.thresh = Head(in_channels, **kwargs) def step_function(self, x, y): return paddle.reciprocal(1 + paddle.exp(-self.k * (x - y))) @@ -116,3 +114,41 @@ class DBHead(nn.Layer): binary_maps = self.step_function(shrink_maps, threshold_maps) y = paddle.concat([shrink_maps, threshold_maps, binary_maps], axis=1) return {'maps': y} + + +class LocalModule(nn.Layer): + def __init__(self, in_c, mid_c, use_distance=True): + super(self.__class__, self).__init__() + self.last_3 = ConvBNLayer(in_c + 1, mid_c, 3, 1, 1, act='relu') + self.last_1 = nn.Conv2D(mid_c, 1, 1, 1, 0) + + def forward(self, x, init_map, distance_map): + outf = paddle.concat([init_map, x], axis=1) + # last Conv + out = self.last_1(self.last_3(outf)) + return out + + +class PFHeadLocal(DBHead): + def __init__(self, in_channels, k=50, mode='small', **kwargs): + super(PFHeadLocal, self).__init__(in_channels, k, **kwargs) + self.mode = mode + + self.up_conv = nn.Upsample(scale_factor=2, mode="nearest", align_mode=1) + if self.mode == 'large': + self.cbn_layer = LocalModule(in_channels // 4, in_channels // 4) + elif self.mode == 'small': + self.cbn_layer = LocalModule(in_channels // 4, in_channels // 8) + + def forward(self, x, targets=None): + shrink_maps, f = self.binarize(x, return_f=True) + base_maps = shrink_maps + cbn_maps = self.cbn_layer(self.up_conv(f), shrink_maps, None) + cbn_maps = F.sigmoid(cbn_maps) + if not self.training: + return {'maps': 0.5 * (base_maps + cbn_maps), 'cbn_maps': cbn_maps} + + threshold_maps = self.thresh(x) + binary_maps = self.step_function(shrink_maps, threshold_maps) + y = paddle.concat([cbn_maps, threshold_maps, binary_maps], axis=1) + return {'maps': y, 'distance_maps': cbn_maps, 'cbn_maps': binary_maps} diff --git a/ppocr/modeling/heads/proposal_local_graph.py b/ppocr/modeling/heads/proposal_local_graph.py index 7887c4ff42f8ae9d1826a71f01208cd81bb2d52c..a48656135b2292f32c7285e767f8fea1a7318e08 100644 --- a/ppocr/modeling/heads/proposal_local_graph.py +++ b/ppocr/modeling/heads/proposal_local_graph.py @@ -40,7 +40,7 @@ def fill_hole(input_mask): mask = np.zeros((h + 4, w + 4), np.uint8) cv2.floodFill(canvas, mask, (0, 0), 1) - canvas = canvas[1:h + 1, 1:w + 1].astype(np.bool) + canvas = canvas[1:h + 1, 1:w + 1].astype(np.bool_) return ~canvas | input_mask diff --git a/ppocr/modeling/heads/rec_multi_head.py b/ppocr/modeling/heads/rec_multi_head.py index 2f10e7bdf90025d3304128e720ce561c8bb269c1..0b4fa939eecad15c79f5e37384944720b1879205 100644 --- a/ppocr/modeling/heads/rec_multi_head.py +++ b/ppocr/modeling/heads/rec_multi_head.py @@ -25,12 +25,28 @@ import paddle.nn.functional as F from ppocr.modeling.necks.rnn import Im2Seq, EncoderWithRNN, EncoderWithFC, SequenceEncoder, EncoderWithSVTR from .rec_ctc_head import CTCHead from .rec_sar_head import SARHead +from .rec_nrtr_head import Transformer + + +class FCTranspose(nn.Layer): + def __init__(self, in_channels, out_channels, only_transpose=False): + super().__init__() + self.only_transpose = only_transpose + if not self.only_transpose: + self.fc = nn.Linear(in_channels, out_channels, bias_attr=False) + + def forward(self, x): + if self.only_transpose: + return x.transpose([0, 2, 1]) + else: + return self.fc(x.transpose([0, 2, 1])) class MultiHead(nn.Layer): def __init__(self, in_channels, out_channels_list, **kwargs): super().__init__() self.head_list = kwargs.pop('head_list') + self.gtc_head = 'sar' assert len(self.head_list) >= 2 for idx, head_name in enumerate(self.head_list): @@ -40,12 +56,27 @@ class MultiHead(nn.Layer): sar_args = self.head_list[idx][name] self.sar_head = eval(name)(in_channels=in_channels, \ out_channels=out_channels_list['SARLabelDecode'], **sar_args) + elif name == 'NRTRHead': + gtc_args = self.head_list[idx][name] + max_text_length = gtc_args.get('max_text_length', 25) + nrtr_dim = gtc_args.get('nrtr_dim', 256) + num_decoder_layers = gtc_args.get('num_decoder_layers', 4) + self.before_gtc = nn.Sequential( + nn.Flatten(2), FCTranspose(in_channels, nrtr_dim)) + self.gtc_head = Transformer( + d_model=nrtr_dim, + nhead=nrtr_dim // 32, + num_encoder_layers=-1, + beam_size=-1, + num_decoder_layers=num_decoder_layers, + max_len=max_text_length, + dim_feedforward=nrtr_dim * 4, + out_channels=out_channels_list['NRTRLabelDecode']) elif name == 'CTCHead': # ctc neck self.encoder_reshape = Im2Seq(in_channels) neck_args = self.head_list[idx][name]['Neck'] encoder_type = neck_args.pop('name') - self.encoder = encoder_type self.ctc_encoder = SequenceEncoder(in_channels=in_channels, \ encoder_type=encoder_type, **neck_args) # ctc head @@ -57,6 +88,7 @@ class MultiHead(nn.Layer): '{} is not supported in MultiHead yet'.format(name)) def forward(self, x, targets=None): + ctc_encoder = self.ctc_encoder(x) ctc_out = self.ctc_head(ctc_encoder, targets) head_out = dict() @@ -68,6 +100,7 @@ class MultiHead(nn.Layer): if self.gtc_head == 'sar': sar_out = self.sar_head(x, targets[1:]) head_out['sar'] = sar_out - return head_out else: - return head_out + gtc_out = self.gtc_head(self.before_gtc(x), targets[1:]) + head_out['nrtr'] = gtc_out + return head_out diff --git a/ppocr/modeling/heads/rec_nrtr_head.py b/ppocr/modeling/heads/rec_nrtr_head.py index bf9ef56145e6edfb15bd30235b4a62588396ba96..eb279400203b9ef173793bc0d90e5ab99701cb3a 100644 --- a/ppocr/modeling/heads/rec_nrtr_head.py +++ b/ppocr/modeling/heads/rec_nrtr_head.py @@ -162,7 +162,7 @@ class Transformer(nn.Layer): memory = src dec_seq = paddle.full((bs, 1), 2, dtype=paddle.int64) dec_prob = paddle.full((bs, 1), 1., dtype=paddle.float32) - for len_dec_seq in range(1, self.max_len): + for len_dec_seq in range(1, paddle.to_tensor(self.max_len)): dec_seq_embed = self.embedding(dec_seq) dec_seq_embed = self.positional_encoding(dec_seq_embed) tgt_mask = self.generate_square_subsequent_mask( @@ -304,7 +304,7 @@ class Transformer(nn.Layer): inst_idx_to_position_map = get_inst_idx_to_tensor_position_map( active_inst_idx_list) # Decode - for len_dec_seq in range(1, self.max_len): + for len_dec_seq in range(1, paddle.to_tensor(self.max_len)): src_enc_copy = src_enc.clone() active_inst_idx_list = beam_decode_step( inst_dec_beams, len_dec_seq, src_enc_copy, diff --git a/ppocr/modeling/heads/rec_robustscanner_head.py b/ppocr/modeling/heads/rec_robustscanner_head.py index 7956059ecfe01f27db364d3d748d6af24dad0aac..550836bd401b0b8799e2afb9b185de8ed6b3d5b1 100644 --- a/ppocr/modeling/heads/rec_robustscanner_head.py +++ b/ppocr/modeling/heads/rec_robustscanner_head.py @@ -99,10 +99,11 @@ class DotProductAttentionLayer(nn.Layer): logits = paddle.reshape(logits, [n, c, h, w]) if valid_ratios is not None: # cal mask of attention weight - for i, valid_ratio in enumerate(valid_ratios): - valid_width = min(w, int(w * valid_ratio + 0.5)) - if valid_width < w: - logits[i, :, :, valid_width:] = float('-inf') + with paddle.fluid.framework._stride_in_no_check_dy2st_diff(): + for i, valid_ratio in enumerate(valid_ratios): + valid_width = min(w, int(w * valid_ratio + 0.5)) + if valid_width < w: + logits[i, :, :, valid_width:] = float('-inf') # reshape to (n, c, h, w) logits = paddle.reshape(logits, [n, c, t]) diff --git a/ppocr/modeling/heads/rec_sar_head.py b/ppocr/modeling/heads/rec_sar_head.py index 5e64cae85afafc555f2519ed6dd3f05eafff7ea2..11fe253b67f82ec321bcd3b51c39de318a7aec2f 100644 --- a/ppocr/modeling/heads/rec_sar_head.py +++ b/ppocr/modeling/heads/rec_sar_head.py @@ -276,7 +276,9 @@ class ParallelSARDecoder(BaseDecoder): hf_c = holistic_feat.shape[-1] holistic_feat = paddle.expand( holistic_feat, shape=[bsz, seq_len, hf_c]) - y = self.prediction(paddle.concat((y, attn_feat, holistic_feat), 2)) + y = self.prediction( + paddle.concat((y, attn_feat.astype(y.dtype), + holistic_feat.astype(y.dtype)), 2)) else: y = self.prediction(attn_feat) # bsz * (seq_len + 1) * num_classes @@ -298,7 +300,7 @@ class ParallelSARDecoder(BaseDecoder): lab_embedding = self.embedding(label) # bsz * seq_len * emb_dim - out_enc = out_enc.unsqueeze(1) + out_enc = out_enc.unsqueeze(1).astype(lab_embedding.dtype) # bsz * 1 * emb_dim in_dec = paddle.concat((out_enc, lab_embedding), axis=1) # bsz * (seq_len + 1) * C diff --git a/ppocr/modeling/heads/rec_satrn_head.py b/ppocr/modeling/heads/rec_satrn_head.py new file mode 100644 index 0000000000000000000000000000000000000000..b969c89693b677489b7191a9120f16d02c322802 --- /dev/null +++ b/ppocr/modeling/heads/rec_satrn_head.py @@ -0,0 +1,568 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/1.x/mmocr/models/textrecog/encoders/satrn_encoder.py +https://github.com/open-mmlab/mmocr/blob/1.x/mmocr/models/textrecog/decoders/nrtr_decoder.py +""" + +import math +import numpy as np +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr, reshape, transpose +from paddle.nn import Conv2D, BatchNorm, Linear, Dropout +from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D +from paddle.nn.initializer import KaimingNormal, Uniform, Constant + + +class ConvBNLayer(nn.Layer): + def __init__(self, + num_channels, + filter_size, + num_filters, + stride, + padding, + num_groups=1): + super(ConvBNLayer, self).__init__() + + self.conv = nn.Conv2D( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=filter_size, + stride=stride, + padding=padding, + groups=num_groups, + bias_attr=False) + + self.bn = nn.BatchNorm2D( + num_filters, + weight_attr=ParamAttr(initializer=Constant(1)), + bias_attr=ParamAttr(initializer=Constant(0))) + self.relu = nn.ReLU() + + def forward(self, inputs): + y = self.conv(inputs) + y = self.bn(y) + y = self.relu(y) + return y + + +class SATRNEncoderLayer(nn.Layer): + def __init__(self, + d_model=512, + d_inner=512, + n_head=8, + d_k=64, + d_v=64, + dropout=0.1, + qkv_bias=False): + super().__init__() + self.norm1 = nn.LayerNorm(d_model) + self.attn = MultiHeadAttention( + n_head, d_model, d_k, d_v, qkv_bias=qkv_bias, dropout=dropout) + self.norm2 = nn.LayerNorm(d_model) + self.feed_forward = LocalityAwareFeedforward( + d_model, d_inner, dropout=dropout) + + def forward(self, x, h, w, mask=None): + n, hw, c = x.shape + residual = x + x = self.norm1(x) + x = residual + self.attn(x, x, x, mask) + residual = x + x = self.norm2(x) + x = x.transpose([0, 2, 1]).reshape([n, c, h, w]) + x = self.feed_forward(x) + x = x.reshape([n, c, hw]).transpose([0, 2, 1]) + x = residual + x + return x + + +class LocalityAwareFeedforward(nn.Layer): + def __init__( + self, + d_in, + d_hid, + dropout=0.1, ): + super().__init__() + self.conv1 = ConvBNLayer(d_in, 1, d_hid, stride=1, padding=0) + + self.depthwise_conv = ConvBNLayer( + d_hid, 3, d_hid, stride=1, padding=1, num_groups=d_hid) + + self.conv2 = ConvBNLayer(d_hid, 1, d_in, stride=1, padding=0) + + def forward(self, x): + x = self.conv1(x) + x = self.depthwise_conv(x) + x = self.conv2(x) + + return x + + +class Adaptive2DPositionalEncoding(nn.Layer): + def __init__(self, d_hid=512, n_height=100, n_width=100, dropout=0.1): + super().__init__() + + h_position_encoder = self._get_sinusoid_encoding_table(n_height, d_hid) + h_position_encoder = h_position_encoder.transpose([1, 0]) + h_position_encoder = h_position_encoder.reshape([1, d_hid, n_height, 1]) + + w_position_encoder = self._get_sinusoid_encoding_table(n_width, d_hid) + w_position_encoder = w_position_encoder.transpose([1, 0]) + w_position_encoder = w_position_encoder.reshape([1, d_hid, 1, n_width]) + + self.register_buffer('h_position_encoder', h_position_encoder) + self.register_buffer('w_position_encoder', w_position_encoder) + + self.h_scale = self.scale_factor_generate(d_hid) + self.w_scale = self.scale_factor_generate(d_hid) + self.pool = nn.AdaptiveAvgPool2D(1) + self.dropout = nn.Dropout(p=dropout) + + def _get_sinusoid_encoding_table(self, n_position, d_hid): + """Sinusoid position encoding table.""" + denominator = paddle.to_tensor([ + 1.0 / np.power(10000, 2 * (hid_j // 2) / d_hid) + for hid_j in range(d_hid) + ]) + denominator = denominator.reshape([1, -1]) + pos_tensor = paddle.cast( + paddle.arange(n_position).unsqueeze(-1), 'float32') + sinusoid_table = pos_tensor * denominator + sinusoid_table[:, 0::2] = paddle.sin(sinusoid_table[:, 0::2]) + sinusoid_table[:, 1::2] = paddle.cos(sinusoid_table[:, 1::2]) + + return sinusoid_table + + def scale_factor_generate(self, d_hid): + scale_factor = nn.Sequential( + nn.Conv2D(d_hid, d_hid, 1), + nn.ReLU(), nn.Conv2D(d_hid, d_hid, 1), nn.Sigmoid()) + + return scale_factor + + def forward(self, x): + b, c, h, w = x.shape + + avg_pool = self.pool(x) + + h_pos_encoding = \ + self.h_scale(avg_pool) * self.h_position_encoder[:, :, :h, :] + w_pos_encoding = \ + self.w_scale(avg_pool) * self.w_position_encoder[:, :, :, :w] + + out = x + h_pos_encoding + w_pos_encoding + + out = self.dropout(out) + + return out + + +class ScaledDotProductAttention(nn.Layer): + def __init__(self, temperature, attn_dropout=0.1): + super().__init__() + self.temperature = temperature + self.dropout = nn.Dropout(attn_dropout) + + def forward(self, q, k, v, mask=None): + def masked_fill(x, mask, value): + y = paddle.full(x.shape, value, x.dtype) + return paddle.where(mask, y, x) + + attn = paddle.matmul(q / self.temperature, k.transpose([0, 1, 3, 2])) + if mask is not None: + attn = masked_fill(attn, mask == 0, -1e9) + # attn = attn.masked_fill(mask == 0, float('-inf')) + # attn += mask + + attn = self.dropout(F.softmax(attn, axis=-1)) + output = paddle.matmul(attn, v) + + return output, attn + + +class MultiHeadAttention(nn.Layer): + def __init__(self, + n_head=8, + d_model=512, + d_k=64, + d_v=64, + dropout=0.1, + qkv_bias=False): + super().__init__() + self.n_head = n_head + self.d_k = d_k + self.d_v = d_v + + self.dim_k = n_head * d_k + self.dim_v = n_head * d_v + + self.linear_q = nn.Linear(self.dim_k, self.dim_k, bias_attr=qkv_bias) + self.linear_k = nn.Linear(self.dim_k, self.dim_k, bias_attr=qkv_bias) + self.linear_v = nn.Linear(self.dim_v, self.dim_v, bias_attr=qkv_bias) + + self.attention = ScaledDotProductAttention(d_k**0.5, dropout) + + self.fc = nn.Linear(self.dim_v, d_model, bias_attr=qkv_bias) + self.proj_drop = nn.Dropout(dropout) + + def forward(self, q, k, v, mask=None): + batch_size, len_q, _ = q.shape + _, len_k, _ = k.shape + + q = self.linear_q(q).reshape([batch_size, len_q, self.n_head, self.d_k]) + k = self.linear_k(k).reshape([batch_size, len_k, self.n_head, self.d_k]) + v = self.linear_v(v).reshape([batch_size, len_k, self.n_head, self.d_v]) + + q, k, v = q.transpose([0, 2, 1, 3]), k.transpose( + [0, 2, 1, 3]), v.transpose([0, 2, 1, 3]) + + if mask is not None: + if mask.dim() == 3: + mask = mask.unsqueeze(1) + elif mask.dim() == 2: + mask = mask.unsqueeze(1).unsqueeze(1) + + attn_out, _ = self.attention(q, k, v, mask=mask) + + attn_out = attn_out.transpose([0, 2, 1, 3]).reshape( + [batch_size, len_q, self.dim_v]) + + attn_out = self.fc(attn_out) + attn_out = self.proj_drop(attn_out) + + return attn_out + + +class SATRNEncoder(nn.Layer): + def __init__(self, + n_layers=12, + n_head=8, + d_k=64, + d_v=64, + d_model=512, + n_position=100, + d_inner=256, + dropout=0.1): + super().__init__() + self.d_model = d_model + self.position_enc = Adaptive2DPositionalEncoding( + d_hid=d_model, + n_height=n_position, + n_width=n_position, + dropout=dropout) + self.layer_stack = nn.LayerList([ + SATRNEncoderLayer( + d_model, d_inner, n_head, d_k, d_v, dropout=dropout) + for _ in range(n_layers) + ]) + self.layer_norm = nn.LayerNorm(d_model) + + def forward(self, feat, valid_ratios=None): + """ + Args: + feat (Tensor): Feature tensor of shape :math:`(N, D_m, H, W)`. + img_metas (dict): A dict that contains meta information of input + images. Preferably with the key ``valid_ratio``. + + Returns: + Tensor: A tensor of shape :math:`(N, T, D_m)`. + """ + if valid_ratios is None: + valid_ratios = [1.0 for _ in range(feat.shape[0])] + feat = self.position_enc(feat) + n, c, h, w = feat.shape + + mask = paddle.zeros((n, h, w)) + for i, valid_ratio in enumerate(valid_ratios): + valid_width = min(w, math.ceil(w * valid_ratio)) + mask[i, :, :valid_width] = 1 + + mask = mask.reshape([n, h * w]) + feat = feat.reshape([n, c, h * w]) + + output = feat.transpose([0, 2, 1]) + for enc_layer in self.layer_stack: + output = enc_layer(output, h, w, mask) + output = self.layer_norm(output) + + return output + + +class PositionwiseFeedForward(nn.Layer): + def __init__(self, d_in, d_hid, dropout=0.1): + super().__init__() + self.w_1 = nn.Linear(d_in, d_hid) + self.w_2 = nn.Linear(d_hid, d_in) + self.act = nn.GELU() + self.dropout = nn.Dropout(dropout) + + def forward(self, x): + x = self.w_1(x) + x = self.act(x) + x = self.w_2(x) + x = self.dropout(x) + + return x + + +class PositionalEncoding(nn.Layer): + def __init__(self, d_hid=512, n_position=200, dropout=0): + super().__init__() + self.dropout = nn.Dropout(p=dropout) + + # Not a parameter + # Position table of shape (1, n_position, d_hid) + self.register_buffer( + 'position_table', + self._get_sinusoid_encoding_table(n_position, d_hid)) + + def _get_sinusoid_encoding_table(self, n_position, d_hid): + """Sinusoid position encoding table.""" + denominator = paddle.to_tensor([ + 1.0 / np.power(10000, 2 * (hid_j // 2) / d_hid) + for hid_j in range(d_hid) + ]) + denominator = denominator.reshape([1, -1]) + pos_tensor = paddle.cast( + paddle.arange(n_position).unsqueeze(-1), 'float32') + sinusoid_table = pos_tensor * denominator + sinusoid_table[:, 0::2] = paddle.sin(sinusoid_table[:, 0::2]) + sinusoid_table[:, 1::2] = paddle.cos(sinusoid_table[:, 1::2]) + + return sinusoid_table.unsqueeze(0) + + def forward(self, x): + + x = x + self.position_table[:, :x.shape[1]].clone().detach() + return self.dropout(x) + + +class TFDecoderLayer(nn.Layer): + def __init__(self, + d_model=512, + d_inner=256, + n_head=8, + d_k=64, + d_v=64, + dropout=0.1, + qkv_bias=False, + operation_order=None): + super().__init__() + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.norm3 = nn.LayerNorm(d_model) + + self.self_attn = MultiHeadAttention( + n_head, d_model, d_k, d_v, dropout=dropout, qkv_bias=qkv_bias) + + self.enc_attn = MultiHeadAttention( + n_head, d_model, d_k, d_v, dropout=dropout, qkv_bias=qkv_bias) + + self.mlp = PositionwiseFeedForward(d_model, d_inner, dropout=dropout) + + self.operation_order = operation_order + if self.operation_order is None: + self.operation_order = ('norm', 'self_attn', 'norm', 'enc_dec_attn', + 'norm', 'ffn') + assert self.operation_order in [ + ('norm', 'self_attn', 'norm', 'enc_dec_attn', 'norm', 'ffn'), + ('self_attn', 'norm', 'enc_dec_attn', 'norm', 'ffn', 'norm') + ] + + def forward(self, + dec_input, + enc_output, + self_attn_mask=None, + dec_enc_attn_mask=None): + if self.operation_order == ('self_attn', 'norm', 'enc_dec_attn', 'norm', + 'ffn', 'norm'): + dec_attn_out = self.self_attn(dec_input, dec_input, dec_input, + self_attn_mask) + dec_attn_out += dec_input + dec_attn_out = self.norm1(dec_attn_out) + + enc_dec_attn_out = self.enc_attn(dec_attn_out, enc_output, + enc_output, dec_enc_attn_mask) + enc_dec_attn_out += dec_attn_out + enc_dec_attn_out = self.norm2(enc_dec_attn_out) + + mlp_out = self.mlp(enc_dec_attn_out) + mlp_out += enc_dec_attn_out + mlp_out = self.norm3(mlp_out) + elif self.operation_order == ('norm', 'self_attn', 'norm', + 'enc_dec_attn', 'norm', 'ffn'): + dec_input_norm = self.norm1(dec_input) + dec_attn_out = self.self_attn(dec_input_norm, dec_input_norm, + dec_input_norm, self_attn_mask) + dec_attn_out += dec_input + + enc_dec_attn_in = self.norm2(dec_attn_out) + enc_dec_attn_out = self.enc_attn(enc_dec_attn_in, enc_output, + enc_output, dec_enc_attn_mask) + enc_dec_attn_out += dec_attn_out + + mlp_out = self.mlp(self.norm3(enc_dec_attn_out)) + mlp_out += enc_dec_attn_out + + return mlp_out + + +class SATRNDecoder(nn.Layer): + def __init__(self, + n_layers=6, + d_embedding=512, + n_head=8, + d_k=64, + d_v=64, + d_model=512, + d_inner=256, + n_position=200, + dropout=0.1, + num_classes=93, + max_seq_len=40, + start_idx=1, + padding_idx=92): + super().__init__() + + self.padding_idx = padding_idx + self.start_idx = start_idx + self.max_seq_len = max_seq_len + + self.trg_word_emb = nn.Embedding( + num_classes, d_embedding, padding_idx=padding_idx) + + self.position_enc = PositionalEncoding( + d_embedding, n_position=n_position) + self.dropout = nn.Dropout(p=dropout) + + self.layer_stack = nn.LayerList([ + TFDecoderLayer( + d_model, d_inner, n_head, d_k, d_v, dropout=dropout) + for _ in range(n_layers) + ]) + self.layer_norm = nn.LayerNorm(d_model, epsilon=1e-6) + + pred_num_class = num_classes - 1 # ignore padding_idx + self.classifier = nn.Linear(d_model, pred_num_class) + + @staticmethod + def get_pad_mask(seq, pad_idx): + + return (seq != pad_idx).unsqueeze(-2) + + @staticmethod + def get_subsequent_mask(seq): + """For masking out the subsequent info.""" + len_s = seq.shape[1] + subsequent_mask = 1 - paddle.triu( + paddle.ones((len_s, len_s)), diagonal=1) + subsequent_mask = paddle.cast(subsequent_mask.unsqueeze(0), 'bool') + + return subsequent_mask + + def _attention(self, trg_seq, src, src_mask=None): + trg_embedding = self.trg_word_emb(trg_seq) + trg_pos_encoded = self.position_enc(trg_embedding) + tgt = self.dropout(trg_pos_encoded) + + trg_mask = self.get_pad_mask( + trg_seq, + pad_idx=self.padding_idx) & self.get_subsequent_mask(trg_seq) + output = tgt + for dec_layer in self.layer_stack: + output = dec_layer( + output, + src, + self_attn_mask=trg_mask, + dec_enc_attn_mask=src_mask) + output = self.layer_norm(output) + + return output + + def _get_mask(self, logit, valid_ratios): + N, T, _ = logit.shape + mask = None + if valid_ratios is not None: + mask = paddle.zeros((N, T)) + for i, valid_ratio in enumerate(valid_ratios): + valid_width = min(T, math.ceil(T * valid_ratio)) + mask[i, :valid_width] = 1 + + return mask + + def forward_train(self, feat, out_enc, targets, valid_ratio): + src_mask = self._get_mask(out_enc, valid_ratio) + attn_output = self._attention(targets, out_enc, src_mask=src_mask) + outputs = self.classifier(attn_output) + + return outputs + + def forward_test(self, feat, out_enc, valid_ratio): + + src_mask = self._get_mask(out_enc, valid_ratio) + N = out_enc.shape[0] + init_target_seq = paddle.full( + (N, self.max_seq_len + 1), self.padding_idx, dtype='int64') + # bsz * seq_len + init_target_seq[:, 0] = self.start_idx + + outputs = [] + for step in range(0, paddle.to_tensor(self.max_seq_len)): + decoder_output = self._attention( + init_target_seq, out_enc, src_mask=src_mask) + # bsz * seq_len * C + step_result = F.softmax( + self.classifier(decoder_output[:, step, :]), axis=-1) + # bsz * num_classes + outputs.append(step_result) + step_max_index = paddle.argmax(step_result, axis=-1) + init_target_seq[:, step + 1] = step_max_index + + outputs = paddle.stack(outputs, axis=1) + + return outputs + + def forward(self, feat, out_enc, targets=None, valid_ratio=None): + if self.training: + return self.forward_train(feat, out_enc, targets, valid_ratio) + else: + return self.forward_test(feat, out_enc, valid_ratio) + + +class SATRNHead(nn.Layer): + def __init__(self, enc_cfg, dec_cfg, **kwargs): + super(SATRNHead, self).__init__() + + # encoder module + self.encoder = SATRNEncoder(**enc_cfg) + + # decoder module + self.decoder = SATRNDecoder(**dec_cfg) + + def forward(self, feat, targets=None): + + if targets is not None: + targets, valid_ratio = targets + else: + targets, valid_ratio = None, None + holistic_feat = self.encoder(feat, valid_ratio) # bsz c + + final_out = self.decoder(feat, holistic_feat, targets, valid_ratio) + + return final_out diff --git a/ppocr/modeling/heads/sr_rensnet_transformer.py b/ppocr/modeling/heads/sr_rensnet_transformer.py index 654f3fca5486229c176246237708c4cf6a8da9ec..df0d0c9299170993fb881714c1f07b618cee9612 100644 --- a/ppocr/modeling/heads/sr_rensnet_transformer.py +++ b/ppocr/modeling/heads/sr_rensnet_transformer.py @@ -78,7 +78,7 @@ class MultiHeadedAttention(nn.Layer): def forward(self, query, key, value, mask=None, attention_map=None): if mask is not None: mask = mask.unsqueeze(1) - nbatches = query.shape[0] + nbatches = paddle.shape(query)[0] query, key, value = \ [paddle.transpose(l(x).reshape([nbatches, -1, self.h, self.d_k]), [0,2,1,3]) diff --git a/ppocr/modeling/necks/db_fpn.py b/ppocr/modeling/necks/db_fpn.py index 8c3f52a331db5daafab2a38c0a441edd44eb141d..0f5b826bfb023895d6216605e2b2faf82023fa80 100644 --- a/ppocr/modeling/necks/db_fpn.py +++ b/ppocr/modeling/necks/db_fpn.py @@ -22,6 +22,7 @@ import paddle.nn.functional as F from paddle import ParamAttr import os import sys +from ppocr.modeling.necks.intracl import IntraCLBlock __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(__dir__) @@ -228,6 +229,13 @@ class RSEFPN(nn.Layer): self.out_channels = out_channels self.ins_conv = nn.LayerList() self.inp_conv = nn.LayerList() + self.intracl = False + if 'intracl' in kwargs.keys() and kwargs['intracl'] is True: + self.intracl = kwargs['intracl'] + self.incl1 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) + self.incl2 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) + self.incl3 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) + self.incl4 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) for i in range(len(in_channels)): self.ins_conv.append( @@ -263,6 +271,12 @@ class RSEFPN(nn.Layer): p3 = self.inp_conv[1](out3) p2 = self.inp_conv[0](out2) + if self.intracl is True: + p5 = self.incl4(p5) + p4 = self.incl3(p4) + p3 = self.incl2(p3) + p2 = self.incl1(p2) + p5 = F.upsample(p5, scale_factor=8, mode="nearest", align_mode=1) p4 = F.upsample(p4, scale_factor=4, mode="nearest", align_mode=1) p3 = F.upsample(p3, scale_factor=2, mode="nearest", align_mode=1) @@ -329,6 +343,14 @@ class LKPAN(nn.Layer): weight_attr=ParamAttr(initializer=weight_attr), bias_attr=False)) + self.intracl = False + if 'intracl' in kwargs.keys() and kwargs['intracl'] is True: + self.intracl = kwargs['intracl'] + self.incl1 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) + self.incl2 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) + self.incl3 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) + self.incl4 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) + def forward(self, x): c2, c3, c4, c5 = x @@ -358,6 +380,12 @@ class LKPAN(nn.Layer): p4 = self.pan_lat_conv[2](pan4) p5 = self.pan_lat_conv[3](pan5) + if self.intracl is True: + p5 = self.incl4(p5) + p4 = self.incl3(p4) + p3 = self.incl2(p3) + p2 = self.incl1(p2) + p5 = F.upsample(p5, scale_factor=8, mode="nearest", align_mode=1) p4 = F.upsample(p4, scale_factor=4, mode="nearest", align_mode=1) p3 = F.upsample(p3, scale_factor=2, mode="nearest", align_mode=1) @@ -424,4 +452,4 @@ class ASFBlock(nn.Layer): out_list = [] for i in range(self.out_features_num): out_list.append(attention_scores[:, i:i + 1] * features_list[i]) - return paddle.concat(out_list, axis=1) + return paddle.concat(out_list, axis=1) \ No newline at end of file diff --git a/ppocr/modeling/necks/intracl.py b/ppocr/modeling/necks/intracl.py new file mode 100644 index 0000000000000000000000000000000000000000..205b52e35f04e59d35ae6a89bfe1b920a3890d5f --- /dev/null +++ b/ppocr/modeling/necks/intracl.py @@ -0,0 +1,118 @@ +import paddle +from paddle import nn + +# refer from: https://github.com/ViTAE-Transformer/I3CL/blob/736c80237f66d352d488e83b05f3e33c55201317/mmdet/models/detectors/intra_cl_module.py + + +class IntraCLBlock(nn.Layer): + def __init__(self, in_channels=96, reduce_factor=4): + super(IntraCLBlock, self).__init__() + self.channels = in_channels + self.rf = reduce_factor + weight_attr = paddle.nn.initializer.KaimingUniform() + self.conv1x1_reduce_channel = nn.Conv2D( + self.channels, + self.channels // self.rf, + kernel_size=1, + stride=1, + padding=0) + self.conv1x1_return_channel = nn.Conv2D( + self.channels // self.rf, + self.channels, + kernel_size=1, + stride=1, + padding=0) + + self.v_layer_7x1 = nn.Conv2D( + self.channels // self.rf, + self.channels // self.rf, + kernel_size=(7, 1), + stride=(1, 1), + padding=(3, 0)) + self.v_layer_5x1 = nn.Conv2D( + self.channels // self.rf, + self.channels // self.rf, + kernel_size=(5, 1), + stride=(1, 1), + padding=(2, 0)) + self.v_layer_3x1 = nn.Conv2D( + self.channels // self.rf, + self.channels // self.rf, + kernel_size=(3, 1), + stride=(1, 1), + padding=(1, 0)) + + self.q_layer_1x7 = nn.Conv2D( + self.channels // self.rf, + self.channels // self.rf, + kernel_size=(1, 7), + stride=(1, 1), + padding=(0, 3)) + self.q_layer_1x5 = nn.Conv2D( + self.channels // self.rf, + self.channels // self.rf, + kernel_size=(1, 5), + stride=(1, 1), + padding=(0, 2)) + self.q_layer_1x3 = nn.Conv2D( + self.channels // self.rf, + self.channels // self.rf, + kernel_size=(1, 3), + stride=(1, 1), + padding=(0, 1)) + + # base + self.c_layer_7x7 = nn.Conv2D( + self.channels // self.rf, + self.channels // self.rf, + kernel_size=(7, 7), + stride=(1, 1), + padding=(3, 3)) + self.c_layer_5x5 = nn.Conv2D( + self.channels // self.rf, + self.channels // self.rf, + kernel_size=(5, 5), + stride=(1, 1), + padding=(2, 2)) + self.c_layer_3x3 = nn.Conv2D( + self.channels // self.rf, + self.channels // self.rf, + kernel_size=(3, 3), + stride=(1, 1), + padding=(1, 1)) + + self.bn = nn.BatchNorm2D(self.channels) + self.relu = nn.ReLU() + + def forward(self, x): + x_new = self.conv1x1_reduce_channel(x) + + x_7_c = self.c_layer_7x7(x_new) + x_7_v = self.v_layer_7x1(x_new) + x_7_q = self.q_layer_1x7(x_new) + x_7 = x_7_c + x_7_v + x_7_q + + x_5_c = self.c_layer_5x5(x_7) + x_5_v = self.v_layer_5x1(x_7) + x_5_q = self.q_layer_1x5(x_7) + x_5 = x_5_c + x_5_v + x_5_q + + x_3_c = self.c_layer_3x3(x_5) + x_3_v = self.v_layer_3x1(x_5) + x_3_q = self.q_layer_1x3(x_5) + x_3 = x_3_c + x_3_v + x_3_q + + x_relation = self.conv1x1_return_channel(x_3) + + x_relation = self.bn(x_relation) + x_relation = self.relu(x_relation) + + return x + x_relation + + +def build_intraclblock_list(num_block): + IntraCLBlock_list = nn.LayerList() + for i in range(num_block): + IntraCLBlock_list.append(IntraCLBlock()) + + return IntraCLBlock_list \ No newline at end of file diff --git a/ppocr/modeling/necks/rnn.py b/ppocr/modeling/necks/rnn.py index 33be9400b34cb535d260881748e179c3df106caa..a195a6217ae1246316ef441d7f4772ca296914c9 100644 --- a/ppocr/modeling/necks/rnn.py +++ b/ppocr/modeling/necks/rnn.py @@ -47,8 +47,10 @@ class EncoderWithRNN(nn.Layer): x, _ = self.lstm(x) return x + class BidirectionalLSTM(nn.Layer): - def __init__(self, input_size, + def __init__(self, + input_size, hidden_size, output_size=None, num_layers=1, @@ -58,39 +60,46 @@ class BidirectionalLSTM(nn.Layer): with_linear=False): super(BidirectionalLSTM, self).__init__() self.with_linear = with_linear - self.rnn = nn.LSTM(input_size, - hidden_size, - num_layers=num_layers, - dropout=dropout, - direction=direction, - time_major=time_major) + self.rnn = nn.LSTM( + input_size, + hidden_size, + num_layers=num_layers, + dropout=dropout, + direction=direction, + time_major=time_major) # text recognition the specified structure LSTM with linear if self.with_linear: self.linear = nn.Linear(hidden_size * 2, output_size) def forward(self, input_feature): - recurrent, _ = self.rnn(input_feature) # batch_size x T x input_size -> batch_size x T x (2*hidden_size) + recurrent, _ = self.rnn( + input_feature + ) # batch_size x T x input_size -> batch_size x T x (2*hidden_size) if self.with_linear: - output = self.linear(recurrent) # batch_size x T x output_size + output = self.linear(recurrent) # batch_size x T x output_size return output return recurrent + class EncoderWithCascadeRNN(nn.Layer): - def __init__(self, in_channels, hidden_size, out_channels, num_layers=2, with_linear=False): + def __init__(self, + in_channels, + hidden_size, + out_channels, + num_layers=2, + with_linear=False): super(EncoderWithCascadeRNN, self).__init__() self.out_channels = out_channels[-1] - self.encoder = nn.LayerList( - [BidirectionalLSTM( - in_channels if i == 0 else out_channels[i - 1], - hidden_size, - output_size=out_channels[i], - num_layers=1, - direction='bidirectional', - with_linear=with_linear) - for i in range(num_layers)] - ) - + self.encoder = nn.LayerList([ + BidirectionalLSTM( + in_channels if i == 0 else out_channels[i - 1], + hidden_size, + output_size=out_channels[i], + num_layers=1, + direction='bidirectional', + with_linear=with_linear) for i in range(num_layers) + ]) def forward(self, x): for i, l in enumerate(self.encoder): @@ -130,12 +139,17 @@ class EncoderWithSVTR(nn.Layer): drop_rate=0.1, attn_drop_rate=0.1, drop_path=0., + kernel_size=[3, 3], qk_scale=None): super(EncoderWithSVTR, self).__init__() self.depth = depth self.use_guide = use_guide self.conv1 = ConvBNLayer( - in_channels, in_channels // 8, padding=1, act=nn.Swish) + in_channels, + in_channels // 8, + kernel_size=kernel_size, + padding=[kernel_size[0] // 2, kernel_size[1] // 2], + act=nn.Swish) self.conv2 = ConvBNLayer( in_channels // 8, hidden_dims, kernel_size=1, act=nn.Swish) @@ -161,7 +175,11 @@ class EncoderWithSVTR(nn.Layer): hidden_dims, in_channels, kernel_size=1, act=nn.Swish) # last conv-nxn, the input is concat of input tensor and conv3 output tensor self.conv4 = ConvBNLayer( - 2 * in_channels, in_channels // 8, padding=1, act=nn.Swish) + 2 * in_channels, + in_channels // 8, + kernel_size=kernel_size, + padding=[kernel_size[0] // 2, kernel_size[1] // 2], + act=nn.Swish) self.conv1x1 = ConvBNLayer( in_channels // 8, dims, kernel_size=1, act=nn.Swish) diff --git a/ppocr/modeling/transforms/gaspin_transformer.py b/ppocr/modeling/transforms/gaspin_transformer.py index f4719eb2162a02141620586bcb6a849ae16f3b62..7afa21609336c6914c92b2d7b2b291f7e0fbffdd 100644 --- a/ppocr/modeling/transforms/gaspin_transformer.py +++ b/ppocr/modeling/transforms/gaspin_transformer.py @@ -280,5 +280,13 @@ class GA_SPIN_Transformer(nn.Layer): x = self.sp_net(x, sp_weight, offsets, lambda_color) if self.stn: + is_fp16 = False + if build_P_prime_reshape.dtype != paddle.float32: + data_type = build_P_prime_reshape.dtype + x = x.cast(paddle.float32) + build_P_prime_reshape = build_P_prime_reshape.cast(paddle.float32) + is_fp16 = True x = F.grid_sample(x=x, grid=build_P_prime_reshape, padding_mode='border') + if is_fp16: + x = x.cast(data_type) return x diff --git a/ppocr/modeling/transforms/tbsrn.py b/ppocr/modeling/transforms/tbsrn.py index ee119003600b0515feb6fd1049e2c91565528b7d..e3e77bd36aa1047812a0e41de30a1541c984b2a6 100644 --- a/ppocr/modeling/transforms/tbsrn.py +++ b/ppocr/modeling/transforms/tbsrn.py @@ -45,21 +45,24 @@ def positionalencoding2d(d_model, height, width): pe = paddle.zeros([d_model, height, width]) # Each dimension use half of d_model d_model = int(d_model / 2) - div_term = paddle.exp(paddle.arange(0., d_model, 2) * - -(math.log(10000.0) / d_model)) + div_term = paddle.exp( + paddle.arange(0., d_model, 2, dtype='int64') * -(math.log(10000.0) / d_model)) pos_w = paddle.arange(0., width, dtype='float32').unsqueeze(1) pos_h = paddle.arange(0., height, dtype='float32').unsqueeze(1) - pe[0:d_model:2, :, :] = paddle.sin(pos_w * div_term).transpose([1, 0]).unsqueeze(1).tile([1, height, 1]) - pe[1:d_model:2, :, :] = paddle.cos(pos_w * div_term).transpose([1, 0]).unsqueeze(1).tile([1, height, 1]) - pe[d_model::2, :, :] = paddle.sin(pos_h * div_term).transpose([1, 0]).unsqueeze(2).tile([1, 1, width]) - pe[d_model + 1::2, :, :] = paddle.cos(pos_h * div_term).transpose([1, 0]).unsqueeze(2).tile([1, 1, width]) + pe[0:d_model:2, :, :] = paddle.sin(pos_w * div_term).transpose( + [1, 0]).unsqueeze(1).tile([1, height, 1]) + pe[1:d_model:2, :, :] = paddle.cos(pos_w * div_term).transpose( + [1, 0]).unsqueeze(1).tile([1, height, 1]) + pe[d_model::2, :, :] = paddle.sin(pos_h * div_term).transpose( + [1, 0]).unsqueeze(2).tile([1, 1, width]) + pe[d_model + 1::2, :, :] = paddle.cos(pos_h * div_term).transpose( + [1, 0]).unsqueeze(2).tile([1, 1, width]) return pe class FeatureEnhancer(nn.Layer): - def __init__(self): super(FeatureEnhancer, self).__init__() @@ -77,13 +80,16 @@ class FeatureEnhancer(nn.Layer): global_info: (batch, embedding_size, 1, 1) conv_feature: (batch, channel, H, W) ''' - batch = conv_feature.shape[0] - position2d = positionalencoding2d(64, 16, 64).cast('float32').unsqueeze(0).reshape([1, 64, 1024]) + batch = paddle.shape(conv_feature)[0] + position2d = positionalencoding2d( + 64, 16, 64).cast('float32').unsqueeze(0).reshape([1, 64, 1024]) position2d = position2d.tile([batch, 1, 1]) - conv_feature = paddle.concat([conv_feature, position2d], 1) # batch, 128(64+64), 32, 128 + conv_feature = paddle.concat([conv_feature, position2d], + 1) # batch, 128(64+64), 32, 128 result = conv_feature.transpose([0, 2, 1]) origin_result = result - result = self.mul_layernorm1(origin_result + self.multihead(result, result, result, mask=None)[0]) + result = self.mul_layernorm1(origin_result + self.multihead( + result, result, result, mask=None)[0]) origin_result = result result = self.mul_layernorm3(origin_result + self.pff(result)) result = self.linear(result) @@ -124,23 +130,35 @@ class TBSRN(nn.Layer): assert math.log(scale_factor, 2) % 1 == 0 upsample_block_num = int(math.log(scale_factor, 2)) self.block1 = nn.Sequential( - nn.Conv2D(in_planes, 2 * hidden_units, kernel_size=9, padding=4), + nn.Conv2D( + in_planes, 2 * hidden_units, kernel_size=9, padding=4), nn.PReLU() # nn.ReLU() ) self.srb_nums = srb_nums for i in range(srb_nums): - setattr(self, 'block%d' % (i + 2), RecurrentResidualBlock(2 * hidden_units)) - - setattr(self, 'block%d' % (srb_nums + 2), - nn.Sequential( - nn.Conv2D(2 * hidden_units, 2 * hidden_units, kernel_size=3, padding=1), - nn.BatchNorm2D(2 * hidden_units) - )) + setattr(self, 'block%d' % (i + 2), + RecurrentResidualBlock(2 * hidden_units)) + + setattr( + self, + 'block%d' % (srb_nums + 2), + nn.Sequential( + nn.Conv2D( + 2 * hidden_units, + 2 * hidden_units, + kernel_size=3, + padding=1), + nn.BatchNorm2D(2 * hidden_units))) # self.non_local = NonLocalBlock2D(64, 64) - block_ = [UpsampleBLock(2 * hidden_units, 2) for _ in range(upsample_block_num)] - block_.append(nn.Conv2D(2 * hidden_units, in_planes, kernel_size=9, padding=4)) + block_ = [ + UpsampleBLock(2 * hidden_units, 2) + for _ in range(upsample_block_num) + ] + block_.append( + nn.Conv2D( + 2 * hidden_units, in_planes, kernel_size=9, padding=4)) setattr(self, 'block%d' % (srb_nums + 3), nn.Sequential(*block_)) self.tps_inputsize = [height // scale_factor, width // scale_factor] tps_outputsize = [height // scale_factor, width // scale_factor] @@ -164,7 +182,8 @@ class TBSRN(nn.Layer): self.english_dict = {} for index in range(len(self.english_alphabet)): self.english_dict[self.english_alphabet[index]] = index - transformer = Transformer(alphabet='-0123456789abcdefghijklmnopqrstuvwxyz') + transformer = Transformer( + alphabet='-0123456789abcdefghijklmnopqrstuvwxyz') self.transformer = transformer for param in self.transformer.parameters(): param.trainable = False @@ -219,10 +238,10 @@ class TBSRN(nn.Layer): # add transformer label = [str_filt(i, 'lower') + '-' for i in x[2]] length_tensor, input_tensor, text_gt = self.label_encoder(label) - hr_pred, word_attention_map_gt, hr_correct_list = self.transformer(hr_img, length_tensor, - input_tensor) - sr_pred, word_attention_map_pred, sr_correct_list = self.transformer(sr_img, length_tensor, - input_tensor) + hr_pred, word_attention_map_gt, hr_correct_list = self.transformer( + hr_img, length_tensor, input_tensor) + sr_pred, word_attention_map_pred, sr_correct_list = self.transformer( + sr_img, length_tensor, input_tensor) output["hr_img"] = hr_img output["hr_pred"] = hr_pred output["text_gt"] = text_gt @@ -257,8 +276,8 @@ class RecurrentResidualBlock(nn.Layer): residual = self.conv2(residual) residual = self.bn2(residual) - size = residual.shape + size = paddle.shape(residual) residual = residual.reshape([size[0], size[1], -1]) residual = self.feature_enhancer(residual) residual = residual.reshape([size[0], size[1], size[2], size[3]]) - return x + residual \ No newline at end of file + return x + residual diff --git a/ppocr/modeling/transforms/tps.py b/ppocr/modeling/transforms/tps.py index 9bdab0f85112b90d8da959dce4e258188a812052..ac5ce998b00c92042517f48b3bed81756c230b51 100644 --- a/ppocr/modeling/transforms/tps.py +++ b/ppocr/modeling/transforms/tps.py @@ -304,5 +304,14 @@ class TPS(nn.Layer): batch_P_prime = self.grid_generator(batch_C_prime, image.shape[2:]) batch_P_prime = batch_P_prime.reshape( [-1, image.shape[2], image.shape[3], 2]) + is_fp16 = False + if batch_P_prime.dtype != paddle.float32: + data_type = batch_P_prime.dtype + image = image.cast(paddle.float32) + batch_P_prime = batch_P_prime.cast(paddle.float32) + is_fp16 = True batch_I_r = F.grid_sample(x=image, grid=batch_P_prime) + if is_fp16: + batch_I_r = batch_I_r.cast(data_type) + return batch_I_r diff --git a/ppocr/modeling/transforms/tps_spatial_transformer.py b/ppocr/modeling/transforms/tps_spatial_transformer.py index e7ec2c848f192d766722f824962a7f8d0fed41f9..35b1d8bf7164a1a0dd6e5905dc0c91112008c3e0 100644 --- a/ppocr/modeling/transforms/tps_spatial_transformer.py +++ b/ppocr/modeling/transforms/tps_spatial_transformer.py @@ -29,12 +29,28 @@ import itertools def grid_sample(input, grid, canvas=None): input.stop_gradient = False + + is_fp16 = False + if grid.dtype != paddle.float32: + data_type = grid.dtype + input = input.cast(paddle.float32) + grid = grid.cast(paddle.float32) + is_fp16 = True output = F.grid_sample(input, grid) + if is_fp16: + output = output.cast(data_type) + grid = grid.cast(data_type) + if canvas is None: return output else: input_mask = paddle.ones(shape=input.shape) + if is_fp16: + input_mask = input_mask.cast(paddle.float32) + grid = grid.cast(paddle.float32) output_mask = F.grid_sample(input_mask, grid) + if is_fp16: + output_mask = output_mask.cast(data_type) padded_output = output * output_mask + canvas * (1 - output_mask) return padded_output @@ -140,7 +156,9 @@ class TPSSpatialTransformer(nn.Layer): padding_matrix = paddle.expand( self.padding_matrix, shape=[batch_size, 3, 2]) - Y = paddle.concat([source_control_points, padding_matrix], 1) + Y = paddle.concat([ + source_control_points.astype(padding_matrix.dtype), padding_matrix + ], 1) mapping_matrix = paddle.matmul(self.inverse_kernel, Y) source_coordinate = paddle.matmul(self.target_coordinate_repr, mapping_matrix) @@ -153,4 +171,4 @@ class TPSSpatialTransformer(nn.Layer): # the input to grid_sample is normalized [-1, 1], but what we get is [0, 1] grid = 2.0 * grid - 1.0 output_maps = grid_sample(input, grid, canvas=None) - return output_maps, source_coordinate \ No newline at end of file + return output_maps, source_coordinate diff --git a/ppocr/optimizer/optimizer.py b/ppocr/optimizer/optimizer.py index 144f011c79ec2303b7fbc73ac078afe3ce92c255..ffe72d7db309ab832a258dcc73916f9fa4485c2b 100644 --- a/ppocr/optimizer/optimizer.py +++ b/ppocr/optimizer/optimizer.py @@ -84,8 +84,7 @@ class Adam(object): if self.group_lr: if self.training_step == 'LF_2': import paddle - if isinstance(model, paddle.fluid.dygraph.parallel. - DataParallel): # multi gpu + if isinstance(model, paddle.DataParallel): # multi gpu mlm = model._layers.head.MLM_VRM.MLM.parameters() pre_mlm_pp = model._layers.head.MLM_VRM.Prediction.pp_share.parameters( ) diff --git a/ppocr/postprocess/__init__.py b/ppocr/postprocess/__init__.py index 36a3152f2f2d68ed0884bd415844d209d850f5ca..c89345e70b3dcf22b292ebf1250bf3f258a3355c 100644 --- a/ppocr/postprocess/__init__.py +++ b/ppocr/postprocess/__init__.py @@ -28,7 +28,7 @@ from .fce_postprocess import FCEPostProcess from .rec_postprocess import CTCLabelDecode, AttnLabelDecode, SRNLabelDecode, \ DistillationCTCLabelDecode, NRTRLabelDecode, SARLabelDecode, \ SEEDLabelDecode, PRENLabelDecode, ViTSTRLabelDecode, ABINetLabelDecode, \ - SPINLabelDecode, VLLabelDecode, RFLLabelDecode + SPINLabelDecode, VLLabelDecode, RFLLabelDecode, SATRNLabelDecode from .cls_postprocess import ClsPostProcess from .pg_postprocess import PGPostProcess from .vqa_token_ser_layoutlm_postprocess import VQASerTokenLayoutLMPostProcess, DistillationSerPostProcess @@ -52,7 +52,8 @@ def build_post_process(config, global_config=None): 'TableMasterLabelDecode', 'SPINLabelDecode', 'DistillationSerPostProcess', 'DistillationRePostProcess', 'VLLabelDecode', 'PicoDetPostProcess', 'CTPostProcess', - 'RFLLabelDecode', 'DRRGPostprocess', 'CANLabelDecode' + 'RFLLabelDecode', 'DRRGPostprocess', 'CANLabelDecode', + 'SATRNLabelDecode' ] if config['name'] == 'PSEPostProcess': diff --git a/ppocr/postprocess/db_postprocess.py b/ppocr/postprocess/db_postprocess.py index dfe107816c195b36bf06568843b008bf66ff24c7..244825b76a47162419b4ae68103b182331be1791 100755 --- a/ppocr/postprocess/db_postprocess.py +++ b/ppocr/postprocess/db_postprocess.py @@ -144,9 +144,9 @@ class DBPostProcess(object): np.round(box[:, 0] / width * dest_width), 0, dest_width) box[:, 1] = np.clip( np.round(box[:, 1] / height * dest_height), 0, dest_height) - boxes.append(box.astype(np.int16)) + boxes.append(box.astype("int32")) scores.append(score) - return np.array(boxes, dtype=np.int16), scores + return np.array(boxes, dtype="int32"), scores def unclip(self, box, unclip_ratio): poly = Polygon(box) @@ -185,15 +185,15 @@ class DBPostProcess(object): ''' h, w = bitmap.shape[:2] box = _box.copy() - xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int), 0, w - 1) - xmax = np.clip(np.ceil(box[:, 0].max()).astype(np.int), 0, w - 1) - ymin = np.clip(np.floor(box[:, 1].min()).astype(np.int), 0, h - 1) - ymax = np.clip(np.ceil(box[:, 1].max()).astype(np.int), 0, h - 1) + xmin = np.clip(np.floor(box[:, 0].min()).astype("int32"), 0, w - 1) + xmax = np.clip(np.ceil(box[:, 0].max()).astype("int32"), 0, w - 1) + ymin = np.clip(np.floor(box[:, 1].min()).astype("int32"), 0, h - 1) + ymax = np.clip(np.ceil(box[:, 1].max()).astype("int32"), 0, h - 1) mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8) box[:, 0] = box[:, 0] - xmin box[:, 1] = box[:, 1] - ymin - cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1) + cv2.fillPoly(mask, box.reshape(1, -1, 2).astype("int32"), 1) return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0] def box_score_slow(self, bitmap, contour): @@ -214,7 +214,7 @@ class DBPostProcess(object): contour[:, 0] = contour[:, 0] - xmin contour[:, 1] = contour[:, 1] - ymin - cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype(np.int32), 1) + cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype("int32"), 1) return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0] def __call__(self, outs_dict, shape_list): diff --git a/ppocr/postprocess/drrg_postprocess.py b/ppocr/postprocess/drrg_postprocess.py index 353081c9d4d0fa1d04d995c84445445767276cc8..56fd034f7c18ee31e8af6c5c72461bcd98021809 100644 --- a/ppocr/postprocess/drrg_postprocess.py +++ b/ppocr/postprocess/drrg_postprocess.py @@ -68,7 +68,7 @@ def graph_propagation(edges, scores, text_comps, edge_len_thr=50.): score_dict[edge[0], edge[1]] = scores[i] nodes = np.sort(np.unique(edges.flatten())) - mapping = -1 * np.ones((np.max(nodes) + 1), dtype=np.int) + mapping = -1 * np.ones((np.max(nodes) + 1), dtype=np.int32) mapping[nodes] = np.arange(nodes.shape[0]) order_inds = mapping[edges] vertices = [Node(node) for node in nodes] @@ -93,9 +93,8 @@ def connected_components(nodes, score_dict, link_thr): while node_queue: node = node_queue.pop(0) neighbors = set([ - neighbor for neighbor in node.links - if score_dict[tuple(sorted([node.ind, neighbor.ind]))] >= - link_thr + neighbor for neighbor in node.links if + score_dict[tuple(sorted([node.ind, neighbor.ind]))] >= link_thr ]) neighbors.difference_update(cluster) nodes.difference_update(neighbors) diff --git a/ppocr/postprocess/east_postprocess.py b/ppocr/postprocess/east_postprocess.py index c194c81c6911aac0f9210109c37b76b44532e9c4..c1af3eccef84d0044c7962094b85ad5f4399e09e 100755 --- a/ppocr/postprocess/east_postprocess.py +++ b/ppocr/postprocess/east_postprocess.py @@ -22,6 +22,7 @@ import cv2 import paddle import os +from ppocr.utils.utility import check_install import sys @@ -78,11 +79,12 @@ class EASTPostProcess(object): boxes[:, 8] = score_map[xy_text[:, 0], xy_text[:, 1]] try: + check_install('lanms', 'lanms-nova') import lanms boxes = lanms.merge_quadrangle_n9(boxes, nms_thresh) except: print( - 'you should install lanms by pip3 install lanms-nova to speed up nms_locality' + 'You should install lanms by pip3 install lanms-nova to speed up nms_locality' ) boxes = nms_locality(boxes.astype(np.float64), nms_thresh) if boxes.shape[0] == 0: diff --git a/ppocr/postprocess/fce_postprocess.py b/ppocr/postprocess/fce_postprocess.py index 8e0716f9f2f3a7cb585fa40a2e2a27aecb606a9b..959f86efa4c3180a1fe4e6e2115bbf32966a7f09 100755 --- a/ppocr/postprocess/fce_postprocess.py +++ b/ppocr/postprocess/fce_postprocess.py @@ -31,7 +31,7 @@ def fill_hole(input_mask): mask = np.zeros((h + 4, w + 4), np.uint8) cv2.floodFill(canvas, mask, (0, 0), 1) - canvas = canvas[1:h + 1, 1:w + 1].astype(np.bool) + canvas = canvas[1:h + 1, 1:w + 1].astype(np.bool_) return ~canvas | input_mask @@ -234,7 +234,7 @@ class FCEPostProcess(object): poly = np.array(boundary[:-1]).reshape(-1, 2).astype(np.float32) score = boundary[-1] points = cv2.boxPoints(cv2.minAreaRect(poly)) - points = np.int0(points) + points = np.int64(points) new_boundaries.append(points.reshape(-1).tolist() + [score]) boundaries = new_boundaries diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py index fbf8b93e3d11121c99ce5b2dcbf2149e15453d4a..ce2e9f8b579f2e2fb6d25390db71eb4e45ddeef3 100644 --- a/ppocr/postprocess/rec_postprocess.py +++ b/ppocr/postprocess/rec_postprocess.py @@ -67,7 +67,66 @@ class BaseRecLabelDecode(object): def add_special_char(self, dict_character): return dict_character - def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + def get_word_info(self, text, selection): + """ + Group the decoded characters and record the corresponding decoded positions. + + Args: + text: the decoded text + selection: the bool array that identifies which columns of features are decoded as non-separated characters + Returns: + word_list: list of the grouped words + word_col_list: list of decoding positions corresponding to each character in the grouped word + state_list: list of marker to identify the type of grouping words, including two types of grouping words: + - 'cn': continous chinese characters (e.g., 你好啊) + - 'en&num': continous english characters (e.g., hello), number (e.g., 123, 1.123), or mixed of them connected by '-' (e.g., VGG-16) + The remaining characters in text are treated as separators between groups (e.g., space, '(', ')', etc.). + """ + state = None + word_content = [] + word_col_content = [] + word_list = [] + word_col_list = [] + state_list = [] + valid_col = np.where(selection==True)[0] + + for c_i, char in enumerate(text): + if '\u4e00' <= char <= '\u9fff': + c_state = 'cn' + elif bool(re.search('[a-zA-Z0-9]', char)): + c_state = 'en&num' + else: + c_state = 'splitter' + + if char == '.' and state == 'en&num' and c_i + 1 < len(text) and bool(re.search('[0-9]', text[c_i+1])): # grouping floting number + c_state = 'en&num' + if char == '-' and state == "en&num": # grouping word with '-', such as 'state-of-the-art' + c_state = 'en&num' + + if state == None: + state = c_state + + if state != c_state: + if len(word_content) != 0: + word_list.append(word_content) + word_col_list.append(word_col_content) + state_list.append(state) + word_content = [] + word_col_content = [] + state = c_state + + if state != "splitter": + word_content.append(char) + word_col_content.append(valid_col[c_i]) + + if len(word_content) != 0: + word_list.append(word_content) + word_col_list.append(word_col_content) + state_list.append(state) + + return word_list, word_col_list, state_list + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False, return_word_box=False): """ convert text-index into text-label. """ result_list = [] ignored_tokens = self.get_ignored_tokens() @@ -95,8 +154,12 @@ class BaseRecLabelDecode(object): if self.reverse: # for arabic rec text = self.pred_reverse(text) - - result_list.append((text, np.mean(conf_list).tolist())) + + if return_word_box: + word_list, word_col_list, state_list = self.get_word_info(text, selection) + result_list.append((text, np.mean(conf_list).tolist(), [len(text_index[batch_idx]), word_list, word_col_list, state_list])) + else: + result_list.append((text, np.mean(conf_list).tolist())) return result_list def get_ignored_tokens(self): @@ -111,14 +174,19 @@ class CTCLabelDecode(BaseRecLabelDecode): super(CTCLabelDecode, self).__init__(character_dict_path, use_space_char) - def __call__(self, preds, label=None, *args, **kwargs): + def __call__(self, preds, label=None, return_word_box=False, *args, **kwargs): if isinstance(preds, tuple) or isinstance(preds, list): preds = preds[-1] if isinstance(preds, paddle.Tensor): preds = preds.numpy() preds_idx = preds.argmax(axis=2) preds_prob = preds.max(axis=2) - text = self.decode(preds_idx, preds_prob, is_remove_duplicate=True) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=True, return_word_box=return_word_box) + if return_word_box: + for rec_idx, rec in enumerate(text): + wh_ratio = kwargs['wh_ratio_list'][rec_idx] + max_wh_ratio = kwargs['max_wh_ratio'] + rec[2][0] = rec[2][0]*(wh_ratio/max_wh_ratio) if label is None: return text label = self.decode(label) @@ -568,6 +636,82 @@ class SARLabelDecode(BaseRecLabelDecode): return [self.padding_idx] +class SATRNLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(SATRNLabelDecode, self).__init__(character_dict_path, + use_space_char) + + self.rm_symbol = kwargs.get('rm_symbol', False) + + def add_special_char(self, dict_character): + beg_end_str = "" + unknown_str = "" + padding_str = "" + dict_character = dict_character + [unknown_str] + self.unknown_idx = len(dict_character) - 1 + dict_character = dict_character + [beg_end_str] + self.start_idx = len(dict_character) - 1 + self.end_idx = len(dict_character) - 1 + dict_character = dict_character + [padding_str] + self.padding_idx = len(dict_character) - 1 + return dict_character + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + ignored_tokens = self.get_ignored_tokens() + + batch_size = len(text_index) + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + if text_index[batch_idx][idx] in ignored_tokens: + continue + if int(text_index[batch_idx][idx]) == int(self.end_idx): + if text_prob is None and idx == 0: + continue + else: + break + if is_remove_duplicate: + # only for predict + if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ + batch_idx][idx]: + continue + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + text = ''.join(char_list) + if self.rm_symbol: + comp = re.compile('[^A-Z^a-z^0-9^\u4e00-\u9fa5]') + text = text.lower() + text = comp.sub('', text) + result_list.append((text, np.mean(conf_list).tolist())) + return result_list + + def __call__(self, preds, label=None, *args, **kwargs): + if isinstance(preds, paddle.Tensor): + preds = preds.numpy() + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + + if label is None: + return text + label = self.decode(label, is_remove_duplicate=False) + return text, label + + def get_ignored_tokens(self): + return [self.padding_idx] + + class DistillationSARLabelDecode(SARLabelDecode): """ Convert @@ -723,7 +867,7 @@ class NRTRLabelDecode(BaseRecLabelDecode): else: conf_list.append(1) text = ''.join(char_list) - result_list.append((text.lower(), np.mean(conf_list).tolist())) + result_list.append((text, np.mean(conf_list).tolist())) return result_list @@ -891,7 +1035,7 @@ class VLLabelDecode(BaseRecLabelDecode): ) + length[i])].topk(1)[0][:, 0] preds_prob = paddle.exp( paddle.log(preds_prob).sum() / (preds_prob.shape[0] + 1e-6)) - text.append((preds_text, preds_prob.numpy()[0])) + text.append((preds_text, float(preds_prob))) if label is None: return text label = self.decode(label) diff --git a/ppocr/postprocess/sast_postprocess.py b/ppocr/postprocess/sast_postprocess.py index bee75c05b1a3ea59193d566f91378c96797f533b..594bf17d6a0db2ebee17e7476834ce7b6b4289e6 100755 --- a/ppocr/postprocess/sast_postprocess.py +++ b/ppocr/postprocess/sast_postprocess.py @@ -141,6 +141,8 @@ class SASTPostProcess(object): def nms(self, dets): if self.is_python35: + from ppocr.utils.utility import check_install + check_install('lanms', 'lanms-nova') import lanms dets = lanms.merge_quadrangle_n9(dets, self.nms_thresh) else: diff --git a/ppocr/utils/e2e_metric/Deteval.py b/ppocr/utils/e2e_metric/Deteval.py index 6ce56eda2aa9f38fdc712d49ae64945c558b418d..c2a4383eed38acc4e4c7effea2aa688007a0c24a 100755 --- a/ppocr/utils/e2e_metric/Deteval.py +++ b/ppocr/utils/e2e_metric/Deteval.py @@ -15,7 +15,9 @@ import json import numpy as np import scipy.io as io -import Polygon as plg + +from ppocr.utils.utility import check_install + from ppocr.utils.e2e_metric.polygon_fast import iod, area_of_intersection, area @@ -275,6 +277,8 @@ def get_score_C(gt_label, text, pred_bboxes): """ get score for CentripetalText (CT) prediction. """ + check_install("Polygon", "Polygon3") + import Polygon as plg def gt_reading_mod(gt_label, text): """This helper reads groundtruths from mat files""" diff --git a/ppocr/utils/gen_label.py b/ppocr/utils/gen_label.py index fb78bd38bcfc1a59cac48a28bbb655ecb83bcb3f..56d75544dbee596a87343c90320b0ea3178e6b28 100644 --- a/ppocr/utils/gen_label.py +++ b/ppocr/utils/gen_label.py @@ -29,7 +29,7 @@ def gen_rec_label(input_path, out_label): def gen_det_label(root_path, input_dir, out_label): with open(out_label, 'w') as out_file: for label_file in os.listdir(input_dir): - img_path = root_path + label_file[3:-4] + ".jpg" + img_path = os.path.join(root_path, label_file[3:-4] + ".jpg") label = [] with open( os.path.join(input_dir, label_file), 'r', diff --git a/ppocr/utils/network.py b/ppocr/utils/network.py index 080a5d160116cfdd3b255a883525281d97ee9cc9..f2cd690e12fd06f2749320f1319fde9de8ebe18d 100644 --- a/ppocr/utils/network.py +++ b/ppocr/utils/network.py @@ -20,6 +20,8 @@ from tqdm import tqdm from ppocr.utils.logging import get_logger +MODELS_DIR = os.path.expanduser("~/.paddleocr/models/") + def download_with_progressbar(url, save_path): logger = get_logger() @@ -67,6 +69,18 @@ def maybe_download(model_storage_directory, url): os.remove(tmp_path) +def maybe_download_params(model_path): + if os.path.exists(model_path) or not is_link(model_path): + return model_path + else: + url = model_path + tmp_path = os.path.join(MODELS_DIR, url.split('/')[-1]) + print('download {} to {}'.format(url, tmp_path)) + os.makedirs(MODELS_DIR, exist_ok=True) + download_with_progressbar(url, tmp_path) + return tmp_path + + def is_link(s): return s is not None and s.startswith('http') diff --git a/ppocr/utils/profiler.py b/ppocr/utils/profiler.py index c4e28bc6bea9ca912a0786d879a48ec0349e7698..629ef4ef054a050afd1bc0ce819cb664b9503e9f 100644 --- a/ppocr/utils/profiler.py +++ b/ppocr/utils/profiler.py @@ -13,7 +13,7 @@ # limitations under the License. import sys -import paddle +import paddle.profiler as profiler # A global variable to record the number of calling times for profiler # functions. It is used to specify the tracing range of training steps. @@ -21,7 +21,7 @@ _profiler_step_id = 0 # A global variable to avoid parsing from string every time. _profiler_options = None - +_prof = None class ProfilerOptions(object): ''' @@ -31,6 +31,7 @@ class ProfilerOptions(object): "profile_path=model.profile" "batch_range=[50, 60]; profile_path=model.profile" "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile" + ProfilerOptions supports following key-value pair: batch_range - a integer list, e.g. [100, 110]. state - a string, the optional values are 'CPU', 'GPU' or 'All'. @@ -52,7 +53,8 @@ class ProfilerOptions(object): 'sorted_key': 'total', 'tracer_option': 'Default', 'profile_path': '/tmp/profile', - 'exit_on_finished': True + 'exit_on_finished': True, + 'timer_only': True } self._parse_from_string(options_str) @@ -71,6 +73,8 @@ class ProfilerOptions(object): 'state', 'sorted_key', 'tracer_option', 'profile_path' ]: self._options[key] = value + elif key == 'timer_only': + self._options[key] = value def __getitem__(self, name): if self._options.get(name, None) is None: @@ -84,7 +88,6 @@ def add_profiler_step(options_str=None): Enable the operator-level timing using PaddlePaddle's profiler. The profiler uses a independent variable to count the profiler steps. One call of this function is treated as a profiler step. - Args: profiler_options - a string to initialize the ProfilerOptions. Default is None, and the profiler is disabled. @@ -92,18 +95,33 @@ def add_profiler_step(options_str=None): if options_str is None: return + global _prof global _profiler_step_id global _profiler_options if _profiler_options is None: _profiler_options = ProfilerOptions(options_str) - - if _profiler_step_id == _profiler_options['batch_range'][0]: - paddle.utils.profiler.start_profiler( - _profiler_options['state'], _profiler_options['tracer_option']) - elif _profiler_step_id == _profiler_options['batch_range'][1]: - paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'], - _profiler_options['profile_path']) + # profile : https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/performance_improving/profiling_model.html#chakanxingnengshujudetongjibiaodan + # timer_only = True only the model's throughput and time overhead are displayed + # timer_only = False calling summary can print a statistical form that presents performance data from different perspectives. + # timer_only = False the output Timeline information can be found in the profiler_log directory + if _prof is None: + _timer_only = str(_profiler_options['timer_only']) == str(True) + _prof = profiler.Profiler( + scheduler = (_profiler_options['batch_range'][0], _profiler_options['batch_range'][1]), + on_trace_ready = profiler.export_chrome_tracing('./profiler_log'), + timer_only = _timer_only) + _prof.start() + else: + _prof.step() + + if _profiler_step_id == _profiler_options['batch_range'][1]: + _prof.stop() + _prof.summary( + op_detail=True, + thread_sep=False, + time_unit='ms') + _prof = None if _profiler_options['exit_on_finished']: sys.exit(0) diff --git a/ppocr/utils/save_load.py b/ppocr/utils/save_load.py index aa65f290c0a5f4f13b3103fb4404815e2ae74a88..e6a81c48dfd43245f13f69e1a5679d08838ca603 100644 --- a/ppocr/utils/save_load.py +++ b/ppocr/utils/save_load.py @@ -24,6 +24,7 @@ import six import paddle from ppocr.utils.logging import get_logger +from ppocr.utils.network import maybe_download_params __all__ = ['load_model'] @@ -145,6 +146,7 @@ def load_model(config, model, optimizer=None, model_type='det'): def load_pretrained_params(model, path): logger = get_logger() + path = maybe_download_params(path) if path.endswith('.pdparams'): path = path.replace('.pdparams', '') assert os.path.exists(path + ".pdparams"), \ @@ -195,13 +197,26 @@ def save_model(model, """ _mkdir_if_not_exist(model_path, logger) model_prefix = os.path.join(model_path, prefix) + + if prefix == 'best_accuracy': + best_model_path = os.path.join(model_path, 'best_model') + _mkdir_if_not_exist(best_model_path, logger) + paddle.save(optimizer.state_dict(), model_prefix + '.pdopt') + if prefix == 'best_accuracy': + paddle.save(optimizer.state_dict(), + os.path.join(best_model_path, 'model.pdopt')) is_nlp_model = config['Architecture']["model_type"] == 'kie' and config[ "Architecture"]["algorithm"] not in ["SDMGR"] if is_nlp_model is not True: paddle.save(model.state_dict(), model_prefix + '.pdparams') metric_prefix = model_prefix + + if prefix == 'best_accuracy': + paddle.save(model.state_dict(), + os.path.join(best_model_path, 'model.pdparams')) + else: # for kie system, we follow the save/load rules in NLP if config['Global']['distributed']: arch = model._layers @@ -211,6 +226,10 @@ def save_model(model, arch = arch.Student arch.backbone.model.save_pretrained(model_prefix) metric_prefix = os.path.join(model_prefix, 'metric') + + if prefix == 'best_accuracy': + arch.backbone.model.save_pretrained(best_model_path) + # save metric and config with open(metric_prefix + '.states', 'wb') as f: pickle.dump(kwargs, f, protocol=2) diff --git a/ppocr/utils/utility.py b/ppocr/utils/utility.py index 18357c8e97bcea8ee321856a87146a4a7b901469..f788e79cd53a24d4d7f979f359cacd0532a1ff05 100755 --- a/ppocr/utils/utility.py +++ b/ppocr/utils/utility.py @@ -19,6 +19,9 @@ import cv2 import random import numpy as np import paddle +import importlib.util +import sys +import subprocess def print_dict(d, logger, delimiter=0): @@ -72,6 +75,25 @@ def get_image_file_list(img_file): imgs_lists = sorted(imgs_lists) return imgs_lists +def binarize_img(img): + if len(img.shape) == 3 and img.shape[2] == 3: + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # conversion to grayscale image + # use cv2 threshold binarization + _, gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) + img = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR) + return img + +def alpha_to_color(img, alpha_color=(255, 255, 255)): + if len(img.shape) == 3 and img.shape[2] == 4: + B, G, R, A = cv2.split(img) + alpha = A / 255 + + R = (alpha_color[0] * (1 - alpha) + R * alpha).astype(np.uint8) + G = (alpha_color[1] * (1 - alpha) + G * alpha).astype(np.uint8) + B = (alpha_color[2] * (1 - alpha) + B * alpha).astype(np.uint8) + + img = cv2.merge((B, G, R)) + return img def check_and_read(img_path): if os.path.basename(img_path)[-3:] in ['gif', 'GIF']: @@ -131,6 +153,26 @@ def set_seed(seed=1024): paddle.seed(seed) +def check_install(module_name, install_name): + spec = importlib.util.find_spec(module_name) + if spec is None: + print(f'Warnning! The {module_name} module is NOT installed') + print( + f'Try install {module_name} module automatically. You can also try to install manually by pip install {install_name}.' + ) + python = sys.executable + try: + subprocess.check_call( + [python, '-m', 'pip', 'install', install_name], + stdout=subprocess.DEVNULL) + print(f'The {module_name} module is now installed') + except subprocess.CalledProcessError as exc: + raise Exception( + f"Install {module_name} failed, please install manually") + else: + print(f"{module_name} has been installed.") + + class AverageMeter: def __init__(self): self.reset() diff --git a/ppocr/utils/visual.py b/ppocr/utils/visual.py index b6de446593984788bea5c03026f4a5b8c0187909..9108a3728143e0ef0a0d6705e4cc701ab9588394 100644 --- a/ppocr/utils/visual.py +++ b/ppocr/utils/visual.py @@ -14,6 +14,7 @@ import cv2 import os import numpy as np +import PIL from PIL import Image, ImageDraw, ImageFont @@ -62,8 +63,13 @@ def draw_box_txt(bbox, text, draw, font, font_size, color): draw.rectangle(bbox, fill=color) # draw ocr results - tw = font.getsize(text)[0] - th = font.getsize(text)[1] + if int(PIL.__version__.split('.')[0]) < 10: + tw = font.getsize(text)[0] + th = font.getsize(text)[1] + else: + left, top, right, bottom = font.getbbox(text) + tw, th = right - left, bottom - top + start_y = max(0, bbox[0][1] - th) draw.rectangle( [(bbox[0][0] + 1, start_y), (bbox[0][0] + tw + 1, start_y + th)], diff --git a/ppstructure/docs/quickstart_en.md b/ppstructure/docs/quickstart_en.md index 9229a79de1f14ea738a4ca2b93cf44d48508ff40..bbaac342fdfdb797c9f0f6b8b343713c5afb970f 100644 --- a/ppstructure/docs/quickstart_en.md +++ b/ppstructure/docs/quickstart_en.md @@ -311,7 +311,7 @@ Please refer to: [Key Information Extraction](../kie/README.md) . | save_pdf | Whether to convert docx to pdf when recovery| False | | structure_version | Structure version, optional PP-structure and PP-structurev2 | PP-structure | -Most of the parameters are consistent with the PaddleOCR whl package, see [whl package documentation](../../doc/doc_en/whl.md) +Most of the parameters are consistent with the PaddleOCR whl package, see [whl package documentation](../../doc/doc_en/whl_en.md) ## 3. Summary diff --git a/ppstructure/kie/README.md b/ppstructure/kie/README.md index 872edb959276e22b22e4b733df44bdb6a6819c98..6717aa0c8c349da717b3fd01d9ae209b15eae026 100644 --- a/ppstructure/kie/README.md +++ b/ppstructure/kie/README.md @@ -89,7 +89,7 @@ Boxes of different colors in the image represent different categories. The invoice and application form images have three categories: `request`, `answer` and `header`. The `question` and 'answer' can be used to extract the relationship. -For the ID card image, the mdoel can be directly identify the key information such as `name`, `gender`, `nationality`, so that the subsequent relationship extraction process is not required, and the key information extraction task can be completed using only on model. +For the ID card image, the model can directly identify the key information such as `name`, `gender`, `nationality`, so that the subsequent relationship extraction process is not required, and the key information extraction task can be completed using only on model. ### 3.2 RE @@ -186,6 +186,10 @@ python3 ./tools/infer_kie_token_ser_re.py \ The visual result images and the predicted text file will be saved in the `Global.save_res_path` directory. +If you want to use a custom ocr model, you can set it through the following fields +- `Global.kie_det_model_dir`: the detection inference model path +- `Global.kie_rec_model_dir`: the recognition inference model path + If you want to load the text detection and recognition results collected before, you can use the following command to predict. @@ -257,6 +261,9 @@ python3 kie/predict_kie_token_ser_re.py \ The visual results and text file will be saved in directory `output`. +If you want to use a custom ocr model, you can set it through the following fields +- `--det_model_dir`: the detection inference model path +- `--rec_model_dir`: the recognition inference model path ### 4.3 More diff --git a/ppstructure/kie/README_ch.md b/ppstructure/kie/README_ch.md index 7a8b1942b1849834f8843c8f272ce08e95f4b993..2efb49fd9622e767bbb8696e946bb05dcf72781f 100644 --- a/ppstructure/kie/README_ch.md +++ b/ppstructure/kie/README_ch.md @@ -170,6 +170,10 @@ python3 ./tools/infer_kie_token_ser_re.py \ `Global.save_res_path`目录中会保存可视化的结果图像以及预测的文本文件。 +如果想使用自定义OCR模型,可通过如下字段进行设置 +- `Global.kie_det_model_dir`: 设置检测inference模型地址 +- `Global.kie_rec_model_dir`: 设置识别inference模型地址 + 如果希望加载标注好的文本检测与识别结果,仅预测可以使用下面的命令进行预测。 @@ -239,6 +243,9 @@ python3 kie/predict_kie_token_ser_re.py \ 可视化结果保存在`output`目录下。 +如果想使用自定义OCR模型,可通过如下字段进行设置 +- `--det_model_dir`: 设置检测inference模型地址 +- `--rec_model_dir`: 设置识别inference模型地址 ### 4.3 更多 diff --git a/ppstructure/kie/requirements.txt b/ppstructure/kie/requirements.txt index 6cfcba764190fd46f98b76c27e93db6f4fa36c45..61c230d3ed5bedc093c40af8228d3ea685382f54 100644 --- a/ppstructure/kie/requirements.txt +++ b/ppstructure/kie/requirements.txt @@ -2,6 +2,6 @@ sentencepiece yacs seqeval pypandoc -attrdict +attrdict3 python_docx paddlenlp>=2.4.1 diff --git a/ppstructure/predict_system.py b/ppstructure/predict_system.py index bb061c998f6f8b16c06f9ee94299af0f59c53eb2..b8b871689c919097e480f726a402da1c54873df0 100644 --- a/ppstructure/predict_system.py +++ b/ppstructure/predict_system.py @@ -34,7 +34,7 @@ from ppocr.utils.visual import draw_ser_results, draw_re_results from tools.infer.predict_system import TextSystem from ppstructure.layout.predict_layout import LayoutPredictor from ppstructure.table.predict_table import TableSystem, to_excel -from ppstructure.utility import parse_args, draw_structure_result +from ppstructure.utility import parse_args, draw_structure_result, cal_ocr_word_box logger = get_logger() @@ -79,6 +79,8 @@ class StructureSystem(object): from ppstructure.kie.predict_kie_token_ser_re import SerRePredictor self.kie_predictor = SerRePredictor(args) + self.return_word_box = args.return_word_box + def __call__(self, img, return_ocr_result_in_table=False, img_idx=0): time_dict = { 'image_orientation': 0, @@ -156,17 +158,27 @@ class StructureSystem(object): ] res = [] for box, rec_res in zip(filter_boxes, filter_rec_res): - rec_str, rec_conf = rec_res + rec_str, rec_conf = rec_res[0], rec_res[1] for token in style_token: if token in rec_str: rec_str = rec_str.replace(token, '') if not self.recovery: box += [x1, y1] - res.append({ - 'text': rec_str, - 'confidence': float(rec_conf), - 'text_region': box.tolist() - }) + if self.return_word_box: + word_box_content_list, word_box_list = cal_ocr_word_box(rec_str, box, rec_res[2]) + res.append({ + 'text': rec_str, + 'confidence': float(rec_conf), + 'text_region': box.tolist(), + 'text_word': word_box_content_list, + 'text_word_region': word_box_list + }) + else: + res.append({ + 'text': rec_str, + 'confidence': float(rec_conf), + 'text_region': box.tolist() + }) res_list.append({ 'type': region['label'].lower(), 'bbox': [x1, y1, x2, y2], @@ -229,7 +241,9 @@ def main(args): if args.recovery and args.use_pdf2docx_api and flag_pdf: from pdf2docx.converter import Converter - docx_file = os.path.join(args.output, '{}.docx'.format(img_name)) + os.makedirs(args.output, exist_ok=True) + docx_file = os.path.join(args.output, + '{}_api.docx'.format(img_name)) cv = Converter(image_file) cv.convert(docx_file) cv.close() diff --git a/ppstructure/recovery/recovery_to_doc.py b/ppstructure/recovery/recovery_to_doc.py index 1d8f8d9d4babca7410d6625dbeac4c41668f58a7..cd1728b6668577266c10ab71667e630c21a5703b 100644 --- a/ppstructure/recovery/recovery_to_doc.py +++ b/ppstructure/recovery/recovery_to_doc.py @@ -36,6 +36,8 @@ def convert_info_docx(img, res, save_folder, img_name): flag = 1 for i, region in enumerate(res): + if len(region['res']) == 0: + continue img_idx = region['img_idx'] if flag == 2 and region['layout'] == 'single': section = doc.add_section(WD_SECTION.CONTINUOUS) @@ -73,7 +75,7 @@ def convert_info_docx(img, res, save_folder, img_name): text_run.font.size = shared.Pt(10) # save to docx - docx_path = os.path.join(save_folder, '{}.docx'.format(img_name)) + docx_path = os.path.join(save_folder, '{}_ocr.docx'.format(img_name)) doc.save(docx_path) logger.info('docx save to {}'.format(docx_path)) diff --git a/ppstructure/recovery/requirements.txt b/ppstructure/recovery/requirements.txt index ec08f9d0a28b54e3e082db4d32799f8384250c1d..761b9d7c3e34cedb335e2c93707619593ebede63 100644 --- a/ppstructure/recovery/requirements.txt +++ b/ppstructure/recovery/requirements.txt @@ -1,5 +1,4 @@ python-docx -PyMuPDF==1.19.0 beautifulsoup4 fonttools>=4.24.0 fire>=0.3.0 diff --git a/ppstructure/recovery/table_process.py b/ppstructure/recovery/table_process.py index 982e6b760f9291628d0514728dc8f684f183aa2c..77a6ef7659666ebcbe54dd0c107cb2d62e4c7273 100644 --- a/ppstructure/recovery/table_process.py +++ b/ppstructure/recovery/table_process.py @@ -278,8 +278,6 @@ class HtmlToDocx(HTMLParser): cell_col += colspan cell_row += 1 - doc.save('1.docx') - def handle_data(self, data): if self.skip: return diff --git a/ppstructure/table/predict_table.py b/ppstructure/table/predict_table.py index 354baf6ddf5e73b2e933a9b9e8a568bda80340e5..76bd42dc003cdbd1037cdfe4d50b480f777b41c0 100644 --- a/ppstructure/table/predict_table.py +++ b/ppstructure/table/predict_table.py @@ -93,7 +93,7 @@ class TableSystem(object): time_dict['rec'] = rec_elapse if return_ocr_result_in_table: - result['boxes'] = dt_boxes #[x.tolist() for x in dt_boxes] + result['boxes'] = [x.tolist() for x in dt_boxes] result['rec_res'] = rec_res tic = time.time() diff --git a/ppstructure/utility.py b/ppstructure/utility.py index d909f1a8a165745a5c0df78cc3d89960ec4469e7..4ab4b88b9bc073287ec33b29eea9fca471da8470 100644 --- a/ppstructure/utility.py +++ b/ppstructure/utility.py @@ -13,9 +13,11 @@ # limitations under the License. import random import ast +import PIL from PIL import Image, ImageDraw, ImageFont import numpy as np -from tools.infer.utility import draw_ocr_box_txt, str2bool, init_args as infer_args +from tools.infer.utility import draw_ocr_box_txt, str2bool, str2int_tuple, init_args as infer_args +import math def init_args(): @@ -98,6 +100,21 @@ def init_args(): type=str2bool, default=False, help='Whether to use pdf2docx api') + parser.add_argument( + "--invert", + type=str2bool, + default=False, + help='Whether to invert image before processing') + parser.add_argument( + "--binarize", + type=str2bool, + default=False, + help='Whether to threshold binarize image before processing') + parser.add_argument( + "--alphacolor", + type=str2int_tuple, + default=(255, 255, 255), + help='Replacement color for the alpha channel, if the latter is present; R,G,B integers') return parser @@ -132,7 +149,13 @@ def draw_structure_result(image, result, font_path): [(box_layout[0], box_layout[1]), (box_layout[2], box_layout[3])], outline=box_color, width=3) - text_w, text_h = font.getsize(region['type']) + + if int(PIL.__version__.split('.')[0]) < 10: + text_w, text_h = font.getsize(region['type']) + else: + left, top, right, bottom = font.getbbox(region['type']) + text_w, text_h = right - left, bottom - top + draw_layout.rectangle( [(box_layout[0], box_layout[1]), (box_layout[0] + text_w, box_layout[1] + text_h)], @@ -151,6 +174,71 @@ def draw_structure_result(image, result, font_path): txts.append(text_result['text']) scores.append(text_result['confidence']) + if 'text_word_region' in text_result: + for word_region in text_result['text_word_region']: + char_box = word_region + box_height = int( + math.sqrt((char_box[0][0] - char_box[3][0])**2 + ( + char_box[0][1] - char_box[3][1])**2)) + box_width = int( + math.sqrt((char_box[0][0] - char_box[1][0])**2 + ( + char_box[0][1] - char_box[1][1])**2)) + if box_height == 0 or box_width == 0: + continue + boxes.append(word_region) + txts.append("") + scores.append(1.0) + im_show = draw_ocr_box_txt( img_layout, boxes, txts, scores, font_path=font_path, drop_score=0) return im_show + + +def cal_ocr_word_box(rec_str, box, rec_word_info): + ''' Calculate the detection frame for each word based on the results of recognition and detection of ocr''' + + col_num, word_list, word_col_list, state_list = rec_word_info + box = box.tolist() + bbox_x_start = box[0][0] + bbox_x_end = box[1][0] + bbox_y_start = box[0][1] + bbox_y_end = box[2][1] + + cell_width = (bbox_x_end - bbox_x_start) / col_num + + word_box_list = [] + word_box_content_list = [] + cn_width_list = [] + cn_col_list = [] + for word, word_col, state in zip(word_list, word_col_list, state_list): + if state == 'cn': + if len(word_col) != 1: + char_seq_length = (word_col[-1] - word_col[0] + 1) * cell_width + char_width = char_seq_length / (len(word_col) - 1) + cn_width_list.append(char_width) + cn_col_list += word_col + word_box_content_list += word + else: + cell_x_start = bbox_x_start + int(word_col[0] * cell_width) + cell_x_end = bbox_x_start + int((word_col[-1] + 1) * cell_width) + cell = ((cell_x_start, bbox_y_start), (cell_x_end, bbox_y_start), + (cell_x_end, bbox_y_end), (cell_x_start, bbox_y_end)) + word_box_list.append(cell) + word_box_content_list.append("".join(word)) + if len(cn_col_list) != 0: + if len(cn_width_list) != 0: + avg_char_width = np.mean(cn_width_list) + else: + avg_char_width = (bbox_x_end - bbox_x_start) / len(rec_str) + for center_idx in cn_col_list: + center_x = (center_idx + 0.5) * cell_width + cell_x_start = max(int(center_x - avg_char_width / 2), + 0) + bbox_x_start + cell_x_end = min( + int(center_x + avg_char_width / 2), bbox_x_end - + bbox_x_start) + bbox_x_start + cell = ((cell_x_start, bbox_y_start), (cell_x_end, bbox_y_start), + (cell_x_end, bbox_y_end), (cell_x_start, bbox_y_end)) + word_box_list.append(cell) + + return word_box_content_list, word_box_list diff --git a/requirements.txt b/requirements.txt index 8c5b12f831dfcb2a8854ec46b82ff1fa5b84029e..a5a022738c5fe4c7430099a7a1e41c1671b4ed15 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,13 +7,12 @@ tqdm numpy visualdl rapidfuzz -opencv-python -opencv-contrib-python +opencv-python<=4.6.0.66 +opencv-contrib-python<=4.6.0.66 cython lxml premailer openpyxl attrdict -Polygon3 -lanms-neo==1.0.2 -PyMuPDF==1.19.0 \ No newline at end of file +PyMuPDF<1.21.0 +Pillow diff --git a/test_tipc/benchmark_train.sh b/test_tipc/benchmark_train.sh index 25fda8f97f0bfdefbd6922b13a0ffef3f40c3de9..1668e41a9a95272e38f9f4b5960400718772ec34 100644 --- a/test_tipc/benchmark_train.sh +++ b/test_tipc/benchmark_train.sh @@ -72,6 +72,19 @@ FILENAME=$new_filename # MODE must be one of ['benchmark_train'] MODE=$2 PARAMS=$3 + +to_static="" +# parse "to_static" options and modify trainer into "to_static_trainer" +if [[ $PARAMS =~ "dynamicTostatic" ]] ;then + to_static="d2sT_" + sed -i 's/trainer:norm_train/trainer:to_static_train/g' $FILENAME + # clear PARAM contents + if [ $PARAMS = "to_static" ] ;then + PARAMS="" + fi +fi +# bash test_tipc/benchmark_train.sh test_tipc/configs/det_mv3_db_v2_0/train_benchmark.txt benchmark_train dynamic_bs8_fp32_DP_N1C8 +# bash test_tipc/benchmark_train.sh test_tipc/configs/det_mv3_db_v2_0/train_benchmark.txt benchmark_train dynamicTostatic_bs8_fp32_DP_N1C8 # bash test_tipc/benchmark_train.sh test_tipc/configs/det_mv3_db_v2_0/train_benchmark.txt benchmark_train dynamic_bs8_null_DP_N1C1 IFS=$'\n' # parser params from train_benchmark.txt @@ -83,13 +96,13 @@ model_name=$(func_parser_value "${lines[1]}") python_name=$(func_parser_value "${lines[2]}") # set env -python=${python_name} +python=python export str_tmp=$(echo `pip list|grep paddlepaddle-gpu|awk -F ' ' '{print $2}'`) export frame_version=${str_tmp%%.post*} export frame_commit=$(echo `${python} -c "import paddle;print(paddle.version.commit)"`) # 获取benchmark_params所在的行数 -line_num=`grep -n "train_benchmark_params" $FILENAME | cut -d ":" -f 1` +line_num=`grep -n -w "train_benchmark_params" $FILENAME | cut -d ":" -f 1` # for train log parser batch_size=$(func_parser_value "${lines[line_num]}") line_num=`expr $line_num + 1` @@ -117,7 +130,8 @@ repo_name=$(get_repo_name ) SAVE_LOG=${BENCHMARK_LOG_DIR:-$(pwd)} # */benchmark_log mkdir -p "${SAVE_LOG}/benchmark_log/" status_log="${SAVE_LOG}/benchmark_log/results.log" - +# get benchmark profiling params : PROFILING_TIMER_ONLY=no|True|False +PROFILING_TIMER_ONLY=${PROFILING_TIMER_ONLY:-"True"} # The number of lines in which train params can be replaced. line_python=3 line_gpuid=4 @@ -140,6 +154,13 @@ if [ ! -n "$PARAMS" ] ;then fp_items_list=(${fp_items}) device_num_list=(N1C4) run_mode="DP" +elif [[ ${PARAMS} = "dynamicTostatic" ]];then + IFS="|" + model_type=$PARAMS + batch_size_list=(${batch_size}) + fp_items_list=(${fp_items}) + device_num_list=(N1C4) + run_mode="DP" else # parser params from input: modeltype_bs${bs_item}_${fp_item}_${run_mode}_${device_num} IFS="_" @@ -179,26 +200,32 @@ for batch_size in ${batch_size_list[*]}; do gpu_id=$(set_gpu_id $device_num) if [ ${#gpu_id} -le 1 ];then - log_path="$SAVE_LOG/profiling_log" - mkdir -p $log_path - log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_profiling" func_sed_params "$FILENAME" "${line_gpuid}" "0" # sed used gpu_id - # set profile_option params - tmp=`sed -i "${line_profile}s/.*/${profile_option}/" "${FILENAME}"` - - # run test_train_inference_python.sh - cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " - echo $cmd - eval $cmd - eval "cat ${log_path}/${log_name}" - + if [[ ${PROFILING_TIMER_ONLY} != "no" ]];then + echo "run profile" + # The default value of profile_option's timer_only parameter is True + if [[ ${PROFILING_TIMER_ONLY} = "False" ]];then + profile_option="${profile_option};timer_only=False" + fi + log_path="$SAVE_LOG/profiling_log" + mkdir -p $log_path + log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}profiling" + # set profile_option params + tmp=`sed -i "${line_profile}s/.*/\"${profile_option}\"/" "${FILENAME}"` + # run test_train_inference_python.sh + cmd="timeout 5m bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " + echo $cmd + eval ${cmd} + eval "cat ${log_path}/${log_name}" + fi + echo "run without profile" # without profile log_path="$SAVE_LOG/train_log" speed_log_path="$SAVE_LOG/index" mkdir -p $log_path mkdir -p $speed_log_path - log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_log" - speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_speed" + log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}log" + speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}speed" func_sed_params "$FILENAME" "${line_profile}" "null" # sed profile_id as null cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " echo $cmd @@ -232,8 +259,8 @@ for batch_size in ${batch_size_list[*]}; do speed_log_path="$SAVE_LOG/index" mkdir -p $log_path mkdir -p $speed_log_path - log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_log" - speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_speed" + log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}log" + speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}speed" func_sed_params "$FILENAME" "${line_gpuid}" "$gpu_id" # sed used gpu_id func_sed_params "$FILENAME" "${line_profile}" "null" # sed --profile_option as null cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " diff --git a/test_tipc/configs/ch_PP-OCRv2_rec/ch_PP-OCRv2_rec_distillation.yml b/test_tipc/configs/ch_PP-OCRv2_rec/ch_PP-OCRv2_rec_distillation.yml index 3eb82d42bc3f2b3ca7420d999865977bbad09e31..43e14b84d77a216ef949e2af14a01b65bb350b54 100644 --- a/test_tipc/configs/ch_PP-OCRv2_rec/ch_PP-OCRv2_rec_distillation.yml +++ b/test_tipc/configs/ch_PP-OCRv2_rec/ch_PP-OCRv2_rec_distillation.yml @@ -27,7 +27,7 @@ Optimizer: beta2: 0.999 lr: name: Piecewise - decay_epochs : [700, 800] + decay_epochs : [700] values : [0.001, 0.0001] warmup_epoch: 5 regularizer: diff --git a/test_tipc/configs/ch_PP-OCRv3_det/train_infer_python.txt b/test_tipc/configs/ch_PP-OCRv3_det/train_infer_python.txt index bf10aebe3e9aa67e30ce7a20cb07f376825e39ae..8daab48a4dc08aae888d7b784605b3986e220821 100644 --- a/test_tipc/configs/ch_PP-OCRv3_det/train_infer_python.txt +++ b/test_tipc/configs/ch_PP-OCRv3_det/train_infer_python.txt @@ -17,7 +17,7 @@ norm_train:tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o pact_train:null fpgm_train:null distill_train:null -null:null +to_static_train:Global.to_static=true null:null ## ===========================eval_params=========================== @@ -57,3 +57,5 @@ fp_items:fp32|fp16 epoch:2 --profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 +===========================to_static_train_benchmark_params=========================== +to_static_train:Global.to_static=true \ No newline at end of file diff --git a/test_tipc/configs/ch_PP-OCRv3_rec/ch_PP-OCRv3_rec_distillation.yml b/test_tipc/configs/ch_PP-OCRv3_rec/ch_PP-OCRv3_rec_distillation.yml index 4c8ba0a6fa4a355e9bad1665a8de82399f919740..63362135737f1665fecb16d5b7d6a19c8cd1b8da 100644 --- a/test_tipc/configs/ch_PP-OCRv3_rec/ch_PP-OCRv3_rec_distillation.yml +++ b/test_tipc/configs/ch_PP-OCRv3_rec/ch_PP-OCRv3_rec_distillation.yml @@ -19,6 +19,7 @@ Global: use_space_char: true distributed: true save_res_path: ./output/rec/predicts_ppocrv3_distillation.txt + d2s_train_image_shape: [3, 48, -1] Optimizer: @@ -27,7 +28,7 @@ Optimizer: beta2: 0.999 lr: name: Piecewise - decay_epochs : [700, 800] + decay_epochs : [700] values : [0.0005, 0.00005] warmup_epoch: 5 regularizer: @@ -45,7 +46,7 @@ Architecture: freeze_params: false return_all_feats: true model_type: *model_type - algorithm: SVTR + algorithm: SVTR_LCNet Transform: Backbone: name: MobileNetV1Enhance @@ -72,7 +73,7 @@ Architecture: freeze_params: false return_all_feats: true model_type: *model_type - algorithm: SVTR + algorithm: SVTR_LCNet Transform: Backbone: name: MobileNetV1Enhance diff --git a/test_tipc/configs/ch_PP-OCRv3_rec/train_infer_python.txt b/test_tipc/configs/ch_PP-OCRv3_rec/train_infer_python.txt index fee08b08ede0f61ae4f57fd42dba303301798a3e..13480ec49acc3896920219bee369bb4bfc97b6ff 100644 --- a/test_tipc/configs/ch_PP-OCRv3_rec/train_infer_python.txt +++ b/test_tipc/configs/ch_PP-OCRv3_rec/train_infer_python.txt @@ -17,7 +17,7 @@ norm_train:tools/train.py -c test_tipc/configs/ch_PP-OCRv3_rec/ch_PP-OCRv3_rec_d pact_train:null fpgm_train:null distill_train:null -null:null +to_static_train:Global.to_static=true null:null ## ===========================eval_params=========================== @@ -57,4 +57,5 @@ fp_items:fp32|fp16 epoch:1 --profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 - +===========================to_static_train_benchmark_params=========================== +to_static_train:Global.to_static=true diff --git a/test_tipc/configs/ch_PP-OCRv4_mobile_det/train_infer_python.txt b/test_tipc/configs/ch_PP-OCRv4_mobile_det/train_infer_python.txt new file mode 100644 index 0000000000000000000000000000000000000000..3635c0c6f06c73909527874f54a8fc0402e4c61d --- /dev/null +++ b/test_tipc/configs/ch_PP-OCRv4_mobile_det/train_infer_python.txt @@ -0,0 +1,61 @@ +===========================train_params=========================== +model_name:ch_PP-OCRv4_mobile_det +python:python +gpu_list:0|0,1 +Global.use_gpu:True|True +Global.auto_cast:fp32 +Global.epoch_num:lite_train_lite_infer=1|whole_train_whole_infer=50 +Global.save_model_dir:./output/ +Train.loader.batch_size_per_card:lite_train_lite_infer=2|whole_train_whole_infer=4 +Global.pretrained_model:pretrain_models/PPLCNetV3_x0_75_ocr_det.pdparams +train_model_name:latest +train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/ +null:null +## +trainer:norm_train +norm_train:tools/train.py -c configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det_student.yml -o Global.print_batch_step=1 Train.loader.shuffle=false Global.eval_batch_step=[4000,400] +pact_train:null +fpgm_train:null +distill_train:null +to_static_train:Global.to_static=true +null:null +## +===========================eval_params=========================== +eval:null +null:null +## +===========================infer_params=========================== +Global.save_inference_dir:./output/ +Global.checkpoints: +norm_export:tools/export_model.py -c configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det_student.yml -o +quant_export:null +fpgm_export: +distill_export:null +export1:null +export2:null +inference_dir:Student +infer_model:./inference/ch_PP-OCRv3_det_infer/ +infer_export:null +infer_quant:False +inference:tools/infer/predict_det.py +--use_gpu:True|False +--enable_mkldnn:False +--cpu_threads:6 +--rec_batch_num:1 +--use_tensorrt:False +--precision:fp32 +--det_model_dir: +--image_dir:./inference/ch_det_data_50/all-sum-510/ +null:null +--benchmark:True +null:null +===========================infer_benchmark_params========================== +random_infer_input:[{float32,[3,640,640]}];[{float32,[3,960,960]}] +===========================train_benchmark_params========================== +batch_size:8 +fp_items:fp32|fp16 +epoch:2 +--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile +flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 +===========================to_static_train_benchmark_params=========================== +to_static_train:Global.to_static=true \ No newline at end of file diff --git a/test_tipc/configs/ch_PP-OCRv4_mobile_rec/train_infer_python.txt b/test_tipc/configs/ch_PP-OCRv4_mobile_rec/train_infer_python.txt new file mode 100644 index 0000000000000000000000000000000000000000..5796deb010d4d58162d1d93b56dca4568c14b849 --- /dev/null +++ b/test_tipc/configs/ch_PP-OCRv4_mobile_rec/train_infer_python.txt @@ -0,0 +1,59 @@ +===========================train_params=========================== +model_name:ch_PP-OCRv4_mobile_rec +python:python +gpu_list:0 +Global.use_gpu:True|True +Global.auto_cast:fp32 +Global.epoch_num:lite_train_lite_infer=3|whole_train_whole_infer=50 +Global.save_model_dir:./output/ +Train.loader.batch_size_per_card:lite_train_lite_infer=16|whole_train_whole_infer=128 +Global.pretrained_model:null +train_model_name:latest +train_infer_img_dir:./inference/rec_inference +null:null +## +trainer:norm_train +norm_train:tools/train.py -c configs/rec/PP-OCRv4/ch_PP-OCRv4_rec.yml -o Global.cal_metric_during_train=False Global.print_batch_step=1 Train.loader.shuffle=false Train.dataset.data_dir=./train_data/ic15_data Train.dataset.label_file_list=[./train_data/ic15_data/rec_gt_train.txt] Eval.dataset.data_dir=./train_data/ic15_data Eval.dataset.label_file_list=[./train_data/ic15_data/rec_gt_test.txt] Train.loader.num_workers=16 Eval.loader.num_workers=16 +pact_train:null +fpgm_train:null +distill_train:null +to_static_train:Global.to_static=true +null:null +## +===========================eval_params=========================== +eval:null +null:null +## +===========================infer_params=========================== +Global.save_inference_dir:./output/ +Global.checkpoints: +norm_export:tools/export_model.py -c configs/rec/PP-OCRv4/ch_PP-OCRv4_rec.yml -o +quant_export: +fpgm_export: +distill_export:null +export1:null +export2:null +## +infer_model:./inference/ch_PP-OCRv4_rec_infer +infer_export:null +infer_quant:False +inference:tools/infer/predict_rec.py --rec_image_shape="3,48,320" +--use_gpu:True|False +--enable_mkldnn:False +--cpu_threads:6 +--rec_batch_num:1 +--use_tensorrt:False +--precision:fp32 +--rec_model_dir: +--image_dir:./inference/rec_inference +null:null +--benchmark:True +null:null +===========================infer_benchmark_params========================== +random_infer_input:[{float32,[3,48,320]}] +===========================train_benchmark_params========================== +batch_size:128 +fp_items:fp32|fp16 +epoch:1 +--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile +flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 diff --git a/test_tipc/configs/ch_PP-OCRv4_mobile_rec_ampO2_ultra/train_infer_python.txt b/test_tipc/configs/ch_PP-OCRv4_mobile_rec_ampO2_ultra/train_infer_python.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc2884be97f1d483a9d33a0674c8b4bdbfd5ef87 --- /dev/null +++ b/test_tipc/configs/ch_PP-OCRv4_mobile_rec_ampO2_ultra/train_infer_python.txt @@ -0,0 +1,61 @@ +===========================train_params=========================== +model_name:ch_PP-OCRv4_mobile_rec +python:python +gpu_list:0 +Global.use_gpu:True|True +Global.auto_cast:fp32 +Global.epoch_num:lite_train_lite_infer=3|whole_train_whole_infer=50 +Global.save_model_dir:./output/ +Train.loader.batch_size_per_card:lite_train_lite_infer=16|whole_train_whole_infer=128 +Global.pretrained_model:null +train_model_name:latest +train_infer_img_dir:./inference/rec_inference +null:null +## +trainer:norm_train +norm_train:tools/train.py -c configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_ampO2_ultra.yml -o Global.cal_metric_during_train=False Global.print_batch_step=1 Train.loader.shuffle=false Train.dataset.data_dir=./train_data/ic15_data Train.dataset.label_file_list=[./train_data/ic15_data/rec_gt_train.txt] Eval.dataset.data_dir=./train_data/ic15_data Eval.dataset.label_file_list=[./train_data/ic15_data/rec_gt_test.txt] +pact_train:null +fpgm_train:null +distill_train:null +to_static_train:Global.to_static=true +null:null +## +===========================eval_params=========================== +eval:null +null:null +## +===========================infer_params=========================== +Global.save_inference_dir:./output/ +Global.checkpoints: +norm_export:tools/export_model.py -c configs/rec/PP-OCRv4/ch_PP-OCRv4_rec.yml -o +quant_export: +fpgm_export: +distill_export:null +export1:null +export2:null +## +infer_model:./inference/ch_PP-OCRv4_rec_infer +infer_export:null +infer_quant:False +inference:tools/infer/predict_rec.py --rec_image_shape="3,48,320" +--use_gpu:True|False +--enable_mkldnn:False +--cpu_threads:6 +--rec_batch_num:1 +--use_tensorrt:False +--precision:fp32 +--rec_model_dir: +--image_dir:./inference/rec_inference +null:null +--benchmark:True +null:null +===========================infer_benchmark_params========================== +random_infer_input:[{float32,[3,48,320]}] +===========================train_benchmark_params========================== +batch_size:384 +fp_items:fp16 +epoch:1 +--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile +flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 +===========================disable_to_static_train_benchmark=========================== +to_static_train:Global.to_static=False diff --git a/test_tipc/configs/ch_PP-OCRv4_mobile_rec_fp32_ultra/train_infer_python.txt b/test_tipc/configs/ch_PP-OCRv4_mobile_rec_fp32_ultra/train_infer_python.txt new file mode 100644 index 0000000000000000000000000000000000000000..0465cfc5d78f477ce2b3a7f7b87ceebcef45b5b5 --- /dev/null +++ b/test_tipc/configs/ch_PP-OCRv4_mobile_rec_fp32_ultra/train_infer_python.txt @@ -0,0 +1,61 @@ +===========================train_params=========================== +model_name:ch_PP-OCRv4_mobile_rec +python:python +gpu_list:0 +Global.use_gpu:True|True +Global.auto_cast:fp32 +Global.epoch_num:lite_train_lite_infer=3|whole_train_whole_infer=50 +Global.save_model_dir:./output/ +Train.loader.batch_size_per_card:lite_train_lite_infer=16|whole_train_whole_infer=128 +Global.pretrained_model:null +train_model_name:latest +train_infer_img_dir:./inference/rec_inference +null:null +## +trainer:norm_train +norm_train:tools/train.py -c configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_fp32_ultra.yml -o Global.cal_metric_during_train=False Global.print_batch_step=1 Train.loader.shuffle=false Train.dataset.data_dir=./train_data/ic15_data Train.dataset.label_file_list=[./train_data/ic15_data/rec_gt_train.txt] Eval.dataset.data_dir=./train_data/ic15_data Eval.dataset.label_file_list=[./train_data/ic15_data/rec_gt_test.txt] +pact_train:null +fpgm_train:null +distill_train:null +to_static_train:Global.to_static=true +null:null +## +===========================eval_params=========================== +eval:null +null:null +## +===========================infer_params=========================== +Global.save_inference_dir:./output/ +Global.checkpoints: +norm_export:tools/export_model.py -c configs/rec/PP-OCRv4/ch_PP-OCRv4_rec.yml -o +quant_export: +fpgm_export: +distill_export:null +export1:null +export2:null +## +infer_model:./inference/ch_PP-OCRv4_rec_infer +infer_export:null +infer_quant:False +inference:tools/infer/predict_rec.py --rec_image_shape="3,48,320" +--use_gpu:True|False +--enable_mkldnn:False +--cpu_threads:6 +--rec_batch_num:1 +--use_tensorrt:False +--precision:fp32 +--rec_model_dir: +--image_dir:./inference/rec_inference +null:null +--benchmark:True +null:null +===========================infer_benchmark_params========================== +random_infer_input:[{float32,[3,48,320]}] +===========================train_benchmark_params========================== +batch_size:192 +fp_items:fp32 +epoch:1 +--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile +flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 +===========================disable_to_static_train_benchmark=========================== +to_static_train:Global.to_static=False diff --git a/test_tipc/configs/ch_PP-OCRv4_server_det/train_infer_python.txt b/test_tipc/configs/ch_PP-OCRv4_server_det/train_infer_python.txt new file mode 100644 index 0000000000000000000000000000000000000000..315fac9829be3a6443958bdaa2d69a5f84b26c26 --- /dev/null +++ b/test_tipc/configs/ch_PP-OCRv4_server_det/train_infer_python.txt @@ -0,0 +1,61 @@ +===========================train_params=========================== +model_name:ch_PP-OCRv4_server_det +python:python +gpu_list:0|0,1 +Global.use_gpu:True|True +Global.auto_cast:fp32 +Global.epoch_num:lite_train_lite_infer=1|whole_train_whole_infer=50 +Global.save_model_dir:./output/ +Train.loader.batch_size_per_card:lite_train_lite_infer=2|whole_train_whole_infer=4 +Global.pretrained_model:pretrain_models/PPHGNet_small_ocr_det.pdparams +train_model_name:latest +train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/ +null:null +## +trainer:norm_train +norm_train:tools/train.py -c configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det_teacher.yml -o Global.print_batch_step=1 Train.loader.shuffle=false Global.eval_batch_step=[4000,400] +pact_train:null +fpgm_train:null +distill_train:null +to_static_train:Global.to_static=true +null:null +## +===========================eval_params=========================== +eval:null +null:null +## +===========================infer_params=========================== +Global.save_inference_dir:./output/ +Global.checkpoints: +norm_export:tools/export_model.py -c configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det_teacher.yml -o +quant_export:null +fpgm_export: +distill_export:null +export1:null +export2:null +inference_dir:Student +infer_model:./inference/ch_PP-OCRv3_det_infer/ +infer_export:null +infer_quant:False +inference:tools/infer/predict_det.py +--use_gpu:True|False +--enable_mkldnn:False +--cpu_threads:6 +--rec_batch_num:1 +--use_tensorrt:False +--precision:fp32 +--det_model_dir: +--image_dir:./inference/ch_det_data_50/all-sum-510/ +null:null +--benchmark:True +null:null +===========================infer_benchmark_params========================== +random_infer_input:[{float32,[3,640,640]}];[{float32,[3,960,960]}] +===========================train_benchmark_params========================== +batch_size:4 +fp_items:fp32|fp16 +epoch:2 +--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile +flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 +===========================to_static_train_benchmark_params=========================== +to_static_train:Global.to_static=true \ No newline at end of file diff --git a/test_tipc/configs/ch_PP-OCRv4_server_rec/train_infer_python.txt b/test_tipc/configs/ch_PP-OCRv4_server_rec/train_infer_python.txt new file mode 100644 index 0000000000000000000000000000000000000000..c7fa48f196b92939687d34c7c310e15a57a6d9b4 --- /dev/null +++ b/test_tipc/configs/ch_PP-OCRv4_server_rec/train_infer_python.txt @@ -0,0 +1,59 @@ +===========================train_params=========================== +model_name:ch_PP-OCRv4_server_rec +python:python +gpu_list:0 +Global.use_gpu:True|True +Global.auto_cast:fp32 +Global.epoch_num:lite_train_lite_infer=3|whole_train_whole_infer=50 +Global.save_model_dir:./output/ +Train.loader.batch_size_per_card:lite_train_lite_infer=16|whole_train_whole_infer=128 +Global.pretrained_model:null +train_model_name:latest +train_infer_img_dir:./inference/rec_inference +null:null +## +trainer:norm_train +norm_train:tools/train.py -c configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_hgnet.yml -o Global.cal_metric_during_train=False Global.print_batch_step=1 Train.loader.shuffle=false Train.dataset.data_dir=./train_data/ic15_data Train.dataset.label_file_list=[./train_data/ic15_data/rec_gt_train.txt] Eval.dataset.data_dir=./train_data/ic15_data Eval.dataset.label_file_list=[./train_data/ic15_data/rec_gt_test.txt] Train.loader.num_workers=16 Eval.loader.num_workers=16 +pact_train:null +fpgm_train:null +distill_train:null +to_static_train:Global.to_static=true +null:null +## +===========================eval_params=========================== +eval:null +null:null +## +===========================infer_params=========================== +Global.save_inference_dir:./output/ +Global.checkpoints: +norm_export:tools/export_model.py -c configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_hgnet.yml -o +quant_export: +fpgm_export: +distill_export:null +export1:null +export2:null +## +infer_model:./inference/ch_PP-OCRv4_rec_infer +infer_export:null +infer_quant:False +inference:tools/infer/predict_rec.py --rec_image_shape="3,48,320" +--use_gpu:True|False +--enable_mkldnn:False +--cpu_threads:6 +--rec_batch_num:1 +--use_tensorrt:False +--precision:fp32 +--rec_model_dir: +--image_dir:./inference/rec_inference +null:null +--benchmark:True +null:null +===========================infer_benchmark_params========================== +random_infer_input:[{float32,[3,48,320]}] +===========================train_benchmark_params========================== +batch_size:128 +fp_items:fp32|fp16 +epoch:1 +--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile +flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 diff --git a/test_tipc/configs/ch_PP-OCRv4_server_rec_ampO2_ultra/train_infer_python.txt b/test_tipc/configs/ch_PP-OCRv4_server_rec_ampO2_ultra/train_infer_python.txt new file mode 100644 index 0000000000000000000000000000000000000000..679d2aa37179ba85cad2d0b65c3d06a6c2eb5af9 --- /dev/null +++ b/test_tipc/configs/ch_PP-OCRv4_server_rec_ampO2_ultra/train_infer_python.txt @@ -0,0 +1,60 @@ +===========================train_params=========================== +model_name:ch_PP-OCRv4_server_rec +python:python +gpu_list:0 +Global.use_gpu:True|True +Global.auto_cast:fp32 +Global.epoch_num:lite_train_lite_infer=3|whole_train_whole_infer=50 +Global.save_model_dir:./output/ +Train.loader.batch_size_per_card:lite_train_lite_infer=16|whole_train_whole_infer=128 +Global.pretrained_model:null +train_model_name:latest +train_infer_img_dir:./inference/rec_inference +null:null +## +trainer:norm_train +norm_train:tools/train.py -c configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_hgnet_ampO2_ultra.yml -o Global.cal_metric_during_train=False Global.print_batch_step=1 Train.loader.shuffle=false Train.dataset.data_dir=./train_data/ic15_data Train.dataset.label_file_list=[./train_data/ic15_data/rec_gt_train.txt] Eval.dataset.data_dir=./train_data/ic15_data Eval.dataset.label_file_list=[./train_data/ic15_data/rec_gt_test.txt] +fpgm_train:null +distill_train:null +to_static_train:Global.to_static=true +null:null +## +===========================eval_params=========================== +eval:null +null:null +## +===========================infer_params=========================== +Global.save_inference_dir:./output/ +Global.checkpoints: +norm_export:tools/export_model.py -c configs/rec/PP-OCRv4/ch_PP-OCRv4_rec.yml -o +quant_export: +fpgm_export: +distill_export:null +export1:null +export2:null +## +infer_model:./inference/ch_PP-OCRv4_rec_infer +infer_export:null +infer_quant:False +inference:tools/infer/predict_rec.py --rec_image_shape="3,48,320" +--use_gpu:True|False +--enable_mkldnn:False +--cpu_threads:6 +--rec_batch_num:1 +--use_tensorrt:False +--precision:fp32 +--rec_model_dir: +--image_dir:./inference/rec_inference +null:null +--benchmark:True +null:null +===========================infer_benchmark_params========================== +random_infer_input:[{float32,[3,48,320]}] +===========================train_benchmark_params========================== +batch_size:256 +fp_items:fp16 +epoch:1 +--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile +flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 +===========================disable_to_static_train_benchmark=========================== +to_static_train:Global.to_static=False diff --git a/test_tipc/configs/ch_PP-OCRv4_server_rec_fp32_ultra/train_infer_python.txt b/test_tipc/configs/ch_PP-OCRv4_server_rec_fp32_ultra/train_infer_python.txt new file mode 100644 index 0000000000000000000000000000000000000000..368bc7c64c1d05ee8d6670638fff4d62f4d08796 --- /dev/null +++ b/test_tipc/configs/ch_PP-OCRv4_server_rec_fp32_ultra/train_infer_python.txt @@ -0,0 +1,61 @@ +===========================train_params=========================== +model_name:ch_PP-OCRv4_server_rec +python:python +gpu_list:0 +Global.use_gpu:True|True +Global.auto_cast:fp32 +Global.epoch_num:lite_train_lite_infer=3|whole_train_whole_infer=50 +Global.save_model_dir:./output/ +Train.loader.batch_size_per_card:lite_train_lite_infer=16|whole_train_whole_infer=128 +Global.pretrained_model:null +train_model_name:latest +train_infer_img_dir:./inference/rec_inference +null:null +## +trainer:norm_train +norm_train:tools/train.py -c configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_hgnet_fp32_ultra.yml -o Global.cal_metric_during_train=False Global.print_batch_step=1 Train.loader.shuffle=false Train.dataset.data_dir=./train_data/ic15_data Train.dataset.label_file_list=[./train_data/ic15_data/rec_gt_train.txt] Eval.dataset.data_dir=./train_data/ic15_data Eval.dataset.label_file_list=[./train_data/ic15_data/rec_gt_test.txt] +pact_train:null +fpgm_train:null +distill_train:null +to_static_train:Global.to_static=true +null:null +## +===========================eval_params=========================== +eval:null +null:null +## +===========================infer_params=========================== +Global.save_inference_dir:./output/ +Global.checkpoints: +norm_export:tools/export_model.py -c configs/rec/PP-OCRv4/ch_PP-OCRv4_rec.yml -o +quant_export: +fpgm_export: +distill_export:null +export1:null +export2:null +## +infer_model:./inference/ch_PP-OCRv4_rec_infer +infer_export:null +infer_quant:False +inference:tools/infer/predict_rec.py --rec_image_shape="3,48,320" +--use_gpu:True|False +--enable_mkldnn:False +--cpu_threads:6 +--rec_batch_num:1 +--use_tensorrt:False +--precision:fp32 +--rec_model_dir: +--image_dir:./inference/rec_inference +null:null +--benchmark:True +null:null +===========================infer_benchmark_params========================== +random_infer_input:[{float32,[3,48,320]}] +===========================train_benchmark_params========================== +batch_size:256 +fp_items:fp32 +epoch:1 +--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile +flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 +===========================disable_to_static_train_benchmark=========================== +to_static_train:Global.to_static=False diff --git a/test_tipc/configs/en_table_structure/table_mv3.yml b/test_tipc/configs/en_table_structure/table_mv3.yml deleted file mode 100755 index edcbe2c3b00e8d8a56ad8dd9f208e283b511b86e..0000000000000000000000000000000000000000 --- a/test_tipc/configs/en_table_structure/table_mv3.yml +++ /dev/null @@ -1,129 +0,0 @@ -Global: - use_gpu: true - epoch_num: 10 - log_smooth_window: 20 - print_batch_step: 5 - save_model_dir: ./output/table_mv3/ - save_epoch_step: 400 - # evaluation is run every 400 iterations after the 0th iteration - eval_batch_step: [0, 40000] - cal_metric_during_train: True - pretrained_model: - checkpoints: - save_inference_dir: - use_visualdl: False - infer_img: ppstructure/docs/table/table.jpg - save_res_path: output/table_mv3 - # for data or label process - character_dict_path: ppocr/utils/dict/table_structure_dict.txt - character_type: en - max_text_length: &max_text_length 500 - box_format: &box_format 'xyxy' # 'xywh', 'xyxy', 'xyxyxyxy' - infer_mode: False - -Optimizer: - name: Adam - beta1: 0.9 - beta2: 0.999 - clip_norm: 5.0 - lr: - learning_rate: 0.001 - regularizer: - name: 'L2' - factor: 0.00000 - -Architecture: - model_type: table - algorithm: TableAttn - Backbone: - name: MobileNetV3 - scale: 1.0 - model_name: small - disable_se: true - Head: - name: TableAttentionHead - hidden_size: 256 - loc_type: 2 - max_text_length: *max_text_length - loc_reg_num: &loc_reg_num 4 - -Loss: - name: TableAttentionLoss - structure_weight: 100.0 - loc_weight: 10000.0 - -PostProcess: - name: TableLabelDecode - -Metric: - name: TableMetric - main_indicator: acc - compute_bbox_metric: false # cost many time, set False for training - -Train: - dataset: - name: PubTabDataSet - data_dir: ./train_data/pubtabnet/train - label_file_list: [./train_data/pubtabnet/train.jsonl] - transforms: - - DecodeImage: # load image - img_mode: BGR - channel_first: False - - TableLabelEncode: - learn_empty_box: False - merge_no_span_structure: False - replace_empty_cell_token: False - loc_reg_num: *loc_reg_num - max_text_length: *max_text_length - - TableBoxEncode: - - ResizeTableImage: - max_len: 488 - - NormalizeImage: - scale: 1./255. - mean: [0.485, 0.456, 0.406] - std: [0.229, 0.224, 0.225] - order: 'hwc' - - PaddingTableImage: - size: [488, 488] - - ToCHWImage: - - KeepKeys: - keep_keys: [ 'image', 'structure', 'bboxes', 'bbox_masks', 'shape' ] - loader: - shuffle: True - batch_size_per_card: 32 - drop_last: True - num_workers: 1 - -Eval: - dataset: - name: PubTabDataSet - data_dir: ./train_data/pubtabnet/test/ - label_file_list: [./train_data/pubtabnet/test.jsonl] - transforms: - - DecodeImage: # load image - img_mode: BGR - channel_first: False - - TableLabelEncode: - learn_empty_box: False - merge_no_span_structure: False - replace_empty_cell_token: False - loc_reg_num: *loc_reg_num - max_text_length: *max_text_length - - TableBoxEncode: - - ResizeTableImage: - max_len: 488 - - NormalizeImage: - scale: 1./255. - mean: [0.485, 0.456, 0.406] - std: [0.229, 0.224, 0.225] - order: 'hwc' - - PaddingTableImage: - size: [488, 488] - - ToCHWImage: - - KeepKeys: - keep_keys: [ 'image', 'structure', 'bboxes', 'bbox_masks', 'shape' ] - loader: - shuffle: False - drop_last: False - batch_size_per_card: 16 - num_workers: 1 diff --git a/test_tipc/configs/en_table_structure/train_infer_python.txt b/test_tipc/configs/en_table_structure/train_infer_python.txt index 3fd5dc9f60a9621026d488e5654cd7e1421e8b65..8861ea8cc134a94dfa7b9b233ea66bc341a5a666 100644 --- a/test_tipc/configs/en_table_structure/train_infer_python.txt +++ b/test_tipc/configs/en_table_structure/train_infer_python.txt @@ -13,7 +13,7 @@ train_infer_img_dir:./ppstructure/docs/table/table.jpg null:null ## trainer:norm_train -norm_train:tools/train.py -c test_tipc/configs/en_table_structure/table_mv3.yml -o Global.print_batch_step=1 Train.loader.shuffle=false +norm_train:tools/train.py -c configs/table/table_mv3.yml -o Global.print_batch_step=1 Train.loader.shuffle=false Train.dataset.data_dir=./train_data/pubtabnet/train Train.dataset.label_file_list=[./train_data/pubtabnet/train.jsonl] Eval.dataset.data_dir=./train_data/pubtabnet/test Eval.dataset.label_file_list=[./train_data/pubtabnet/test.jsonl] pact_train:null fpgm_train:null distill_train:null @@ -27,7 +27,7 @@ null:null ===========================infer_params=========================== Global.save_inference_dir:./output/ Global.checkpoints: -norm_export:tools/export_model.py -c test_tipc/configs/en_table_structure/table_mv3.yml -o +norm_export:tools/export_model.py -c configs/table/table_mv3.yml -o quant_export: fpgm_export: distill_export:null diff --git a/test_tipc/configs/en_table_structure/train_linux_gpu_fleet_normal_infer_python_linux_gpu_cpu.txt b/test_tipc/configs/en_table_structure/train_linux_gpu_fleet_normal_infer_python_linux_gpu_cpu.txt index 41d236c3765fbf6a711c6739d8dee4f41a147039..8e25b9d4ef7abbde7986545ec7245cc92ae25710 100644 --- a/test_tipc/configs/en_table_structure/train_linux_gpu_fleet_normal_infer_python_linux_gpu_cpu.txt +++ b/test_tipc/configs/en_table_structure/train_linux_gpu_fleet_normal_infer_python_linux_gpu_cpu.txt @@ -13,7 +13,7 @@ train_infer_img_dir:./ppstructure/docs/table/table.jpg null:null ## trainer:norm_train -norm_train:tools/train.py -c test_tipc/configs/en_table_structure/table_mv3.yml -o +norm_train:tools/train.py -c configs/table/table_mv3.yml -o Train.dataset.data_dir=./train_data/pubtabnet/train Train.dataset.label_file_list=[./train_data/pubtabnet/train.jsonl] Eval.dataset.data_dir=./train_data/pubtabnet/test Eval.dataset.label_file_list=[./train_data/pubtabnet/test.jsonl] pact_train:null fpgm_train:null distill_train:null @@ -27,7 +27,7 @@ null:null ===========================infer_params=========================== Global.save_inference_dir:./output/ Global.checkpoints: -norm_export:tools/export_model.py -c test_tipc/configs/en_table_structure/table_mv3.yml -o +norm_export:tools/export_model.py -c configs/table/table_mv3.yml -o quant_export: fpgm_export: distill_export:null diff --git a/test_tipc/configs/en_table_structure/train_linux_gpu_normal_amp_infer_python_linux_gpu_cpu.txt b/test_tipc/configs/en_table_structure/train_linux_gpu_normal_amp_infer_python_linux_gpu_cpu.txt index 31ac1ed53f2adc9810bc4fd2cf4f874d89d49606..a399e35d453745f323ec4c4e18fe428fe8150d85 100644 --- a/test_tipc/configs/en_table_structure/train_linux_gpu_normal_amp_infer_python_linux_gpu_cpu.txt +++ b/test_tipc/configs/en_table_structure/train_linux_gpu_normal_amp_infer_python_linux_gpu_cpu.txt @@ -13,7 +13,7 @@ train_infer_img_dir:./ppstructure/docs/table/table.jpg null:null ## trainer:norm_train -norm_train:tools/train.py -c test_tipc/configs/en_table_structure/table_mv3.yml -o +norm_train:tools/train.py -c configs/table/table_mv3.yml -o Train.dataset.data_dir=./train_data/pubtabnet/train Train.dataset.label_file_list=[./train_data/pubtabnet/train.jsonl] Eval.dataset.data_dir=./train_data/pubtabnet/test Eval.dataset.label_file_list=[./train_data/pubtabnet/test.jsonl] pact_train:null fpgm_train:null distill_train:null @@ -27,7 +27,7 @@ null:null ===========================infer_params=========================== Global.save_inference_dir:./output/ Global.checkpoints: -norm_export:tools/export_model.py -c test_tipc/configs/en_table_structure/table_mv3.yml -o +norm_export:tools/export_model.py -c configs/table/table_mv3.yml -o quant_export: fpgm_export: distill_export:null diff --git a/test_tipc/configs/en_table_structure/train_pact_infer_python.txt b/test_tipc/configs/en_table_structure/train_pact_infer_python.txt index 9890b906a1d3b1127352af567dca0d7186f94694..0bb04c4c929a53ebb44db0ce5c3e98b28c179ff9 100644 --- a/test_tipc/configs/en_table_structure/train_pact_infer_python.txt +++ b/test_tipc/configs/en_table_structure/train_pact_infer_python.txt @@ -14,7 +14,7 @@ null:null ## trainer:pact_train norm_train:null -pact_train:deploy/slim/quantization/quant.py -c test_tipc/configs/en_table_structure/table_mv3.yml -o +pact_train:deploy/slim/quantization/quant.py -c configs/table/table_mv3.yml -o Train.dataset.data_dir=./train_data/pubtabnet/train Train.dataset.label_file_list=[./train_data/pubtabnet/train.jsonl] Eval.dataset.data_dir=./train_data/pubtabnet/test Eval.dataset.label_file_list=[./train_data/pubtabnet/test.jsonl] fpgm_train:null distill_train:null null:null @@ -28,7 +28,7 @@ null:null Global.save_inference_dir:./output/ Global.checkpoints: norm_export:null -quant_export:deploy/slim/quantization/export_model.py -c test_tipc/configs/en_table_structure/table_mv3.yml -o +quant_export:deploy/slim/quantization/export_model.py -c configs/table/table_mv3.yml -o fpgm_export: distill_export:null export1:null diff --git a/test_tipc/configs/en_table_structure/train_ptq_infer_python.txt b/test_tipc/configs/en_table_structure/train_ptq_infer_python.txt index e8f7bbaa50417b97f79596634677fff0a95cb47f..aae0895e6469e6913673e5e5dad2f75702f6c921 100644 --- a/test_tipc/configs/en_table_structure/train_ptq_infer_python.txt +++ b/test_tipc/configs/en_table_structure/train_ptq_infer_python.txt @@ -4,7 +4,7 @@ python:python3.7 Global.pretrained_model: Global.save_inference_dir:null infer_model:./inference/en_ppocr_mobile_v2.0_table_structure_infer/ -infer_export:deploy/slim/quantization/quant_kl.py -c test_tipc/configs/en_table_structure/table_mv3.yml -o +infer_export:deploy/slim/quantization/quant_kl.py -c configs/table/table_mv3.yml -o infer_quant:True inference:ppstructure/table/predict_table.py --det_model_dir=./inference/en_ppocr_mobile_v2.0_table_det_infer --rec_model_dir=./inference/en_ppocr_mobile_v2.0_table_rec_infer --rec_char_dict_path=./ppocr/utils/dict/table_dict.txt --table_char_dict_path=./ppocr/utils/dict/table_structure_dict.txt --image_dir=./ppstructure/docs/table/table.jpg --det_limit_side_len=736 --det_limit_type=min --output ./output/table --use_gpu:True|False diff --git a/test_tipc/configs/layoutxlm_ser/ser_layoutxlm_xfund_zh.yml b/test_tipc/configs/layoutxlm_ser/ser_layoutxlm_xfund_zh.yml index d2be152f0bae7d87129904d87c56c6d777a1f338..31e0ed4918e25c9408b0a6f77ae94d3d8f734cc1 100644 --- a/test_tipc/configs/layoutxlm_ser/ser_layoutxlm_xfund_zh.yml +++ b/test_tipc/configs/layoutxlm_ser/ser_layoutxlm_xfund_zh.yml @@ -84,7 +84,7 @@ Train: shuffle: True drop_last: False batch_size_per_card: 8 - num_workers: 4 + num_workers: 16 Eval: dataset: diff --git a/test_tipc/configs/rec_svtrnet/rec_svtrnet.yml b/test_tipc/configs/rec_svtrnet/rec_svtrnet.yml index 140b17e0e79f9895167e9c51d86ced173e44a541..6e22bc7832c292b59e060f0564d77c1e93d785af 100644 --- a/test_tipc/configs/rec_svtrnet/rec_svtrnet.yml +++ b/test_tipc/configs/rec_svtrnet/rec_svtrnet.yml @@ -20,6 +20,7 @@ Global: infer_mode: False use_space_char: False save_res_path: ./output/rec/predicts_svtr_tiny.txt + d2s_train_image_shape: [3, 64, 256] Optimizer: diff --git a/test_tipc/configs/rec_svtrnet/train_infer_python.txt b/test_tipc/configs/rec_svtrnet/train_infer_python.txt index 5508c0411cfdc7102ccec7a00c59c2a5e1a54998..63e6b908a35c061f0979d0548f73e73b4265505d 100644 --- a/test_tipc/configs/rec_svtrnet/train_infer_python.txt +++ b/test_tipc/configs/rec_svtrnet/train_infer_python.txt @@ -51,3 +51,11 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dic null:null ===========================infer_benchmark_params========================== random_infer_input:[{float32,[3,64,256]}] +===========================train_benchmark_params========================== +batch_size:512 +fp_items:fp32|fp16 +epoch:2 +--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile +flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 +===========================to_static_train_benchmark_params=========================== +to_static_train:Global.to_static=true \ No newline at end of file diff --git a/test_tipc/configs/slanet/SLANet.yml b/test_tipc/configs/slanet/SLANet.yml deleted file mode 100644 index 0d55d70d64e29716e942517e9c0d4909e6f70f9b..0000000000000000000000000000000000000000 --- a/test_tipc/configs/slanet/SLANet.yml +++ /dev/null @@ -1,143 +0,0 @@ -Global: - use_gpu: true - epoch_num: 100 - log_smooth_window: 20 - print_batch_step: 20 - save_model_dir: ./output/SLANet - save_epoch_step: 400 - # evaluation is run every 1000 iterations after the 0th iteration - eval_batch_step: [0, 1000] - cal_metric_during_train: True - pretrained_model: - checkpoints: - save_inference_dir: ./output/SLANet/infer - use_visualdl: False - infer_img: ppstructure/docs/table/table.jpg - # for data or label process - character_dict_path: ppocr/utils/dict/table_structure_dict.txt - character_type: en - max_text_length: &max_text_length 500 - box_format: &box_format 'xyxy' # 'xywh', 'xyxy', 'xyxyxyxy' - infer_mode: False - use_sync_bn: True - save_res_path: 'output/infer' - -Optimizer: - name: Adam - beta1: 0.9 - beta2: 0.999 - clip_norm: 5.0 - lr: - name: Piecewise - learning_rate: 0.001 - decay_epochs : [40, 50] - values : [0.001, 0.0001, 0.00005] - regularizer: - name: 'L2' - factor: 0.00000 - -Architecture: - model_type: table - algorithm: SLANet - Backbone: - name: PPLCNet - scale: 1.0 - pretrained: true - use_ssld: true - Neck: - name: CSPPAN - out_channels: 96 - Head: - name: SLAHead - hidden_size: 256 - max_text_length: *max_text_length - loc_reg_num: &loc_reg_num 4 - -Loss: - name: SLALoss - structure_weight: 1.0 - loc_weight: 2.0 - loc_loss: smooth_l1 - -PostProcess: - name: TableLabelDecode - merge_no_span_structure: &merge_no_span_structure True - -Metric: - name: TableMetric - main_indicator: acc - compute_bbox_metric: False - loc_reg_num: *loc_reg_num - box_format: *box_format - -Train: - dataset: - name: PubTabDataSet - data_dir: ./train_data/pubtabnet/train/ - label_file_list: [./train_data/pubtabnet/train.jsonl] - transforms: - - DecodeImage: # load image - img_mode: BGR - channel_first: False - - TableLabelEncode: - learn_empty_box: False - merge_no_span_structure: *merge_no_span_structure - replace_empty_cell_token: False - loc_reg_num: *loc_reg_num - max_text_length: *max_text_length - - TableBoxEncode: - in_box_format: *box_format - out_box_format: *box_format - - ResizeTableImage: - max_len: 488 - - NormalizeImage: - scale: 1./255. - mean: [0.485, 0.456, 0.406] - std: [0.229, 0.224, 0.225] - order: 'hwc' - - PaddingTableImage: - size: [488, 488] - - ToCHWImage: - - KeepKeys: - keep_keys: [ 'image', 'structure', 'bboxes', 'bbox_masks', 'shape' ] - loader: - shuffle: True - batch_size_per_card: 48 - drop_last: True - num_workers: 1 - -Eval: - dataset: - name: PubTabDataSet - data_dir: ./train_data/pubtabnet/test/ - label_file_list: [./train_data/pubtabnet/test.jsonl] - transforms: - - DecodeImage: # load image - img_mode: BGR - channel_first: False - - TableLabelEncode: - learn_empty_box: False - merge_no_span_structure: *merge_no_span_structure - replace_empty_cell_token: False - loc_reg_num: *loc_reg_num - max_text_length: *max_text_length - - TableBoxEncode: - in_box_format: *box_format - out_box_format: *box_format - - ResizeTableImage: - max_len: 488 - - NormalizeImage: - scale: 1./255. - mean: [0.485, 0.456, 0.406] - std: [0.229, 0.224, 0.225] - order: 'hwc' - - PaddingTableImage: - size: [488, 488] - - ToCHWImage: - - KeepKeys: - keep_keys: [ 'image', 'structure', 'bboxes', 'bbox_masks', 'shape' ] - loader: - shuffle: False - drop_last: False - batch_size_per_card: 48 - num_workers: 1 diff --git a/test_tipc/configs/slanet/train_infer_python.txt b/test_tipc/configs/slanet/train_infer_python.txt index 05264360ac95d08ba11157372a9badef23afdc70..0beebc04d63f74d6d099f19b516a4702b43bd39f 100644 --- a/test_tipc/configs/slanet/train_infer_python.txt +++ b/test_tipc/configs/slanet/train_infer_python.txt @@ -1,6 +1,6 @@ ===========================train_params=========================== model_name:slanet -python:python3.7 +python:python gpu_list:0|0,1 Global.use_gpu:True|True Global.auto_cast:fp32 @@ -13,11 +13,11 @@ train_infer_img_dir:./ppstructure/docs/table/table.jpg null:null ## trainer:norm_train -norm_train:tools/train.py -c test_tipc/configs/slanet/SLANet.yml -o Global.print_batch_step=1 Train.loader.shuffle=false +norm_train:tools/train.py -c configs/table/SLANet.yml -o Global.cal_metric_during_train=False Global.print_batch_step=1 Train.loader.shuffle=false Train.dataset.data_dir=./train_data/pubtabnet/train Train.dataset.label_file_list=[./train_data/pubtabnet/train.jsonl] Eval.dataset.data_dir=./train_data/pubtabnet/test Eval.dataset.label_file_list=[./train_data/pubtabnet/test.jsonl] pact_train:null fpgm_train:null distill_train:null -null:null +to_static_train:Global.to_static=true null:null ## ===========================eval_params=========================== @@ -27,7 +27,7 @@ null:null ===========================infer_params=========================== Global.save_inference_dir:./output/ Global.checkpoints: -norm_export:tools/export_model.py -c test_tipc/configs/slanet/SLANet.yml -o +norm_export:tools/export_model.py -c configs/table/SLANet.yml -o quant_export: fpgm_export: distill_export:null @@ -52,8 +52,10 @@ null:null ===========================infer_benchmark_params========================== random_infer_input:[{float32,[3,488,488]}] ===========================train_benchmark_params========================== -batch_size:32 +batch_size:64 fp_items:fp32|fp16 epoch:2 --profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 +===========================to_static_train_benchmark_params=========================== +to_static_train:Global.to_static=true \ No newline at end of file diff --git a/test_tipc/configs/table_master/table_master.yml b/test_tipc/configs/table_master/table_master.yml index 27f81683b9b7e9475bdfa4ad4862166f4cf9c14d..b27bdae542bf85d8f2932372d9002c2de8d6c652 100644 --- a/test_tipc/configs/table_master/table_master.yml +++ b/test_tipc/configs/table_master/table_master.yml @@ -6,7 +6,7 @@ Global: save_model_dir: ./output/table_master/ save_epoch_step: 17 eval_batch_step: [0, 6259] - cal_metric_during_train: true + cal_metric_during_train: false pretrained_model: null checkpoints: save_inference_dir: output/table_master/infer @@ -16,6 +16,7 @@ Global: character_dict_path: ppocr/utils/dict/table_master_structure_dict.txt infer_mode: false max_text_length: 500 + d2s_train_image_shape: [3, 480, 480] Optimizer: @@ -67,16 +68,15 @@ Metric: Train: dataset: - name: PubTabDataSet - data_dir: ./train_data/pubtabnet/train - label_file_list: [./train_data/pubtabnet/train.jsonl] + name: LMDBDataSetTableMaster + data_dir: train_data/StructureLabel_val_500/ transforms: - DecodeImage: img_mode: BGR channel_first: False - TableMasterLabelEncode: learn_empty_box: False - merge_no_span_structure: True + merge_no_span_structure: False replace_empty_cell_token: True - ResizeTableImage: max_len: 480 @@ -101,16 +101,15 @@ Train: Eval: dataset: - name: PubTabDataSet - data_dir: ./train_data/pubtabnet/test/ - label_file_list: [./train_data/pubtabnet/test.jsonl] + name: LMDBDataSetTableMaster + data_dir: train_data/StructureLabel_val_500/ transforms: - DecodeImage: img_mode: BGR channel_first: False - TableMasterLabelEncode: learn_empty_box: False - merge_no_span_structure: True + merge_no_span_structure: False replace_empty_cell_token: True - ResizeTableImage: max_len: 480 diff --git a/test_tipc/configs/table_master/train_infer_python.txt b/test_tipc/configs/table_master/train_infer_python.txt index c3a871731a36fb5434db111cfd68b6eab7ba3f99..a248cd8227a22babb29f2fad1b4eb8b30051711f 100644 --- a/test_tipc/configs/table_master/train_infer_python.txt +++ b/test_tipc/configs/table_master/train_infer_python.txt @@ -13,7 +13,7 @@ train_infer_img_dir:./ppstructure/docs/table/table.jpg null:null ## trainer:norm_train -norm_train:tools/train.py -c test_tipc/configs/table_master/table_master.yml -o Global.print_batch_step=10 +norm_train:tools/train.py -c test_tipc/configs/table_master/table_master.yml -o Global.print_batch_step=1 pact_train:null fpgm_train:null distill_train:null @@ -51,3 +51,11 @@ null:null null:null ===========================infer_benchmark_params========================== random_infer_input:[{float32,[3,480,480]}] +===========================train_benchmark_params========================== +batch_size:10 +fp_items:fp32|fp16 +epoch:2 +--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile +flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 +===========================to_static_train_benchmark_params=========================== +to_static_train:Global.to_static=true \ No newline at end of file diff --git a/test_tipc/configs/vi_layoutxlm_ser/train_infer_python.txt b/test_tipc/configs/vi_layoutxlm_ser/train_infer_python.txt index adad78bb76e34635a632ef7c1b55e212bc4b636a..e304519c719f21deed52c5f33aa9ce3a8fd8251d 100644 --- a/test_tipc/configs/vi_layoutxlm_ser/train_infer_python.txt +++ b/test_tipc/configs/vi_layoutxlm_ser/train_infer_python.txt @@ -1,6 +1,6 @@ ===========================train_params=========================== model_name:vi_layoutxlm_ser -python:python3.7 +python:python gpu_list:0|0,1 Global.use_gpu:True|True Global.auto_cast:fp32 @@ -13,11 +13,11 @@ train_infer_img_dir:ppstructure/docs/kie/input/zh_val_42.jpg null:null ## trainer:norm_train -norm_train:tools/train.py -c ./configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o Global.print_batch_step=1 Global.eval_batch_step=[1000,1000] Train.loader.shuffle=false +norm_train:tools/train.py -c ./configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o Global.print_batch_step=1 Global.eval_batch_step=[1000,1000] Train.loader.shuffle=false Train.loader.num_workers=32 Eval.loader.num_workers=32 pact_train:null fpgm_train:null distill_train:null -null:null +to_static_train:Global.to_static=true null:null ## ===========================eval_params=========================== @@ -52,8 +52,10 @@ null:null ===========================infer_benchmark_params========================== random_infer_input:[{float32,[3,224,224]}] ===========================train_benchmark_params========================== -batch_size:4 +batch_size:8 fp_items:fp32|fp16 epoch:3 --profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98 +===========================to_static_train_benchmark_params=========================== +to_static_train:Global.to_static=true diff --git a/test_tipc/prepare.sh b/test_tipc/prepare.sh index 02ee8a24d241195d1330ea42fc05ed35dd7a87b7..a2e5332745a704ca8bf0770823be36ae8c475802 100644 --- a/test_tipc/prepare.sh +++ b/test_tipc/prepare.sh @@ -23,7 +23,7 @@ trainer_list=$(func_parser_value "${lines[14]}") if [ ${MODE} = "benchmark_train" ];then python_name_list=$(func_parser_value "${lines[2]}") array=(${python_name_list}) - python_name=${array[0]} + python_name=python ${python_name} -m pip install -r requirements.txt if [[ ${model_name} =~ "ch_ppocr_mobile_v2_0_det" || ${model_name} =~ "det_mv3_db_v2_0" ]];then wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/MobileNetV3_large_x0_5_pretrained.pdparams --no-check-certificate @@ -40,6 +40,42 @@ if [ ${MODE} = "benchmark_train" ];then cd ../../../ fi fi + if [[ ${model_name} =~ "ch_PP-OCRv4_mobile_det" ]];then + wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/PPLCNetV3_x0_75_ocr_det.pdparams --no-check-certificate + rm -rf ./train_data/icdar2015 + wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dataset/icdar2015_benckmark.tar --no-check-certificate + cd ./train_data/ && tar xf icdar2015_benckmark.tar + ln -s ./icdar2015_benckmark ./icdar2015 + cd ../ + fi + if [[ ${model_name} =~ "ch_PP-OCRv4_server_det" ]];then + wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/PPHGNet_small_ocr_det.pdparams --no-check-certificate + rm -rf ./train_data/icdar2015 + wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dataset/icdar2015_benckmark.tar --no-check-certificate + cd ./train_data/ && tar xf icdar2015_benckmark.tar + ln -s ./icdar2015_benckmark ./icdar2015 + cd ../ + fi + if [[ ${model_name} =~ "ch_PP-OCRv4_mobile_rec" ]];then + rm -rf ./train_data/ic15_data + wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dataset/ic15_data_benckmark.tar --no-check-certificate + cd ./train_data/ && tar xf ic15_data_benckmark.tar + ln -s ./ic15_data_benckmark ./ic15_data + cd ic15_data + mv rec_gt_train4w.txt rec_gt_train.txt + cd ../ + cd ../ + fi + if [[ ${model_name} =~ "ch_PP-OCRv4_server_rec" ]];then + rm -rf ./train_data/ic15_data + wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dataset/ic15_data_benckmark.tar --no-check-certificate + cd ./train_data/ && tar xf ic15_data_benckmark.tar + ln -s ./ic15_data_benckmark ./ic15_data + cd ic15_data + mv rec_gt_train4w.txt rec_gt_train.txt + cd ../ + cd ../ + fi if [[ ${model_name} =~ "ch_ppocr_server_v2_0_det" || ${model_name} =~ "ch_PP-OCRv3_det" ]];then rm -rf ./train_data/icdar2015 wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dataset/icdar2015_benckmark.tar --no-check-certificate @@ -88,7 +124,7 @@ if [ ${MODE} = "benchmark_train" ];then ln -s ./ic15_data_benckmark ./ic15_data cd ../ fi - if [[ ${model_name} =~ "ch_PP-OCRv2_rec" || ${model_name} =~ "ch_PP-OCRv3_rec" ]];then + if [[ ${model_name} =~ "ch_PP-OCRv2_rec" || ${model_name} =~ "ch_PP-OCRv3_rec" || ${model_name} =~ "ch_PP-OCRv4_mobile_rec" || ${model_name} =~ "ch_PP-OCRv4_server_rec" ]];then rm -rf ./train_data/ic15_data wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dataset/ic15_data_benckmark.tar --no-check-certificate cd ./train_data/ && tar xf ic15_data_benckmark.tar @@ -138,6 +174,26 @@ if [ ${MODE} = "benchmark_train" ];then cd ../ fi + if [ ${model_name} == "table_master" ];then + wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/ppstructure/models/tablemaster/table_structure_tablemaster_train.tar --no-check-certificate + cd ./pretrain_models/ && tar xf table_structure_tablemaster_train.tar && cd ../ + wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dataset/StructureLabel_val_500.tar --no-check-certificate + cd ./train_data/ && tar xf StructureLabel_val_500.tar + cd ../ + fi + if [ ${model_name} == "rec_svtrnet" ]; then + wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dataset/ic15_data_benckmark.tar --no-check-certificate + cd ./train_data/ && tar xf ic15_data_benckmark.tar + ln -s ./ic15_data_benckmark ./ic15_data + cd ic15_data + mv rec_gt_train4w.txt rec_gt_train.txt + + for i in `seq 10`;do cp rec_gt_train.txt dup$i.txt;done + cat dup* > rec_gt_train.txt && rm -rf dup* + + cd ../ + cd ../ + fi fi if [ ${MODE} = "lite_train_lite_infer" ];then @@ -150,7 +206,9 @@ if [ ${MODE} = "lite_train_lite_infer" ];then # pretrain lite train data wget -nc -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV3_large_x0_5_pretrained.pdparams --no-check-certificate wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_db_v2.0_train.tar --no-check-certificate - cd ./pretrain_models/ && tar xf det_mv3_db_v2.0_train.tar && cd ../ + cd ./pretrain_models/ + tar xf det_mv3_db_v2.0_train.tar + cd ../ if [[ ${model_name} =~ "ch_PP-OCRv2_det" ]];then wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar --no-check-certificate cd ./pretrain_models/ && tar xf ch_PP-OCRv2_det_distill_train.tar && cd ../ @@ -159,6 +217,12 @@ if [ ${MODE} = "lite_train_lite_infer" ];then wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar --no-check-certificate cd ./pretrain_models/ && tar xf ch_PP-OCRv3_det_distill_train.tar && cd ../ fi + if [[ ${model_name} =~ "ch_PP-OCRv4_mobile_det" ]];then + wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/PPLCNetV3_x0_75_ocr_det.pdparams --no-check-certificate + fi + if [[ ${model_name} =~ "ch_PP-OCRv4_server_det" ]];then + wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/PPHGNet_small_ocr_det.pdparams --no-check-certificate + fi if [ ${model_name} == "en_table_structure" ] || [ ${model_name} == "en_table_structure_PACT" ];then wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_structure_train.tar --no-check-certificate cd ./pretrain_models/ && tar xf en_ppocr_mobile_v2.0_table_structure_train.tar && cd ../ @@ -179,6 +243,8 @@ if [ ${MODE} = "lite_train_lite_infer" ];then if [ ${model_name} == "table_master" ];then wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/ppstructure/models/tablemaster/table_structure_tablemaster_train.tar --no-check-certificate cd ./pretrain_models/ && tar xf table_structure_tablemaster_train.tar && cd ../ + wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dataset/StructureLabel_val_500.tar --no-check-certificate + cd ./train_data/ && tar xf StructureLabel_val_500.tar && cd ../ fi rm -rf ./train_data/icdar2015 rm -rf ./train_data/ic15_data @@ -366,7 +432,7 @@ elif [ ${MODE} = "whole_infer" ];then python_name_list=$(func_parser_value "${lines[2]}") array=(${python_name_list}) python_name=${array[0]} - ${python_name} -m pip install paddleslim --force-reinstall + ${python_name} -m pip install paddleslim ${python_name} -m pip install -r requirements.txt wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/ch_det_data_50.tar --no-check-certificate wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/rec_inference.tar --no-check-certificate diff --git a/test_tipc/supplementary/data_loader.py b/test_tipc/supplementary/data_loader.py index 049e7b2d36306d4bb7264d1c45a072ed84bbba60..f0245dd27cc5bb5d7272d6950f27b4ae0ba899f2 100644 --- a/test_tipc/supplementary/data_loader.py +++ b/test_tipc/supplementary/data_loader.py @@ -1,7 +1,6 @@ import numpy as np from paddle.vision.datasets import Cifar100 from paddle.vision.transforms import Normalize -from paddle.fluid.dataloader.collate import default_collate_fn import signal import os from paddle.io import Dataset, DataLoader, DistributedBatchSampler diff --git a/test_tipc/supplementary/train.py b/test_tipc/supplementary/train.py index e632d1d1803a85144bc750c3ff6ff51b1eb65973..f582123407956b335aac8a0845cae50769dae829 100644 --- a/test_tipc/supplementary/train.py +++ b/test_tipc/supplementary/train.py @@ -71,7 +71,7 @@ def amp_scaler(config): 'FLAGS_cudnn_batchnorm_spatial_persistent': 1, 'FLAGS_max_inplace_grad_add': 8, } - paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING) + paddle.set_flags(AMP_RELATED_FLAGS_SETTING) scale_loss = config["AMP"].get("scale_loss", 1.0) use_dynamic_loss_scaling = config["AMP"].get("use_dynamic_loss_scaling", False) @@ -168,22 +168,22 @@ def train(config, scaler=None): if idx % 10 == 0: et = time.time() strs = f"epoch: [{epoch}/{EPOCH}], iter: [{idx}/{data_num}], " - strs += f"loss: {avg_loss.numpy()[0]}" - strs += f", acc_topk1: {acc['top1'].numpy()[0]}, acc_top5: {acc['top5'].numpy()[0]}" + strs += f"loss: {float(avg_loss)}" + strs += f", acc_topk1: {float(acc['top1'])}, acc_top5: {float(acc['top5'])}" strs += f", batch_time: {round(et-st, 4)} s" logger.info(strs) st = time.time() if epoch % 10 == 0: acc = eval(config, model) - if len(best_acc) < 1 or acc['top5'].numpy()[0] > best_acc['top5']: + if len(best_acc) < 1 or float(acc['top5']) > best_acc['top5']: best_acc = acc best_acc['epoch'] = epoch is_best = True else: is_best = False logger.info( - f"The best acc: acc_topk1: {best_acc['top1'].numpy()[0]}, acc_top5: {best_acc['top5'].numpy()[0]}, best_epoch: {best_acc['epoch']}" + f"The best acc: acc_topk1: {float(best_acc['top1'])}, acc_top5: {float(best_acc['top5'])}, best_epoch: {best_acc['epoch']}" ) save_model( model, @@ -276,22 +276,22 @@ def train_distill(config, scaler=None): if idx % 10 == 0: et = time.time() strs = f"epoch: [{epoch}/{EPOCH}], iter: [{idx}/{data_num}], " - strs += f"loss: {avg_loss.numpy()[0]}" - strs += f", acc_topk1: {acc['top1'].numpy()[0]}, acc_top5: {acc['top5'].numpy()[0]}" + strs += f"loss: {float(avg_loss)}" + strs += f", acc_topk1: {float(acc['top1'])}, acc_top5: {float(acc['top5'])}" strs += f", batch_time: {round(et-st, 4)} s" logger.info(strs) st = time.time() if epoch % 10 == 0: acc = eval(config, model._layers.student) - if len(best_acc) < 1 or acc['top5'].numpy()[0] > best_acc['top5']: + if len(best_acc) < 1 or float(acc['top5']) > best_acc['top5']: best_acc = acc best_acc['epoch'] = epoch is_best = True else: is_best = False logger.info( - f"The best acc: acc_topk1: {best_acc['top1'].numpy()[0]}, acc_top5: {best_acc['top5'].numpy()[0]}, best_epoch: {best_acc['epoch']}" + f"The best acc: acc_topk1: {float(best_acc['top1'])}, acc_top5: {float(best_acc['top5'])}, best_epoch: {best_acc['epoch']}" ) save_model( @@ -401,22 +401,22 @@ def train_distill_multiopt(config, scaler=None): if idx % 10 == 0: et = time.time() strs = f"epoch: [{epoch}/{EPOCH}], iter: [{idx}/{data_num}], " - strs += f"loss: {avg_loss.numpy()[0]}, loss1: {avg_loss1.numpy()[0]}" - strs += f", acc_topk1: {acc['top1'].numpy()[0]}, acc_top5: {acc['top5'].numpy()[0]}" + strs += f"loss: {float(avg_loss)}, loss1: {float(avg_loss1)}" + strs += f", acc_topk1: {float(acc['top1'])}, acc_top5: {float(acc['top5'])}" strs += f", batch_time: {round(et-st, 4)} s" logger.info(strs) st = time.time() if epoch % 10 == 0: acc = eval(config, model._layers.student) - if len(best_acc) < 1 or acc['top5'].numpy()[0] > best_acc['top5']: + if len(best_acc) < 1 or float(acc['top5']) > best_acc['top5']: best_acc = acc best_acc['epoch'] = epoch is_best = True else: is_best = False logger.info( - f"The best acc: acc_topk1: {best_acc['top1'].numpy()[0]}, acc_top5: {best_acc['top5'].numpy()[0]}, best_epoch: {best_acc['epoch']}" + f"The best acc: acc_topk1: {float(best_acc['top1'])}, acc_top5: {float(best_acc['top5'])}, best_epoch: {best_acc['epoch']}" ) save_model( model, [optimizer, optimizer1], @@ -450,7 +450,7 @@ def eval(config, model): labels = paddle.concat(labels, axis=0) acc = metric_func(outs, labels) - strs = f"The metric are as follows: acc_topk1: {acc['top1'].numpy()[0]}, acc_top5: {acc['top5'].numpy()[0]}" + strs = f"The metric are as follows: acc_topk1: {float(acc['top1'])}, acc_top5: {float(acc['top5'])}" logger.info(strs) return acc diff --git a/test_tipc/test_serving_infer_cpp.sh b/test_tipc/test_serving_infer_cpp.sh index 10ddecf3fa26805fef7bc6ae10d78ee5e741cd27..6de685682a20acda0f97e64abfa20e61284f9b1b 100644 --- a/test_tipc/test_serving_infer_cpp.sh +++ b/test_tipc/test_serving_infer_cpp.sh @@ -103,7 +103,9 @@ function func_serving(){ last_status=${PIPESTATUS[0]} eval "cat ${_save_log_path}" status_check $last_status "${cpp_client_cmd}" "${status_log}" "${model_name}" "${_save_log_path}" - ps ux | grep -i ${port_value} | awk '{print $2}' | xargs kill -s 9 + #ps ux | grep -i ${port_value} | awk '{print $2}' | xargs kill -s 9 + ${python_list[0]} ${web_service_py} stop + sleep 5s else server_log_path="${LOG_PATH}/cpp_server_gpu.log" web_service_cpp_cmd="nohup ${python_list[0]} ${web_service_py} --model ${det_server_value} ${rec_server_value} ${op_key} ${op_value} ${port_key} ${port_value} ${gpu_key} ${gpu_id} > ${server_log_path} 2>&1 &" @@ -115,7 +117,8 @@ function func_serving(){ last_status=${PIPESTATUS[0]} eval "cat ${_save_log_path}" status_check $last_status "${cpp_client_cmd}" "${status_log}" "${model_name}" "${_save_log_path}" - ps ux | grep -i ${port_value} | awk '{print $2}' | xargs kill -s 9 + #ps ux | grep -i ${port_value} | awk '{print $2}' | xargs kill -s 9 + ${python_list[0]} ${web_service_py} stop fi done } diff --git a/test_tipc/test_train_inference_python.sh b/test_tipc/test_train_inference_python.sh index e182fa57f060c81af012a5da89b892bde02b4a2b..9a94db858cb44355745ebb0399a227fe24e2dc73 100644 --- a/test_tipc/test_train_inference_python.sh +++ b/test_tipc/test_train_inference_python.sh @@ -5,7 +5,7 @@ FILENAME=$1 # MODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer' 'whole_train_whole_infer', 'whole_infer'] MODE=$2 -dataline=$(awk 'NR==1, NR==51{print}' $FILENAME) +dataline=$(awk 'NR>=1{print}' $FILENAME) # parser params IFS=$'\n' @@ -88,11 +88,14 @@ benchmark_value=$(func_parser_value "${lines[49]}") infer_key1=$(func_parser_key "${lines[50]}") infer_value1=$(func_parser_value "${lines[50]}") +line_num=`grep -n -w "to_static_train_benchmark_params" $FILENAME | cut -d ":" -f 1` +to_static_key=$(func_parser_key "${lines[line_num]}") +to_static_trainer=$(func_parser_value "${lines[line_num]}") + LOG_PATH="./test_tipc/output/${model_name}/${MODE}" mkdir -p ${LOG_PATH} status_log="${LOG_PATH}/results_python.log" - function func_inference(){ IFS='|' _python=$1 @@ -253,9 +256,9 @@ else elif [ ${trainer} = "${distill_key}" ]; then run_train=${distill_trainer} run_export=${distill_export} - elif [ ${trainer} = ${trainer_key1} ]; then - run_train=${trainer_value1} - run_export=${export_value1} + elif [ ${trainer} = "${to_static_key}" ]; then + run_train="${norm_trainer} ${to_static_trainer}" + run_export=${norm_export} elif [[ ${trainer} = ${trainer_key2} ]]; then run_train=${trainer_value2} run_export=${export_value2} @@ -289,11 +292,11 @@ else set_save_model=$(func_set_params "${save_model_key}" "${save_log}") if [ ${#gpu} -le 2 ];then # train with cpu or single gpu - cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_batchsize} ${set_train_params1} ${set_amp_config} " + cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_batchsize} ${set_amp_config} ${set_train_params1}" elif [ ${#ips} -le 15 ];then # train with multi-gpu - cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_batchsize} ${set_train_params1} ${set_amp_config}" + cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_batchsize} ${set_amp_config} ${set_train_params1}" else # train with multi-machine - cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_batchsize} ${set_train_params1} ${set_amp_config}" + cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_batchsize} ${set_amp_config} ${set_train_params1}" fi # run train eval $cmd @@ -337,5 +340,4 @@ else done # done with: for trainer in ${trainer_list[*]}; do done # done with: for autocast in ${autocast_list[*]}; do done # done with: for gpu in ${gpu_list[*]}; do -fi # end if [ ${MODE} = "infer" ]; then - +fi # end if [ ${MODE} = "infer" ]; then \ No newline at end of file diff --git a/test_tipc/test_train_inference_python_npu.sh b/test_tipc/test_train_inference_python_npu.sh index bab70fc78ee902515c0fccb57d9215d86f2a6589..4341ceeaebfdbef2b29e40e9b2361e6ab6ab7a61 100644 --- a/test_tipc/test_train_inference_python_npu.sh +++ b/test_tipc/test_train_inference_python_npu.sh @@ -29,18 +29,28 @@ fi sed -i 's/use_gpu/use_npu/g' $FILENAME # disable benchmark as AutoLog required nvidia-smi command sed -i 's/--benchmark:True/--benchmark:False/g' $FILENAME +# python has been updated to version 3.9 for npu backend +sed -i "s/python3.7/python3.9/g" $FILENAME dataline=`cat $FILENAME` # parser params IFS=$'\n' lines=(${dataline}) +modelname=$(echo ${lines[1]} | cut -d ":" -f2) +if [ $modelname == "rec_r31_sar" ] || [ $modelname == "rec_mtb_nrtr" ]; then + sed -i "s/Global.epoch_num:lite_train_lite_infer=2/Global.epoch_num:lite_train_lite_infer=1/g" $FILENAME + sed -i "s/gpu_list:0|0,1/gpu_list:0,1/g" $FILENAME + sed -i "s/Global.use_npu:True|True/Global.use_npu:True/g" $FILENAME +fi + # replace training config file grep -n 'tools/.*yml' $FILENAME | cut -d ":" -f 1 \ | while read line_num ; do train_cmd=$(func_parser_value "${lines[line_num-1]}") trainer_config=$(func_parser_config ${train_cmd}) sed -i 's/use_gpu/use_npu/g' "$REPO_ROOT_PATH/$trainer_config" + sed -i 's/use_sync_bn: True/use_sync_bn: False/g' "$REPO_ROOT_PATH/$trainer_config" done # change gpu to npu in execution script diff --git a/tools/eval.py b/tools/eval.py index 21f4d94d5e4ed560b8775c8827ffdbbd00355218..b4c69b6d37532103f1316eb3b7a14b472d741ed3 100755 --- a/tools/eval.py +++ b/tools/eval.py @@ -24,7 +24,7 @@ sys.path.insert(0, __dir__) sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '..'))) import paddle -from ppocr.data import build_dataloader +from ppocr.data import build_dataloader, set_signal_handlers from ppocr.modeling.architectures import build_model from ppocr.postprocess import build_post_process from ppocr.metrics import build_metric @@ -35,6 +35,7 @@ import tools.program as program def main(): global_config = config['Global'] # build dataloader + set_signal_handlers() valid_dataloader = build_dataloader(config, 'Eval', device, logger) # build post process @@ -54,8 +55,12 @@ def main(): if config['PostProcess'][ 'name'] == 'DistillationSARLabelDecode': char_num = char_num - 2 + if config['PostProcess'][ + 'name'] == 'DistillationNRTRLabelDecode': + char_num = char_num - 3 out_channels_list['CTCLabelDecode'] = char_num out_channels_list['SARLabelDecode'] = char_num + 2 + out_channels_list['NRTRLabelDecode'] = char_num + 3 config['Architecture']['Models'][key]['Head'][ 'out_channels_list'] = out_channels_list else: @@ -66,8 +71,11 @@ def main(): out_channels_list = {} if config['PostProcess']['name'] == 'SARLabelDecode': char_num = char_num - 2 + if config['PostProcess']['name'] == 'NRTRLabelDecode': + char_num = char_num - 3 out_channels_list['CTCLabelDecode'] = char_num out_channels_list['SARLabelDecode'] = char_num + 2 + out_channels_list['NRTRLabelDecode'] = char_num + 3 config['Architecture']['Head'][ 'out_channels_list'] = out_channels_list else: # base rec model @@ -75,7 +83,8 @@ def main(): model = build_model(config['Architecture']) extra_input_models = [ - "SRN", "NRTR", "SAR", "SEED", "SVTR", "VisionLAN", "RobustScanner" + "SRN", "NRTR", "SAR", "SEED", "SVTR", "SVTR_LCNet", "VisionLAN", + "RobustScanner", "SVTR_HGNet" ] extra_input = False if config['Architecture']['algorithm'] == 'Distillation': @@ -103,7 +112,7 @@ def main(): 'FLAGS_cudnn_batchnorm_spatial_persistent': 1, 'FLAGS_max_inplace_grad_add': 8, } - paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING) + paddle.set_flags(AMP_RELATED_FLAGS_SETTING) scale_loss = config["Global"].get("scale_loss", 1.0) use_dynamic_loss_scaling = config["Global"].get( "use_dynamic_loss_scaling", False) diff --git a/tools/export_center.py b/tools/export_center.py index 30b9c33499b8d0c8044682c6a078e00f683c1d7c..3f7a883528525bebe037de7f78847fd77a059142 100644 --- a/tools/export_center.py +++ b/tools/export_center.py @@ -24,7 +24,7 @@ __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(__dir__) sys.path.append(os.path.abspath(os.path.join(__dir__, '..'))) -from ppocr.data import build_dataloader +from ppocr.data import build_dataloader, set_signal_handlers from ppocr.modeling.architectures import build_model from ppocr.postprocess import build_post_process from ppocr.utils.save_load import load_model @@ -40,6 +40,7 @@ def main(): 'data_dir'] config['Eval']['dataset']['label_file_list'] = config['Train']['dataset'][ 'label_file_list'] + set_signal_handlers() eval_dataloader = build_dataloader(config, 'Eval', device, logger) # build post process diff --git a/tools/export_model.py b/tools/export_model.py index 4b90fcae435619a53a3def8cc4dc46b4e2963bff..cc515164bf64f0038856a3b97975562335eb1dc2 100755 --- a/tools/export_model.py +++ b/tools/export_model.py @@ -62,17 +62,17 @@ def export_single_model(model, shape=[None], dtype="float32")] ] model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] in ["SVTR_LCNet", "SVTR_HGNet"]: + other_shape = [ + paddle.static.InputSpec( + shape=[None, 3, 48, -1], dtype="float32"), + ] + model = to_static(model, input_spec=other_shape) elif arch_config["algorithm"] == "SVTR": - if arch_config["Head"]["name"] == 'MultiHead': - other_shape = [ - paddle.static.InputSpec( - shape=[None, 3, 48, -1], dtype="float32"), - ] - else: - other_shape = [ - paddle.static.InputSpec( - shape=[None] + input_shape, dtype="float32"), - ] + other_shape = [ + paddle.static.InputSpec( + shape=[None] + input_shape, dtype="float32"), + ] model = to_static(model, input_spec=other_shape) elif arch_config["algorithm"] == "PREN": other_shape = [ @@ -105,6 +105,12 @@ def export_single_model(model, shape=[None, 1, 32, 100], dtype="float32"), ] model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] == 'SATRN': + other_shape = [ + paddle.static.InputSpec( + shape=[None, 3, 32, 100], dtype="float32"), + ] + model = to_static(model, input_spec=other_shape) elif arch_config["algorithm"] == "VisionLAN": other_shape = [ paddle.static.InputSpec( @@ -181,6 +187,13 @@ def export_single_model(model, shape=[None] + infer_shape, dtype="float32") ]) + if arch_config["model_type"] != "sr" and arch_config["Backbone"][ + "name"] == "PPLCNetV3": + # for rep lcnetv3 + for layer in model.sublayers(): + if hasattr(layer, "rep") and not getattr(layer, "is_repped"): + layer.rep() + if quanter is None: paddle.jit.save(model, save_path) else: @@ -212,8 +225,12 @@ def main(): if config['PostProcess'][ 'name'] == 'DistillationSARLabelDecode': char_num = char_num - 2 + if config['PostProcess'][ + 'name'] == 'DistillationNRTRLabelDecode': + char_num = char_num - 3 out_channels_list['CTCLabelDecode'] = char_num out_channels_list['SARLabelDecode'] = char_num + 2 + out_channels_list['NRTRLabelDecode'] = char_num + 3 config['Architecture']['Models'][key]['Head'][ 'out_channels_list'] = out_channels_list else: @@ -228,8 +245,11 @@ def main(): char_num = len(getattr(post_process_class, 'character')) if config['PostProcess']['name'] == 'SARLabelDecode': char_num = char_num - 2 + if config['PostProcess']['name'] == 'NRTRLabelDecode': + char_num = char_num - 3 out_channels_list['CTCLabelDecode'] = char_num out_channels_list['SARLabelDecode'] = char_num + 2 + out_channels_list['NRTRLabelDecode'] = char_num + 3 config['Architecture']['Head'][ 'out_channels_list'] = out_channels_list else: # base rec model diff --git a/tools/infer/predict_det.py b/tools/infer/predict_det.py index 1b4446a6717bccdc5b3de4ba70e058885479be84..6c5c36cf86febef406609bf5022cfd2ee776756f 100755 --- a/tools/infer/predict_det.py +++ b/tools/infer/predict_det.py @@ -143,7 +143,9 @@ class TextDetector(object): if self.use_onnx: img_h, img_w = self.input_tensor.shape[2:] - if img_h is not None and img_w is not None and img_h > 0 and img_w > 0: + if isinstance(img_h, str) or isinstance(img_w, str): + pass + elif img_h is not None and img_w is not None and img_h > 0 and img_w > 0: pre_process_list[0] = { 'DetResizeForTest': { 'image_shape': [img_h, img_w] diff --git a/tools/infer/predict_rec.py b/tools/infer/predict_rec.py index b3ef557c09fb74990b65c266afa5d5c77960b7ed..9dd33dc7b68e05cc218a9a0746cb58ccb5a8ebb2 100755 --- a/tools/infer/predict_rec.py +++ b/tools/infer/predict_rec.py @@ -106,6 +106,13 @@ class TextRecognizer(object): "character_dict_path": None, "use_space_char": args.use_space_char } + elif self.rec_algorithm == "SATRN": + postprocess_params = { + 'name': 'SATRNLabelDecode', + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char, + "rm_symbol": True + } elif self.rec_algorithm == "PREN": postprocess_params = {'name': 'PRENLabelDecode'} elif self.rec_algorithm == "CAN": @@ -116,6 +123,7 @@ class TextRecognizer(object): "use_space_char": args.use_space_char } self.postprocess_op = build_post_process(postprocess_params) + self.postprocess_params = postprocess_params self.predictor, self.input_tensor, self.output_tensors, self.config = \ utility.create_predictor(args, 'rec', logger) self.benchmark = args.benchmark @@ -139,6 +147,7 @@ class TextRecognizer(object): ], warmup=0, logger=logger) + self.return_word_box = args.return_word_box def resize_norm_img(self, img, max_wh_ratio): imgC, imgH, imgW = self.rec_image_shape @@ -149,7 +158,7 @@ class TextRecognizer(object): if self.rec_algorithm == 'ViTSTR': img = image_pil.resize([imgW, imgH], Image.BICUBIC) else: - img = image_pil.resize([imgW, imgH], Image.ANTIALIAS) + img = image_pil.resize([imgW, imgH], Image.Resampling.LANCZOS) img = np.array(img) norm_img = np.expand_dims(img, -1) norm_img = norm_img.transpose((2, 0, 1)) @@ -173,9 +182,10 @@ class TextRecognizer(object): imgW = int((imgH * max_wh_ratio)) if self.use_onnx: w = self.input_tensor.shape[3:][0] - if w is not None and w > 0: + if isinstance(w, str): + pass + elif w is not None and w > 0: imgW = w - h, w = img.shape[:2] ratio = w / float(h) if math.ceil(imgH * ratio) > imgW: @@ -407,11 +417,12 @@ class TextRecognizer(object): valid_ratios = [] imgC, imgH, imgW = self.rec_image_shape[:3] max_wh_ratio = imgW / imgH - # max_wh_ratio = 0 + wh_ratio_list = [] for ino in range(beg_img_no, end_img_no): h, w = img_list[indices[ino]].shape[0:2] wh_ratio = w * 1.0 / h max_wh_ratio = max(max_wh_ratio, wh_ratio) + wh_ratio_list.append(wh_ratio) for ino in range(beg_img_no, end_img_no): if self.rec_algorithm == "SAR": norm_img, _, _, valid_ratio = self.resize_norm_img_sar( @@ -428,7 +439,7 @@ class TextRecognizer(object): gsrm_slf_attn_bias1_list.append(norm_img[3]) gsrm_slf_attn_bias2_list.append(norm_img[4]) norm_img_batch.append(norm_img[0]) - elif self.rec_algorithm == "SVTR": + elif self.rec_algorithm in ["SVTR", "SATRN"]: norm_img = self.resize_norm_img_svtr(img_list[indices[ino]], self.rec_image_shape) norm_img = norm_img[np.newaxis, :] @@ -616,7 +627,10 @@ class TextRecognizer(object): preds = outputs else: preds = outputs[0] - rec_result = self.postprocess_op(preds) + if self.postprocess_params['name'] == 'CTCLabelDecode': + rec_result = self.postprocess_op(preds, return_word_box=self.return_word_box, wh_ratio_list=wh_ratio_list, max_wh_ratio=max_wh_ratio) + else: + rec_result = self.postprocess_op(preds) for rno in range(len(rec_result)): rec_res[indices[beg_img_no + rno]] = rec_result[rno] if self.benchmark: diff --git a/tools/infer/predict_system.py b/tools/infer/predict_system.py index affd0d1bcd1283be02ead3cd61c01c375b49bdf9..8af45b4cf52eb6355c9d4e08bc609e6ea91dfb43 100755 --- a/tools/infer/predict_system.py +++ b/tools/infer/predict_system.py @@ -34,7 +34,7 @@ import tools.infer.predict_det as predict_det import tools.infer.predict_cls as predict_cls from ppocr.utils.utility import get_image_file_list, check_and_read from ppocr.utils.logging import get_logger -from tools.infer.utility import draw_ocr_box_txt, get_rotate_crop_image +from tools.infer.utility import draw_ocr_box_txt, get_rotate_crop_image, get_minarea_rect_crop logger = get_logger() @@ -65,40 +65,53 @@ class TextSystem(object): self.crop_image_res_index += bbox_num def __call__(self, img, cls=True): - time_dict = {'det': 0, 'rec': 0, 'csl': 0, 'all': 0} + time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0} + + if img is None: + logger.debug("no valid image provided") + return None, None, time_dict + start = time.time() ori_im = img.copy() dt_boxes, elapse = self.text_detector(img) time_dict['det'] = elapse - logger.debug("dt_boxes num : {}, elapse : {}".format( - len(dt_boxes), elapse)) + if dt_boxes is None: - return None, None + logger.debug("no dt_boxes found, elapsed : {}".format(elapse)) + end = time.time() + time_dict['all'] = end - start + return None, None, time_dict + else: + logger.debug("dt_boxes num : {}, elapsed : {}".format( + len(dt_boxes), elapse)) img_crop_list = [] dt_boxes = sorted_boxes(dt_boxes) for bno in range(len(dt_boxes)): tmp_box = copy.deepcopy(dt_boxes[bno]) - img_crop = get_rotate_crop_image(ori_im, tmp_box) + if self.args.det_box_type == "quad": + img_crop = get_rotate_crop_image(ori_im, tmp_box) + else: + img_crop = get_minarea_rect_crop(ori_im, tmp_box) img_crop_list.append(img_crop) if self.use_angle_cls and cls: img_crop_list, angle_list, elapse = self.text_classifier( img_crop_list) time_dict['cls'] = elapse - logger.debug("cls num : {}, elapse : {}".format( + logger.debug("cls num : {}, elapsed : {}".format( len(img_crop_list), elapse)) rec_res, elapse = self.text_recognizer(img_crop_list) time_dict['rec'] = elapse - logger.debug("rec_res num : {}, elapse : {}".format( + logger.debug("rec_res num : {}, elapsed : {}".format( len(rec_res), elapse)) if self.args.save_crop_res: self.draw_crop_rec_res(self.args.crop_res_save_dir, img_crop_list, rec_res) filter_boxes, filter_rec_res = [], [] for box, rec_result in zip(dt_boxes, rec_res): - text, score = rec_result + text, score = rec_result[0], rec_result[1] if score >= self.drop_score: filter_boxes.append(box) filter_rec_res.append(rec_result) @@ -120,7 +133,7 @@ def sorted_boxes(dt_boxes): _boxes = list(sorted_boxes) for i in range(num_boxes - 1): - for j in range(i, 0, -1): + for j in range(i, -1, -1): if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and \ (_boxes[j + 1][0][0] < _boxes[j][0][0]): tmp = _boxes[j] diff --git a/tools/infer/utility.py b/tools/infer/utility.py index 34cad2590f2904f79709530acf841033c89088e0..b064cbf18941a40bdca57e4d7e4ec68dc42e6fc6 100644 --- a/tools/infer/utility.py +++ b/tools/infer/utility.py @@ -19,6 +19,7 @@ import platform import cv2 import numpy as np import paddle +import PIL from PIL import Image, ImageDraw, ImageFont import math from paddle import inference @@ -28,8 +29,10 @@ from ppocr.utils.logging import get_logger def str2bool(v): - return v.lower() in ("true", "t", "1") + return v.lower() in ("true", "yes", "t", "y", "1") +def str2int_tuple(v): + return tuple([int(i.strip()) for i in v.split(",")]) def init_args(): parser = argparse.ArgumentParser() @@ -42,6 +45,7 @@ def init_args(): parser.add_argument("--min_subgraph_size", type=int, default=15) parser.add_argument("--precision", type=str, default="fp32") parser.add_argument("--gpu_mem", type=int, default=500) + parser.add_argument("--gpu_id", type=int, default=0) # params for text detector parser.add_argument("--image_dir", type=str) @@ -144,6 +148,10 @@ def init_args(): parser.add_argument("--show_log", type=str2bool, default=True) parser.add_argument("--use_onnx", type=str2bool, default=False) + + # extended function + parser.add_argument("--return_word_box", type=str2bool, default=False, help='Whether return the bbox of each word (split by space) or chinese character. Only used in ppstructure for layout recovery') + return parser @@ -181,7 +189,10 @@ def create_predictor(args, mode, logger): if not os.path.exists(model_file_path): raise ValueError("not find model file path {}".format( model_file_path)) - sess = ort.InferenceSession(model_file_path) + if args.use_gpu: + sess = ort.InferenceSession(model_file_path, providers=['CUDAExecutionProvider']) + else: + sess = ort.InferenceSession(model_file_path) return sess, sess.get_inputs()[0], None, None else: @@ -219,7 +230,7 @@ def create_predictor(args, mode, logger): logger.warning( "GPU is not found in current device by nvidia-smi. Please check your device or ignore it if run on jetson." ) - config.enable_use_gpu(args.gpu_mem, 0) + config.enable_use_gpu(args.gpu_mem, args.gpu_id) if args.use_tensorrt: config.enable_tensorrt_engine( workspace_size=1 << 30, @@ -245,7 +256,7 @@ def create_predictor(args, mode, logger): logger.info("Please keep your paddlepaddle-gpu >= 2.3.0!") elif args.use_npu: - config.enable_npu() + config.enable_custom_device("npu") elif args.use_xpu: config.enable_xpu(10 * 1024 * 1024) else: @@ -290,7 +301,9 @@ def create_predictor(args, mode, logger): def get_output_tensors(args, mode, predictor): output_names = predictor.get_output_names() output_tensors = [] - if mode == "rec" and args.rec_algorithm in ["CRNN", "SVTR_LCNet"]: + if mode == "rec" and args.rec_algorithm in [ + "CRNN", "SVTR_LCNet", "SVTR_HGNet" + ]: output_name = 'softmax_0.tmp_0' if output_name in output_names: return [predictor.get_output_handle(output_name)] @@ -310,7 +323,7 @@ def get_infer_gpuid(): if sysstr == "Windows": return 0 - if not paddle.fluid.core.is_compiled_with_rocm(): + if not paddle.device.is_compiled_with_rocm: cmd = "env | grep CUDA_VISIBLE_DEVICES" else: cmd = "env | grep HIP_VISIBLE_DEVICES" @@ -468,7 +481,11 @@ def draw_box_txt_fine(img_size, box, txt, font_path="./doc/fonts/simfang.ttf"): def create_font(txt, sz, font_path="./doc/fonts/simfang.ttf"): font_size = int(sz[1] * 0.99) font = ImageFont.truetype(font_path, font_size, encoding="utf-8") - length = font.getsize(txt)[0] + if int(PIL.__version__.split('.')[0]) < 10: + length = font.getsize(txt)[0] + else: + length = font.getlength(txt) + if length > sz[0]: font_size = int(font_size * sz[0] / length) font = ImageFont.truetype(font_path, font_size, encoding="utf-8") @@ -629,6 +646,29 @@ def get_rotate_crop_image(img, points): return dst_img +def get_minarea_rect_crop(img, points): + bounding_box = cv2.minAreaRect(np.array(points).astype(np.int32)) + points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0]) + + index_a, index_b, index_c, index_d = 0, 1, 2, 3 + if points[1][1] > points[0][1]: + index_a = 0 + index_d = 1 + else: + index_a = 1 + index_d = 0 + if points[3][1] > points[2][1]: + index_b = 2 + index_c = 3 + else: + index_b = 3 + index_c = 2 + + box = [points[index_a], points[index_b], points[index_c], points[index_d]] + crop_img = get_rotate_crop_image(img, np.array(box)) + return crop_img + + def check_gpu(use_gpu): if use_gpu and not paddle.is_compiled_with_cuda(): use_gpu = False diff --git a/tools/infer_det.py b/tools/infer_det.py index f253e8f2876a5942538f18e93dfdada4391875b2..097d032b99c7d25a3e9e3b1d781bbdbe4dde62fa 100755 --- a/tools/infer_det.py +++ b/tools/infer_det.py @@ -40,17 +40,16 @@ import tools.program as program def draw_det_res(dt_boxes, config, img, img_name, save_path): - if len(dt_boxes) > 0: - import cv2 - src_im = img - for box in dt_boxes: - box = np.array(box).astype(np.int32).reshape((-1, 1, 2)) - cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2) - if not os.path.exists(save_path): - os.makedirs(save_path) - save_path = os.path.join(save_path, os.path.basename(img_name)) - cv2.imwrite(save_path, src_im) - logger.info("The detected Image saved in {}".format(save_path)) + import cv2 + src_im = img + for box in dt_boxes: + box = np.array(box).astype(np.int32).reshape((-1, 1, 2)) + cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2) + if not os.path.exists(save_path): + os.makedirs(save_path) + save_path = os.path.join(save_path, os.path.basename(img_name)) + cv2.imwrite(save_path, src_im) + logger.info("The detected Image saved in {}".format(save_path)) @paddle.no_grad() diff --git a/tools/infer_kie_token_ser_re.py b/tools/infer_kie_token_ser_re.py index c4fa2c927ab93cfa9082e51f08f8d6e1c35fe29e..76120a913f36c815cbd3b9314523ea91e3290065 100755 --- a/tools/infer_kie_token_ser_re.py +++ b/tools/infer_kie_token_ser_re.py @@ -81,7 +81,7 @@ def make_input(ser_inputs, ser_results): end.append(entity['end']) label.append(entities_labels[res['pred']]) - entities = np.full([max_seq_len + 1, 3], fill_value=-1) + entities = np.full([max_seq_len + 1, 3], fill_value=-1, dtype=np.int64) entities[0, 0] = len(start) entities[1:len(start) + 1, 0] = start entities[0, 1] = len(end) @@ -98,7 +98,7 @@ def make_input(ser_inputs, ser_results): head.append(i) tail.append(j) - relations = np.full([len(head) + 1, 2], fill_value=-1) + relations = np.full([len(head) + 1, 2], fill_value=-1, dtype=np.int64) relations[0, 0] = len(head) relations[1:len(head) + 1, 0] = head relations[0, 1] = len(tail) diff --git a/tools/infer_rec.py b/tools/infer_rec.py index 29aab9b57853b16bf615c893c30351a403270b57..80986ccdebb5b0e91cb843933c1a0ee6914ca671 100755 --- a/tools/infer_rec.py +++ b/tools/infer_rec.py @@ -48,34 +48,41 @@ def main(): # build model if hasattr(post_process_class, 'character'): char_num = len(getattr(post_process_class, 'character')) - if config['Architecture']["algorithm"] in ["Distillation", + if config["Architecture"]["algorithm"] in ["Distillation", ]: # distillation model - for key in config['Architecture']["Models"]: - if config['Architecture']['Models'][key]['Head'][ - 'name'] == 'MultiHead': # for multi head + for key in config["Architecture"]["Models"]: + if config["Architecture"]["Models"][key]["Head"][ + "name"] == 'MultiHead': # multi head out_channels_list = {} if config['PostProcess'][ 'name'] == 'DistillationSARLabelDecode': char_num = char_num - 2 + if config['PostProcess'][ + 'name'] == 'DistillationNRTRLabelDecode': + char_num = char_num - 3 out_channels_list['CTCLabelDecode'] = char_num out_channels_list['SARLabelDecode'] = char_num + 2 + out_channels_list['NRTRLabelDecode'] = char_num + 3 config['Architecture']['Models'][key]['Head'][ 'out_channels_list'] = out_channels_list else: - config['Architecture']["Models"][key]["Head"][ - 'out_channels'] = char_num + config["Architecture"]["Models"][key]["Head"][ + "out_channels"] = char_num elif config['Architecture']['Head'][ - 'name'] == 'MultiHead': # for multi head loss + 'name'] == 'MultiHead': # multi head out_channels_list = {} + char_num = len(getattr(post_process_class, 'character')) if config['PostProcess']['name'] == 'SARLabelDecode': char_num = char_num - 2 + if config['PostProcess']['name'] == 'NRTRLabelDecode': + char_num = char_num - 3 out_channels_list['CTCLabelDecode'] = char_num out_channels_list['SARLabelDecode'] = char_num + 2 + out_channels_list['NRTRLabelDecode'] = char_num + 3 config['Architecture']['Head'][ 'out_channels_list'] = out_channels_list else: # base rec model - config['Architecture']["Head"]['out_channels'] = char_num - + config["Architecture"]["Head"]["out_channels"] = char_num model = build_model(config['Architecture']) load_model(config, model) diff --git a/tools/program.py b/tools/program.py index afb8a47254b9847e4a4d432b7f17902c3ee78725..511ee9dd1f12273eb773b6f2e29a3955940721ee 100755 --- a/tools/program.py +++ b/tools/program.py @@ -134,9 +134,18 @@ def check_device(use_gpu, use_xpu=False, use_npu=False, use_mlu=False): if use_xpu and not paddle.device.is_compiled_with_xpu(): print(err.format("use_xpu", "xpu", "xpu", "use_xpu")) sys.exit(1) - if use_npu and not paddle.device.is_compiled_with_npu(): - print(err.format("use_npu", "npu", "npu", "use_npu")) - sys.exit(1) + if use_npu: + if int(paddle.version.major) != 0 and int( + paddle.version.major) <= 2 and int( + paddle.version.minor) <= 4: + if not paddle.device.is_compiled_with_npu(): + print(err.format("use_npu", "npu", "npu", "use_npu")) + sys.exit(1) + # is_compiled_with_npu() has been updated after paddle-2.4 + else: + if not paddle.device.is_compiled_with_custom_device("npu"): + print(err.format("use_npu", "npu", "npu", "use_npu")) + sys.exit(1) if use_mlu and not paddle.device.is_compiled_with_mlu(): print(err.format("use_mlu", "mlu", "mlu", "use_mlu")) sys.exit(1) @@ -179,7 +188,9 @@ def train(config, log_writer=None, scaler=None, amp_level='O2', - amp_custom_black_list=[]): + amp_custom_black_list=[], + amp_custom_white_list=[], + amp_dtype='float16'): cal_metric_during_train = config['Global'].get('cal_metric_during_train', False) calc_epoch_interval = config['Global'].get('calc_epoch_interval', 1) @@ -219,8 +230,8 @@ def train(config, use_srn = config['Architecture']['algorithm'] == "SRN" extra_input_models = [ - "SRN", "NRTR", "SAR", "SEED", "SVTR", "SPIN", "VisionLAN", - "RobustScanner", "RFL", 'DRRG' + "SRN", "NRTR", "SAR", "SEED", "SVTR", "SVTR_LCNet", "SPIN", "VisionLAN", + "RobustScanner", "RFL", 'DRRG', 'SATRN', 'SVTR_HGNet' ] extra_input = False if config['Architecture']['algorithm'] == 'Distillation': @@ -268,7 +279,9 @@ def train(config, if scaler: with paddle.amp.auto_cast( level=amp_level, - custom_black_list=amp_custom_black_list): + custom_black_list=amp_custom_black_list, + custom_white_list=amp_custom_white_list, + dtype=amp_dtype): if model_type == 'table' or extra_input: preds = model(images, data=batch[1:]) elif model_type in ["kie"]: @@ -333,7 +346,10 @@ def train(config, lr_scheduler.step() # logger and visualdl - stats = {k: v.numpy().mean() for k, v in loss.items()} + stats = { + k: float(v) if v.shape == [] else v.numpy().mean() + for k, v in loss.items() + } stats['lr'] = lr train_stats.update(stats) @@ -382,7 +398,9 @@ def train(config, extra_input=extra_input, scaler=scaler, amp_level=amp_level, - amp_custom_black_list=amp_custom_black_list) + amp_custom_black_list=amp_custom_black_list, + amp_custom_white_list=amp_custom_white_list, + amp_dtype=amp_dtype) cur_metric_str = 'cur metric, {}'.format(', '.join( ['{}: {}'.format(k, v) for k, v in cur_metric.items()])) logger.info(cur_metric_str) @@ -475,7 +493,9 @@ def eval(model, extra_input=False, scaler=None, amp_level='O2', - amp_custom_black_list=[]): + amp_custom_black_list=[], + amp_custom_white_list=[], + amp_dtype='float16'): model.eval() with paddle.no_grad(): total_frame = 0.0 @@ -498,7 +518,8 @@ def eval(model, if scaler: with paddle.amp.auto_cast( level=amp_level, - custom_black_list=amp_custom_black_list): + custom_black_list=amp_custom_black_list, + dtype=amp_dtype): if model_type == 'table' or extra_input: preds = model(images, data=batch[1:]) elif model_type in ["kie"]: @@ -641,9 +662,9 @@ def preprocess(is_train=False): 'EAST', 'DB', 'SAST', 'Rosetta', 'CRNN', 'STARNet', 'RARE', 'SRN', 'CLS', 'PGNet', 'Distillation', 'NRTR', 'TableAttn', 'SAR', 'PSE', 'SEED', 'SDMGR', 'LayoutXLM', 'LayoutLM', 'LayoutLMv2', 'PREN', 'FCE', - 'SVTR', 'ViTSTR', 'ABINet', 'DB++', 'TableMaster', 'SPIN', 'VisionLAN', - 'Gestalt', 'SLANet', 'RobustScanner', 'CT', 'RFL', 'DRRG', 'CAN', - 'Telescope' + 'SVTR', 'SVTR_LCNet', 'ViTSTR', 'ABINet', 'DB++', 'TableMaster', 'SPIN', + 'VisionLAN', 'Gestalt', 'SLANet', 'RobustScanner', 'CT', 'RFL', 'DRRG', + 'CAN', 'Telescope', 'SATRN', 'SVTR_HGNet' ] if use_xpu: @@ -665,7 +686,7 @@ def preprocess(is_train=False): if 'use_visualdl' in config['Global'] and config['Global']['use_visualdl']: save_model_dir = config['Global']['save_model_dir'] - vdl_writer_path = '{}/vdl/'.format(save_model_dir) + vdl_writer_path = save_model_dir log_writer = VDLLogger(vdl_writer_path) loggers.append(log_writer) if ('use_wandb' in config['Global'] and diff --git a/tools/train.py b/tools/train.py index ff261e85fec10ec974ff763d6c3747faaa47c8d9..85c98eaddfe69c08e0e29921edcb1d26539b871f 100755 --- a/tools/train.py +++ b/tools/train.py @@ -27,7 +27,7 @@ import yaml import paddle import paddle.distributed as dist -from ppocr.data import build_dataloader +from ppocr.data import build_dataloader, set_signal_handlers from ppocr.modeling.architectures import build_model from ppocr.losses import build_loss from ppocr.optimizer import build_optimizer @@ -41,7 +41,7 @@ import tools.program as program dist.get_world_size() -def main(config, device, logger, vdl_writer): +def main(config, device, logger, vdl_writer, seed): # init dist environment if config['Global']['distributed']: dist.init_parallel_env() @@ -49,7 +49,8 @@ def main(config, device, logger, vdl_writer): global_config = config['Global'] # build dataloader - train_dataloader = build_dataloader(config, 'Train', device, logger) + set_signal_handlers() + train_dataloader = build_dataloader(config, 'Train', device, logger, seed) if len(train_dataloader) == 0: logger.error( "No Images in train dataset, please ensure\n" + @@ -60,7 +61,7 @@ def main(config, device, logger, vdl_writer): return if config['Eval']: - valid_dataloader = build_dataloader(config, 'Eval', device, logger) + valid_dataloader = build_dataloader(config, 'Eval', device, logger, seed) else: valid_dataloader = None @@ -80,14 +81,22 @@ def main(config, device, logger, vdl_writer): if config['PostProcess'][ 'name'] == 'DistillationSARLabelDecode': char_num = char_num - 2 - # update SARLoss params - assert list(config['Loss']['loss_config_list'][-1].keys())[ - 0] == 'DistillationSARLoss' - config['Loss']['loss_config_list'][-1][ - 'DistillationSARLoss']['ignore_index'] = char_num + 1 + if config['PostProcess'][ + 'name'] == 'DistillationNRTRLabelDecode': + char_num = char_num - 3 out_channels_list = {} out_channels_list['CTCLabelDecode'] = char_num - out_channels_list['SARLabelDecode'] = char_num + 2 + # update SARLoss params + if list(config['Loss']['loss_config_list'][-1].keys())[ + 0] == 'DistillationSARLoss': + config['Loss']['loss_config_list'][-1][ + 'DistillationSARLoss'][ + 'ignore_index'] = char_num + 1 + out_channels_list['SARLabelDecode'] = char_num + 2 + elif list(config['Loss']['loss_config_list'][-1].keys())[ + 0] == 'DistillationNRTRLoss': + out_channels_list['NRTRLabelDecode'] = char_num + 3 + config['Architecture']['Models'][key]['Head'][ 'out_channels_list'] = out_channels_list else: @@ -97,19 +106,24 @@ def main(config, device, logger, vdl_writer): 'name'] == 'MultiHead': # for multi head if config['PostProcess']['name'] == 'SARLabelDecode': char_num = char_num - 2 - # update SARLoss params - assert list(config['Loss']['loss_config_list'][1].keys())[ - 0] == 'SARLoss' - if config['Loss']['loss_config_list'][1]['SARLoss'] is None: - config['Loss']['loss_config_list'][1]['SARLoss'] = { - 'ignore_index': char_num + 1 - } - else: - config['Loss']['loss_config_list'][1]['SARLoss'][ - 'ignore_index'] = char_num + 1 + if config['PostProcess']['name'] == 'NRTRLabelDecode': + char_num = char_num - 3 out_channels_list = {} out_channels_list['CTCLabelDecode'] = char_num - out_channels_list['SARLabelDecode'] = char_num + 2 + # update SARLoss params + if list(config['Loss']['loss_config_list'][1].keys())[ + 0] == 'SARLoss': + if config['Loss']['loss_config_list'][1]['SARLoss'] is None: + config['Loss']['loss_config_list'][1]['SARLoss'] = { + 'ignore_index': char_num + 1 + } + else: + config['Loss']['loss_config_list'][1]['SARLoss'][ + 'ignore_index'] = char_num + 1 + out_channels_list['SARLabelDecode'] = char_num + 2 + elif list(config['Loss']['loss_config_list'][1].keys())[ + 0] == 'NRTRLoss': + out_channels_list['NRTRLabelDecode'] = char_num + 3 config['Architecture']['Head'][ 'out_channels_list'] = out_channels_list else: # base rec model @@ -147,14 +161,17 @@ def main(config, device, logger, vdl_writer): use_amp = config["Global"].get("use_amp", False) amp_level = config["Global"].get("amp_level", 'O2') + amp_dtype = config["Global"].get("amp_dtype", 'float16') amp_custom_black_list = config['Global'].get('amp_custom_black_list', []) + amp_custom_white_list = config['Global'].get('amp_custom_white_list', []) if use_amp: AMP_RELATED_FLAGS_SETTING = {'FLAGS_max_inplace_grad_add': 8, } if paddle.is_compiled_with_cuda(): AMP_RELATED_FLAGS_SETTING.update({ - 'FLAGS_cudnn_batchnorm_spatial_persistent': 1 + 'FLAGS_cudnn_batchnorm_spatial_persistent': 1, + 'FLAGS_gemm_use_half_precision_compute_type': 0, }) - paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING) + paddle.set_flags(AMP_RELATED_FLAGS_SETTING) scale_loss = config["Global"].get("scale_loss", 1.0) use_dynamic_loss_scaling = config["Global"].get( "use_dynamic_loss_scaling", False) @@ -166,7 +183,8 @@ def main(config, device, logger, vdl_writer): models=model, optimizers=optimizer, level=amp_level, - master_weight=True) + master_weight=True, + dtype=amp_dtype) else: scaler = None @@ -180,7 +198,8 @@ def main(config, device, logger, vdl_writer): program.train(config, train_dataloader, valid_dataloader, device, model, loss_class, optimizer, lr_scheduler, post_process_class, eval_class, pre_best_model_dict, logger, vdl_writer, scaler, - amp_level, amp_custom_black_list) + amp_level, amp_custom_black_list, amp_custom_white_list, + amp_dtype) def test_reader(config, device, logger): @@ -205,5 +224,5 @@ if __name__ == '__main__': config, device, logger, vdl_writer = program.preprocess(is_train=True) seed = config['Global']['seed'] if 'seed' in config['Global'] else 1024 set_seed(seed) - main(config, device, logger, vdl_writer) + main(config, device, logger, vdl_writer, seed) # test_reader(config, device, logger)