diff --git a/docs/lite/api/_custom/graph b/docs/lite/api/_custom/graph index adf0c0f4e00a030075105bb8a75cef1d8803a1b6..fe0e2282bbd4fd9ff328c61ddc82820ae7e25d12 100644 --- a/docs/lite/api/_custom/graph +++ b/docs/lite/api/_custom/graph @@ -2120,6 +2120,8 @@ class ExhaleRoot(object): # old_name = str(file) # new_name = str(file).replace('_runtime', '').replace('_converter', '') # os.rename(os.path.join(dirA, old_name), os.path.join(dirA, new_name)) + + # 拷贝c++目录文件至临时文件夹并改名 for i in os.listdir(dirA): if i.startswith('namespace_mindspore') and i != 'namespace_mindspore__dataset.rst': new_name = i.replace("__", "_").split("_", 1)[-1] @@ -2132,6 +2134,7 @@ class ExhaleRoot(object): if not re.findall(f'\n.*\n[=-]+\n(?:.|\n|)+?\w', generate_doc) and '.. toctree::' not in generate_doc: os.remove(os.path.join(dirA, i)) del_list_en.append(i) + # 特殊rst文件处理,删除toctree if os.path.exists(os.path.join(dirA, 'unabridged_api.rst')): with open(os.path.join(dirA, 'unabridged_api.rst'), 'r+', encoding='utf-8') as h: content = h.read() @@ -2147,6 +2150,7 @@ class ExhaleRoot(object): if "mindspore_utils" in file: os.remove(dirB+file) + # 调整目录文件中的链接方式 title = re.compile(r'[\s\n\S]*=====') content = re.compile(r'(Classes|Functions|Enums|Type Definition|Structs)([\s\n\S]*)') for file in os.listdir(dirB): @@ -2165,6 +2169,7 @@ class ExhaleRoot(object): shutil.rmtree(dirB) + # 遍历generate文件夹找出c相关的rst并生成目录文件 dirE = './source_en/api_c/' if not os.path.exists(dirE): os.makedirs(dirE) @@ -2186,6 +2191,7 @@ class ExhaleRoot(object): p.write("\n\n"+"".join(mycapi)) mycapi = [] + # index.rst 目录生成 cppresult = [] javaresult = [] cresult = [] diff --git a/docs/lite/api/source_en/conf.py b/docs/lite/api/source_en/conf.py index 761b49102147e755b4a2115890f639c7950d8c50..e7f2c31f35be6908014ab0d1a1925665f599feb9 100644 --- a/docs/lite/api/source_en/conf.py +++ b/docs/lite/api/source_en/conf.py @@ -428,6 +428,7 @@ save_path = "../" extract_tar_gz(lite_package_path, save_path) +# 拷贝需要的部分到include里 source_path = "../" + header_path + "/" source_runtime_include = os.path.join(source_path, "runtime/include") target_runtime_include = "../include/runtime/include" diff --git a/docs/lite/docs/source_en/infer/runtime_cpp.md b/docs/lite/docs/source_en/infer/runtime_cpp.md index 4bfbfae48863d35160b66d2ac4a5509e6c5eb752..f5499118fe9106b29f3d9cd42efdae3ce5495850 100644 --- a/docs/lite/docs/source_en/infer/runtime_cpp.md +++ b/docs/lite/docs/source_en/infer/runtime_cpp.md @@ -726,7 +726,7 @@ int RunEncryptedInfer(const char *model_path, const std::string dec_key_str, If the command for using the converter_lite is: ```bash -./converter_lite --fmk=MINDIR --modelFile=./lenet.mindir --outputFile=lenet_enc --encryptKey=30313233343536373839414243444546 --encryption=true +./converter_lite --fmk=MINDIR --modelFile=./lenet.mindir --outputFile=lenet_enc --encryptKey="your encrypt key" --encryption=true ``` Compile the source code in the mindspore-lite/examples/runtime_cpp directory, and generate build/runtime_cpp: @@ -740,7 +740,7 @@ cd build Run MindSpore Lite inference on the encrypted model file: ```bash -./runtime_cpp --modelFile=./lenet_enc.ms 6 30313233343536373839414243444546 ${your_openssl_path} +./runtime_cpp --modelFile=./lenet_enc.ms 6 "your decrypt key" ${your_openssl_path} ``` ### Viewing Logs diff --git a/docs/lite/docs/source_en/mindir/benchmark_tool.md b/docs/lite/docs/source_en/mindir/benchmark_tool.md index 0ab57251143fa84f2ca4757fbdc8d3db3a1e82ed..d58381e84a76601eeaa897dff66bdf481fd1ad22 100644 --- a/docs/lite/docs/source_en/mindir/benchmark_tool.md +++ b/docs/lite/docs/source_en/mindir/benchmark_tool.md @@ -115,5 +115,5 @@ If you need to specify the dimension of the input data (e.g. input dimension is If the model is encryption model, inference is performed after both `decryptKey` and `cryptoLibPath` are configured to decrypt the model. For example: ```bash -./benchmark --modelFile=/path/to/encry_model.mindir --decryptKey=30313233343536373839414243444546 --cryptoLibPath=/root/anaconda3/bin/openssl +./benchmark --modelFile=/path/to/encry_model.mindir --decryptKey="your decrypt key" --cryptoLibPath=/root/anaconda3/bin/openssl ``` \ No newline at end of file diff --git a/docs/lite/docs/source_zh_cn/infer/runtime_cpp.md b/docs/lite/docs/source_zh_cn/infer/runtime_cpp.md index b83a8db4a12835f47e31159469376e7cc7c1cbea..fb4f0fd90015b6227eb3d09074b09a13d236ec1f 100644 --- a/docs/lite/docs/source_zh_cn/infer/runtime_cpp.md +++ b/docs/lite/docs/source_zh_cn/infer/runtime_cpp.md @@ -725,7 +725,7 @@ int RunEncryptedInfer(const char *model_path, const std::string dec_key_str, 使用converter_lite工具的命令为: ```bash -./converter_lite --fmk=MINDIR --modelFile=./lenet.mindir --outputFile=lenet_enc --encryptKey=30313233343536373839414243444546 --encryption=true +./converter_lite --fmk=MINDIR --modelFile=./lenet.mindir --outputFile=lenet_enc --encryptKey="your encrypt key" --encryption=true ``` 在mindspore-lite/examples/runtime_cpp目录下编译源码生成build/runtime_cpp文件: @@ -739,7 +739,7 @@ cd build 运行Lite端侧使用加密后的模型进行推理: ```bash -./runtime_cpp --modelFile=./lenet_enc.ms 6 30313233343536373839414243444546 ${your_openssl_path} +./runtime_cpp --modelFile=./lenet_enc.ms 6 "your decrypt key" ${your_openssl_path} ``` ### 查看日志 diff --git a/docs/lite/docs/source_zh_cn/mindir/benchmark_tool.md b/docs/lite/docs/source_zh_cn/mindir/benchmark_tool.md index 5b993712b140f594aa0cf3b71420c0dc3f1fad1f..c8d66087cd562d54f7024284be800fa02c452f3d 100644 --- a/docs/lite/docs/source_zh_cn/mindir/benchmark_tool.md +++ b/docs/lite/docs/source_zh_cn/mindir/benchmark_tool.md @@ -115,5 +115,5 @@ Mean bias of all nodes: 0% 如果输入的模型是加密模型,需要同时配置`decryptKey`和`cryptoLibPath`对模型解密后进行推理,使用如下命令: ```bash -./benchmark --modelFile=/path/to/encry_model.mindir --decryptKey=30313233343536373839414243444546 --cryptoLibPath=/root/anaconda3/bin/openssl +./benchmark --modelFile=/path/to/encry_model.mindir --decryptKey="your decrypt key" --cryptoLibPath=/root/anaconda3/bin/openssl ``` \ No newline at end of file diff --git a/docs/mindformers/docs/source_en/env_variables.md b/docs/mindformers/docs/source_en/env_variables.md index 1d2da642cb39ed750c4f8516d092e6cd4bbb346a..5ab90136b29214fa667fe06961b92ef173388e7a 100644 --- a/docs/mindformers/docs/source_en/env_variables.md +++ b/docs/mindformers/docs/source_en/env_variables.md @@ -40,4 +40,4 @@ The following environment variables are supported by MindSpore Transformers. | **MS_ENABLE_FA_FLATTEN** | on | Controls whether support FlashAttention flatten optimization. | `on`: Enable FlashAttention flatten optimization;
`off`: Disable FlashAttention flatten optimization. | Provide a fallback mechanism for models that have not yet been adapted to FlashAttention flatten optimization. | | **EXPERIMENTAL_KERNEL_LAUNCH_GROUP** | NA | Control whether to support the batch parallel submission of operators. If supported, enable the parallel submission and configure the number of parallel submissions. | `thread_num`: The number of concurrent threads is not recommended to be increased. The default value is 2;
`kernel_group_num`: Total number of operator groups, 'kernel_group_num/thread_num' groups per thread, default is' 8 '. | This feature will continue to evolve in the future, and the subsequent behavior may change. Currently, only the `deepseek` reasoning scenario is supported, with certain performance optimization, but other models using this feature may deteriorate, and users need to use it with caution, as follows:`export EXPERIMENTAL_KERNEL_LAUNCH_GROUP="thread_num:2,kernel_group_num:8"`. | | **ENFORCE_EAGER** | False | Control whether to disable jit mode. | `False`: Enable jit mode;
`True`: Do not enable jit mode. | Jit compiles functions into a callable MindSpore graph, sets ENFORCE_EAGER to False to enable jit mode, which can generate performance benefits. Currently, only inference mode is supported. | -| **MS_ENABLE_TFT** | NA | Enable [MindIO TFT](https://www.hiascend.com/document/detail/zh/mindx-dl/600/clusterscheduling/ref/mindiottp/mindiotft001.html) feature. Turn on TTP, UCE, HCCE, ARF, TRE or TSP feature. | The value of the environment variable can be:"{TTP:1,UCE:1,HCCE:1,ARF:1,TRE:1,TSP:1}", when using a certain feature, the corresponding field can be configured as "1". | Usage can refer to [High Availability](https://www.mindspore.cn/mindformers/docs/en/master/feature/high_availability.html). | +| **MS_ENABLE_TFT** | NA | Enable Training Fault Tolerance function. Most functions rely on [MindIO TFT](https://www.hiascend.com/document/detail/zh/mindx-dl/600/clusterscheduling/ref/mindiottp/mindiotft001.html). | The value of the environment variable can be: "{TTP:1,UCE:1,HCCE:1,ARF:1,TRE:1,TSP:1,RSC:1}". When a certain feature is used, the corresponding field can be configured as "1". | Usage can refer to [High Availability](https://www.mindspore.cn/mindformers/docs/en/master/feature/high_availability.html). | diff --git a/docs/mindformers/docs/source_en/feature/high_availability.md b/docs/mindformers/docs/source_en/feature/high_availability.md index 493089b3d5b569966093b0568acfd3906024b18b..56bdeef698e313ea22f3149121dcf425607b9e28 100644 --- a/docs/mindformers/docs/source_en/feature/high_availability.md +++ b/docs/mindformers/docs/source_en/feature/high_availability.md @@ -4,7 +4,7 @@ ## Overview -MindSpore Transformers high availability provides the following six functions: +MindSpore Transformers high availability provides the following several functions: - **End-of-life CKPT**: It is mainly aimed at accelerating the fault recovery in the training process of large models. This feature verifies the integrity and consistency of the intermediate state data after a fault occurs during the training process and generates an end-of-life CheckPoint data, which can be used to recover the training and reduce the loss of training iterations caused by the fault. - **UCE Fault-tolerant Recovery**: It mainly focuses on the detection of UCE faults in on-chip memory during the training process of large models, and accomplishes online repair to reach Step-level recomputation. @@ -12,16 +12,17 @@ MindSpore Transformers high availability provides the following six functions: - **TRE Training Result Excepition Recovery**:It mainly focuses on the detection of value excepton of loss, global-norm, etc. during the training process of large models, and accomplishes online repair to reach Step-level recomputation. - **ARF Process-Level Rescheduling Recovery**: Instead of pulling up the entire cluster again after an anomaly in training occurs, simply restart or replace it on a node-by-node basis to complete the repair and continue training. - **TSP Training Step Pause Function**:After each training step is completed, enter the train pause interface,pause or resume training according to the needs of upper level operations. For example, pause training to perform communication network track switching, and resume training after successful switching. +- **RSC POD-Level Rescheduling Function**: Primarily serves as a fallback solution when other fast recovery features fail. It kills the faulty process and other normal processes (the pods where the normal processes reside will not be terminated), removes the faulty pod from the current cluster, and rescheduling a new pod to join the cluster, and resumes training (the current version must rely on MindX). Constraints and dependencies of the high availability functions: -| | End-of-life CKPT | UCE | HCCE | ARF | TRE | TSP | -| - | - | - | - | - | - | - | -| Depending on MindIO | Yes | Yes | Yes | Yes | No | Yes | -| Replica relationship between between cards | Yes | Yes | No | Yes | No | No | -| Sink Size is 1 | Yes | Yes | Yes | Yes | No | No | +| | End-of-life CKPT | UCE | HCCE | ARF | TRE | TSP | RSC | +| - | - | - | - | - | - | - | - | +| Depending on MindIO | Yes | Yes | Yes | Yes | No | Yes | No | +| Replica relationship between between cards | Yes | Yes | No | Yes | No | No | No | +| Sink Size is 1 | Yes | Yes | Yes | Yes | No | No | No | -These six high availability functions are currently only supported in the MindSpore Ascend back-end graph schema to support Step-level recovery. +These high availability functions are currently only supported in the MindSpore Ascend back-end graph schema to support Step-level recovery. The replica relationship between cards is used to make sure when one of the cards fails, it can be recovered from the other card. It requires that there must be at least two copies of redundancy in both the weights and the optimizer. To ensure this redundancy relationship, data parallelism must be turned on to ensure that there are two cards with the same weights, and also if optimizer parallelism is turned on, it must be ensured that there are two cards with the same optimizer state. @@ -52,13 +53,15 @@ export MS_TFT_PORT=30051 ``` - `MINDIO_FOR_MINDSPORE`: Enabling MindIO TFT SDK to support MindSpore -- `MS_ENABLE_TFT`: Indicates that the TTP, UCE, ARF, TRE and TSP functions are enabled. If you want to enable only one of these functions, set the corresponding value to 1. +- `MS_ENABLE_TFT`: Indicates that Training Fault Tolerance is enabled. If you want to enable only one of these functions, set the corresponding value to 1. - **TTP (Try To Persist)**: End-of-life CKPT function - **UCE (Uncorrectable Memory Error)**: UCE fault tolerance recovery - **HCCE (Huawei Collective Communication Error)**: HCCL recompute error recovery - **ARF (Air Refuelling)**: Process-level rescheduling recovery function - **TRE (Training Result Error)**: Training result exception recovery - **TSP (Training Step Pause)**:Training step pause function + - **RSC (Register Stop/Start Controller)**: POD-level rescheduling function + - POD-level rescheduling only hands over the training processes to a third-party component (such as MindX) for management. When only RSC:1 is enabled (the current version must rely on MindX), other training fault tolerance features are not effective. - When UCE or ARF is enabled, TTP is enabled by default. - Enabling both TRE and asynchronous CKPT features at the same time cannot guarantee that the loss before and after resuming training is exactly the same. - TRE does not depend on MindIO. It is not necessary to configure the MindIO-related environment variables MINDIO_FOR_MINDSPORE, MS_TFT_IP, and MS_TFT_PORT to enable only the TRE feature diff --git a/docs/mindformers/docs/source_en/feature/resume_training.md b/docs/mindformers/docs/source_en/feature/resume_training.md index 1dbd2632a8b33f334dd75ee39811c05e2aa6843d..984efd3310a657ed71339e2ed039b5377e6bfd30 100644 --- a/docs/mindformers/docs/source_en/feature/resume_training.md +++ b/docs/mindformers/docs/source_en/feature/resume_training.md @@ -12,20 +12,20 @@ MindSpore Transformers supports **step-level resumable training**, which allows You can modify the configuration file to control resumable training. The main parameters are as follows. For details about other parameters, see the description of CheckpointMonitor. -| Parameter | Description | -|------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| load_checkpoint | Weight path loaded during resumable training. The path can be a folder path (used to load distributed weights) or a specific weight file path. The default value is an empty string, indicating that no weight is loaded (required for resumable training). | -| resume_training | Specifies whether to enable resumable training. You can set it to `True` or specify a weight file name. If the value is `True`, the system automatically resumes the training from the last interruption. The default value is `False`. | -| load_ckpt_async | Determines whether to load model weights and compile in parallel (this configuration does not take effect when auto_trans_ckpt is set to true). The default value is False (serial execution).
When it is `True`, the parallel capability of loading ckpt weights and building model is enabled to reduce the overall time resume training. | +| Parameter | Description | +|------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| load_checkpoint | Weight path loaded during resumable training. The path can be a folder path (used to load distributed weights) or a specific weight file path. The default value is an empty string, indicating that no weight is loaded (required for resumable training). When the configured path is an empty directory, the system will fall back to pretraining with randomly initialized weights. | +| resume_training | Specifies whether to enable resumable training. You can set it to `True` or specify a weight file name. If the value is `True`, the system automatically resumes the training from the last interruption. The default value is `False`. | +| load_ckpt_async | Determines whether to load model weights and compile in parallel (this configuration does not take effect when auto_trans_ckpt is set to true). The default value is False (serial execution).
When it is `True`, the parallel capability of loading ckpt weights and building model is enabled to reduce the overall time resume training. | Based on the input parameters, there are four cases. -| load_checkpoint | resume_training | Description | Recommended or Not | -|---------------------|-------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------| -| Weight file path | True | Resumes a training based on the weights specified by load_checkpoint. | √ | -| Weight file path | Weight file name | The file name specified by resume_training is invalid. A training is resumed based on the weights specified by load_checkpoint. | × | -| Weight folder path | True | **Scenario 1: Single-node system, multi-node system+shared directory, or ModelArts**
1. Resumes the training based on the weights recorded in meta.json files and supports fault recovery.
2. Resumes the training based on the latest weight of all ranks if the meta.json file of any rank is missing.
**Scenario 2: Multi-node+non-shared directory**
Resumes the training based on the latest weight of all ranks. | √ | -| Weight folder path | Weight file name | Resumes the training based on the weights specified by resume_training. | √ | +| load_checkpoint | resume_training | Description | Recommended or Not | +|---------------------|-------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------| +| Weight file path | True | Resumes a training based on the weights specified by load_checkpoint. | √ | +| Weight file path | Weight file name | The file name specified by resume_training is invalid. A training is resumed based on the weights specified by load_checkpoint. | × | +| Weight folder path | True | **Scenario 1: Single-node system, multi-node system+shared directory, or ModelArts**
1. Resumes the training based on the weights recorded in meta.json files and supports fault recovery.
2. Resumes the training based on the latest weight of all ranks if the meta.json file of any rank is missing.
**Scenario 2: Multi-node+non-shared directory**
Resumes the training based on the latest weight of all ranks.
**Scenario 3:Automatically resume training**
To facilitate using the automatic training recovery feature, configure `load_checkpoint` as the save path for weight checkpoints, eliminating the need to manually modify this setting when resuming training. If the directory is empty during initial training, weights will initialize randomly normally; when resuming, training will recover from checkpoints saved in this directory. | √ | +| Weight folder path | Weight file name | Resumes the training based on the weights specified by resume_training. | √ | In addition, you can modify the following parameters in the configuration file to use related functions. @@ -49,6 +49,15 @@ For related configuration files, see [research/llama3_1/llama3_1_8b/finetune_lla 1. Modify `research/llama3_1/llama3_1_8b/finetune_llama3_1_8b.yaml`. + For initial training with randomly initialized weights followed by resume training without changing the configuration file, set `resume_training` to `True` and `load_checkpoint` to the directory where checkpoints will be saved: + + ```yaml + load_checkpoint: './output/checkpoint' + resume_training: True + ``` + + > Use an empty directory for `load_checkpoint` only if it is intended for saving checkpoints; otherwise, the next run will start from scratch instead of resuming. + Configure the parallelism as required. ```yaml @@ -95,7 +104,7 @@ For related configuration files, see [research/llama3_1/llama3_1_8b/finetune_lla ### Resumable Training -1. Modify the configuration and specify the resumable training weight file. +1. If `resume_training` is set to `False` in the pre-training configuration, update the configuration to specify the resumable training weight file. ```yaml load_checkpoint: './output/checkpoint' diff --git a/docs/mindformers/docs/source_en/installation.md b/docs/mindformers/docs/source_en/installation.md index 88280676200996515c3a467a31a01afb9494209b..498acf0f51fd97e05b1e2ea62d263c9cc4008559 100644 --- a/docs/mindformers/docs/source_en/installation.md +++ b/docs/mindformers/docs/source_en/installation.md @@ -4,22 +4,25 @@ ## Confirming Version Matching Relationship -The currently supported hardware is the [Atlas 800T A2](https://www.hiascend.com/hardware/ai-server?tag=900A2) training server. +The currently supported hardware is the Atlas 800T A2, Atlas 800I A2, and Atlas 900 A3 SuperPoD. The current recommended Python version for the suite is 3.11.4. -| MindSpore Transformers | MindSpore | CANN | Firmware & Drivers | Mirror Links | -|:----------------------:|:----------------------:|:----------------------:|:----------------------:|:--------------:| -| In-Development Version | In-Development Version | In-Development Version | In-Development Version | Not applicable | +| MindSpore Transformers | MindSpore | CANN | Firmware & Drivers | +|:----------------------:|:----------------------:|:----------------------:|:----------------------:| +| In-Development Version | In-Development Version | In-Development Version | In-Development Version | **Currently MindSpore Transformers recommends using a software package relationship as above.** Historical version matching relationship: -| MindSpore Transformers | MindSpore | CANN | Firmware & Drivers | -|:----------------------------------------------------:|:-------------------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------:| -| [1.3.2](https://pypi.org/project/mindformers/1.3.2/) | [2.4.10](https://www.mindspore.cn/install/) | [8.0.0](https://www.hiascend.com/document/detail/zh/canncommercial/800/softwareinst/instg/instg_0000.html?Mode=PmIns&OS=Ubuntu&Software=cannToolKit) | [24.1.0](https://www.hiascend.com/document/detail/zh/canncommercial/800/softwareinst/instg/instg_0000.html?Mode=PmIns&OS=Ubuntu&Software=cannToolKit) | -| [1.2.0](https://pypi.org/project/mindformers/1.2.0/) | [2.3.0](https://www.mindspore.cn/install/) | [8.0.RC2.beta1](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.RC2.beta1) | [24.1.RC2](https://www.hiascend.com/hardware/firmware-drivers/community) | +| MindSpore Transformers | MindSpore | CANN | Firmware & Drivers | +|:----------------------:|:---------------------------------------------:|:--------------------------------------------------------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------:| +| 1.6.0 | [2.7.0](https://www.mindspore.cn/install) | [8.2.RC1](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/softwareinst/instg/instg_0000.html) | [25.2.0](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/softwareinst/instg/instg_0000.html) | +| 1.5.0 | [2.6.0-rc1](https://www.mindspore.cn/install) | [8.1.RC1](https://www.hiascend.com/document/detail/zh/canncommercial/81RC1/softwareinst/instg/instg_0000.html) | [25.0.RC1](https://www.hiascend.com/document/detail/zh/canncommercial/81RC1/softwareinst/instg/instg_0000.html) | +| 1.3.2 | [2.4.10](https://www.mindspore.cn/versions) | [8.0.0](https://www.hiascend.com/document/detail/zh/canncommercial/800/softwareinst/instg/instg_0000.html) | [24.1.0](https://www.hiascend.com/document/detail/zh/canncommercial/800/softwareinst/instg/instg_0000.html) | +| 1.3.0 | [2.4.0](https://www.mindspore.cn/versions) | [8.0.RC3](https://www.hiascend.com/document/detail/zh/canncommercial/80RC3/softwareinst/instg/instg_0000.html) | [24.1.RC3](https://www.hiascend.com/document/detail/zh/canncommercial/80RC3/softwareinst/instg/instg_0000.html) | +| 1.2.0 | [2.3.0](https://www.mindspore.cn/versions) | [8.0.RC2](https://www.hiascend.com/document/detail/zh/canncommercial/80RC2/softwareinst/instg/instg_0000.html) | [24.1.RC2](https://www.hiascend.com/document/detail/zh/canncommercial/80RC2/softwareinst/instg/instg_0000.html) | ## Installing Dependent Software diff --git a/docs/mindformers/docs/source_zh_cn/env_variables.md b/docs/mindformers/docs/source_zh_cn/env_variables.md index a4df9519477f082a83d6fd6b241e763102c15e78..a29d54306e294436c9532d3c39674d8c9ef8305b 100644 --- a/docs/mindformers/docs/source_zh_cn/env_variables.md +++ b/docs/mindformers/docs/source_zh_cn/env_variables.md @@ -40,4 +40,4 @@ | **MS_ENABLE_FA_FLATTEN** | on | 控制 是否支持 FlashAttention flatten 优化。 | `on`:启用 FlashAttention flatten 优化;
`off`: 禁用 FlashAttention flatten 优化。 | 对于还未适配FlashAttention flatten 优化的模型提供回退机制。 | | **EXPERIMENTAL_KERNEL_LAUNCH_GROUP** | NA | 控制是否支持算子批量并行下发,支持开启并行下发,并配置并行数 | `thread_num`: 并发线程数,一般不建议增加,默认值为`2`;
`kernel_group_num`: 算子分组总数量,每线程`kernel_group_num/thread_num`个组,默认值为`8`。 | 该特性后续还会继续演进,后续行为可能会有变更,当前仅支持`deepseek`推理场景,有一定的性能优化,但是其他模型使用该特性可能会有劣化,用户需要谨慎使用,使用方法如下:`export EXPERIMENTAL_KERNEL_LAUNCH_GROUP="thread_num:2,kernel_group_num:8"`。 | | **ENFORCE_EAGER** | False | 控制是否**不开启**jit模式。 | `False`: 开启jit模式;
`True`: 不开启jit模式。 | Jit将函数编译成一张可调用的MindSpore图,设置ENFORCE_EAGER为False开启jit模式,可以获取性能收益,当前仅支持推理模式。 | -| **MS_ENABLE_TFT** | NA | 使能 [MindIO TFT](https://www.hiascend.com/document/detail/zh/mindx-dl/600/clusterscheduling/ref/mindiottp/mindiotft001.html) 特性,表示启用 TTP、UCE、HCCE、ARF、TRE 或 TSP 功能。 | 取值为"{TTP:1,UCE:1,HCCE:1,ARF:1,TRE:1,TSP:1}",使用某一功能时,可将对应字段配置为"1"。 | 使用方式可以参考[高可用特性](https://www.mindspore.cn/mindformers/docs/zh-CN/master/feature/high_availability.html)。 | +| **MS_ENABLE_TFT** | NA | 使能训练故障容错(Training Fault Tolerance)功能,大多数功能依赖 [MindIO TFT](https://www.hiascend.com/document/detail/zh/mindx-dl/600/clusterscheduling/ref/mindiottp/mindiotft001.html)。 | 取值为"{TTP:1,UCE:1,HCCE:1,ARF:1,TRE:1,TSP:1,RSC:1}",使用某一功能时,可将对应字段配置为"1"。 | 使用方式可以参考[高可用特性](https://www.mindspore.cn/mindformers/docs/zh-CN/master/feature/high_availability.html)。 | diff --git a/docs/mindformers/docs/source_zh_cn/feature/high_availability.md b/docs/mindformers/docs/source_zh_cn/feature/high_availability.md index e4187d1af41c660503fc65ab67ff0b0f40b14668..77a831708323599fd553eba4828202c1e18c8b91 100644 --- a/docs/mindformers/docs/source_zh_cn/feature/high_availability.md +++ b/docs/mindformers/docs/source_zh_cn/feature/high_availability.md @@ -4,7 +4,7 @@ ## 概述 -MindSpore Transformers 高可用特性提供了如下六个功能: +MindSpore Transformers 高可用特性提供了如下几个功能: - **临终 CKPT 功能**:主要针对大模型训练过程中的故障恢复加速,该特性在训练过程中发生故障后,校验中间状态数据的完整性和一致性,生成一次临终 CheckPoint 数据,恢复训练时能够通过该 CheckPoint 数据恢复,减少故障造成的训练迭代损失。 - **UCE 故障容错恢复功能**:主要是针对大模型训练过程中片上内存的 UCE 故障检测,并完成在线修复,达到 Step 级重计算。 @@ -12,16 +12,17 @@ MindSpore Transformers 高可用特性提供了如下六个功能: - **TRE 训练结果异常恢复功能**:主要是针对大模型训练过程中出现loss或global norm等值异常检测,并完成在线修复,达到 Step 级重计算。 - **ARF 进程级重调度恢复功能**:训练发生异常后,不需要重新拉起整个集群,只需以节点为单位进行重启或替换,完成修复并继续训练。 - **TSP 训练迭代暂停功能**:在每个训练step结束后,进入训练暂停接口,根据上层运维需要进行训练暂停和继续,例如,暂停训练执行通信网络轨道切换,切换成功后继续训练。 +- **RSC POD级重调度功能**:主要是其他快恢特性执行失败之后的兜底方案,kill故障进程以及其他正常进程(正常进程所在pod不会被kill),将故障pod从当前集群中隔离,同时调度新的pod加入集群,并恢复训练(当前版本必须依赖MindX)。 这几个高可用特性的**约束**和**依赖**如下: -| | 临终 CKPT | UCE | HCCE | ARF | TRE | TSP | -| - | - | - | - | - | - | - | -| 依赖MindIO组件 | Yes | Yes | Yes | Yes | No | Yes | -| 卡间存在副本关系 | Yes | Yes | No | Yes | No | No | -| Sink Size 为 1 | Yes | Yes | Yes | Yes | No | No | +| | 临终 CKPT | UCE | HCCE | ARF | TRE | TSP | RSC | +| - | - | - | - | - | - | - | - | +| 依赖MindIO组件 | Yes | Yes | Yes | Yes | No | Yes | No | +| 卡间存在副本关系 | Yes | Yes | No | Yes | No | No | No | +| Sink Size 为 1 | Yes | Yes | Yes | Yes | No | No | No | -目前这六个高可用特性只支持Ascend后端上图模式的Step级别恢复。 +目前这几个高可用特性只支持Ascend后端上图模式的Step级别恢复。 卡间存在副本关系的目的是当其中一张卡发生故障时,可从另外一张卡恢复,要求权重和优化器状态都会存在至少两份冗余。为保证这种冗余关系,必须开启数据并行,保证有两张卡权重一致,同时如果开启了优化器并行,也必须确保存在两张卡的优化器状态一致。 @@ -52,13 +53,15 @@ export MS_TFT_PORT=30051 ``` - `MINDIO_FOR_MINDSPORE`:使能 MindIO TFT SDK 支持 MindSpore -- `MS_ENABLE_TFT`:表示启用 TTP、UCE、ARF、TRE、TSP功能,如果只想启用其中的某一个功能,则将对应的值设置为 1 即可。 +- `MS_ENABLE_TFT`:表示启用训练故障容错(Training Fault Tolerance)功能,如果只想启用其中的某一个功能,则将对应的值设置为 1 即可。 - **TTP (Try To Persist)**:临终 CKPT 功能 - **UCE (Uncorrectable Memory Error)**:UCE 故障容错恢复功能 - **HCCE (Huawei Collective Communication Error)**:HCCL 重计算失败恢复功能 - **ARF (Air Refuelling)**:进程级重调度恢复功能 - **TRE (Training Result Error)**:TRE 训练结果异常恢复功能 - **TSP (Training Step Pause)**:TSP 训练迭代暂停功能 + - **RSC (Register Stop/Start Controller)**:POD级重调度功能 + - POD级重调度只把训练进程交给第三方组件(如 MindX)管控,仅开启RSC(当前版本必须依赖MindX)时,其他训练故障容错功能不生效 - 开启 UCE 或者 ARF 功能时,默认开启 TTP 功能 - 同时开启 TRE 和异步 CKPT 特性,无法保证续训前后的 loss 完全一致 - TRE 功能不依赖 MindIO 组件,若只使能TRE特性,无需配置 MindIO 相关的环境变量 MINDIO_FOR_MINDSPORE、MS_TFT_IP 和 MS_TFT_PORT diff --git a/docs/mindformers/docs/source_zh_cn/feature/resume_training.md b/docs/mindformers/docs/source_zh_cn/feature/resume_training.md index 32445bab4f26c4fa342275fca3e1d6917ea6ffce..33655c5b17a6077e72a2445085e043cd2aa2a245 100644 --- a/docs/mindformers/docs/source_zh_cn/feature/resume_training.md +++ b/docs/mindformers/docs/source_zh_cn/feature/resume_training.md @@ -12,20 +12,20 @@ MindSpore Transformers支持**step级断点续训**功能,允许在训练中 用户可通过修改配置文件来控制断点续训的行为。以下是主要参数,其他参数可参考CheckpointMonitor介绍: -| 参数 | 描述 | -| --------------- |--------------------------------------------------------------------------------------------------------------| -| load_checkpoint | 断点续训时加载的权重路径。路径可以是文件夹路径(用于加载分布式权重),也可以是具体权重文件的路径。默认为空字符串,即不加载权重(断点续训时必填) | -| resume_training | 断点续训开关,可设置为`True`或指定特定的权重文件名。为`True`时,系统会自动从上次中断处恢复训练。默认为`False` | -| load_ckpt_async | 是否将加载权重与模型编译的操作并行执行,不支持在线自动切分权重场景(auto_trans_ckpt=True),该场景下不生效。默认为False串行执行。
为`True`时,并行执行,减少总体拉起续训的耗时 | +| 参数 | 描述 | +| --------------- |---------------------------------------------------------------------------------------------------------------------------------------------------------------| +| load_checkpoint | 断点续训时加载的权重路径。路径可以是文件夹路径(用于加载分布式权重),也可以是具体权重文件的路径。默认为空字符串,即不加载权重(断点续训时必填)。当配置的路径为空目录时,会退化为使用随机初始化权重进行预训练。| +| resume_training | 断点续训开关,可设置为`True`或指定特定的权重文件名。为`True`时,系统会自动从上次中断处恢复训练。默认为`False`。 | +| load_ckpt_async | 是否将加载权重与模型编译的操作并行执行,不支持在线自动切分权重场景(auto_trans_ckpt=True),该场景下不生效。默认为False串行执行。
为`True`时,并行执行,减少总体拉起续训的耗时。 | 根据传入参数不同,可分为如下四种情况: -| load_checkpoint | resume_training | 功能描述 | 是否为推荐使用方式 | -|-----------------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------| -| 权重文件路径 | True | 基于load_checkpoint指代的权重续训 | √ | -| 权重文件路径 | 权重文件名 | resume_training指代的文件名无效,基于load_checkpoint指代的权重续训 | × | -| 权重文件夹路径 | True | **场景1:"单机"或"多机+共享目录"或"ModelArts"**
① 基于meta.json记录的权重续训,支持故障恢复。
② 若任一rank文件夹下缺少meta.json,所有rank基于最后时间戳的权重续训。
**场景2:"多机+非共享目录"**
所有rank基于最后时间戳的权重续训。 | √ | -| 权重文件夹路径 | 权重文件名 | 基于resume_training指代的权重续训 | √ | +| load_checkpoint | resume_training | 功能描述 | 是否为推荐使用方式 | +|-----------------|-----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------| +| 权重文件路径 | True | 基于load_checkpoint指代的权重续训 | √ | +| 权重文件路径 | 权重文件名 | resume_training指代的文件名无效,基于load_checkpoint指代的权重续训 | × | +| 权重文件夹路径 | True | **场景1:"单机"或"多机+共享目录"或"ModelArts"**
① 基于meta.json记录的权重续训,支持故障恢复。
② 若任一rank文件夹下缺少meta.json,所有rank基于最后时间戳的权重续训。
**场景2:"多机+非共享目录"**
所有rank基于最后时间戳的权重续训。
**场景3:"自动恢复训练"**
为方便自动恢复训练功能的使用,可以将load_checkpoint配置为权重checkpoint的保存路径,这样在续训时不需要对配置项load_checkpoint做手动修改。首次开始训练时,该目录为空,会正常随机初始化权重;续训时,会从该目录下保存的checkpoint恢复训练。 | √ | +| 权重文件夹路径 | 权重文件名 | 基于resume_training指代的权重续训 | √ | 此外,用户还可通过增改配置文件的如下参数来使用相关功能。 @@ -48,6 +48,15 @@ MindSpore Transformers支持**step级断点续训**功能,允许在训练中 1. 修改`research/llama3_1/llama3_1_8b/finetune_llama3_1_8b.yaml`: + 如果想首次运行随机初始化训练,并且后续断点续训不改配置文件,可在此时将`resume_training`设置为`True`,并将`load_checkpoint`设为即将保存权重的目录: + + ```yaml + load_checkpoint: './output/checkpoint' + resume_training: True + ``` + + > 一旦目录为空目录,模型权重即会自动随机初始化。因此如果误设了一个非即将保存权重的空目录,会导致第二次拉起任务时训练从头开始。 + 根据需要设置并行配置: ```yaml @@ -94,7 +103,7 @@ MindSpore Transformers支持**step级断点续训**功能,允许在训练中 ### 断点续训 -1. 修改配置,指定断点续训权重文件: +1. 如果在前置训练的配置中,`resume_training`为`False`,此时需修改配置,指定断点续训权重文件: ```yaml load_checkpoint: './output/checkpoint' diff --git a/docs/mindformers/docs/source_zh_cn/installation.md b/docs/mindformers/docs/source_zh_cn/installation.md index 224c3395cebdcb12f4ae0bad514ff0473902609f..55da1a2db420ac55c5ba5186f066760b0429ba2a 100644 --- a/docs/mindformers/docs/source_zh_cn/installation.md +++ b/docs/mindformers/docs/source_zh_cn/installation.md @@ -4,7 +4,7 @@ ## 确认版本匹配关系 -当前支持的硬件为[Atlas 800T A2](https://www.hiascend.com/hardware/ai-server?tag=900A2)训练服务器。 +当前支持的硬件为Atlas 800T A2、Atlas 800I A2、Atlas 900 A3 SuperPoD。 当前套件建议使用的Python版本为3.11.4。 @@ -16,10 +16,13 @@ 历史版本配套关系: -| MindSpore Transformers | MindSpore | CANN | 固件与驱动 | -|:----------------------------------------------------:|:-------------------------------------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------:| -| [1.3.2](https://pypi.org/project/mindformers/1.3.2/) | [2.4.10](https://www.mindspore.cn/install/) | [8.0.0](https://www.hiascend.com/document/detail/zh/canncommercial/800/softwareinst/instg/instg_0000.html?Mode=PmIns&OS=Ubuntu&Software=cannToolKit) | [24.1.0](https://www.hiascend.com/document/detail/zh/canncommercial/800/softwareinst/instg/instg_0000.html?Mode=PmIns&OS=Ubuntu&Software=cannToolKit) | -| [1.2.0](https://pypi.org/project/mindformers/1.2.0/) | [2.3.0](https://www.mindspore.cn/install/) | [8.0.RC2.beta1](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.RC2.beta1) | [24.1.RC2](https://www.hiascend.com/hardware/firmware-drivers/community) | +| MindSpore Transformers | MindSpore | CANN | 固件与驱动 | +|:----------------------:|:---------------------------------------------:|:--------------------------------------------------------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------:| +| 1.6.0 | [2.7.0](https://www.mindspore.cn/install) | [8.2.RC1](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/softwareinst/instg/instg_0000.html) | [25.2.0](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/softwareinst/instg/instg_0000.html) | +| 1.5.0 | [2.6.0-rc1](https://www.mindspore.cn/install) | [8.1.RC1](https://www.hiascend.com/document/detail/zh/canncommercial/81RC1/softwareinst/instg/instg_0000.html) | [25.0.RC1](https://www.hiascend.com/document/detail/zh/canncommercial/81RC1/softwareinst/instg/instg_0000.html) | +| 1.3.2 | [2.4.10](https://www.mindspore.cn/versions) | [8.0.0](https://www.hiascend.com/document/detail/zh/canncommercial/800/softwareinst/instg/instg_0000.html) | [24.1.0](https://www.hiascend.com/document/detail/zh/canncommercial/800/softwareinst/instg/instg_0000.html) | +| 1.3.0 | [2.4.0](https://www.mindspore.cn/versions) | [8.0.RC3](https://www.hiascend.com/document/detail/zh/canncommercial/80RC3/softwareinst/instg/instg_0000.html) | [24.1.RC3](https://www.hiascend.com/document/detail/zh/canncommercial/80RC3/softwareinst/instg/instg_0000.html) | +| 1.2.0 | [2.3.0](https://www.mindspore.cn/versions) | [8.0.RC2](https://www.hiascend.com/document/detail/zh/canncommercial/80RC2/softwareinst/instg/instg_0000.html) | [24.1.RC2](https://www.hiascend.com/document/detail/zh/canncommercial/80RC2/softwareinst/instg/instg_0000.html) | ## 安装依赖软件 diff --git a/docs/mindspore/_ext/overwriteautosummary_generate.txt b/docs/mindspore/_ext/overwriteautosummary_generate.txt index 0221725120470fb4ce3ad9a4f5ccaaae73b85d38..b034baf0fdb440e0d945a7ced40df96ef0eccfe5 100644 --- a/docs/mindspore/_ext/overwriteautosummary_generate.txt +++ b/docs/mindspore/_ext/overwriteautosummary_generate.txt @@ -471,92 +471,91 @@ def generate_autosummary_docs(sources: List[str], output_dir: str = None, imported_members, app, entry.recursive, context, modname, qualname) - # 获取对象所在文件的部分路径 - try: - py_source_rel = get_full_modname(modname, qualname).replace('.', '/') + '.py' - except: - py_source_rel = '' - - # 自动生成的ops模块单独处理 - if 'mindspore/ops/auto_generate/' in py_source_rel: - name1 = name - spec_tp = [('mint.nn.functional.dense', 'mint.nn.functional.linear'), - ('mint.select_ext_view', 'mint.select'), - ('mint.transpose_ext_view', 'mint.transpose'), - ] - for i in spec_tp: - if name.endswith(i[1]): - name1 = name.replace(i[1], i[0]) - # 根据接口名内大写字母个数分类处理primitive,得到yaml文件名 - if name1 not in app.config.primi_auto: - if len(re.findall('[A-Z]', name1)) == 1: - name1 = name1.lower() - elif len(re.findall('[A-Z]', name1)) > 1: - name1 = 'mindspore.ops.' + '_'.join(re.split('(?=[A-Z])', name1)[1:]).lower() - if name1.split('.')[-1] + '_doc.yaml' not in app.config.ops_yaml_list: - if name.split('.')[-1].lower() + '_doc.yaml' in app.config.ops_yaml_list: - name1 = name.lower() - # 根据yaml文件名查询文件是否存在,分别再处理 - if name1.split('.')[-1] + '_doc.yaml' not in app.config.ops_yaml_list: - # 新增查找_ext后缀文件 - if name1.split('.')[-1] + '_ext_doc.yaml' in app.config.ops_yaml_list: - py_source_rel = app.config.ops_yaml + name1.split('.')[-1] + '_ext_doc.yaml' - else: - for f_yaml in app.config.ops_yaml_list: - # 对文件名中存在v[0-9]的特殊处理 - if re.findall(f"{name1.split('.')[-1]}_v[0-9]+_doc.yaml", f_yaml): - py_source_rel = app.config.ops_yaml + re.findall(f"{name1.split('.')[-1]}_v[0-9]+_doc.yaml", f_yaml)[0] - break + if not app.config.branch.startswith('v'): + # 获取对象所在文件的部分路径 + try: + py_source_rel = get_full_modname(modname, qualname).replace('.', '/') + '.py' + except: + py_source_rel = '' + + # 自动生成的ops模块单独处理 + if 'mindspore/ops/auto_generate/' in py_source_rel: + name1 = name + spec_tp = [('mint.nn.functional.dense', 'mint.nn.functional.linear'), + ('mint.select_ext_view', 'mint.select'), + ('mint.transpose_ext_view', 'mint.transpose'), + ] + for i in spec_tp: + if name.endswith(i[1]): + name1 = name.replace(i[1], i[0]) + # 根据接口名内大写字母个数分类处理primitive,得到yaml文件名 + if name1 not in app.config.primi_auto: + if len(re.findall('[A-Z]', name1)) == 1: + name1 = name1.lower() + elif len(re.findall('[A-Z]', name1)) > 1: + name1 = 'mindspore.ops.' + '_'.join(re.split('(?=[A-Z])', name1)[1:]).lower() + if name1.split('.')[-1] + '_doc.yaml' not in app.config.ops_yaml_list: + if name.split('.')[-1].lower() + '_doc.yaml' in app.config.ops_yaml_list: + name1 = name.lower() + # 根据yaml文件名查询文件是否存在,分别再处理 + if name1.split('.')[-1] + '_doc.yaml' not in app.config.ops_yaml_list: + # 新增查找_ext后缀文件 + if name1.split('.')[-1] + '_ext_doc.yaml' in app.config.ops_yaml_list: + py_source_rel = app.config.ops_yaml + name1.split('.')[-1] + '_ext_doc.yaml' else: - py_source_rel = '' - else: - py_source_rel = app.config.ops_yaml + name1.split('.')[-1] + '_doc.yaml' - - if name1.split('.')[-1] in app.config.func_name_dict and not py_source_rel: - py_source_rel = app.config.ops_yaml + app.config.func_name_dict[name1.split('.')[-1]] + '_doc.yaml' - - # 拼接源文件链接格式文档 - re_view = f"\n.. image:: https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/{app.config.docs_branch}/" + \ - f"resource/_static/logo_source_en.svg\n :target: " + app.config.giturl + \ - f"{app.config.copy_repo}/blob/{app.config.branch}/" + \ - py_source_rel + '\n :alt: View Source On Gitee\n\n' - # 写入源文件链接文档 - if re_view not in content and py_source_rel: - content = re.sub('([=]{5,})\n', r'\1\n' + re_view, content, 1) - - elif '_c_expression' in py_source_rel and '.Tensor.' in name: - py_source_rel = app.config.tensor_yaml + name.split('.')[-1] + '_doc.yaml' - re_view = f"\n.. image:: https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/{app.config.docs_branch}/" + \ - f"resource/_static/logo_source_en.svg\n :target: " + app.config.giturl + \ - f"{app.config.copy_repo}/blob/{app.config.branch}/" + \ - py_source_rel + '\n :alt: View Source On Gitee\n\n' - - if re_view not in content: - content = re.sub('([=]{5,})\n', r'\1\n' + re_view, content, 1) - - elif 'ops/functional_overload' in py_source_rel: - py_source_rel = app.config.func_yaml + name.split('.')[-1] + '_doc.yaml' - re_view = f"\n.. image:: https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/{app.config.docs_branch}/" + \ - f"resource/_static/logo_source_en.svg\n :target: " + app.config.giturl + \ - f"{app.config.copy_repo}/blob/{app.config.branch}/" + \ - py_source_rel + '\n :alt: View Source On Gitee\n\n' - - if re_view not in content: - content = re.sub('([=]{5,})\n', r'\1\n' + re_view, content, 1) - - elif py_source_rel: - if not os.path.exists(os.path.join(app.config.repo_path, app.config.repo_whl, py_source_rel)): - py_source_rel = py_source_rel.replace('.py', '/__init__.py') - - re_view = f"\n.. image:: https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/{app.config.docs_branch}/" + \ - f"resource/_static/logo_source_en.svg\n :target: " + app.config.giturl + \ - f"{app.config.copy_repo}/blob/{app.config.branch}/" + app.config.repo_whl + \ - py_source_rel + '\n :alt: View Source On Gitee\n\n' - - if re_view not in content: - content = re.sub('([=]{5,})\n', r'\1\n' + re_view, content, 1) - - + for f_yaml in app.config.ops_yaml_list: + # 对文件名中存在v[0-9]的特殊处理 + if re.findall(f"{name1.split('.')[-1]}_v[0-9]+_doc.yaml", f_yaml): + py_source_rel = app.config.ops_yaml + re.findall(f"{name1.split('.')[-1]}_v[0-9]+_doc.yaml", f_yaml)[0] + break + else: + py_source_rel = '' + else: + py_source_rel = app.config.ops_yaml + name1.split('.')[-1] + '_doc.yaml' + + if name1.split('.')[-1] in app.config.func_name_dict and not py_source_rel: + py_source_rel = app.config.ops_yaml + app.config.func_name_dict[name1.split('.')[-1]] + '_doc.yaml' + + # 拼接源文件链接格式文档 + re_view = f"\n.. image:: https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/{app.config.docs_branch}/" + \ + f"resource/_static/logo_source_en.svg\n :target: " + app.config.giturl + \ + f"{app.config.copy_repo}/blob/{app.config.branch}/" + \ + py_source_rel + '\n :alt: View Source On Gitee\n\n' + # 写入源文件链接文档 + if re_view not in content and py_source_rel: + content = re.sub('([=]{5,})\n', r'\1\n' + re_view, content, 1) + + elif '_c_expression' in py_source_rel and '.Tensor.' in name: + py_source_rel = app.config.tensor_yaml + name.split('.')[-1] + '_doc.yaml' + re_view = f"\n.. image:: https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/{app.config.docs_branch}/" + \ + f"resource/_static/logo_source_en.svg\n :target: " + app.config.giturl + \ + f"{app.config.copy_repo}/blob/{app.config.branch}/" + \ + py_source_rel + '\n :alt: View Source On Gitee\n\n' + + if re_view not in content: + content = re.sub('([=]{5,})\n', r'\1\n' + re_view, content, 1) + + elif 'ops/functional_overload' in py_source_rel: + py_source_rel = app.config.func_yaml + name.split('.')[-1] + '_doc.yaml' + re_view = f"\n.. image:: https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/{app.config.docs_branch}/" + \ + f"resource/_static/logo_source_en.svg\n :target: " + app.config.giturl + \ + f"{app.config.copy_repo}/blob/{app.config.branch}/" + \ + py_source_rel + '\n :alt: View Source On Gitee\n\n' + + if re_view not in content: + content = re.sub('([=]{5,})\n', r'\1\n' + re_view, content, 1) + + elif py_source_rel: + if not os.path.exists(os.path.join(app.config.repo_path, app.config.repo_whl, py_source_rel)): + py_source_rel = py_source_rel.replace('.py', '/__init__.py') + + re_view = f"\n.. image:: https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/{app.config.docs_branch}/" + \ + f"resource/_static/logo_source_en.svg\n :target: " + app.config.giturl + \ + f"{app.config.copy_repo}/blob/{app.config.branch}/" + app.config.repo_whl + \ + py_source_rel + '\n :alt: View Source On Gitee\n\n' + + if re_view not in content: + content = re.sub('([=]{5,})\n', r'\1\n' + re_view, content, 1) filename = os.path.join(path, filename_map.get(name, name) + suffix) if os.path.isfile(filename): with open(filename, encoding=encoding) as f: diff --git a/docs/mindspore/source_en/api_python/env_var_list.rst b/docs/mindspore/source_en/api_python/env_var_list.rst index 84cc33b0de34142e7cc3b272382729251dc6cdff..70d6d931cff02690b1006791033477e306682ce3 100644 --- a/docs/mindspore/source_en/api_python/env_var_list.rst +++ b/docs/mindspore/source_en/api_python/env_var_list.rst @@ -103,7 +103,7 @@ Graph Compilation and Execution - Specify which modules are treated as third-party libraries in static graph mode without JIT static compilation. Their functions and methods will be interpreted and executed. - String - The module name, corresponding to the name of the imported top-level module. If there are more than one, separate them with commas. For example, `export MS_JIT_IGNORE_MODULES=numpy,scipy`. - - Static graph mode can automatically recognize third-party libraries, and generally there is no need to set this environment variable for recognizable third-party libraries such as NumPy and Scipy. If `MS_JIT_IGNORE_MODULES` and `MS_JIT_MODULES` specify the same module name at the same time, the former takes effect and the latter does not. + - Static graph mode can automatically recognize third-party libraries, and generally there is no need to set this environment variable for recognizable third-party libraries such as NumPy and SciPy. If `MS_JIT_IGNORE_MODULES` and `MS_JIT_MODULES` specify the same module name at the same time, the former takes effect and the latter does not. * - MS_DEV_FALLBACK_DUMP_NODE - Print syntax expressions supported by Static Graph Syntax Enhancement in the code. - Integer @@ -279,7 +279,7 @@ Graph Compilation and Execution compile_statistics: Whether to enable compile statistics, with a default value of false. - backend_compile_cache: Whether to enable backend cache in O0/O1 mode, only effective when enable complie cache(MS_COMPILER_CACHE_ENABLE), with a default value of true. + backend_compile_cache: Whether to enable backend cache in O0/O1 mode, only effective when enable compilation cache(MS_COMPILER_CACHE_ENABLE), with a default value of true. view: Whether to enable view kernels, only effective in O0 or O1 mode, with a default value of true. - @@ -307,6 +307,8 @@ Graph Compilation and Execution acl_allocator: Whether to enable ACL memory allocator, with a default value of true. somas_whole_block: Whether to use the entire Somas for memory allocation, with a default value of false. + + enable_small_pool: Whether to enable small pool, with a default value of false. When enabled, memory allocations smaller than 1MB are managed by the small memory pool. - * - MS_DEV_GRAPH_KERNEL_FLAGS @@ -352,7 +354,7 @@ Graph Compilation and Execution - * - MS_DEV_LAUNCH_BLOCKING - - Control whether the operator is synchronously launched. When enabled, the operator will be launced in a single thread and will synchronize the stream. + - Control whether the operator is synchronously launched. When enabled, the operator will be launched in a single thread and will synchronize the stream. - Integer - 1: Enable operator synchronization launch. @@ -392,7 +394,7 @@ Graph Compilation and Execution * - MS_SUPPORT_BINARY - Control whether support run pyc or so in graph mode. - Integer - - 1:Support run pyc or so in graph mode. + - 1: Support run pyc or so in graph mode. No setting or other value: Not support. - @@ -872,11 +874,11 @@ Silent Data Corruption Detection - Integer - 0: Disable feature value detection function - 1: Enable feature value detection function, when error was detected, just print log, not thow exception + 1: Enable feature value detection function, when error was detected, just print log, not throw exception - 2: Enable feature value detection function, when error was detected, thow exception + 2: Enable feature value detection function, when error was detected, throw exception - 3: Enable feature value detection function, when error was detected, thow exception, but at the same time write value detection info of each time to log file (this requires set ascend log level to info or debug) + 3: Enable feature value detection function, when error was detected, throw exception, but at the same time write value detection info of each time to log file (this requires set ascend log level to info or debug) - Currently, this feature only supports Atlas A2 training series products, and only detects abnormal feature value that occur during the training of Transformer class models with bfloat16 data type Considering that the feature value range can not be known ahead, setting NPU_ASD_ENABLE to 1 is recommended to enable silent check, which prevents training interruption caused by false detection @@ -922,7 +924,7 @@ Third-party Library - Value Range - Description * - OPTION_PROTO_LIB_PATH - - Specifies the RPOTO dependent library path. + - Specifies the PROTO dependent library path. - String - File path, which can be a relative path or an absolute path. - @@ -961,9 +963,9 @@ Third-party Library - Absolute path for CUDA package installation - Required for GPU environment only, generally no need to set. If multiple versions of CUDA are installed in the GPU environment, it is recommended to configure this environment variable in order to avoid confusion. * - MS_ENABLE_TFT - - Enable `MindIO TFT `_ feature. Turn on TTP, UCE, TRE or ARF feature. + - Enable Training Fault Tolerance. Most functions rely on `MindIO TFT `_ . - String - - "{TTP:1,UCE:1,TRE:1,ARF:1,TSP:1}". TTP (Try To Persist): End of life CKPT, UCE (Uncorrectable Memory Error): Fault tolerance and recovery, TRE(Training Result Error): Restoring training result exceptions, ARF (Air Refuelling), TSP(Training step pause): Process level rescheduling and recovery feature. The four features can be enabled separately. If you only want to enable one of them, set the corresponding value to 1. Other values: MindIO TFT not turned on. (When using UCE or ARF, TTP is enabled by default. TRE can not be used with UCE or ARF feature.) + - "{TTP:1,UCE:1,TRE:1,ARF:1,TSP:1,RSC:1}". TTP (Try To Persist): End of life CKPT, UCE (Uncorrectable Memory Error): Fault tolerance and recovery, TRE(Training Result Error): Restoring training result exceptions, ARF (Air Refuelling), TSP(Training step pause): Process level rescheduling and recovery feature, RSC (Register Stop/Start Controller): POD-level rescheduling function. Above features can be enabled separately. If you only want to enable one of them, set the corresponding value to 1. (When using UCE or ARF, TTP is enabled by default. TRE can not be used with UCE or ARF feature. When only RSC:1 is enabled (the current version must rely on MindX), other training fault tolerance features are not effective.) - Graph mode can only be enabled on the Ascend backend and jit_level is set to "O0" or "O1". * - MS_TFT_IP - The IP address where the MindIO controller thread is located for processor connections. diff --git a/docs/mindspore/source_en/api_python/operator_list_parallel.md b/docs/mindspore/source_en/api_python/operator_list_parallel.md index 030afe951272532261c80e54acbbeda3e2ba1c39..ca1aa66f3c242811ada47994bd5b5c649929c9b3 100644 --- a/docs/mindspore/source_en/api_python/operator_list_parallel.md +++ b/docs/mindspore/source_en/api_python/operator_list_parallel.md @@ -47,7 +47,7 @@ | [mindspore.ops.DivNoNan](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.DivNoNan.html) | None | Not support config layout | | [mindspore.ops.Dropout](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.Dropout.html) | None | Not support config layout | | [mindspore.ops.Elu](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.Elu.html) | None | Not support config layout | -| [mindspore.ops.embedding](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.embedding.html) | 1. padding_idx, max_norm, norm_type, and scale_gradid_by_freq only support default values.
2. The first input does not support splitting.
3. The second input does not support scenarios where it cannot be cut off. | Layout configuration is supported. | +| [mindspore.ops.embedding](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.embedding.html) | 1. padding_idx, max_norm, norm_type, and scale_grad_by_freq only support default values.
2. The first input does not support splitting.
3. The second input does not support scenarios where it cannot be cut off. | Layout configuration is supported. | | [mindspore.ops.EmbeddingLookup](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.EmbeddingLookup.html) | The same as Gather. | Not support config layout | | [mindspore.ops.Equal](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.Equal.html) | None | Not support config layout | | [mindspore.ops.Erf](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.Erf.html) | None | Not support config layout | @@ -72,7 +72,7 @@ | [mindspore.ops.InplaceSub](https://mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.InplaceSub.html) | The same as InplaceAdd. | Not support config layout | | [mindspore.ops.InplaceUpdate](https://mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.InplaceUpdate.html) | The same as InplaceAdd. | Not support config layout | | [mindspore.ops.Inv](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.Inv.html) | None | Not support config layout | -| [mindspore.ops.IOU](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.IOU.html) | The first dimension of the `anchor_boxes` and `gt_boxes` can be spilt. | Not support config layout | +| [mindspore.ops.IOU](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.IOU.html) | The first dimension of the `anchor_boxes` and `gt_boxes` can be split. | Not support config layout | | [mindspore.ops.IsFinite](https://mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.IsFinite.html) | None | Not support config layout | | [mindspore.ops.KLDivLoss](https://mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.KLDivLoss.html) | None | Not support config layout | | [mindspore.ops.LayerNorm](https://mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.LayerNorm.html) | The strategy for the second input gamma and the third input beta needs to be equal to the input_x_strategy[begin_params.axis:], input_x_strategy is the strategy for the first input. | Support config layout. The layout configuration for the second input gamma and the third input beta needs to be equal to the input_x_layout_tuple[begin_params.axis:], input_x_layout_tuple is the layout configuration for the first input. | @@ -107,13 +107,13 @@ | [mindspore.ops.RandomChoiceWithMask](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.RandomChoiceWithMask.html) | Only the all-1 strategy is supported. | Not support config layout | | [mindspore.ops.RealDiv](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.RealDiv.html) | None | Not support config layout | | [mindspore.ops.Reciprocal](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.Reciprocal.html) | None | Not support config layout | -| [mindspore.ops.ReduceMax](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.ReduceMax.html) | When the input_x is splited on the axis dimension, the distributed result may be inconsistent with that on the single machine. | Not support config layout | -| [mindspore.ops.ReduceMin](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.ReduceMin.html) | When the input_x is splited on the axis dimension, the distributed result may be inconsistent with that on the single machine. | Not support config layout | +| [mindspore.ops.ReduceMax](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.ReduceMax.html) | When the input_x is split on the axis dimension, the distributed result may be inconsistent with that on the single machine. | Not support config layout | +| [mindspore.ops.ReduceMin](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.ReduceMin.html) | When the input_x is split on the axis dimension, the distributed result may be inconsistent with that on the single machine. | Not support config layout | | [mindspore.ops.ReduceSum](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.ReduceSum.html) | None | Not support config layout | | [mindspore.ops.ReduceMean](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.ReduceMean.html) | None | Not support config layout | | [mindspore.ops.ReLU](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.ReLU.html) | None | Support config input layout. Output layout is not open for configuration. | | [mindspore.ops.ReLU6](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.ReLU6.html) | None | Not support config layout | -| [mindspore.ops.Reshape](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.Reshape.html) | Configuring sharding strategy is not supported. In auto parallel mode, if multiple operators are followed by the reshape operator, different shard strategys are not allowed to be configured for these operators. | Not support config layout | +| [mindspore.ops.Reshape](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.Reshape.html) | Configuring sharding strategy is not supported. In auto parallel mode, if multiple operators are followed by the reshape operator, different shard strategies are not allowed to be configured for these operators. | Not support config layout | | [mindspore.ops.Rint](https://mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.Rint.html) | None | Not support config layout | | [mindspore.ops.ResizeNearestNeighbor](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.ResizeNearestNeighbor.html) | When `align_corners=True` is set, only the first dimension and the second dimension can be split. | Not support config layout | | [mindspore.ops.ROIAlign](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.ROIAlign.html) | Sharding the H/W dimension of the input(features) and the second dimension of input(rois) is not supported. | Not support config layout | @@ -144,7 +144,7 @@ | [mindspore.ops.Sin](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.Sin.html) | None | Not support config layout | | [mindspore.ops.Sinh](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.Sinh.html) | None | Not support config layout | | [mindspore.ops.Softmax](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.Softmax.html) | The logits can't be split into the dimension of axis, otherwise it's inconsistent with the single machine in the mathematical logic. | Support config input layout. Output layout is not open for configuration, and can't config layout on the dimension of axis. | -| [mindspore.ops.SoftmaxCrossEntropyWithLogits](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.SoftmaxCrossEntropyWithLogits.html) | The last dimension of logits and labels can't be splited; Only supports using output[0]. | Not support config layout | +| [mindspore.ops.SoftmaxCrossEntropyWithLogits](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.SoftmaxCrossEntropyWithLogits.html) | The last dimension of logits and labels can't be split; Only supports using output[0]. | Not support config layout | | [mindspore.ops.Softplus](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.Softplus.html) | None | Not support config layout | | [mindspore.ops.Softsign](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.Softsign.html) | None | Not support config layout | | [mindspore.ops.SoftShrink](https://mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.SoftShrink.html) | None | Not support config layout | @@ -167,8 +167,8 @@ | [mindspore.ops.TruncateMod](https://mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.TruncateMod.html) | None | Not support config layout | | [mindspore.ops.Unique](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.Unique.html) | Only support the repeat calculate shard strategy (1,). | Not support config layout | | [mindspore.ops.UnsortedSegmentSum](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.UnsortedSegmentSum.html) | The shard of input_x and segment_ids must be the same as the dimension of segment_ids. | Not support config layout | -| [mindspore.ops.UnsortedSegmentMin](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.UnsortedSegmentMin.html) | The shard of input_x and segment_ids must be the same as the dimension of segment_ids. Note that if the segment id i is missing, then the output[i] will be filled with the maximum of the input type. The user needs to mask the maximum value to avoid value overflow. The communication operation such as AllReudce will raise an Run Task Error due to overflow. | Not support config layout | -| [mindspore.ops.UnsortedSegmentMax](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.UnsortedSegmentMax.html) | The shard of input_x and segment_ids must be the same as the dimension of segment_ids. Note that if the segment id i is missing, then the output[i] will be filled with the minimum of the input type. The user needs to mask the minimum value to avoid value overflow. The communication operation such as AllReudce will raise an Run Task Error due to overflow. | Not support config layout | +| [mindspore.ops.UnsortedSegmentMin](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.UnsortedSegmentMin.html) | The shard of input_x and segment_ids must be the same as the dimension of segment_ids. Note that if the segment id i is missing, then the output[i] will be filled with the maximum of the input type. The user needs to mask the maximum value to avoid value overflow. The communication operation such as AllReduce will raise an Run Task Error due to overflow. | Not support config layout | +| [mindspore.ops.UnsortedSegmentMax](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.UnsortedSegmentMax.html) | The shard of input_x and segment_ids must be the same as the dimension of segment_ids. Note that if the segment id i is missing, then the output[i] will be filled with the minimum of the input type. The user needs to mask the minimum value to avoid value overflow. The communication operation such as AllReduce will raise an Run Task Error due to overflow. | Not support config layout | | [mindspore.ops.Xdivy](https://mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.Xdivy.html) | None | Not support config layout | | [mindspore.ops.Xlogy](https://mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.Xlogy.html) | None | Not support config layout | | [mindspore.ops.ZerosLike](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.ZerosLike.html) | None | Not support config layout | diff --git a/docs/mindspore/source_en/features/data_engine.md b/docs/mindspore/source_en/features/data_engine.md index 256403c7b0b70fac8e1d1d0fab10ebb251c357f4..311116c43009f977d52ffcd64300b39919c65c8c 100644 --- a/docs/mindspore/source_en/features/data_engine.md +++ b/docs/mindspore/source_en/features/data_engine.md @@ -61,7 +61,7 @@ In addition, due to the limited resources of device-side scenarios, MindSpore pr The data processing pipeline continuously processes the data and sends the processed data to the Device-side cache, and after the execution of one Step, the data of the next Step is read directly from the Device's cache. - - datat processing: for processing the dataset into the input needed by the network and passing it to the sending queue to ensure efficient data processing. + - data processing: for processing the dataset into the input needed by the network and passing it to the sending queue to ensure efficient data processing. - sending Queue: maintaining data queues to ensure that data processing and network computing processes do not interfere with each other, acting as a bridge. diff --git a/docs/mindspore/source_en/features/images/arch_en.png b/docs/mindspore/source_en/features/images/arch_en.png index 7e6194c7c2b75724afa39b2b64e663ef97e290c0..2c6aeefcdb720e28e1671ce51b729d00541a5881 100644 Binary files a/docs/mindspore/source_en/features/images/arch_en.png and b/docs/mindspore/source_en/features/images/arch_en.png differ diff --git a/docs/mindspore/source_en/features/parallel/operator_parallel.md b/docs/mindspore/source_en/features/parallel/operator_parallel.md index b01560c0f9f00156652f74b8783aed01d7ba32f1..7f383350f0ecc8d1850696c6ecb540bede3d5f06 100644 --- a/docs/mindspore/source_en/features/parallel/operator_parallel.md +++ b/docs/mindspore/source_en/features/parallel/operator_parallel.md @@ -6,7 +6,7 @@ With the development of deep learning, network models are becoming larger and larger, such as trillions of parametric models have emerged in the field of NLP, and the model capacity far exceeds the memory capacity of a single device, making it impossible to train on a single card or data parallel. -Operator-level parallelism is achieved by slicing the tensor involved in each operator in the network model. Logical data parallelism is used when only the data dimension is sliced, while logical model parallelism is used when only the model dimension is silced. The training of large models is enabled by reducing the memory consumption of a single device. +Operator-level parallelism is achieved by slicing the tensor involved in each operator in the network model. Logical data parallelism is used when only the data dimension is sliced, while logical model parallelism is used when only the model dimension is sliced. The training of large models is enabled by reducing the memory consumption of a single device. MindSpore provides two operator-level parallelism capabilities: [Operator-level Parallelism](#basic-principle) and [Higher-order Operator-level Parallelism](#higher-order-operator-level-parallelism). Operator-level Parallelism uses simple tensor dimension splitting strategies to describe tensor distribution, meeting the requirements of most common scenarios. Higher-order Operator-level Parallelism enables complex partitioning scenarios by opening device arrangement descriptions, supporting: Non-contiguous device allocation, Multi-dimensional hybrid partitioning and so on. Both ops and mint operators are supported for the operator-level parallel capability of the two granularities. This chapter only introduces the operator-level parallelism and high-order operator-level parallelism based on ops operators. For the configuration method of operator-level parallelism based on mint operators, please refer to the mint Operator Parallel Practice and Higher-Order mint Operator Parallel Practice in the [Operator-level Parallelism Tutorial](https://www.mindspore.cn/tutorials/en/master/parallel/operator_parallel.html). @@ -25,7 +25,7 @@ Related interfaces: - `ops.Gather().add_prim_attr("manual_split", split_tuple)`: This interface configures the first input of the Gather operator to be non-uniformly sliced, which is only valid for axis=0. `split_tuple` is a tuple with elements of type int, the sum of the elements must be equal to the length of the 0th dimension of the first input in the Gather operator, and the number of tuples must be equal to the number of 0th dimensional slices of the first input in the Gather operator. - `ops.Gather().add_prim_attr("primitive_target", "CPU")`: This interface configures the Gather operator to execute on the CPU for heterogeneous scenarios. - `ops.Reshape().add_prim_attr("skip_redistribution")`: Do not apply tensor redistribution (For tensor redistribution, see [Basic Principle](#basic-principle)) before and after ops.Reshape. - - `ops.ReduceSum().add_prim_attr("cross_batch")`: This interface only supports Reduce operators. When cross_batch is configurated, if the sliced axis is same as the calculated axis of reduce ops, the synchronization will not be added to each cards, which causes different result that is different from that of single card. + - `ops.ReduceSum().add_prim_attr("cross_batch")`: This interface only supports Reduce operators. When cross_batch is configured, if the sliced axis is same as the calculated axis of reduce ops, the synchronization will not be added to each cards, which causes different result that is different from that of single card. - `ops.TensorScatterUpdate().add_prim_attr("self_define_shard", True)`: When set `self_define_shard` to an operator, input/output layout can config to this operator (whatever this operator supports sharding). However, user needs to ensure the correctness of input/output layout and accuracy of operator. ## Basic Principle @@ -40,7 +40,7 @@ Tensor Layout is used to describe the distribution information about the Tensor If the two-dimensional matrix is sliced to four nodes, there are four types of slices: simultaneously slices both row and column, replication, row slicing + replication, and column slicing + replication, as shown below: -Tensor Redistribution is used to handle the conversion between different Tensor Layout, which can convert the Tensor from one layout to another in the cluster. All redistribution operations are decomposed into combinations of operators such as "set communication+split+concat". The following two figures illustrate several Tensor Redistribution operations. +Tensor Redistribution is used to handle the conversion between different Tensor Layouts, which can convert the Tensor from one layout to another in the cluster. All redistribution operations are decomposed into combinations of operators such as "set communication+split+concat". The following two figures illustrate several Tensor Redistribution operations. *Figure: Tensor is sliced to redistribution of two nodes* @@ -64,7 +64,7 @@ class DenseMatMulNet(nn.Cell): return z net = DenseMatMulNet() -paralell_net = AutoParallel(net, parallel_mode='semi_auto') +parallel_net = AutoParallel(net, parallel_mode='semi_auto') ``` In the above example, the user computes two consecutive two-dimensional matrix multiplications on 4 cards: `Z = (X * W) * V` . For the first matrix multiplication `Y = X * W`, the user wants to slice X by rows in 4 parts (i.e. data parallelism), while for the second matrix multiplication `Z = Y * V`, the user wants to slice V by columns in 4 parts (i.e. model parallelism): @@ -142,5 +142,5 @@ class DenseMatMulNet(nn.Cell): return y net = DenseMatMulNet() -paralell_net = AutoParallel(net, parallel_mode='semi_auto') +parallel_net = AutoParallel(net, parallel_mode='semi_auto') ``` \ No newline at end of file diff --git a/docs/mindspore/source_en/features/parallel/optimizer_parallel.md b/docs/mindspore/source_en/features/parallel/optimizer_parallel.md index d5d069cdde61bf3ad155585cbca383cf8817cd0f..3d1fa968481fc683752f5ccdc8f57408104c73f4 100644 --- a/docs/mindspore/source_en/features/parallel/optimizer_parallel.md +++ b/docs/mindspore/source_en/features/parallel/optimizer_parallel.md @@ -52,5 +52,5 @@ Combining the above characteristics, the implementation scheme of parameter slic In the test validation of the actual network training, we found that the memory gain from parameter slicing is significant. In particular, for large-scale network models, the popular Adaptive Moment estimation (Adam) and Layer-wise Adaptive Moments optimizer for Batching training (LAMB) are usually chosen to train the network, and the number of parameters and computations of the optimizer itself should not be neglected. After parameter grouping, the weight parameters in the network and the two copies of state parameters in the optimizer are reduced by a factor of N-1/N, which greatly saves the static memory. This provides the possibility to increase the number of samples in a single iteration and improve the overall training throughput, which effectively solves the memory pressure of large-scale network training. -Optimizer parameter slicing implemented by MindSpore also has the advantage of being mixed with operator-level parallelism. When the number of sliced parts in the operator-level model parallel parameters are smaller than the number of dimensions, the optimizer parameters can continue to be sliced in the dimension of data parallelism, increasing the utilization of machine resources and thus improving the end-to-end performance. +Optimizer parameter slicing implemented by MindSpore also has the advantage of being mixed with operator-level parallelism. When the number of sliced parts in the operator-level model parallel parameters is smaller than the number of dimensions, the optimizer parameters can continue to be sliced in the dimension of data parallelism, increasing the utilization of machine resources and thus improving the end-to-end performance. diff --git a/docs/mindspore/source_en/features/parallel/pipeline_parallel.md b/docs/mindspore/source_en/features/parallel/pipeline_parallel.md index e42b37461b361d014afee176d0315ec2b3d856a0..e70617c43cf40f30423a85799ebbdbcaae6dd25d 100644 --- a/docs/mindspore/source_en/features/parallel/pipeline_parallel.md +++ b/docs/mindspore/source_en/features/parallel/pipeline_parallel.md @@ -16,7 +16,7 @@ Related interfaces: 3. [mindspore.parallel.Pipeline(network, micro_size=1, stage_config={"cell1":0, "cell2":1})](https://www.mindspore.cn/docs/en/master/api_python/parallel/mindspore.parallel.nn.Pipeline.html): Pipeline parallelism requires wrapping the `network` with an additional layer of `Pipeline`, `micro_size` specifies the number of MicroBatch, which are finer-grained splits of a MiniBatch to improve hardware utilization. If using `WithLossCell` to encapsulate `network`, the name of the `Cell` will be changed and the `_backbone` prefix will be added. The final loss is the accumulation of losses from all MicroBatches. `stage_config` indicates the stage assignment for each Cell in the network. `micro_size` must be greater than or equal to the number of `stages`. -4. [mindspore.parallel.PipelineGradReducer(parameters, scale_sense=1.0, opt_shard=None)](https://www.mindspore.cn/docs/en/master/api_python/parallel/mindspore.parallel.nn.PipelineGradReducer.html): pipeline parallelism requires using `PipelineGradReducer` for gradient reduction. Because the output of pipeline parallelism is derived by the addition of several micro-batch outputs, as the gradient do. +4. [mindspore.parallel.PipelineGradReducer(parameters, scale_sense=1.0, opt_shard=None)](https://www.mindspore.cn/docs/en/master/api_python/parallel/mindspore.parallel.nn.PipelineGradReducer.html): pipeline parallelism requires using `PipelineGradReducer` for gradient reduction. Because the output of pipeline parallelism is derived by the addition of several micro-batch outputs, as the gradient does. 5. [mindspore.parallel.sync_pipeline_shared_parameters(net)](https://www.mindspore.cn/docs/en/master/api_python/parallel/mindspore.parallel.sync_pipeline_shared_parameters.html): Synchronize pipeline parallel stage shared parameters. diff --git a/docs/mindspore/source_en/note/api_mapping/pytorch_api_mapping.md b/docs/mindspore/source_en/note/api_mapping/pytorch_api_mapping.md index 242ccf5b105c7f3bf9fba7a8fefa3bc411e1efed..628ab9a20c3660e46cdc28d5b47777b8956e8e09 100644 --- a/docs/mindspore/source_en/note/api_mapping/pytorch_api_mapping.md +++ b/docs/mindspore/source_en/note/api_mapping/pytorch_api_mapping.md @@ -18,7 +18,7 @@ The API mapping is also consistent in the following exception scenarios: (2) MindSpore API does not support passing parameters of plural type. -**Exception Scenario 2**: Compared to MindSpore APIss, the extra parameters of PyTorch API are [general difference parameters](https://www.mindspore.cn/docs/en/master/note/api_mapping/pytorch_api_mapping.html#general-difference-parameter-table). General difference parameters exist because PyTorch has some parameters that are added for non-functionality such as performance optimization, and the performance optimization mechanism of MindSpore is different from that of PyTorch. +**Exception Scenario 2**: Compared to MindSpore APIs, the extra parameters of PyTorch API are [general difference parameters](https://www.mindspore.cn/docs/en/master/note/api_mapping/pytorch_api_mapping.html#general-difference-parameter-table). General difference parameters exist because PyTorch has some parameters that are added for non-functionality such as performance optimization, and the performance optimization mechanism of MindSpore is different from that of PyTorch. **Exception Scenario 3**: If it can be guaranteed that MindSpore API uses the default configuration (or that the user does not configure it), MindSpore API can implement the same functionality as the PyTorch API, and MindSpore API has more parameters than PyTorch API. The functionality is not considered a difference. @@ -37,7 +37,7 @@ Because of the framework mechanism, MindSpore does not provide the following par | :-------------: | :----------------------------------------------------------: |:--:| | out | Indicates the output Tensor |Assign the operation result to the out parameter, not supported in MindSpore.| | layout | Indicates the memory distribution strategy |PyTorch supports torch.striped and torch.split_coo, not supported in MindSpore.| -| device | Indicates the Tensor storage location |Including device type and optional device number. MindSpore supports the following options:
1. After creating a Tensor, it is created on the CPU by default. When executing operators, it will be automatically copied to the corresponding device_target according to [set_device](https://www.mindspore.cn/docs/en/master/api_python/mindspore/mindspore.set_device.html).
2. If you want to manually copy the Tensor after creation, you can also call the [Tensor.move_to](https://www.mindspore.cn/docs/en/master/api_python/mindspore/Tensor/mindspore.Tensor.move_to.html). | +| device | Indicates the Tensor storage location |Including device type and optional device number. MindSpore supports the following options:
1. After creating a Tensor, it is created on the CPU by default. When executing operators, it will be automatically copied to the corresponding device_target according to [mindspore.set_device](https://www.mindspore.cn/docs/en/master/api_python/mindspore/mindspore.set_device.html).
2. If you want to manually copy the Tensor after creation, you can also call the [mindspore.Tensor.move_to](https://www.mindspore.cn/docs/en/master/api_python/mindspore/Tensor/mindspore.Tensor.move_to.html). | | requires_grad | Indicates whether to update the gradient |MindSpore can be accessed through the `Parameter.requires_grad` attribute to control.| | pin_memory | Indicates whether to use locking page memory |Not supported in MindSpore.| | memory_format | Indicates the memory format of the Tensor |Not supported in MindSpore.| @@ -370,7 +370,7 @@ Because of the framework mechanism, MindSpore does not provide the following par | [torch.nn.functional.hardsigmoid](https://pytorch.org/docs/2.1/nn.functional.html#torch.nn.functional.hardsigmoid) | [mindspore.mint.nn.functional.hardsigmoid](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.functional.hardsigmoid.html) |Consistent functions, MindSpore has no parameter inplace. | | [torch.nn.functional.hardswish](https://pytorch.org/docs/2.1/nn.functional.html#torch.nn.functional.hardswish) | [mindspore.mint.nn.functional.hardswish](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.functional.hardswish.html) |Consistent functions, MindSpore has no parameter inplace. | | [torch.nn.functional.interpolate](https://pytorch.org/docs/2.1/nn.functional.html#torch.nn.functional.interpolate) | [mindspore.mint.nn.functional.interpolate](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.functional.interpolate.html) | Consistent functions, MindSpore has no parameter antialias. | -| [torch.nn.functional.l1_loss](https://pytorch.org/docs/2.1/nn.functional.html#l1-loss) | [mindspore.mint.nn.functional.l1_loss](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.functional.l1_loss.html) | Consistent](https://www.mindspore.cn/docs/en/master/note/api_mapping/pytorch_api_mapping.html#api-mapping-consistency-criteria-and-exceptions) | +| [torch.nn.functional.l1_loss](https://pytorch.org/docs/2.1/nn.functional.html#l1-loss) | [mindspore.mint.nn.functional.l1_loss](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.functional.l1_loss.html) | [Consistent](https://www.mindspore.cn/docs/en/master/note/api_mapping/pytorch_api_mapping.html#api-mapping-consistency-criteria-and-exceptions) | | [torch.nn.functional.leaky_relu](https://pytorch.org/docs/2.1/nn.functional.html#leaky-relu) | [mindspore.mint.nn.functional.leaky_relu](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.functional.leaky_relu.html) |Consistent functions, MindSpore has no parameter inplace. | | [torch.nn.functional.linear](https://pytorch.org/docs/2.1/nn.functional.html#torch.nn.functional.linear) | [mindspore.mint.nn.functional.linear](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.functional.linear.html) | [Consistent](https://www.mindspore.cn/docs/en/master/note/api_mapping/pytorch_api_mapping.html#api-mapping-consistency-criteria-and-exceptions) | | [torch.nn.functional.log_softmax](https://pytorch.org/docs/2.1/nn.functional.html#torch.nn.functional.log_softmax) | [mindspore.mint.nn.functional.log_softmax](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.functional.log_softmax.html) | [Consistent](https://www.mindspore.cn/docs/en/master/note/api_mapping/pytorch_api_mapping.html#api-mapping-consistency-criteria-and-exceptions) | diff --git a/docs/mindspore/source_zh_cn/api_python/env_var_list.rst b/docs/mindspore/source_zh_cn/api_python/env_var_list.rst index 8459888597db9e48358bd0eb7c62093ebc831d9d..b9233a829ba1d595027e1567e8fe0f9355daefad 100644 --- a/docs/mindspore/source_zh_cn/api_python/env_var_list.rst +++ b/docs/mindspore/source_zh_cn/api_python/env_var_list.rst @@ -307,6 +307,8 @@ acl_allocator: 是否使用ACL内存分配器,默认值为true。 somas_whole_block: 是否使用SOMAS整块内存分配,默认值为false。 + + enable_small_pool: 是否开启小内存池,默认值为false。开启后小于1MB的内存申请使用小内存池单独管理。 - * - MS_DEV_GRAPH_KERNEL_FLAGS @@ -918,7 +920,7 @@ Dump调试 - 取值 - 说明 * - OPTION_PROTO_LIB_PATH - - RPOTO依赖库库路径 + - PROTO依赖库库路径 - String - 目录路径,支持相对路径与绝对路径 - @@ -957,9 +959,9 @@ Dump调试 - CUDA包安装的绝对路径 - 仅限GPU环境需要,一般无需设置,如在GPU环境中安装了多种版本的CUDA,为了避免混淆,建议配置此环境变量。 * - MS_ENABLE_TFT - - 使能 `MindIO TFT `_ 特性,表示启用 TTP、UCE、TRE 或 ARF 功能。 + - 使能训练故障容错(Training Fault Tolerance)功能,大多数功能依赖 `MindIO TFT `_ 组件。 - String - - "{TTP:1,UCE:1,ARF:1, TSP:1}"。TTP (Try To Persist):临终 CKPT 功能、UCE (Uncorrectable Memory Error):UCE 故障容错恢复功能、TRE (Training Result Error):训练结果异常恢复功能、ARF (Air Refuelling):进程级重调度恢复功能. TSP(Training Step Pause):训练迭代暂停。五个特性可以分开使能,如果只想启用其中的某一个功能,则将对应的值设置为 1 即可。其他值:未开启MindIO TFT。(开启 UCE 或者 ARF 功能时,默认开启 TTP 功能。TRE 功能不可以与 UCE 或 ARF 功能同时使用。) + - "{TTP:1,UCE:1,ARF:1,TSP:1}"。TTP (Try To Persist):临终 CKPT 功能、UCE (Uncorrectable Memory Error):UCE 故障容错恢复功能、TRE (Training Result Error):训练结果异常恢复功能、ARF (Air Refuelling):进程级重调度恢复功能、TSP(Training Step Pause):训练迭代暂停、RSC (Register Stop/Start Controller): POD级重调度功能。上述特性可以分开使能,如果只想启用其中的某一个功能,则将对应的值设置为 1 即可。(开启 UCE 或者 ARF 功能时,默认开启 TTP 功能。TRE 功能不可以与 UCE 或 ARF 功能同时使用。仅开启 RSC (当前版本必须依赖MindX)时,其他训练故障容错功能不生效。) - 仅限在 Ascend 后端开启图模式,且 jit_level 设置为 "O0" 或 "O1"。 * - MS_TFT_IP - MindIO的controller线程所在IP,供processor链接。 diff --git a/docs/mindspore/source_zh_cn/api_python/operator_list_parallel.md b/docs/mindspore/source_zh_cn/api_python/operator_list_parallel.md index 309870af9559fcf48d4f3f72e6d06fa4b135761f..4bc1b37d4ab6b4e6ec0f058aae29a53c8065eec7 100644 --- a/docs/mindspore/source_zh_cn/api_python/operator_list_parallel.md +++ b/docs/mindspore/source_zh_cn/api_python/operator_list_parallel.md @@ -47,7 +47,7 @@ | [mindspore.ops.DivNoNan](https://www.mindspore.cn/docs/zh-CN/master/api_python/ops/mindspore.ops.DivNoNan.html) | 无 | 不支持配置Layout | | [mindspore.ops.Dropout](https://www.mindspore.cn/docs/zh-CN/master/api_python/ops/mindspore.ops.Dropout.html) | 无 | 不支持配置Layout | | [mindspore.ops.Elu](https://www.mindspore.cn/docs/zh-CN/master/api_python/ops/mindspore.ops.Elu.html) | 无 | 不支持配置Layout | -| [mindspore.ops.embedding](https://www.mindspore.cn/docs/zh-CN/master/api_python/ops/mindspore.ops.embedding.html) | 1. padding_idx、max_norm、norm_type和scale_gradid_by_freq仅支持默认值;
2. 第一个输入不支持切分;
3. 第二个输入不支持切不满的情况。 | 支持配置Layout | +| [mindspore.ops.embedding](https://www.mindspore.cn/docs/zh-CN/master/api_python/ops/mindspore.ops.embedding.html) | 1. padding_idx、max_norm、norm_type和scale_grad_by_freq仅支持默认值;
2. 第一个输入不支持切分;
3. 第二个输入不支持切不满的情况。 | 支持配置Layout | | [mindspore.ops.EmbeddingLookup](https://www.mindspore.cn/docs/zh-CN/master/api_python/ops/mindspore.ops.EmbeddingLookup.html) | 同Gather | 不支持配置Layout | | [mindspore.ops.Equal](https://www.mindspore.cn/docs/zh-CN/master/api_python/ops/mindspore.ops.Equal.html) | 无 | 不支持配置Layout | | [mindspore.ops.Erf](https://www.mindspore.cn/docs/zh-CN/master/api_python/ops/mindspore.ops.Erf.html) | 无 | 不支持配置Layout | diff --git a/docs/mindspore/source_zh_cn/features/images/arch_zh.png b/docs/mindspore/source_zh_cn/features/images/arch_zh.png index 689144efc2cd717e4c5b119e174f8b2efcaefbb9..f15a8ed4c1af970a4f94dea3cdbb10f85a92884e 100644 Binary files a/docs/mindspore/source_zh_cn/features/images/arch_zh.png and b/docs/mindspore/source_zh_cn/features/images/arch_zh.png differ diff --git a/docs/mindspore/source_zh_cn/features/parallel/operator_parallel.md b/docs/mindspore/source_zh_cn/features/parallel/operator_parallel.md index 4bed8a24d96298ebd91eb199cc1a5c8256363ab8..825632e1b2bafb5b9f708b1ff524060543e4241e 100644 --- a/docs/mindspore/source_zh_cn/features/parallel/operator_parallel.md +++ b/docs/mindspore/source_zh_cn/features/parallel/operator_parallel.md @@ -72,7 +72,7 @@ class DenseMatMulNet(nn.Cell): return z net = DenseMatMulNet() -paralell_net = AutoParallel(net, parallel_mode='semi_auto') +parallel_net = AutoParallel(net, parallel_mode='semi_auto') ``` 在以上例子中,用户在4个卡上计算两个连续的二维矩阵乘:`Z = (X * W) * V` 。第一个矩阵乘`Y = X * W`,用户想把X按行切4份(即数据并行);而第二个矩阵乘`Z = Y * V`,用户想把V按列切4份(即模型并行): @@ -152,5 +152,5 @@ class DenseMatMulNet(nn.Cell): return y net = DenseMatMulNet() -paralell_net = AutoParallel(net, parallel_mode='semi_auto') +parallel_net = AutoParallel(net, parallel_mode='semi_auto') ``` \ No newline at end of file diff --git a/docs/mindspore/source_zh_cn/note/api_mapping/pytorch_api_mapping.md b/docs/mindspore/source_zh_cn/note/api_mapping/pytorch_api_mapping.md index abcbaca7d79fbfdd33720cc6c445c155bba23974..9bb0aa45fc89e7f3900ad9be7a1389d56999acdd 100644 --- a/docs/mindspore/source_zh_cn/note/api_mapping/pytorch_api_mapping.md +++ b/docs/mindspore/source_zh_cn/note/api_mapping/pytorch_api_mapping.md @@ -37,7 +37,7 @@ mindspore.mint.argmax只有一种API形式,即mindspore.mint.argmax(input, dim |:-------------:| :------------------------------------------------: |:------------------------------------------------------------:| | out | 表示输出的Tensor | 把运算结果赋值给out参数,MindSpore目前无此机制 | | layout | 表示内存分布策略 | PyTorch支持torch.strided和torch.sparse_coo两种模式, MindSpore目前无此机制 | -| device | 表示Tensor存放位置 | 包含设备类型及可选设备号。MindSpore的支持方案如下:
1. 创建Tensor后,默认在CPU上,在执行算子的时候,会根据[set_device](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore/mindspore.set_device.html)自动拷贝到对应的device_target。
2. 如果想在创建了Tensor之后手动拷贝,也可以调用[Tensor.move_to](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore/Tensor/mindspore.Tensor.move_to.html)接口。 | +| device | 表示Tensor存放位置 | 包含设备类型及可选设备号。MindSpore的支持方案如下:
1. 创建Tensor后,默认在CPU上,在执行算子的时候,会根据[mindspore.set_device](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore/mindspore.set_device.html)自动拷贝到对应的device_target。
2. 如果想在创建了Tensor之后手动拷贝,也可以调用[mindspore.Tensor.move_to](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore/Tensor/mindspore.Tensor.move_to.html)接口。 | | requires_grad | 表示是否更新梯度 | MindSpore中可以通过`Parameter.requires_grad`控制 | | pin_memory | 表示是否使用锁页内存 | MindSpore目前无此机制 | | memory_format | 表示Tensor的内存格式 | MindSpore目前无此机制 | diff --git a/docs/msadapter/docs/source_zh_cn/note/pytorch_api_supporting_nn_functional.md b/docs/msadapter/docs/source_zh_cn/note/pytorch_api_supporting_nn_functional.md index 314c0334d15b6d8f76939d8a9615c29490e02b7d..3f2d6ae0e71ba428c77b68a29e1a487f78cf8793 100644 --- a/docs/msadapter/docs/source_zh_cn/note/pytorch_api_supporting_nn_functional.md +++ b/docs/msadapter/docs/source_zh_cn/note/pytorch_api_supporting_nn_functional.md @@ -10,7 +10,7 @@ |[conv2d](https://pytorch.org/docs/2.1/generated/torch.nn.functional.conv2d.html)|Beta|支持数据类型:bf16、fp16、fp32| |[conv3d](https://pytorch.org/docs/2.1/generated/torch.nn.functional.conv3d.html)|Not Support|N/A| |[conv_transpose1d](https://pytorch.org/docs/2.1/generated/torch.nn.functional.conv_transpose1d.html)|Not Support|N/A| -|[conv_transpose2d](https://pytorch.org/docs/2.1/generated/torch.nn.functional.conv_transpose2d.html)|Not Support|N/A| +|[conv_transpose2d](https://pytorch.org/docs/2.1/generated/torch.nn.functional.conv_transpose2d.html)|Demo|不支持CPU平台| |[conv_transpose3d](https://pytorch.org/docs/2.1/generated/torch.nn.functional.conv_transpose3d.html)|Not Support|N/A| |[unfold](https://pytorch.org/docs/2.1/generated/torch.nn.functional.unfold.html)|Stable|N/A| |[fold](https://pytorch.org/docs/2.1/generated/torch.nn.functional.fold.html)|Stable|N/A| @@ -103,7 +103,7 @@ |API名称|API状态|限制与说明| |-------|-------|---------| -|[dropout](https://pytorch.org/docs/2.1/generated/torch.nn.functional.dropout.html)|Beta|入参不支持inplace;支持数据类型:bf16、fp16、fp32| +|[dropout](https://pytorch.org/docs/2.1/generated/torch.nn.functional.dropout.html)|Stable|支持数据类型:bf16、fp16、fp32| |[alpha_dropout](https://pytorch.org/docs/2.1/generated/torch.nn.functional.alpha_dropout.html)|Not Support|N/A| |[feature_alpha_dropout](https://pytorch.org/docs/2.1/generated/torch.nn.functional.feature_alpha_dropout.html)|Not Support|N/A| |[dropout1d](https://pytorch.org/docs/2.1/generated/torch.nn.functional.dropout1d.html)|Not Support|N/A| diff --git a/docs/msadapter/docs/source_zh_cn/note/pytorch_api_supporting_tensor.md b/docs/msadapter/docs/source_zh_cn/note/pytorch_api_supporting_tensor.md index 882d477a7e40cf1259cc13bf1ed378a154bdab37..44c473b9d6039e7d9f6def6b64396b5e5d2e4ccd 100644 --- a/docs/msadapter/docs/source_zh_cn/note/pytorch_api_supporting_tensor.md +++ b/docs/msadapter/docs/source_zh_cn/note/pytorch_api_supporting_tensor.md @@ -145,7 +145,7 @@ |[Tensor.det](https://pytorch.org/docs/2.1/generated/torch.Tensor.det.html)|Not Support|N/A| |[Tensor.dense_dim](https://pytorch.org/docs/2.1/generated/torch.Tensor.dense_dim.html)|Not Support|N/A| |[Tensor.detach](https://pytorch.org/docs/2.1/generated/torch.Tensor.detach.html)|Not Support|N/A| -|[Tensor.detach_](https://pytorch.org/docs/2.1/generated/torch.Tensor.detach_.html)|Not Support|N/A| +|[Tensor.detach_](https://pytorch.org/docs/2.1/generated/torch.Tensor.detach_.html)|Demo|N/A| |[Tensor.diag](https://pytorch.org/docs/2.1/generated/torch.Tensor.diag.html)|Not Support|N/A| |[Tensor.diag_embed](https://pytorch.org/docs/2.1/generated/torch.Tensor.diag_embed.html)|Not Support|N/A| |[Tensor.diagflat](https://pytorch.org/docs/2.1/generated/torch.Tensor.diagflat.html)|Not Support|N/A| @@ -506,7 +506,7 @@ |[Tensor.tril](https://pytorch.org/docs/2.1/generated/torch.Tensor.tril.html)|Not Support|N/A| |[Tensor.tril_](https://pytorch.org/docs/2.1/generated/torch.Tensor.tril_.html)|Not Support|N/A| |[Tensor.triu](https://pytorch.org/docs/2.1/generated/torch.Tensor.triu.html)|Not Support|N/A| -|[Tensor.triu_](https://pytorch.org/docs/2.1/generated/torch.Tensor.triu_.html)|Not Support|N/A| +|[Tensor.triu_](https://pytorch.org/docs/2.1/generated/torch.Tensor.triu_.html)|Demo|N/A| |[Tensor.true_divide](https://pytorch.org/docs/2.1/generated/torch.Tensor.true_divide.html)|Not Support|N/A| |[Tensor.true_divide_](https://pytorch.org/docs/2.1/generated/torch.Tensor.true_divide_.html)|Not Support|N/A| |[Tensor.trunc](https://pytorch.org/docs/2.1/generated/torch.Tensor.trunc.html)|Stable|支持数据类型:fp16、fp32| diff --git a/docs/msadapter/docs/source_zh_cn/note/pytorch_api_supporting_torch.md b/docs/msadapter/docs/source_zh_cn/note/pytorch_api_supporting_torch.md index 327cae0dfa56083f3a1b1f86e2459ccfa9d36787..57ebb37c18a2e5255f47d2db9d30e92f4e319931 100644 --- a/docs/msadapter/docs/source_zh_cn/note/pytorch_api_supporting_torch.md +++ b/docs/msadapter/docs/source_zh_cn/note/pytorch_api_supporting_torch.md @@ -58,6 +58,8 @@ |[torch.complex](https://pytorch.org/docs/2.1/generated/torch.complex.html)|Not Support|N/A| |[torch.polar](https://pytorch.org/docs/2.1/generated/torch.polar.html)|Beta|不支持out出参;支持数据类型:fp32| |[torch.heaviside](https://pytorch.org/docs/2.1/generated/torch.heaviside.html)|Beta|不支持out出参| +|[torch.linalg.norm](https://docs.pytorch.org/docs/2.1/generated/torch.linalg.norm.html)|Demo|N/A| +|[torch.linalg.vector_norm](https://docs.pytorch.org/docs/2.1/generated/torch.linalg.vector_norm.html)|Demo|不支持CPU平台| ### Indexing, Slicing, Joining, Mutation Ops @@ -256,7 +258,7 @@ |[torch.lerp](https://pytorch.org/docs/2.1/generated/torch.lerp.html)|Beta|不支持out出参;支持数据类型:支持fp16、fp32| |[torch.lgamma](https://pytorch.org/docs/2.1/generated/torch.lgamma.html)|Beta|不支持out出参| |[torch.log](https://pytorch.org/docs/2.1/generated/torch.log.html)|Stable|支持数据类型:bf16、fp16、fp32、fp64、uint8、int8、int16、int32、int64、bool| -|[torch.log10](https://pytorch.org/docs/2.1/generated/torch.log10.html)|Beta|不支持out出参;支持数据类型:bf16、fp16、fp32、fp64、uint8、int8、int16、int32、int64、bool| +|[torch.log10](https://pytorch.org/docs/2.1/generated/torch.log10.html)|Beta|支持数据类型:bf16、fp16、fp32、fp64、uint8、int8、int16、int32、int64、bool| |[torch.log1p](https://pytorch.org/docs/2.1/generated/torch.log1p.html)|Stable|支持数据类型:fp16、fp32、uint8、int8、int16、int32、int64、bool| |[torch.log2](https://pytorch.org/docs/2.1/generated/torch.log2.html)|Stable|支持数据类型:bf16、fp32、int64、bool、fp16| |[torch.logaddexp](https://pytorch.org/docs/2.1/generated/torch.logaddexp.html)|Not Support|N/A| @@ -274,6 +276,7 @@ |[torch.multiply](https://pytorch.org/docs/2.1/generated/torch.multiply.html)|Not Support|N/A| |[torch.mvlgamma](https://pytorch.org/docs/2.1/generated/torch.mvlgamma.html)|Beta|不支持out出参| |[torch.nan_to_num](https://pytorch.org/docs/2.1/generated/torch.nan_to_num.html)|Stable|支持数据类型:bf16、fp16、fp32、uint8、int8、int16、int32、int64、bool| +|[torch.nan_to_num_](https://pytorch.org/docs/2.1/generated/torch.nan_to_num.html)|Demo|支持数据类型:bf16、fp16、fp32、uint8、int8、int16、int32、int64、bool| |[torch.neg](https://pytorch.org/docs/2.1/generated/torch.neg.html)|Stable|支持数据类型:bf16、fp16、fp32、int8、int32、int64| |[torch.negative](https://pytorch.org/docs/2.1/generated/torch.negative.html)|Beta|不支持out出参;支持数据类型:bf16、fp16、fp32、int8、int32、int64| |[torch.nextafter](https://pytorch.org/docs/2.1/generated/torch.nextafter.html)|Beta|不支持out出参| @@ -401,7 +404,7 @@ |[torch.broadcast_to](https://pytorch.org/docs/2.1/generated/torch.broadcast_to.html)|Stable|支持数据类型:bf16、fp16、fp32、fp64、uint8、int8、int16、int32、int64、bool| |[torch.broadcast_shapes](https://pytorch.org/docs/2.1/generated/torch.broadcast_shapes.html)|Beta|N/A| |[torch.bucketize](https://pytorch.org/docs/2.1/generated/torch.bucketize.html)|Not Support|N/A| -|[torch.cartesian_prod](https://pytorch.org/docs/2.1/generated/torch.cartesian_prod.html)|Not Support|N/A| +|[torch.cartesian_prod](https://pytorch.org/docs/2.1/generated/torch.cartesian_prod.html)|Demo|N/A| |[torch.cdist](https://pytorch.org/docs/2.1/generated/torch.cdist.html)|Beta|N/A| |[torch.clone](https://pytorch.org/docs/2.1/generated/torch.clone.html)|Beta|入参不支持memory_format;支持数据类型:bf16、fp16、fp32、fp64、uint8、int8、int16、int32、int64、bool| |[torch.combinations](https://pytorch.org/docs/2.1/generated/torch.combinations.html)|Not Support|N/A| @@ -410,7 +413,7 @@ |[torch.cross](https://pytorch.org/docs/2.1/generated/torch.cross.html)|Not Support|N/A| |[torch.cummax](https://pytorch.org/docs/2.1/generated/torch.cummax.html)|Not Support|N/A| |[torch.cummin](https://pytorch.org/docs/2.1/generated/torch.cummin.html)|Not Support|N/A| -|[torch.cumprod](https://pytorch.org/docs/2.1/generated/torch.cumprod.html)|Not Support|N/A| +|[torch.cumprod](https://pytorch.org/docs/2.1/generated/torch.cumprod.html)|Stable|不支持bfloat16输入| |[torch.cumsum](https://pytorch.org/docs/2.1/generated/torch.cumsum.html)|Stable|支持数据类型:fp16、fp32、uint8、int8、int16、int32、int64、bool| |[torch.diag](https://pytorch.org/docs/2.1/generated/torch.diag.html)|Beta|入参不支持diagnoal、out;支持数据类型:fp16、fp32、uint8、int8、int16、int32、int64、bool| |[torch.diag_embed](https://pytorch.org/docs/2.1/generated/torch.diag_embed.html)|Not Support|N/A| diff --git a/docs/msadapter/docs/source_zh_cn/note/pytorch_api_supporting_torch_nn.md b/docs/msadapter/docs/source_zh_cn/note/pytorch_api_supporting_torch_nn.md index b1e6fcc8d6e7f1d9abf92ea37c7f5733ca1c8f82..17eb1a5a012fbefdc425a8ea97ab8190e6073e39 100644 --- a/docs/msadapter/docs/source_zh_cn/note/pytorch_api_supporting_torch_nn.md +++ b/docs/msadapter/docs/source_zh_cn/note/pytorch_api_supporting_torch_nn.md @@ -6,9 +6,9 @@ |API名称|API状态|限制与说明| |-------|-------|---------| -|[nn.Conv1d](https://pytorch.org/docs/2.1/generated/torch.nn.Conv1d.html)|Beta|入参不支持device;支持数据类型:fp16、fp32| -|[nn.Conv2d](https://pytorch.org/docs/2.1/generated/torch.nn.Conv2d.html)|Beta|入参不支持device;支持数据类型:bf16、fp16、fp32| -|[nn.Conv3d](https://pytorch.org/docs/2.1/generated/torch.nn.Conv3d.html)|Beta|入参不支持device| +|[nn.Conv1d](https://pytorch.org/docs/2.1/generated/torch.nn.Conv1d.html)|Beta|入参不支持device;不支持circular模式;replicate模式不支持bfloat16输入;支持数据类型:fp16、fp32| +|[nn.Conv2d](https://pytorch.org/docs/2.1/generated/torch.nn.Conv2d.html)|Beta|入参不支持device;不支持circular模式;replicate模式不支持bfloat16输入;支持数据类型:bf16、fp16、fp32| +|[nn.Conv3d](https://pytorch.org/docs/2.1/generated/torch.nn.Conv3d.html)|Beta|入参不支持device;不支持circular模式;replicate模式不支持bfloat16输入;| |[nn.ConvTranspose1d](https://pytorch.org/docs/2.1/generated/torch.nn.ConvTranspose1d.html)|Beta|不支持out出参、入参不支持device;支持数据类型:fp32| |[nn.ConvTranspose2d](https://pytorch.org/docs/2.1/generated/torch.nn.ConvTranspose2d.html)|Beta|不支持out出参、入参不支持device;支持数据类型:fp16、fp32| |[nn.ConvTranspose3d](https://pytorch.org/docs/2.1/generated/torch.nn.ConvTranspose3d.html)|Beta|不支持out出参、入参不支持device| @@ -26,12 +26,12 @@ |API名称|API状态|限制与说明| |-------|-------|---------| |[nn.MaxPool1d](https://pytorch.org/docs/2.1/generated/torch.nn.MaxPool1d.html)|Beta|N/A| -|[nn.MaxPool2d](https://pytorch.org/docs/2.1/generated/torch.nn.MaxPool2d.html)|Beta|N/A| +|[nn.MaxPool2d](https://pytorch.org/docs/2.1/generated/torch.nn.MaxPool2d.html)|Beta|只支持4维输入;不支持bfloat16类型输入| |[nn.MaxPool3d](https://pytorch.org/docs/2.1/generated/torch.nn.MaxPool3d.html)|Beta|N/A| |[nn.MaxUnpool1d](https://pytorch.org/docs/2.1/generated/torch.nn.MaxUnpool1d.html)|Not Support|N/A| |[nn.MaxUnpool2d](https://pytorch.org/docs/2.1/generated/torch.nn.MaxUnpool2d.html)|Not Support|N/A| |[nn.MaxUnpool3d](https://pytorch.org/docs/2.1/generated/torch.nn.MaxUnpool3d.html)|Not Support|N/A| -|[nn.AvgPool1d](https://pytorch.org/docs/2.1/generated/torch.nn.AvgPool1d.html)|Beta|支持数据类型:fp16、fp32| +|[nn.AvgPool1d](https://pytorch.org/docs/2.1/generated/torch.nn.AvgPool1d.html)|Stable|不支持CPU;支持数据类型:fp16、fp32| |[nn.AvgPool2d](https://pytorch.org/docs/2.1/generated/torch.nn.AvgPool2d.html)|Beta|支持数据类型:fp16、fp32| |[nn.AvgPool3d](https://pytorch.org/docs/2.1/generated/torch.nn.AvgPool3d.html)|Beta|N/A| |[nn.FractionalMaxPool2d](https://pytorch.org/docs/2.1/generated/torch.nn.FractionalMaxPool2d.html)|Not Support|N/A| @@ -88,7 +88,7 @@ |[nn.Sigmoid](https://pytorch.org/docs/2.1/generated/torch.nn.Sigmoid.html)|Beta|支持数据类型:bf16、fp16、fp32、uint8、int8、int16、int32、int64、bool| |[nn.SiLU](https://pytorch.org/docs/2.1/generated/torch.nn.SiLU.html)|Stable|支持数据类型:bf16、fp16、fp32| |[nn.Mish](https://pytorch.org/docs/2.1/generated/torch.nn.Mish.html)|Beta|支持数据类型:fp16、fp32| -|[nn.Softplus](https://pytorch.org/docs/2.1/generated/torch.nn.Softplus.html)|Beta|支持数据类型:fp16、fp32| +|[nn.Softplus](https://pytorch.org/docs/2.1/generated/torch.nn.Softplus.html)|Stable|支持数据类型:fp16、fp32| |[nn.Softshrink](https://pytorch.org/docs/2.1/generated/torch.nn.Softshrink.html)|Not Support|N/A| |[nn.Softsign](https://pytorch.org/docs/2.1/generated/torch.nn.Softsign.html)|Not Support|N/A| |[nn.Tanh](https://pytorch.org/docs/2.1/generated/torch.nn.Tanh.html)|Beta|支持数据类型:bf16、fp16、fp32、bool| @@ -101,7 +101,7 @@ |API名称|API状态|限制与说明| |-------|-------|---------| |[nn.Softmin](https://pytorch.org/docs/2.1/generated/torch.nn.Softmin.html)|Not Support|N/A| -|[nn.Softmax](https://pytorch.org/docs/2.1/generated/torch.nn.Softmax.html)|Beta|支持数据类型:bf16、fp16、fp32| +|[nn.Softmax](https://pytorch.org/docs/2.1/generated/torch.nn.Softmax.html)|Stable|支持数据类型:bf16、fp16、fp32| |[nn.Softmax2d](https://pytorch.org/docs/2.1/generated/torch.nn.Softmax2d.html)|Not Support|N/A| |[nn.LogSoftmax](https://pytorch.org/docs/2.1/generated/torch.nn.LogSoftmax.html)|Beta|支持数据类型:bf16、fp16、fp32| |[nn.AdaptiveLogSoftmaxWithLoss](https://pytorch.org/docs/2.1/generated/torch.nn.AdaptiveLogSoftmaxWithLoss.html)|Beta|入参不支持device| @@ -112,11 +112,11 @@ |-------|-------|---------| |[nn.BatchNorm1d](https://pytorch.org/docs/2.1/generated/torch.nn.BatchNorm1d.html)|Beta|入参不支持device;支持数据类型:fp16、fp32| |[nn.BatchNorm2d](https://pytorch.org/docs/2.1/generated/torch.nn.BatchNorm2d.html)|Beta|入参不支持device;支持数据类型:fp16、fp32| -|[nn.BatchNorm3d](https://pytorch.org/docs/2.1/generated/torch.nn.BatchNorm3d.html)|Beta|入参不支持device;支持数据类型:fp16、fp32| +|[nn.BatchNorm3d](https://pytorch.org/docs/2.1/generated/torch.nn.BatchNorm3d.html)|Stable|入参不支持device;支持数据类型:fp16、fp32| |[nn.LazyBatchNorm1d](https://pytorch.org/docs/2.1/generated/torch.nn.LazyBatchNorm1d.html)|Not Support|N/A| |[nn.LazyBatchNorm2d](https://pytorch.org/docs/2.1/generated/torch.nn.LazyBatchNorm2d.html)|Not Support|N/A| |[nn.LazyBatchNorm3d](https://pytorch.org/docs/2.1/generated/torch.nn.LazyBatchNorm3d.html)|Not Support|N/A| -|[nn.GroupNorm](https://pytorch.org/docs/2.1/generated/torch.nn.GroupNorm.html)|Not Support|N/A| +|[nn.GroupNorm](https://pytorch.org/docs/2.1/generated/torch.nn.GroupNorm.html)|Stable|入参不支持device| |[nn.SyncBatchNorm](https://pytorch.org/docs/2.1/generated/torch.nn.SyncBatchNorm.html)|Not Support|N/A| |[nn.InstanceNorm1d](https://pytorch.org/docs/2.1/generated/torch.nn.InstanceNorm1d.html)|Beta|N/A| |[nn.InstanceNorm2d](https://pytorch.org/docs/2.1/generated/torch.nn.InstanceNorm2d.html)|Beta|N/A| @@ -124,7 +124,7 @@ |[nn.LazyInstanceNorm1d](https://pytorch.org/docs/2.1/generated/torch.nn.LazyInstanceNorm1d.html)|Not Support|N/A| |[nn.LazyInstanceNorm2d](https://pytorch.org/docs/2.1/generated/torch.nn.LazyInstanceNorm2d.html)|Not Support|N/A| |[nn.LazyInstanceNorm3d](https://pytorch.org/docs/2.1/generated/torch.nn.LazyInstanceNorm3d.html)|Not Support|N/A| -|[nn.LayerNorm](https://pytorch.org/docs/2.1/generated/torch.nn.LayerNorm.html)|Stable|支持数据类型:bf16、fp16、fp32| +|[nn.LayerNorm](https://pytorch.org/docs/2.1/generated/torch.nn.LayerNorm.html)|Stable|入参不支持device;支持数据类型:bf16、fp16、fp32| |[nn.LocalResponseNorm](https://pytorch.org/docs/2.1/generated/torch.nn.LocalResponseNorm.html)|Not Support|N/A| |[nn.RMSNorm](https://pytorch.org/docs/2.1/generated/torch.nn.RMSNorm.html)|Not Support|N/A| @@ -154,8 +154,8 @@ |API名称|API状态|限制与说明| |-------|-------|---------| -|[nn.Identity](https://pytorch.org/docs/2.1/generated/torch.nn.Identity.html)|Stable|支持数据类型:fp32| -|[nn.Linear](https://pytorch.org/docs/2.1/generated/torch.nn.Linear.html)|Stable|支持数据类型:fp16、fp32| +|[nn.Identity](https://pytorch.org/docs/2.1/generated/torch.nn.Identity.html)|Stable|入参不支持device;支持数据类型:fp32| +|[nn.Linear](https://pytorch.org/docs/2.1/generated/torch.nn.Linear.html)|Stable|入参不支持device;支持数据类型:fp16、fp32| |[nn.Bilinear](https://pytorch.org/docs/2.1/generated/torch.nn.Bilinear.html)|Not Support|N/A| |[nn.LazyLinear](https://pytorch.org/docs/2.1/generated/torch.nn.LazyLinear.html)|Not Support|N/A| @@ -174,7 +174,7 @@ |API名称|API状态|限制与说明| |-------|-------|---------| -|[nn.Embedding](https://pytorch.org/docs/2.1/generated/torch.nn.Embedding.html)|Beta|支持数据类型:int32、int64| +|[nn.Embedding](https://pytorch.org/docs/2.1/generated/torch.nn.Embedding.html)|Beta|入参不支持device;支持数据类型:int32、int64| |[nn.EmbeddingBag](https://pytorch.org/docs/2.1/generated/torch.nn.EmbeddingBag.html)|Not Support|N/A| ## Distance Functions @@ -189,7 +189,7 @@ |API名称|API状态|限制与说明| |-------|-------|---------| |[nn.L1Loss](https://pytorch.org/docs/2.1/generated/torch.nn.L1Loss.html)|Beta|支持数据类型:fp16、fp32、int64| -|[nn.MSELoss](https://pytorch.org/docs/2.1/generated/torch.nn.MSELoss.html)|Beta|支持数据类型:fp16、fp32| +|[nn.MSELoss](https://pytorch.org/docs/2.1/generated/torch.nn.MSELoss.html)|Stable|支持数据类型:fp16、fp32| |[nn.CrossEntropyLoss](https://pytorch.org/docs/2.1/generated/torch.nn.CrossEntropyLoss.html)|Beta|支持数据类型:fp16、fp32| |[nn.CTCLoss](https://pytorch.org/docs/2.1/generated/torch.nn.CTCLoss.html)|Beta|支持数据类型:fp32| |[nn.NLLLoss](https://pytorch.org/docs/2.1/generated/torch.nn.NLLLoss.html)|Beta|支持数据类型:fp16、fp32| diff --git a/docs/vllm_mindspore/docs/source_en/getting_started/installation/installation.md b/docs/vllm_mindspore/docs/source_en/getting_started/installation/installation.md index 08f21d571859fcd30ee819916809b98483eb9cea..dd8fab4ee02cd942db14ba27514dccdc1a985204 100644 --- a/docs/vllm_mindspore/docs/source_en/getting_started/installation/installation.md +++ b/docs/vllm_mindspore/docs/source_en/getting_started/installation/installation.md @@ -4,8 +4,7 @@ This document describes the steps to install the vLLM MindSpore environment. Three installation methods are provided: -- [Docker Installation](#docker-installation): Suitable for quick deployment scenarios. -- [Pip Installation](#pip-installation): Suitable for scenarios requiring specific versions. +- [Docker Installation](#docker-installation): Suitable for quick deployment scenarios. - [Source Code Installation](#source-code-installation): Suitable for incremental development of vLLM MindSpore. ## Version Compatibility @@ -14,19 +13,19 @@ This document describes the steps to install the vLLM MindSpore environment. Thr - Python: 3.9 / 3.10 / 3.11 - Software version compatibility - | Software | Version | Corresponding Branch | - | -------- | ------- | -------------------- | - | [CANN](https://www.hiascend.com/developer/download/community/result?module=cann) | 8.1 | - | - | [MindSpore](https://www.mindspore.cn/install/) | 2.7 | master | - | [MSAdapter](https://git.openi.org.cn/OpenI/MSAdapter) | 0.2 | master | - | [MindSpore Transformers](https://gitee.com/mindspore/mindformers) | 1.6 | dev | - | [Golden Stick](https://gitee.com/mindspore/golden-stick) | 1.1.0 | r1.1.0 | - | [vLLM](https://github.com/vllm-project/vllm) | 0.8.3 | v0.8.3 | - | [vLLM MindSpore](https://gitee.com/mindspore/vllm-mindspore) | 0.2 | master | + | Software | Version And Links | + | ----- | ----- | + |[CANN](https://www.hiascend.com/developer/download/community/result?module=cann) | [8.1.RC1](https://www.hiascend.com/document/detail/zh/canncommercial/81RC1/softwareinst/instg/instg_0000.html?Mode=PmIns&InstallType=local&OS=Debian&Software=cannToolKit) | + |[MindSpore](https://www.mindspore.cn/install/) | [2.7.0](https://repo.mindspore.cn/mindspore/mindspore/version/202508/20250814/master_20250814091143_7548abc43af03319bfa528fc96d0ccd3917fcc9c_newest/unified/) | + |[MSAdapter](https://git.openi.org.cn/OpenI/MSAdapter)| [0.5.0](https://repo.mindspore.cn/mindspore/msadapter/version/202508/20250814/master_20250814010018_4615051c43eef898b6bbdc69768656493b5932f8_newest/any/) | + |[MindSpore Transformers](https://gitee.com/mindspore/mindformers)| [1.6.0](https://gitee.com/mindspore/mindformers) | + |[Golden Stick](https://gitee.com/mindspore/golden-stick)| [1.2.0](https://repo.mindspore.cn/mindspore/golden-stick/version/202508/20250814/master_20250814010017_2713821db982330b3bcd6d84d85a3b337d555f27_newest/any/) | + |[vLLM](https://github.com/vllm-project/vllm) | [0.9.1](https://repo.mindspore.cn/mirrors/vllm/version/202505/20250514/v0.8.4.dev0_newest/any/) | + |[vLLM MindSpore](https://gitee.com/mindspore/vllm-mindspore) | [0.3.0](https://gitee.com/mindspore/vllm-mindspore/) | ## Environment Setup -This section introduces three installation methods: [Docker Installation](#docker-installation), [Pip Installation](#pip-installation), [Source Code Installation](#source-code-installation), and [Quick Verification](#quick-verification) example to check the installation. +This section introduces two installation methods: [Docker Installation](#docker-installation), [Source Code Installation](#source-code-installation), and [Quick Verification](#quick-verification) example to check the installation. ### Docker Installation @@ -106,56 +105,109 @@ docker exec -it $DOCKER_NAME bash ### Source Code Installation -- **CANN Installation** +#### CANN Installation - For CANN installation methods and environment configuration, please refer to [CANN Community Edition Installation Guide](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha002/softwareinst/instg/instg_0001.html?Mode=PmIns&OS=openEuler&Software=cannToolKit). If you encounter any issues during CANN installation, please consult the [Ascend FAQ](https://www.hiascend.com/document/detail/zh/AscendFAQ/ProduTech/CANNFAQ/cannfaq_000.html) for troubleshooting. +For CANN installation methods and environment configuration, please refer to [CANN Community Edition Installation Guide](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha002/softwareinst/instg/instg_0001.html?Mode=PmIns&OS=openEuler&Software=cannToolKit). If you encounter any issues during CANN installation, please consult the [Ascend FAQ](https://www.hiascend.com/document/detail/zh/AscendFAQ/ProduTech/CANNFAQ/cannfaq_000.html) for troubleshooting. - The default installation path for CANN is `/usr/local/Ascend`. After completing CANN installation, configure the environment variables with the following commands: +The default installation path for CANN is `/usr/local/Ascend`. After completing CANN installation, configure the environment variables with the following commands: - ```bash - LOCAL_ASCEND=/usr/local/Ascend # the root directory of run package - source ${LOCAL_ASCEND}/ascend-toolkit/set_env.sh - export ASCEND_CUSTOM_PATH=${LOCAL_ASCEND}/ascend-toolkit - ``` +```bash +LOCAL_ASCEND=/usr/local/Ascend # the root directory of run package +source ${LOCAL_ASCEND}/ascend-toolkit/set_env.sh +export ASCEND_CUSTOM_PATH=${LOCAL_ASCEND}/ascend-toolkit +``` + +#### vLLM Prerequisites Installation + +For vLLM environment configuration and installation methods, please refer to the [vLLM Installation Guide](https://docs.vllm.ai/en/v0.9.1/getting_started/installation/cpu.html). In vllM installation, `gcc/g++ >= 12.3.0` is required, and it could be installed by the following command: + +```bash +yum install -y gcc gcc-c++ +``` + +#### vLLM MindSpore Installation + +vLLM MindSpore can be installed in the following two ways. **vLLM MindSpore One-click Installation** is suitable for scenarios where users need quick deployment and usage. **vLLM MindSpore Manual Installation** is suitable for scenarios where users require custom modifications to the components. + +- **vLLM MindSpore One-click Installation** + + To install vLLM MindSpore, user needs to pull the vLLM MindSpore source code and then runs the following command to install the dependencies: + + ```bash + git clone https://gitee.com/mindspore/vllm-mindspore.git + cd vllm-mindspore + bash install_depend_pkgs.sh + ``` + + Compile and install vLLM MindSpore: + + ```bash + pip install . + ``` + + After executing the above commands, `mindformers` folder will be generated in the `vllm-mindspore/install_depend_pkgs` directory. Add this folder to the environment variables: + + ```bash + export PYTHONPATH=$MF_PATH:$PYTHONPATH + ``` + +- **vLLM MindSpore Manual Installation** + + If user need to modify the components or use other versions, components need to be manually installed in a specific order. Version compatibility of vLLM MindSpore can be found [Version Compatibility](#version-compatibility), abd vLLM MindSpore requires the following installation sequence: + + 1. Install vLLM + + ```bash + pip install /path/to/vllm-*.whl + ``` + + 2. Uninstall Torch-related components + + ```bash + pip uninstall torch torch-npu torchvision torchaudio -y + ``` + + 3. Install MindSpore -- **vLLM Prerequisites Installation** + ```bash + pip install /path/to/mindspore-*.whl + ``` - For vLLM environment configuration and installation methods, please refer to the [vLLM Installation Guide](https://docs.vllm.ai/en/v0.8.3/getting_started/installation/cpu.html). In vllM installation, `gcc/g++ >= 12.3.0` is required, and it could be installed by the following command: + 4. Clone the MindSpore Transformers repository and add it to `PYTHONPATH` - ```bash - yum install -y gcc gcc-c++ - ``` + ```bash + git clone https://gitee.com/mindspore/mindformers.git + export PYTHONPATH=$MF_PATH:$PYTHONPATH + ``` -- **vLLM MindSpore Installation** + 5. Install Golden Stick - To install vLLM MindSpore, user needs to pull the vLLM MindSpore source code and then runs the following command to install the dependencies: + ```bash + pip install /path/to/mindspore_gs-*.whl + ``` - ```bash - git clone https://gitee.com/mindspore/vllm-mindspore.git - cd vllm-mindspore - bash install_depend_pkgs.sh - ``` + 6. Install MSAdapter - Compile and install vLLM MindSpore: + ```bash + pip install /path/to/msadapter-*.whl + ``` - ```bash - pip install . - ``` + 7. Install vLLM MindSpore - After executing the above commands, `mindformers` folder will be generated in the `vllm-mindspore/install_depend_pkgs` directory. Add this folder to the environment variables: + User needs to pull source of vLLM MindSpore, and run installation. - ```bash - export PYTHONPATH=$MF_PATH:$PYTHONPATH - ``` + ```bash + git clone https://gitee.com/mindspore/vllm-mindspore.git + cd vllm-mindspore + pip install . + ``` -### Quick Verification +## Quick Verification User can verify the installation with a simple offline inference test. First, user need to configure the environment variables with the following command: ```bash -export ASCEND_TOTAL_MEMORY_GB=64 # Please use `npu-smi info` to check the memory. export vLLM_MODEL_BACKEND=MindFormers # use MindSpore Transformers as model backend. -export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Set according to the model's maximum usage, with the remaining environment used for kvcache allocation export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model's YAML file. ``` diff --git a/docs/vllm_mindspore/docs/source_en/getting_started/quick_start/quick_start.md b/docs/vllm_mindspore/docs/source_en/getting_started/quick_start/quick_start.md index c0eaf16c347299abc75285ffc1d57c50403d5fee..91a88e814de3e6d37ed6edb5b4a37615dc58ffac 100644 --- a/docs/vllm_mindspore/docs/source_en/getting_started/quick_start/quick_start.md +++ b/docs/vllm_mindspore/docs/source_en/getting_started/quick_start/quick_start.md @@ -131,18 +131,14 @@ git clone https://huggingface.co/Qwen/Qwen2.5-7B-Instruct Before launching the model, user need to set the following environment variables: ```bash -export ASCEND_TOTAL_MEMORY_GB=64 # Please use `npu-smi info` to check the memory. export vLLM_MODEL_BACKEND=MindFormers # use MindSpore Transformers as model backend. -export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Set according to the model's maximum usage, with the remaining environment used for kvcache allocation. export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model's YAML file. ``` Here is an explanation of these environment variables: -- `ASCEND_TOTAL_MEMORY_GB`: The memory size of each card. User can check the memory by using `npu-smi info`, where the value corresponds to `HBM-Usage(MB)` in the query results. - `vLLM_MODEL_BACKEND`: The backend of the model to run. User could find supported models and backends for vLLM MindSpore in the [Model Support List](../../user_guide/supported_models/models_list/models_list.md). -- `vLLM_MODEL_MEMORY_USE_GB`: The memory reserved for model loading. Adjust this value if insufficient memory error occurs during model loading. -- `MINDFORMERS_MODEL_CONFIG`: The model configuration file. +- `MINDFORMERS_MODEL_CONFIG`: The model configuration file. User can find the corresponding YAML file in the [MindSpore Transformers repository](https://gitee.com/mindspore/mindformers/tree/master/research/qwen2_5). For Qwen2.5-7B, the YAML file is [predict_qwen2_5_7b_instruct.yaml](https://gitee.com/mindspore/mindformers/blob/master/research/qwen2_5/predict_qwen2_5_7b_instruct.yaml). Additionally, users need to ensure that MindSpore Transformers is installed. Users can add it by running the following command: @@ -202,7 +198,7 @@ Use the model `Qwen/Qwen2.5-7B-Instruct` and start the vLLM service with the fol python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "Qwen/Qwen2.5-7B-Instruct" ``` -If the service starts successfully, similar output will be obtained: +User can also set the local model path by `--model` argument. If the service starts successfully, similar output will be obtained: ```text INFO: Started server process [6363] @@ -224,6 +220,8 @@ Use the following command to send a request, where `prompt` is the model input: curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "Qwen/Qwen2.5-7B-Instruct", "prompt": "I am", "max_tokens": 15, "temperature": 0}' ``` +User needs to ensure that the `"model"` field matches the `--model` in the service startup, and the request can successfully match the model. + If the request is processed successfully, the following inference result will be returned: ```text diff --git a/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/deepseek_parallel/deepseek_r1_671b_w8a8_dp4_tp4_ep4.md b/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/deepseek_parallel/deepseek_r1_671b_w8a8_dp4_tp4_ep4.md index 7a3a9fb4a83667a8d74de50b205d55a4a8a7a928..9c369446a24fd0f0cd955eeb0a369d4293266e07 100644 --- a/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/deepseek_parallel/deepseek_r1_671b_w8a8_dp4_tp4_ep4.md +++ b/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/deepseek_parallel/deepseek_r1_671b_w8a8_dp4_tp4_ep4.md @@ -52,8 +52,8 @@ Execute the following Python script to download the MindSpore-compatible DeepSee ```python from openmind_hub import snapshot_download -snapshot_download(repo_id="MindSpore-Lab/DeepSeek-R1-W8A8", - local_dir="/path/to/save/deepseek_r1_w8a8", +snapshot_download(repo_id="MindSpore-Lab/DeepSeek-R1-0528-A8W8", + local_dir="/path/to/save/deepseek_r1_0528_a8w8", local_dir_use_symlinks=False) ``` @@ -78,7 +78,7 @@ If the tool is unavailable, install [git-lfs](https://git-lfs.com) first. Refer Once confirmed, download the weights by executing the following command: ```shell -git clone https://modelers.cn/MindSpore-Lab/DeepSeek-R1-W8A8.git +git clone https://modelers.cn/models/MindSpore-Lab/DeepSeek-R1-0528-A8W8.git ``` ## TP16 Tensor Parallel Inference @@ -241,18 +241,20 @@ Execution example: ```bash # Master node: -vllm-mindspore serve --model="/path/to/save/deepseek_r1_w8a8" --trust-remote-code --max-num-seqs=256 --max_model_len=32768 --max-num-batched-tokens=4096 --block-size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 16 --distributed-executor-backend=ray +vllm-mindspore serve --model="MindSpore-Lab/DeepSeek-R1-0528-A8W8" --trust-remote-code --max-num-seqs=256 --max_model_len=32768 --max-num-batched-tokens=4096 --block-size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 16 --distributed-executor-backend=ray ``` -In tensor parallel scenarios, the `--tensor-parallel-size` parameter overrides the `model_parallel` configuration in the model YAML file. +In tensor parallel scenarios, the `--tensor-parallel-size` parameter overrides the `model_parallel` configuration in the model YAML file. User can also set the local model path by `--model` argument. #### Sending Requests Use the following command to send requests, where `prompt` is the model input: ```bash -curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "/path/to/save/deepseek_r1_w8a8", "prompt": "I am", "max_tokens": 20, "temperature": 0, "top_p": 1.0, "top_k": 1, "repetition_penalty": 1.0}' -``` +curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "MindSpore-Lab/DeepSeek-R1-0528-A8W8", "prompt": "I am", "max_tokens": 20, "temperature": 0, "top_p": 1.0, "top_k": 1, "repetition_penalty": 1.0}' +``` + +User needs to ensure that the `"model"` field matches the `--model` in the service startup, and the request can successfully match the model. ## Hybrid Parallel Inference @@ -301,6 +303,8 @@ parallel_config: ### Online Inference +#### Starting the Service + `vllm-mindspore` can deploy online inference using the OpenAI API protocol. Below is the workflow for launching the service: ```bash @@ -321,22 +325,24 @@ vllm-mindspore serve --data-parallel-address [Master node communication IP] --data-parallel-rpc-port [Master node communication port] --enable-expert-parallel # Enable expert parallelism -``` +``` -Execution example: +User can also set the local model path by `--model` argument. The following is an execution example: ```bash # Master node: -vllm-mindspore serve --model="/path/to/save/deepseek_r1_w8a8" --trust-remote-code --max-num-seqs=256 --max-model-len=32768 --max-num-batched-tokens=4096 --block-size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 4 --data-parallel-size 4 --data-parallel-size-local 2 --data-parallel-start-rank 0 --data-parallel-address 192.10.10.10 --data-parallel-rpc-port 12370 --enable-expert-parallel +vllm-mindspore serve --model="MindSpore-Lab/DeepSeek-R1-0528-A8W8" --trust-remote-code --max-num-seqs=256 --max-model-len=32768 --max-num-batched-tokens=4096 --block-size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 4 --data-parallel-size 4 --data-parallel-size-local 2 --data-parallel-start-rank 0 --data-parallel-address 192.10.10.10 --data-parallel-rpc-port 12370 --enable-expert-parallel # Worker node: -vllm-mindspore serve --headless --model="/path/to/save/deepseek_r1_w8a8" --trust-remote-code --max-num-seqs=256 --max-model-len=32768 --max-num-batched-tokens=4096 --block-size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 4 --data-parallel-size 4 --data-parallel-size-local 2 --data-parallel-start-rank 2 --data-parallel-address 192.10.10.10 --data-parallel-rpc-port 12370 --enable-expert-parallel -``` +vllm-mindspore serve --headless --model="MindSpore-Lab/DeepSeek-R1-0528-A8W8" --trust-remote-code --max-num-seqs=256 --max-model-len=32768 --max-num-batched-tokens=4096 --block-size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 4 --data-parallel-size 4 --data-parallel-size-local 2 --data-parallel-start-rank 2 --data-parallel-address 192.10.10.10 --data-parallel-rpc-port 12370 --enable-expert-parallel +``` -## Sending Requests +#### Sending Requests Use the following command to send requests, where `prompt` is the model input: ```bash -curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "/path/to/save/deepseek_r1_w8a8", "prompt": "I am", "max_tokens": 20, "temperature": 0}' +curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "MindSpore-Lab/DeepSeek-R1-0528-A8W8", "prompt": "I am", "max_tokens": 20, "temperature": 0}' ``` + +User needs to ensure that the `"model"` field matches the `--model` in the service startup, and the request can successfully match the model. diff --git a/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/qwen2.5_32b_multiNPU/qwen2.5_32b_multiNPU.md b/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/qwen2.5_32b_multiNPU/qwen2.5_32b_multiNPU.md index 24d4d4a2cac790dc5e9e8f5d5145266b896d32f5..40ad4a1597b1a469f47fbdcd06d036219c27b71a 100644 --- a/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/qwen2.5_32b_multiNPU/qwen2.5_32b_multiNPU.md +++ b/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/qwen2.5_32b_multiNPU/qwen2.5_32b_multiNPU.md @@ -127,18 +127,14 @@ For [Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct), the followi ```bash #set environment variables -export ASCEND_TOTAL_MEMORY_GB=64 # Use `npu-smi info` to check the memory. export vLLM_MODEL_BACKEND=MindFormers # Use MindSpore TransFormers as the model backend. -export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Adjust based on the model's maximum usage, with the remaining allocated for KV cache. export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model YAML file. ``` Here is an explanation of these environment variables: -- `ASCEND_TOTAL_MEMORY_GB`: The memory size of each compute card. Query using `npu-smi info`, corresponding to `HBM-Usage(MB)` in the results. - `vLLM_MODEL_BACKEND`: The model backend. Currently supported models and backends are listed in the [Model Support List](../../../user_guide/supported_models/models_list/models_list.md). -- `vLLM_MODEL_MEMORY_USE_GB`: Memory reserved for model loading. Adjust this if encountering insufficient memory. -- `MINDFORMERS_MODEL_CONFIG`: Model configuration file. User can find the corresponding YAML file in the [MindSpore Transformers repository](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/qwen2_5). For Qwen2.5-32B, the YAML file is [predict_qwen2_5_32b_instruct.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/qwen2_5/predict_qwen2_5_32b_instruct.yaml). +- `MINDFORMERS_MODEL_CONFIG`: Model configuration file. User can find the corresponding YAML file in the [MindSpore Transformers repository](https://gitee.com/mindspore/mindformers/tree/master/research/qwen2_5). For Qwen2.5-32B, the YAML file is [predict_qwen2_5_32b_instruct.yaml](https://gitee.com/mindspore/mindformers/blob/master/research/qwen2_5/predict_qwen2_5_32b_instruct.yaml). Users can check memory usage with `npu-smi info` and set the NPU cards for inference using the following example (assuming cards 4,5,6,7 are used): @@ -160,7 +156,7 @@ export MAX_MODEL_LEN=1024 python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "Qwen/Qwen2.5-32B-Instruct" --trust_remote_code --tensor-parallel-size $TENSOR_PARALLEL_SIZE --max-model-len $MAX_MODEL_LEN ``` -Here, `TENSOR_PARALLEL_SIZE` specifies the number of NPU cards, and `MAX_MODEL_LEN` sets the maximum output token length. +Here, `TENSOR_PARALLEL_SIZE` specifies the number of NPU cards, and `MAX_MODEL_LEN` sets the maximum output token length. User can also set the local model path by `--model` argument. If the service starts successfully, similar output will be obtained: @@ -181,9 +177,11 @@ Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 Use the following command to send a request, where `prompt` is the model input: ```bash -curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "Qwen2.5-32B-Instruct", "prompt": "I am", "max_tokens": 20, "temperature": 0}' +curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "Qwen/Qwen2.5-32B-Instruct", "prompt": "I am", "max_tokens": 20, "temperature": 0}' ``` +User needs to ensure that the `"model"` field matches the `--model` in the service startup, and the request can successfully match the model. + If processed successfully, the inference result will be: ```text diff --git a/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md b/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md index 2c360e28f8010720b10792648cd758d2fc54acab..79ed73f81296beb24f42b95e2723377dcd116040 100644 --- a/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md +++ b/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md @@ -127,17 +127,13 @@ For [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct), the following ```bash #set environment variables -export ASCEND_TOTAL_MEMORY_GB=64 # Please use `npu-smi info` to check the memory. export vLLM_MODEL_BACKEND=MindFormers # use MindSpore TransFormers as model backend. -export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Set according to the model's maximum usage, with the remaining environment used for kvcache allocation export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model's YAML file. ``` Here is an explanation of these variables: -- `ASCEND_TOTAL_MEMORY_GB`: The memory size of each compute card. Query using `npu-smi info`, corresponding to `HBM-Usage(MB)` in the results. - `vLLM_MODEL_BACKEND`: The model backend. Currently supported models and backends are listed in the [Model Support List](../../../user_guide/supported_models/models_list/models_list.md). -- `vLLM_MODEL_MEMORY_USE_GB`: Memory reserved for model loading. Adjust this if encountering insufficient memory. - `MINDFORMERS_MODEL_CONFIG`: Model configuration file. User can find the corresponding YAML file in the [MindSpore Transformers repository](https://gitee.com/mindspore/mindformers/tree/master/research/qwen2_5). For Qwen2.5-7B, the YAML file is [predict_qwen2_5_7b_instruct.yaml](https://gitee.com/mindspore/mindformers/blob/master/research/qwen2_5/predict_qwen2_5_7b_instruct.yaml). User can check memory usage with `npu-smi info` and set the compute card for inference using: @@ -196,7 +192,7 @@ Use the model `Qwen/Qwen2.5-7B-Instruct` and start the vLLM service with the fol python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "Qwen/Qwen2.5-7B-Instruct" ``` -If the service starts successfully, similar output will be obtained: +User can also set the local model path by `--model` argument. If the service starts successfully, similar output will be obtained: ```text INFO: Started server process [6363] @@ -218,6 +214,8 @@ Use the following command to send a request, where `prompt` is the model input: curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "Qwen/Qwen2.5-7B-Instruct", "prompt": "I am", "max_tokens": 15, "temperature": 0}' ``` +User needs to ensure that the `"model"` field matches the `--model` in the service startup, and the request can successfully match the model. + If the request is processed successfully, the following inference result will be returned: ```text diff --git a/docs/vllm_mindspore/docs/source_en/index.rst b/docs/vllm_mindspore/docs/source_en/index.rst index 5b4871aebf3664ec158da60e4779c8c016749431..3163af72f89ad7a5ef6f231b6240e1b7b6a63450 100644 --- a/docs/vllm_mindspore/docs/source_en/index.rst +++ b/docs/vllm_mindspore/docs/source_en/index.rst @@ -58,8 +58,8 @@ Branch ----------------------------------------------------- The vllm-mindspore repository contains the main branch, development branch, and version branches: -- **main**: the main branch, compatible with Mindspore master branch and vLLM v0.7.3 version, is continuously monitored for quality through Ascend-MindSpore CI. -- **develop**: the development branch for adapting vLLM features, which is forked from the main branch when a new vLLM version is released. Once the adapted features is stable, it will be merged into the main branch. The current development branch is adapting vLLM v0.8.3 version. +- **main**: the main branch, compatible with Mindspore master branch and vLLM v0.8.3 version, is continuously monitored for quality through Ascend-MindSpore CI. +- **develop**: the development branch for adapting vLLM features, which is forked from the main branch when a new vLLM version is released. Once the adapted features is stable, it will be merged into the main branch. The current development branch is adapting vLLM v0.9.1 version. - **rX.Y.Z**: version branches used for archiving version release, which is forked from the main branch after the adaptation of a certain vLLM version is completed. The following are the version branches: @@ -72,10 +72,10 @@ The following are the version branches: - Notes * - master - Maintained - - Compatible with vLLM v0.7.3, and CI commitment for MindSpore master branch + - Compatible with vLLM v0.8.3, and CI commitment for MindSpore master branch * - develop - Maintained - - Compatible with vLLM v0.8.3 + - Compatible with vLLM v0.9.1 * - r0.1 - Unmaintained - Only doc fixed is allowed diff --git a/docs/vllm_mindspore/docs/source_en/release_notes/release_notes.md b/docs/vllm_mindspore/docs/source_en/release_notes/release_notes.md index e993a140143732e258dfdb7a73d07d61e226870b..4b22bc8bf9a260819deaeba8bf11defa4da00fd0 100644 --- a/docs/vllm_mindspore/docs/source_en/release_notes/release_notes.md +++ b/docs/vllm_mindspore/docs/source_en/release_notes/release_notes.md @@ -8,13 +8,14 @@ The following are the key new features and models supported in the vLLM MindSpor ### New Features -- 0.8.3 V1 Architecture Basic Features, including chunked prefill and automatic prefix caching; +- 0.9.1 V1 Architecture Basic Features, including chunked prefill and automatic prefix caching; - V0 Multi-step Scheduling; - V0 Chunked Prefill; - V0 Automatic Prefix Caching; - V0 DeepSeek MTP (Multi-Task Processing); - GPTQ Quantization; -- SmoothQuant Quantization. +- SmoothQuant Quantization; +- V1 Sampling Enhancements. ### New Models diff --git a/docs/vllm_mindspore/docs/source_en/user_guide/environment_variables/environment_variables.md b/docs/vllm_mindspore/docs/source_en/user_guide/environment_variables/environment_variables.md index c1b71616260505fa7705916b0598ded2aa098674..036834798eb4e55ba82c5ff34667ff7e83b54e5f 100644 --- a/docs/vllm_mindspore/docs/source_en/user_guide/environment_variables/environment_variables.md +++ b/docs/vllm_mindspore/docs/source_en/user_guide/environment_variables/environment_variables.md @@ -11,6 +11,13 @@ | `HCCL_SOCKET_IFNAME` | Specifies the network interface name for inter-machine communication using HCCL. | String | Interface name (e.g., `enp189s0f0`). | Used in multi-machine scenarios. The interface name can be found via `ifconfig` by matching the IP address. | | `ASCEND_RT_VISIBLE_DEVICES` | Specifies which devices are visible to the current process, supporting one or multiple Device IDs. | String | Device IDs as a comma-separated string (e.g., `"0,1,2,3,4,5,6,7"`). | Recommended for Ray usage scenarios. | | `HCCL_BUFFSIZE` | Controls the buffer size for data sharing between two NPUs. | int | Buffer size in MB (e.g., `2048`). | Usage reference: [HCCL_BUFFSIZE](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/81RC1beta1/maintenref/envvar/envref_07_0080.html). Example: For DeepSeek hybrid parallelism (Data Parallel: 32, Expert Parallel: 32) with `max-num-batched-tokens=256`, set `export HCCL_BUFFSIZE=2048`. | -| MS_MEMPOOL_BLOCK_SIZE | Set the size of the memory pool block in PyNative mode for devices | String | String of positive number, and the unit is GB. | | -| vLLM_USE_NPU_ADV_STEP_FLASH_OP | Whether to use Ascend operation `adv_step_flash` | String | `on`: Use;`off`:Not use | If the variable is set to `off`, model will use the implement of small operations. | -| VLLM_TORCH_PROFILER_DIR | Enables profiling data collection and takes effect when a data save path is configured. | String | The path to save profiling data. | | +| `MS_MEMPOOL_BLOCK_SIZE` | Set the size of the memory pool block in PyNative mode for devices | String | String of positive number, and the unit is GB. | | +| `vLLM_USE_NPU_ADV_STEP_FLASH_OP` | Whether to use Ascend operation `adv_step_flash` | String | `on`: Use;`off`:Not use | If the variable is set to `off`, model will use the implement of small operations. | +| `VLLM_TORCH_PROFILER_DIR` | Enables profiling data collection and takes effect when a data save path is configured. | String | The path to save profiling data. | | + +More environment variable information can be referred in the following links: + +- [CANN Environment Variable List](https://www.hiascend.com/document/detail/en/CANNCommunityEdition/81RC1beta1/index/index.html) +- [MindSpore Environment Variable List](https://www.mindspore.cn/docs/en/master/api_python/env_var_list.html) +- [MindSpore Transformers Environment Variable List](https://www.mindspore.cn/mindformers/docs/en/master/index.html) +- [vLLM Environment Variable List](https://docs.vllm.ai/en/v0.8.4/serving/env_vars.html) diff --git a/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/benchmark/benchmark.md b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/benchmark/benchmark.md index c486016bad58e965b06b9fe3cd4950322b72ae8a..9ab03ff8ac88bd2612a18421d204a026c63597f5 100644 --- a/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/benchmark/benchmark.md +++ b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/benchmark/benchmark.md @@ -9,9 +9,7 @@ The benchmark tool of vLLM MindSpore is inherited from vLLM. You can refer to th For single-card inference, we take [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) as an example. You can prepare the environment by following the guide [Single-Card Inference (Qwen2.5-7B)](../../../getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md#online-inference), set the environment variables: ```bash -export ASCEND_TOTAL_MEMORY_GB=64 # Please use `npu-smi info` to check the memory. export vLLM_MODEL_BACKEND=MindFormers # use MindSpore Transformers as model backend. -export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Set according to the model's maximum usage, with the remaining environment used for kvcache allocation export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model's YAML file. ``` @@ -40,7 +38,7 @@ INFO: Application startup complete. Clone the vLLM repository and import the vLLM MindSpore plugin to reuse the benchmark tools: ```bash -export VLLM_BRANCH=v0.8.3 +export VLLM_BRANCH=v0.9.1 git clone https://github.com/vllm-project/vllm.git -b ${VLLM_BRANCH} cd vllm sed -i '1i import vllm_mindspore' benchmarks/benchmark_serving.py @@ -104,16 +102,14 @@ P99 ITL (ms): .... For offline performance benchmark, take [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) as an example. Prepare the environment by following the guide [Single-Card Inference (Qwen2.5-7B)](../../../getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md#offline-inference). User need to set the environment variables: ```bash -export ASCEND_TOTAL_MEMORY_GB=64 # Please use `npu-smi info` to check the memory. export vLLM_MODEL_BACKEND=MindFormers # use MindSpore Transformers as model backend. -export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Set according to the model's maximum usage, with the remaining environment used for kvcache allocation export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model's YAML file. ``` Clone the vLLM repository and import the vLLM-MindSpore plugin to reuse the benchmark tools: ```bash -export VLLM_BRANCH=v0.8.3 +export VLLM_BRANCH=v0.9.1 git clone https://github.com/vllm-project/vllm.git -b ${VLLM_BRANCH} cd vllm sed -i '1i import vllm_mindspore' benchmarks/benchmark_throughput.py diff --git a/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/features_list/features_list.md b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/features_list/features_list.md index a9225c4599e95855bbaf051aa41e73d3be694d7a..67530546706d489f1c2723054b5f2e40b6949558 100644 --- a/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/features_list/features_list.md +++ b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/features_list/features_list.md @@ -23,7 +23,7 @@ The following is the features supported in vLLM MindSpore. | Multi Modality | WIP | WIP | | Prompt adapter | × | WIP | | Speculative decoding | × | WIP | -| LogProbs | × | WIP | +| LogProbs | × | √ | | Prompt logProbs | × | WIP | | Best of | × | × | | Beam search | × | WIP | @@ -31,7 +31,7 @@ The following is the features supported in vLLM MindSpore. | Pooling | × | × | | Enc-dec | × | × | | Reasoning Outputs | √ | √ | -| Tool Calling | WIP | WIP | +| Tool Calling | WIP | √ | - √:Feature aligned with the community version of vLLM. - ×:Currently unsupported; alternative solutions are recommended. diff --git a/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/profiling/profiling.md b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/profiling/profiling.md index b24b541e59a4a4184de1e3619164b7951fcddd11..897f1ec0c848a9d7608a0100a6e4411989ffe5b8 100644 --- a/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/profiling/profiling.md +++ b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/profiling/profiling.md @@ -40,7 +40,7 @@ curl -X POST http://127.0.0.1:8000/start_profile curl http://localhost:8000/v1/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "/home/DeepSeekV3", + "model": "Qwen/Qwen2.5-32B-Instruct", "prompt": "San Francisco is a", "max_tokens": 7, "temperature": 0 diff --git a/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/quantization/quantization.md b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/quantization/quantization.md index 401768ca1af91c1442bee6a3b3d8960921fcea92..fa1b8f89c339d94737f7a9a5329e25c587691e8d 100644 --- a/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/quantization/quantization.md +++ b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/quantization/quantization.md @@ -16,7 +16,7 @@ We employ [MindSpore Golden Stick's PTQ algorithm](https://gitee.com/mindspore/g ### Downloading Quantized Weights -We have uploaded the quantized DeepSeek-R1 to [ModelArts Community](https://modelers.cn): [MindSpore-Lab/DeepSeek-R1-W8A8](https://modelers.cn/models/MindSpore-Lab/DeepSeek-R1-W8A8). Refer to the [ModelArts Community documentation](https://modelers.cn/docs/en/openmind-hub-client/0.9/basic_tutorial/download.html) to download the weights locally. +We have uploaded the quantized DeepSeek-R1 to [ModelArts Community](https://modelers.cn): [MindSpore-Lab/DeepSeek-R1-0528-A8W8](https://modelers.cn/models/MindSpore-Lab/DeepSeek-R1-0528-A8W8). Refer to the [ModelArts Community documentation](https://modelers.cn/docs/en/openmind-hub-client/0.9/basic_tutorial/download.html) to download the weights locally. ## Quantized Model Inference @@ -27,9 +27,7 @@ After obtaining the DeepSeek-R1 W8A8 weights, ensure they are stored in the rela Refer to the [Installation Guide](../../../getting_started/installation/installation.md) to set up the vLLM MindSpore environment. User need to set the following environment variables: ```bash -export ASCEND_TOTAL_MEMORY_GB=64 # Please use `npu-smi info` to check the memory. export vLLM_MODEL_BACKEND=MindFormers # use MindSpore Transformers as model backend. -export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Set according to the model's maximum usage, with the remaining environment used for kvcache allocation export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model's YAML file. ``` diff --git a/docs/vllm_mindspore/docs/source_en/user_guide/supported_models/models_list/models_list.md b/docs/vllm_mindspore/docs/source_en/user_guide/supported_models/models_list/models_list.md index ba825bcae5a769b9217aae4ba8fb808e819b4f85..3d9b49bd6e3f3bd9a9f9bb1cd614cc201c8d1fa9 100644 --- a/docs/vllm_mindspore/docs/source_en/user_guide/supported_models/models_list/models_list.md +++ b/docs/vllm_mindspore/docs/source_en/user_guide/supported_models/models_list/models_list.md @@ -6,7 +6,7 @@ |-------| --------- | ---- | | DeepSeek-V3 | Supported | [DeepSeek-V3](https://modelers.cn/models/MindSpore-Lab/DeepSeek-V3) | | DeepSeek-R1 | Supported | [DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-V3) | -| DeepSeek-R1 W8A8 | Supported | [Deepseek-R1-W8A8](https://modelers.cn/models/MindSpore-Lab/DeepSeek-r1-w8a8) | +| DeepSeek-R1 W8A8 | Supported | [Deepseek-R1-W8A8](https://modelers.cn/models/MindSpore-Lab/DeepSeek-R1-0528-A8W8) | | Qwen2.5 | Supported | [Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct), [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct), [Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct), [Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct), [Qwen2.5-14B-Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct), [Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct), [Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) | | Qwen3-32B | Supported | [Qwen3-32B](https://modelers.cn/models/MindSpore-Lab/Qwen3-32B) | | Qwen3-235B-A22B | Supported | [Qwen3-235B-A22B](https://huggingface.co/Qwen/Qwen3-235B-A22B) | diff --git a/docs/vllm_mindspore/docs/source_zh_cn/getting_started/installation/installation.md b/docs/vllm_mindspore/docs/source_zh_cn/getting_started/installation/installation.md index cc164aecc15040173411f72cd0f826ae819b0eb7..1a442ef00ca09ab9b1095850bcd6b63e26147596 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/getting_started/installation/installation.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/getting_started/installation/installation.md @@ -13,19 +13,19 @@ - Python:3.9 / 3.10 / 3.11 - 软件版本配套 - | 软件 | 版本 | 对应分支 | - | ----- | ----- | ----- | - |[CANN](https://www.hiascend.com/developer/download/community/result?module=cann) | 8.1 | - | - |[MindSpore](https://www.mindspore.cn/install/) | 2.7 | master | - |[MSAdapter](https://git.openi.org.cn/OpenI/MSAdapter)| 0.2 | master | - |[MindSpore Transformers](https://gitee.com/mindspore/mindformers)|1.6 | dev | - |[Golden Stick](https://gitee.com/mindspore/golden-stick)|1.1.0 | r1.1.0 | - |[vLLM](https://github.com/vllm-project/vllm) | 0.8.3 | v0.8.3 | - |[vLLM MindSpore](https://gitee.com/mindspore/vllm-mindspore) | 0.2 | master | + | 软件 | 配套版本与下载链接 | + | ----- | ----- | + |[CANN](https://www.hiascend.com/developer/download/community/result?module=cann) | [8.1.RC1](https://www.hiascend.com/document/detail/zh/canncommercial/81RC1/softwareinst/instg/instg_0000.html?Mode=PmIns&InstallType=local&OS=Debian&Software=cannToolKit) | + |[MindSpore](https://www.mindspore.cn/install/) | [2.7.0](https://repo.mindspore.cn/mindspore/mindspore/version/202508/20250814/master_20250814091143_7548abc43af03319bfa528fc96d0ccd3917fcc9c_newest/unified/) | + |[MSAdapter](https://git.openi.org.cn/OpenI/MSAdapter)| [0.5.0](https://repo.mindspore.cn/mindspore/msadapter/version/202508/20250814/master_20250814010018_4615051c43eef898b6bbdc69768656493b5932f8_newest/any/) | + |[MindSpore Transformers](https://gitee.com/mindspore/mindformers)| [1.6.0](https://gitee.com/mindspore/mindformers) | + |[Golden Stick](https://gitee.com/mindspore/golden-stick)| [1.2.0](https://repo.mindspore.cn/mindspore/golden-stick/version/202508/20250814/master_20250814010017_2713821db982330b3bcd6d84d85a3b337d555f27_newest/any/) | + |[vLLM](https://github.com/vllm-project/vllm) | [0.9.1](https://repo.mindspore.cn/mirrors/vllm/version/202505/20250514/v0.8.4.dev0_newest/any/) | + |[vLLM MindSpore](https://gitee.com/mindspore/vllm-mindspore) | [0.3.0](https://gitee.com/mindspore/vllm-mindspore/) | ## 配置环境 -在本章节中,我们将介绍[docker安装](#docker安装)、[pip安装](#pip安装)、[源码安装](#源码安装)三种安装方式,以及[快速验证](#快速验证)用例,用于验证安装是否成功。 +在本章节中,我们将介绍[docker安装](#docker安装)、[源码安装](#源码安装)两种安装方式,以及[快速验证](#快速验证)用例,用于验证安装是否成功。 ### docker安装 @@ -105,29 +105,33 @@ docker exec -it $DOCKER_NAME bash ### 源码安装 -- **CANN安装** +#### CANN安装 - CANN安装方法与环境配套,请参考[CANN社区版软件安装](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha002/softwareinst/instg/instg_0001.html?Mode=PmIns&OS=openEuler&Software=cannToolKit),若用户在安装CANN过程中遇到问题,可参考[昇腾常见问题](https://www.hiascend.com/document/detail/zh/AscendFAQ/ProduTech/CANNFAQ/cannfaq_000.html)进行解决。 +CANN安装方法与环境配套,请参考[CANN社区版软件安装](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha002/softwareinst/instg/instg_0001.html?Mode=PmIns&OS=openEuler&Software=cannToolKit),若用户在安装CANN过程中遇到问题,可参考[昇腾常见问题](https://www.hiascend.com/document/detail/zh/AscendFAQ/ProduTech/CANNFAQ/cannfaq_000.html)进行解决。 - CANN默认安装路径为`/usr/local/Ascend`。用户在安装CANN完毕后,使用如下命令,为CANN配置环境变量: +CANN默认安装路径为`/usr/local/Ascend`。用户在安装CANN完毕后,使用如下命令,为CANN配置环境变量: - ```bash - LOCAL_ASCEND=/usr/local/Ascend # the root directory of run package - source ${LOCAL_ASCEND}/ascend-toolkit/set_env.sh - export ASCEND_CUSTOM_PATH=${LOCAL_ASCEND}/ascend-toolkit - ``` +```bash +LOCAL_ASCEND=/usr/local/Ascend # the root directory of run package +source ${LOCAL_ASCEND}/ascend-toolkit/set_env.sh +export ASCEND_CUSTOM_PATH=${LOCAL_ASCEND}/ascend-toolkit +``` -- **vLLM前置依赖安装** +#### vLLM前置依赖安装 - vLLM的环境配置与安装方法,请参考[vLLM安装教程](https://docs.vllm.ai/en/v0.8.3/getting_started/installation/cpu.html)。其依赖`gcc/g++ >= 12.3.0`版本,可通过以下命令完成安装: +vLLM的环境配置与安装方法,请参考[vLLM安装教程](https://docs.vllm.ai/en/v0.9.1/getting_started/installation/cpu.html)。其依赖`gcc/g++ >= 12.3.0`版本,可通过以下命令完成安装: - ```bash - yum install -y gcc gcc-c++ - ``` +```bash +yum install -y gcc gcc-c++ +``` + +#### vLLM MindSpore安装 -- **vLLM MindSpore安装** +vLLM MindSpore有以下两种安装方式。**vLLM MindSpore一键式安装**适用于用户快速使用与部署的场景。**vLLM MindSpore手动安装**适用于用户对组件有自定义修改的场景。 - 安装vLLM MindSpore,需要在拉取vLLM MindSpore源码后,执行以下命令,安装依赖包: +- **vLLM MindSpore一键式安装** + + 采用一键式安装脚本来安装vLLM MindSpore,需要在拉取vLLM MindSpore源码后,执行以下命令,安装依赖包: ```bash git clone https://gitee.com/mindspore/vllm-mindspore.git @@ -147,14 +151,63 @@ docker exec -it $DOCKER_NAME bash export PYTHONPATH=$MF_PATH:$PYTHONPATH ``` -### 快速验证 +- **vLLM MindSpore手动安装** + + 若用户对组件有修改,或者需使用其他版本,则用户需要按照特定顺序,手动安装组件。vLLM MindSpore软件配套下载地址可以参考[版本配套](#版本配套),且对组件的安装顺序要求如下: + + 1. 安装vLLM + + ```bash + pip install /path/to/vllm-*.whl + ``` + + 2. 卸载torch相关组件 + + ```bash + pip uninstall torch torch-npu torchvision torchaudio -y + ``` + + 3. 安装MindSpore + + ```bash + pip install /path/to/mindspore-*.whl + ``` + + 4. 引入MindSpore Transformers仓,加入到`PYTHONPATH`中 + + ```bash + git clone https://gitee.com/mindspore/mindformers.git + export PYTHONPATH=$MF_PATH:$PYTHONPATH + ``` + + 5. 安装Golden Stick + + ```bash + pip install /path/to/mindspore_gs-*.whl + ``` + + 6. 安装MSAdapter + + ```bash + pip install /path/to/msadapter-*.whl + ``` + + 7. 安装vLLM MindSpore + + 需要先拉取vLLM MindSpore源码,再执行安装 + + ```bash + git clone https://gitee.com/mindspore/vllm-mindspore.git + cd vllm-mindspore + pip install . + ``` + +## 快速验证 用户可以创建一个简单的离线推理场景,验证安装是否成功。下面以[Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) 为例。首先用户需要执行以下命令,设置环境变量: ```bash -export ASCEND_TOTAL_MEMORY_GB=64 # Please use `npu-smi info` to check the memory. export vLLM_MODEL_BACKEND=MindFormers # use MindSpore Transformers as model backend. -export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Set according to the model's maximum usage, with the remaining environment used for kvcache allocation export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model's YAML file. ``` diff --git a/docs/vllm_mindspore/docs/source_zh_cn/getting_started/quick_start/quick_start.md b/docs/vllm_mindspore/docs/source_zh_cn/getting_started/quick_start/quick_start.md index 2bf629908a752b2c386f752e33355339b9ec6069..addd3951d0d120bf3d7a2e3e237b20ddad6a32d4 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/getting_started/quick_start/quick_start.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/getting_started/quick_start/quick_start.md @@ -131,18 +131,14 @@ git clone https://huggingface.co/Qwen/Qwen2.5-7B-Instruct 用户在拉起模型前,需设置以下环境变量: ```bash -export ASCEND_TOTAL_MEMORY_GB=64 # Please use `npu-smi info` to check the memory. export vLLM_MODEL_BACKEND=MindFormers # use MindSpore Transformers as model backend. -export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Set according to the model's maximum usage, with the remaining environment used for kvcache allocation export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model's YAML file. ``` 以下是对上述环境变量的解释: -- `ASCEND_TOTAL_MEMORY_GB`: 每一张计算卡的显存大小。用户可使用`npu-smi info`命令进行查询,该值对应查询结果中的`HBM-Usage(MB)`; - `vLLM_MODEL_BACKEND`:所运行的模型后端。目前vLLM MindSpore所支持的模型与模型后端,可在[模型支持列表](../../user_guide/supported_models/models_list/models_list.md)中进行查询; -- `vLLM_MODEL_MEMORY_USE_GB`:模型加载时所用空间,根据用户所使用的模型进行设置。若用户在模型加载过程中遇到显存不足时,可适当增大该值并重试; -- `MINDFORMERS_MODEL_CONFIG`:模型配置文件。 +- `MINDFORMERS_MODEL_CONFIG`:模型配置文件。用户可以在[MindSpore Transformers工程](https://gitee.com/mindspore/mindformers/tree/master/research/qwen2_5)中,找到对应模型的yaml文件。以Qwen2.5-7B为例,则其yaml文件为[predict_qwen2_5_7b_instruct.yaml](https://gitee.com/mindspore/mindformers/blob/master/research/qwen2_5/predict_qwen2_5_7b_instruct.yaml)。 另外,用户需要确保MindSpore Transformers已安装。用户可通过 @@ -202,7 +198,7 @@ vLLM MindSpore可使用OpenAI的API协议,进行在线推理部署。以下是 python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "Qwen/Qwen2.5-7B-Instruct" ``` -若服务成功拉起,则可以获得类似的执行结果: +用户可以通过`--model`参数,指定模型保存的本地路径。若服务成功拉起,则可以获得类似的执行结果: ```text INFO: Started server process [6363] @@ -224,7 +220,7 @@ Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg gereration throughput: 0.0 curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "Qwen/Qwen2.5-7B-Instruct", "prompt": "I am", "max_tokens": 20, "temperature": 0}' ``` -若请求处理成功,将获得以下的推理结果: +其中,用户需确认`"model"`字段与启动服务中`--model`一致,请求才能成功匹配到模型。若请求处理成功,将获得以下推理结果: ```text { diff --git a/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/deepseek_parallel/deepseek_r1_671b_w8a8_dp4_tp4_ep4.md b/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/deepseek_parallel/deepseek_r1_671b_w8a8_dp4_tp4_ep4.md index 047e7b4aad2f9239abc18c111f8f576867c3a8be..813a0dd588763cd4c46b31331bec7e88edd158c5 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/deepseek_parallel/deepseek_r1_671b_w8a8_dp4_tp4_ep4.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/deepseek_parallel/deepseek_r1_671b_w8a8_dp4_tp4_ep4.md @@ -94,8 +94,8 @@ docker exec -it $DOCKER_NAME bash ```python from openmind_hub import snapshot_download -snapshot_download(repo_id="MindSpore-Lab/DeepSeek-R1-W8A8", - local_dir="/path/to/save/deepseek_r1_w8a8", +snapshot_download(repo_id="MindSpore-Lab/DeepSeek-R1-0528-A8W8", + local_dir="/path/to/save/deepseek_r1_0528_a8w8", local_dir_use_symlinks=False) ``` @@ -120,7 +120,7 @@ Git LFS initialized. 工具确认可用后,执行以下命令,下载权重: ```shell -git clone https://modelers.cn/MindSpore-Lab/DeepSeek-R1-W8A8.git +git clone https://modelers.cn/models/MindSpore-Lab/DeepSeek-R1-0528-A8W8.git ``` ## TP16 张量并行推理 @@ -284,19 +284,21 @@ vllm-mindspore serve ```bash # 主节点: -vllm-mindspore serve --model="/path/to/save/deepseek_r1_w8a8" --trust-remote-code --max-num-seqs=256 --max_model_len=32768 --max-num-batched-tokens=4096 --block-size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 16 --distributed-executor-backend=ray +vllm-mindspore serve --model="MindSpore-Lab/DeepSeek-R1-0528-A8W8" --trust-remote-code --max-num-seqs=256 --max_model_len=32768 --max-num-batched-tokens=4096 --block-size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 16 --distributed-executor-backend=ray ``` -张量并行场景下,`--tensor-parallel-size`参数会覆盖模型yaml文件中`parallel_config`的`model_parallel`配置。 +张量并行场景下,`--tensor-parallel-size`参数会覆盖模型yaml文件中`parallel_config`的`model_parallel`配置。用户可以通过`--model`参数,指定模型保存的本地路径。 #### 发起请求 使用如下命令发送请求。其中`prompt`字段为模型输入: ```bash -curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "/path/to/save/deepseek_r1_w8a8", "prompt": "I am", "max_tokens": 20, "temperature": 0, "top_p": 1.0, "top_k": 1, "repetition_penalty": 1.0}' +curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "MindSpore-Lab/DeepSeek-R1-0528-A8W8", "prompt": "I am", "max_tokens": 20, "temperature": 0, "top_p": 1.0, "top_k": 1, "repetition_penalty": 1.0}' ``` +用户需确认`"model"`字段与启动服务中`--model`一致,请求才能成功匹配到模型。 + ## 混合并行推理 vLLM 通过 Ray 对多个节点资源进行管理和运行。该样例对应以下并行策略场景: @@ -344,6 +346,8 @@ parallel_config: ### 在线推理 +#### 启动服务 + `vllm-mindspore`可使用OpenAI的API协议部署在线推理。以下是在线推理的拉起流程: ```bash @@ -366,20 +370,22 @@ vllm-mindspore serve --enable-expert-parallel # 使能专家并行 ``` -执行示例: +用户可以通过`--model`参数,指定模型保存的本地路径。以下为执行示例: ```bash # 主节点: -vllm-mindspore serve --model="/path/to/save/deepseek_r1_w8a8" --trust-remote-code --max-num-seqs=256 --max-model-len=32768 --max-num-batched-tokens=4096 --block-size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 4 --data-parallel-size 4 --data-parallel-size-local 2 --data-parallel-start-rank 0 --data-parallel-address 192.10.10.10 --data-parallel-rpc-port 12370 --enable-expert-parallel +vllm-mindspore serve --model="MindSpore-Lab/DeepSeek-R1-0528-A8W8" --trust-remote-code --max-num-seqs=256 --max-model-len=32768 --max-num-batched-tokens=4096 --block-size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 4 --data-parallel-size 4 --data-parallel-size-local 2 --data-parallel-start-rank 0 --data-parallel-address 192.10.10.10 --data-parallel-rpc-port 12370 --enable-expert-parallel # 从节点: -vllm-mindspore serve --headless --model="/path/to/save/deepseek_r1_w8a8" --trust-remote-code --max-num-seqs=256 --max-model-len=32768 --max-num-batched-tokens=4096 --block-size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 4 --data-parallel-size 4 --data-parallel-size-local 2 --data-parallel-start-rank 2 --data-parallel-address 192.10.10.10 --data-parallel-rpc-port 12370 --enable-expert-parallel +vllm-mindspore serve --headless --model="MindSpore-Lab/DeepSeek-R1-0528-A8W8" --trust-remote-code --max-num-seqs=256 --max-model-len=32768 --max-num-batched-tokens=4096 --block-size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 4 --data-parallel-size 4 --data-parallel-size-local 2 --data-parallel-start-rank 2 --data-parallel-address 192.10.10.10 --data-parallel-rpc-port 12370 --enable-expert-parallel ``` -## 发送请求 +#### 发送请求 使用如下命令发送请求。其中`prompt`字段为模型输入: ```bash -curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "/path/to/save/deepseek_r1_w8a8", "prompt": "I am, "max_tokens": 120, "temperature": 0}' +curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "MindSpore-Lab/DeepSeek-R1-0528-A8W8", "prompt": "I am, "max_tokens": 120, "temperature": 0}' ``` + +用户需确认`"model"`字段与启动服务中`--model`一致,请求才能成功匹配到模型。 diff --git a/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/qwen2.5_32b_multiNPU/qwen2.5_32b_multiNPU.md b/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/qwen2.5_32b_multiNPU/qwen2.5_32b_multiNPU.md index 811009f6471bea2dad284efdfad79184b3fed3a8..f0901e2e0f9c461a047645f91b508d6278605e03 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/qwen2.5_32b_multiNPU/qwen2.5_32b_multiNPU.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/qwen2.5_32b_multiNPU/qwen2.5_32b_multiNPU.md @@ -128,18 +128,14 @@ git clone https://huggingface.co/Qwen/Qwen2.5-32B-Instruct ```bash #set environment variables -export ASCEND_TOTAL_MEMORY_GB=64 # Please use `npu-smi info` to check the memory. export vLLM_MODEL_BACKEND=MindFormers # use MindSpore TransFormers as model backend. -export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Set according to the model's maximum usage, with the remaining environment used for kvcache allocation export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model's YAML file. ``` 以下是对上述环境变量的解释: -- `ASCEND_TOTAL_MEMORY_GB`: 每一张计算卡的显存大小。用户可使用`npu-smi info`命令进行查询,该值对应查询结果中的`HBM-Usage(MB)`。 - `vLLM_MODEL_BACKEND`:所运行的模型后端。目前vLLM MindSpore所支持的模型与模型后端,可在[模型支持列表](../../../user_guide/supported_models/models_list/models_list.md)中进行查询。 -- `vLLM_MODEL_MEMORY_USE_GB`:模型加载时所用空间,根据用户所使用的模型进行设置。若用户在模型加载过程中遇到显存不足时,可适当增大该值并重试。 -- `MINDFORMERS_MODEL_CONFIG`:模型配置文件。用户可以在[MindSpore Transformers工程](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/qwen2_5)中,找到对应模型的yaml文件。以Qwen2.5-32B为例,则其yaml文件为[predict_qwen2_5_32b_instruct.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/qwen2_5/predict_qwen2_5_32b_instruct.yaml) 。 +- `MINDFORMERS_MODEL_CONFIG`:模型配置文件。用户可以在[MindSpore Transformers工程](https://gitee.com/mindspore/mindformers/tree/master/research/qwen2_5)中,找到对应模型的yaml文件。以Qwen2.5-32B为例,则其yaml文件为[predict_qwen2_5_32b_instruct.yaml](https://gitee.com/mindspore/mindformers/blob/master/research/qwen2_5/predict_qwen2_5_32b_instruct.yaml) 。 用户可通过`npu-smi info`查看显存占用情况,并可以使用如下环境变量,设置用于推理的计算卡。以下例子为假设用户使用4,5,6,7卡进行推理: @@ -163,7 +159,7 @@ python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model 其中,`TENSOR_PARALLEL_SIZE`为用户指定的卡数,`MAX_MODEL_LEN`为模型最大输出token数。 -若服务成功拉起,则可以获得类似的执行结果: +用户可以通过`--model`参数,指定模型保存的本地路径。若服务成功拉起,则可以获得类似的执行结果: ```text INFO: Started server process [6363] @@ -182,10 +178,10 @@ Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg gereration throughput: 0.0 使用如下命令发送请求。其中`prompt`字段为模型输入: ```bash -curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "Qwen2.5-32B-Instruct", "prompt": "I am", "max_tokens": 20, "temperature": 0}' +curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "Qwen/Qwen2.5-32B-Instruct", "prompt": "I am", "max_tokens": 20, "temperature": 0}' ``` -若请求处理成功,将获得以下的推理结果: +其中,用户需确认`"model"`字段与启动服务中`--model`一致,请求才能成功匹配到模型。若请求处理成功,将获得以下推理结果: ```text { diff --git a/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md b/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md index ffc82071b2e9a98e315268756ad2a48ec9e12246..c7d8426f6dc0c258addf291672a440d3fe93af07 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md @@ -128,18 +128,14 @@ git clone https://huggingface.co/Qwen/Qwen2.5-7B-Instruct ```bash #set environment variables -export ASCEND_TOTAL_MEMORY_GB=64 # Please use `npu-smi info` to check the memory. export vLLM_MODEL_BACKEND=MindFormers # use MindSpore TransFormers as model backend. -export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Set according to the model's maximum usage, with the remaining environment used for kvcache allocation export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model's YAML file. ``` 以下是对上述环境变量的解释: -- `ASCEND_TOTAL_MEMORY_GB`: 每一张计算卡的显存大小。用户可使用`npu-smi info`命令进行查询,该值对应查询结果中的`HBM-Usage(MB)`; - `vLLM_MODEL_BACKEND`:所运行的模型后端。目前vLLM MindSpore所支持的模型与模型后端,可在[模型支持列表](../../../user_guide/supported_models/models_list/models_list.md)中进行查询; -- `vLLM_MODEL_MEMORY_USE_GB`:模型加载时所用空间,根据用户所使用的模型进行设置。若用户在模型加载过程中遇到显存不足时,可适当增大该值并重试; -- `MINDFORMERS_MODEL_CONFIG`:模型配置文件。用户可以在[MindSpore Transformers工程](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/qwen2_5)中,找到对应模型的yaml文件。以Qwen2.5-7B为例,则其yaml文件为[predict_qwen2_5_7b_instruct.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/qwen2_5/predict_qwen2_5_7b_instruct.yaml) 。 +- `MINDFORMERS_MODEL_CONFIG`:模型配置文件。用户可以在[MindSpore Transformers工程](https://gitee.com/mindspore/mindformers/tree/master/research/qwen2_5)中,找到对应模型的yaml文件。以Qwen2.5-7B为例,则其yaml文件为[predict_qwen2_5_7b_instruct.yaml](https://gitee.com/mindspore/mindformers/blob/master/research/qwen2_5/predict_qwen2_5_7b_instruct.yaml) 。 用户可通过`npu-smi info`查看显存占用情况,并可以使用如下环境变量,设置用于推理的计算卡: @@ -198,7 +194,7 @@ vLLM MindSpore可使用OpenAI的API协议,部署为在线推理。以下是以 python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "Qwen/Qwen2.5-7B-Instruct" ``` -若服务成功拉起,则可以获得类似的执行结果: +用户可以通过`--model`参数,指定模型保存的本地路径。若服务成功拉起,则可以获得类似的执行结果: ```text INFO: Started server process [6363] @@ -220,7 +216,7 @@ Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg gereration throughput: 0.0 curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "Qwen/Qwen2.5-7B-Instruct", "prompt": "I am", "max_tokens": 20, "temperature": 0}' ``` -若请求处理成功,将获得以下的推理结果: +其中,用户需确认`"model"`字段与启动服务中`--model`一致,请求才能成功匹配到模型。若请求处理成功,将获得以下推理结果: ```text { diff --git a/docs/vllm_mindspore/docs/source_zh_cn/index.rst b/docs/vllm_mindspore/docs/source_zh_cn/index.rst index 3f8b00d7d1e61977358d664cb37c62280abc95b8..f465f8c121915e72eb11ae20eb3b4da8d0745a2f 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/index.rst +++ b/docs/vllm_mindspore/docs/source_zh_cn/index.rst @@ -58,8 +58,8 @@ vLLM MindSpore采用vLLM社区推荐的插件机制,实现能力注册。未 ----------------------------------------------------- vLLM MindSpore代码仓包含主干分支、开发分支、版本分支: -- **main**: 主干分支,与MindSpore master分支和vLLM v0.7.3版本配套,并通过昇腾+昇思CI持续进行质量看护; -- **develop**: 开发分支,在vLLM部分新版本发布时从主干分支拉出,用于开发适配vLLM的新功能特性。待特性适配稳定后合入主干分支。当前开发分支正在适配vLLM v0.8.3版本; +- **main**: 主干分支,与MindSpore master分支和vLLM v0.8.3版本配套,并通过昇腾+昇思CI持续进行质量看护; +- **develop**: 开发分支,在vLLM部分新版本发布时从主干分支拉出,用于开发适配vLLM的新功能特性。待特性适配稳定后合入主干分支。当前开发分支正在适配vLLM v0.9.1版本; - **rX.Y.Z**: 版本分支,在完成vLLM某版本适配后,从主干分支拉出,用于正式版本发布归档。 下面是维护中的版本分支: @@ -72,10 +72,10 @@ vLLM MindSpore代码仓包含主干分支、开发分支、版本分支: - 备注 * - master - Maintained - - 基于vLLM v0.7.3版本和MindSpore master分支CI看护 + - 基于vLLM v0.8.3版本和MindSpore master分支CI看护 * - develop - Maintained - - 基于vLLM v0.8.3版本 + - 基于vLLM v0.9.1版本 * - r0.1 - Unmaintained - 仅允许文档修复 diff --git a/docs/vllm_mindspore/docs/source_zh_cn/release_notes/release_notes.md b/docs/vllm_mindspore/docs/source_zh_cn/release_notes/release_notes.md index 19fa32c0460881a7f5b6a5bd263b9cc189d6c4f5..f8bd01f5a0644758a34bf307f9bbc4c4413fa21c 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/release_notes/release_notes.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/release_notes/release_notes.md @@ -8,13 +8,14 @@ ### 新特性 -- 0.8.3 V1架构基础功能, 包含分块预填充和自动前缀缓存功能; +- 0.9.1 V1架构基础功能, 包含分块预填充和自动前缀缓存功能; - V0 多步调度功能; - V0 分块预填充功能; - V0 自动前缀缓存功; - V0 DeepSeek MTP功能; - GPTQ量化; -- SmoothQuant量化。 +- SmoothQuant量化; +- V1 后处理增强。 ### 新模型 diff --git a/docs/vllm_mindspore/docs/source_zh_cn/user_guide/environment_variables/environment_variables.md b/docs/vllm_mindspore/docs/source_zh_cn/user_guide/environment_variables/environment_variables.md index 7fd53b3ff3ee7ca8084c66a5942184e2a1fdff73..b5e2aefd2d27953a0ec22dc8cd05a96117ae8bdd 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/user_guide/environment_variables/environment_variables.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/user_guide/environment_variables/environment_variables.md @@ -4,13 +4,20 @@ | 环境变量 | 功能 | 类型 | 取值 | 说明 | | ------ | ------- | ------ | ------ | ------ | -| vLLM_MODEL_BACKEND | 用于指定模型后端。使用vLLM MindSpore原生模型后端时无需指定;使用模型为vLLM MindSpore外部后端时则需要指定。 | String | `MindFormers`: 模型后端为MindSpore Transformers。 | 原生模型后端当前支持Qwen2.5系列;MindSpore Transformers模型后端支持Qwen系列、DeepSeek、Llama系列模型,使用时需配置环境变量:`export PYTHONPATH=/path/to/mindformers/:$PYTHONPATH`。 | -| MINDFORMERS_MODEL_CONFIG | MindSpore Transformers模型的配置文件。使用Qwen2.5系列、DeepSeek系列模型时,需要配置文件路径。 | String | 模型配置文件路径。 | **该环境变量在后续版本会被移除。** 样例:`export MINDFORMERS_MODEL_CONFIG=/path/to/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b_w8a8.yaml`。 | -| GLOO_SOCKET_IFNAME | 用于多机之间使用gloo通信时的网口名称。 | String | 网口名称,例如enp189s0f0。 | 多机场景使用,可通过`ifconfig`查找ip对应网卡的网卡名。 | -| TP_SOCKET_IFNAME | 用于多机之间使用TP通信时的网口名称。 | String | 网口名称,例如enp189s0f0。 | 多机场景使用,可通过`ifconfig`查找ip对应网卡的网卡名。 | -| HCCL_SOCKET_IFNAME | 用于多机之间使用HCCL通信时的网口名称。 | String | 网口名称,例如enp189s0f0。 | 多机场景使用,可通过`ifconfig`查找ip对应网卡的网卡名。 | -| ASCEND_RT_VISIBLE_DEVICES | 指定哪些Device对当前进程可见,支持一次指定一个或多个Device ID。 | String | 为Device ID,逗号分割的字符串,例如"0,1,2,3,4,5,6,7"。 | ray使用场景建议使用。 | -| HCCL_BUFFSIZE | 此环境变量用于控制两个NPU之间共享数据的缓存区大小。 | int | 缓存区大小,大小为MB。例如:`2048`。 | 使用方法参考:[HCCL_BUFFSIZE](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/81RC1beta1/maintenref/envvar/envref_07_0080.html)。例如DeepSeek 混合并行(数据并行数为32,专家并行数为32),且`max-num-batched-tokens`为256时,则`export HCCL_BUFFSIZE=2048`。 | -| MS_MEMPOOL_BLOCK_SIZE | 设置PyNative模式下设备内存池的块大小。 | String | 正整数string,单位为GB。 | | -| vLLM_USE_NPU_ADV_STEP_FLASH_OP | 是否使用昇腾`adv_step_flash`算子。 | String | `on`: 使用;`off`:不使用 | 取值为`off`时,将使用小算子实现替代`adv_step_flash`算子。 | -| VLLM_TORCH_PROFILER_DIR | 开启profiling采集数据,当配置了采集数据保存路径后生效 | String | Profiling数据保存路径。| | +| `vLLM_MODEL_BACKEND` | 用于指定模型后端。使用vLLM MindSpore原生模型后端时无需指定;使用模型为vLLM MindSpore外部后端时则需要指定。 | String | `MindFormers`: 模型后端为MindSpore Transformers。 | 原生模型后端当前支持Qwen2.5系列;MindSpore Transformers模型后端支持Qwen系列、DeepSeek、Llama系列模型,使用时需配置环境变量:`export PYTHONPATH=/path/to/mindformers/:$PYTHONPATH`。 | +| `MINDFORMERS_MODEL_CONFIG` | MindSpore Transformers模型的配置文件。使用Qwen2.5系列、DeepSeek系列模型时,需要配置文件路径。 | String | 模型配置文件路径。 | **该环境变量在后续版本会被移除。** 样例:`export MINDFORMERS_MODEL_CONFIG=/path/to/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b_w8a8.yaml`。 | +| `GLOO_SOCKET_IFNAME` | 用于多机之间使用gloo通信时的网口名称。 | String | 网口名称,例如enp189s0f0。 | 多机场景使用,可通过`ifconfig`查找ip对应网卡的网卡名。 | +| `TP_SOCKET_IFNAME` | 用于多机之间使用TP通信时的网口名称。 | String | 网口名称,例如enp189s0f0。 | 多机场景使用,可通过`ifconfig`查找ip对应网卡的网卡名。 | +| `HCCL_SOCKET_IFNAME` | 用于多机之间使用HCCL通信时的网口名称。 | String | 网口名称,例如enp189s0f0。 | 多机场景使用,可通过`ifconfig`查找ip对应网卡的网卡名。 | +| `ASCEND_RT_VISIBLE_DEVICES` | 指定哪些Device对当前进程可见,支持一次指定一个或多个Device ID。 | String | 为Device ID,逗号分割的字符串,例如"0,1,2,3,4,5,6,7"。 | ray使用场景建议使用。 | +| `HCCL_BUFFSIZE` | 此环境变量用于控制两个NPU之间共享数据的缓存区大小。 | Integer | 缓存区大小,大小为MB。例如:`2048`。 | 使用方法参考:[HCCL_BUFFSIZE](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/81RC1beta1/maintenref/envvar/envref_07_0080.html)。例如DeepSeek 混合并行(数据并行数为32,专家并行数为32),且`max-num-batched-tokens`为256时,则`export HCCL_BUFFSIZE=2048`。 | +| `MS_MEMPOOL_BLOCK_SIZE` | 设置PyNative模式下设备内存池的块大小。 | String | 正整数string,单位为GB。 | | +| `vLLM_USE_NPU_ADV_STEP_FLASH_OP` | 是否使用昇腾`adv_step_flash`算子。 | String | `on`: 使用;`off`:不使用 | 取值为`off`时,将使用小算子实现替代`adv_step_flash`算子。 | +| `VLLM_TORCH_PROFILER_DIR` | 开启profiling采集数据,当配置了采集数据保存路径后生效 | String | Profiling数据保存路径。| | + +更多的环境变量信息,请查看: + +- [CANN 环境变量列表](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/81RC1beta1/index/index.html) +- [MindSpore 环境变量列表](https://www.mindspore.cn/docs/zh-CN/master/api_python/env_var_list.html) +- [MindSpore Transformers 环境变量列表](https://www.mindspore.cn/mindformers/docs/zh-CN/master/index.html) +- [vLLM 环境变量列表](https://docs.vllm.ai/en/v0.8.4/serving/env_vars.html) diff --git a/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/benchmark/benchmark.md b/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/benchmark/benchmark.md index 661dba5878f40992338580d93896393c5d418383..6d28a6ff7c60cb8c1677aeb7fe116534da81802a 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/benchmark/benchmark.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/benchmark/benchmark.md @@ -9,9 +9,7 @@ vLLM MindSpore的性能测试能力,继承自vLLM所提供的性能测试能 若用户使用单卡推理,以[Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct)为例,可按照文档[单卡推理(Qwen2.5-7B)](../../../getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md#在线推理)进行环境准备,设置以下环境变量: ```bash -export ASCEND_TOTAL_MEMORY_GB=64 # Please use `npu-smi info` to check the memory. export vLLM_MODEL_BACKEND=MindFormers # use MindSpore Transformers as model backend. -export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Set according to the model's maximum usage, with the remaining environment used for kvcache allocation export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model's YAML file. ``` @@ -40,7 +38,7 @@ INFO: Application startup complete. 拉取vLLM代码仓,导入vLLM MindSpore插件,复用其中benchmark功能: ```bash -export VLLM_BRANCH=v0.8.3 +export VLLM_BRANCH=v0.9.1 git clone https://github.com/vllm-project/vllm.git -b ${VLLM_BRANCH} cd vllm sed -i '1i import vllm_mindspore' benchmarks/benchmark_serving.py @@ -104,16 +102,14 @@ P99 ITL (ms): .... 用户使用离线性能测试时,以[Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct)为例,可按照文档[单卡推理(Qwen2.5-7B)](../../../getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md#离线推理)进行环境准备,设置以下环境变量: ```bash -export ASCEND_TOTAL_MEMORY_GB=64 # Please use `npu-smi info` to check the memory. export vLLM_MODEL_BACKEND=MindFormers # use MindSpore Transformers as model backend. -export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Set according to the model's maximum usage, with the remaining environment used for kvcache allocation export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model's YAML file. ``` 并拉取vLLM代码仓,导入vLLM MindSpore插件,复用其中benchmark功能: ```bash -export VLLM_BRANCH=v0.8.3 +export VLLM_BRANCH=v0.9.1 git clone https://github.com/vllm-project/vllm.git -b ${VLLM_BRANCH} cd vllm sed -i '1i import vllm_mindspore' benchmarks/benchmark_throughput.py diff --git a/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/features_list/features_list.md b/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/features_list/features_list.md index 986e0f0822d61089de78600285970e69aa50dca4..910550afccb3eadebc898debd958050347363435 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/features_list/features_list.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/features_list/features_list.md @@ -23,7 +23,7 @@ vLLM MindSpore支持的特性功能与vLLM社区版本保持一致,特性描 | Multi Modality | WIP | WIP | | Prompt adapter | × | WIP | | Speculative decoding | × | WIP | -| LogProbs | × | WIP | +| LogProbs | × | √ | | Prompt logProbs | × | WIP | | Best of | × | × | | Beam search | × | WIP | @@ -31,7 +31,7 @@ vLLM MindSpore支持的特性功能与vLLM社区版本保持一致,特性描 | Pooling | × | × | | Enc-dec | × | × | | Reasoning Outputs | √ | √ | -| Tool Calling | WIP | WIP | +| Tool Calling | WIP | √ | - √:功能已与vLLM社区版本能力对齐。 - ×:暂无支持计划,建议使用其他方案代替。 diff --git a/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/profiling/profiling.md b/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/profiling/profiling.md index 4dc4f2ccee29c5c3d3842ba62e5381c4e01834d4..eb90283188cdae258793d3614d88431cd2063f0b 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/profiling/profiling.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/profiling/profiling.md @@ -40,7 +40,7 @@ curl -X POST http://127.0.0.1:8000/start_profile curl http://localhost:8000/v1/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "/home/DeepSeekV3", + "model": "Qwen/Qwen2.5-32B-Instruct", "prompt": "San Francisco is a", "max_tokens": 7, "temperature": 0 diff --git a/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/quantization/quantization.md b/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/quantization/quantization.md index 54ad35032dbdcb594ef4ae2b847beb5d41f701fe..22a83475eff16c8a311a7d7c6a12cc62dcb8f483 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/quantization/quantization.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/quantization/quantization.md @@ -16,7 +16,7 @@ ### 直接下载量化权重 -我们已经将量化好的DeepSeek-R1上传到[魔乐社区](https://modelers.cn):[MindSpore-Lab/DeepSeek-R1-W8A8](https://modelers.cn/models/MindSpore-Lab/DeepSeek-R1-W8A8),可以参考[魔乐社区文档](https://modelers.cn/docs/zh/openmind-hub-client/0.9/basic_tutorial/download.html)将权重下载到本地。 +我们已经将量化好的DeepSeek-R1上传到[魔乐社区](https://modelers.cn):[MindSpore-Lab/DeepSeek-R1-0528-A8W8](https://modelers.cn/models/MindSpore-Lab/DeepSeek-R1-0528-A8W8),可以参考[魔乐社区文档](https://modelers.cn/docs/zh/openmind-hub-client/0.9/basic_tutorial/download.html)将权重下载到本地。 ## 量化模型推理 @@ -27,9 +27,7 @@ 用户可以参考[安装指南](../../../getting_started/installation/installation.md),进行vLLM MindSpore的环境搭建。用户需设置以下环境变量: ```bash -export ASCEND_TOTAL_MEMORY_GB=64 # Please use `npu-smi info` to check the memory. export vLLM_MODEL_BACKEND=MindFormers # use MindSpore Transformers as model backend. -export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Set according to the model's maximum usage, with the remaining environment used for kvcache allocation export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model's YAML file. ``` diff --git a/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_models/models_list/models_list.md b/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_models/models_list/models_list.md index 2e504c0fecd265842d02a1474e0f77fdfe151eac..c64725c9e1e448999d189ec22aeab0942f012299 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_models/models_list/models_list.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_models/models_list/models_list.md @@ -6,7 +6,7 @@ |-------| --------- | ---- | | DeepSeek-V3 | 已支持 | [DeepSeek-V3](https://modelers.cn/models/MindSpore-Lab/DeepSeek-V3) | | DeepSeek-R1 | 已支持 | [DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-V3) | -| DeepSeek-R1 W8A8 | 已支持 | [Deepseek-R1-W8A8](https://modelers.cn/models/MindSpore-Lab/DeepSeek-r1-w8a8) | +| DeepSeek-R1 W8A8 | 已支持 | [Deepseek-R1-W8A8](https://modelers.cn/models/MindSpore-Lab/DeepSeek-R1-0528-A8W8) | | Qwen2.5 | 已支持 | [Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct)、[Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct)、[Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct)、 [Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct)、[Qwen2.5-14B-Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct)、[Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct)、[Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) | | Qwen3-32B | 已支持 | [Qwen3-32B](https://modelers.cn/models/MindSpore-Lab/Qwen3-32B) | | Qwen3-235B-A22B | 已支持 | [Qwen3-235B-A22B](https://huggingface.co/Qwen/Qwen3-235B-A22B) | diff --git a/install/mindspore_ascend_install_docker.md b/install/mindspore_ascend_install_docker.md index c3fd8493467f568b615483527c31990dafe47a5a..90f655628c3fe106b6e80301424d494043072337 100644 --- a/install/mindspore_ascend_install_docker.md +++ b/install/mindspore_ascend_install_docker.md @@ -166,7 +166,7 @@ print(ops.add(x, y)) ## 注意事项 -- 在非root用户模式下创建容器时,必须确保目标NPU设备未被其他非root容器占用。启动后可以执行 `npu-smi info` 命令验证设备状态,若目标NPU设备已被其他非root容器占用,则会出现以下报错,可以在创建容器时加上 `-u root`。 +- 在非root用户模式下创建容器时,必须确保目标NPU设备未被其他非root容器占用。启动后可以执行 `npu-smi info` 命令验证设备状态,若目标NPU设备已被其他非root容器占用,则会出现以下报错,可以在创建容器时加上 `-u root --privileged`。 ```text DrvMngGetConsoleLogLevel failed. (g_conLogLevel=3) diff --git a/install/mindspore_ascend_install_docker_en.md b/install/mindspore_ascend_install_docker_en.md index 9b6f6aae91a6da6a75ed407a0ec999e4946c0fc9..8b283309d1902dc8b365b085405a7108710f5a8c 100644 --- a/install/mindspore_ascend_install_docker_en.md +++ b/install/mindspore_ascend_install_docker_en.md @@ -165,7 +165,7 @@ When you need to update the MindSpore version: ## Notes -- When deploying containers in non-root user mode, it is essential to verify that the target NPU device is not occupied by other unprivileged containers. After startup, execute the `npu-smi` info command to check device status. If the target NPU device is already allocated to another non-root container, the following error will occur, You can add `-u root` when creating the container. +- When deploying containers in non-root user mode, it is essential to verify that the target NPU device is not occupied by other unprivileged containers. After startup, execute the `npu-smi` info command to check device status. If the target NPU device is already allocated to another non-root container, the following error will occur, You can add `-u root --privileged` when creating the container. ```text DrvMngGetConsoleLogLevel failed. (g_conLogLevel=3) diff --git a/install/mindspore_ascend_install_pip.md b/install/mindspore_ascend_install_pip.md index 64190af8c57ac7c1f30f2cd2e5c0909bf42eface..b2984b8263c70a7a55b5c558bcf539ec2c90d1ac 100644 --- a/install/mindspore_ascend_install_pip.md +++ b/install/mindspore_ascend_install_pip.md @@ -126,21 +126,10 @@ pip install /usr/local/Ascend/ascend-toolkit/latest/lib64/hccl-*-py3-none-any.wh export MS_VERSION=2.7.0 ``` -然后根据系统架构及Python版本,执行以下命令安装MindSpore。 +然后执行以下命令安装MindSpore。 ```bash -# x86_64 + Python3.9 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/x86_64/mindspore-${MS_VERSION/-/}-cp39-cp39-linux_x86_64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# x86_64 + Python3.10 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/x86_64/mindspore-${MS_VERSION/-/}-cp310-cp310-linux_x86_64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# x86_64 + Python3.11 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/x86_64/mindspore-${MS_VERSION/-/}-cp311-cp311-linux_x86_64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# aarch64 + Python3.9 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/aarch64/mindspore-${MS_VERSION/-/}-cp39-cp39-linux_aarch64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# aarch64 + Python3.10 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/aarch64/mindspore-${MS_VERSION/-/}-cp310-cp310-linux_aarch64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# aarch64 + Python3.11 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/aarch64/mindspore-${MS_VERSION/-/}-cp311-cp311-linux_aarch64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ +pip install mindspore==${MS_VERSION} -i https://repo.mindspore.cn/pypi/simple --trusted-host repo.mindspore.cn --extra-index-url https://repo.huaweicloud.com/repository/pypi/simple/ ``` 在联网状态下,安装whl包时会自动下载MindSpore安装包的依赖项(依赖项详情参见[setup.py](https://gitee.com/mindspore/mindspore/blob/master/setup.py)中的required_package),其余情况需自行安装依赖。 diff --git a/install/mindspore_ascend_install_pip_en.md b/install/mindspore_ascend_install_pip_en.md index c781d475c04a0c5cc3ae09ff9723af32777b9045..7cfa65c7220402c5ae32f68113a79ab26995dd8c 100644 --- a/install/mindspore_ascend_install_pip_en.md +++ b/install/mindspore_ascend_install_pip_en.md @@ -126,21 +126,10 @@ First, refer to [Version List](https://www.mindspore.cn/versions) to select the export MS_VERSION=2.7.0 ``` -Then run the following commands to install MindSpore according to the system architecture and Python version. +Then run the following command to install MindSpore. ```bash -# x86_64 + Python3.9 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/x86_64/mindspore-${MS_VERSION/-/}-cp39-cp39-linux_x86_64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# x86_64 + Python3.10 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/x86_64/mindspore-${MS_VERSION/-/}-cp310-cp310-linux_x86_64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# x86_64 + Python3.11 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/x86_64/mindspore-${MS_VERSION/-/}-cp311-cp311-linux_x86_64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# aarch64 + Python3.9 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/aarch64/mindspore-${MS_VERSION/-/}-cp39-cp39-linux_aarch64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# aarch64 + Python3.10 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/aarch64/mindspore-${MS_VERSION/-/}-cp310-cp310-linux_aarch64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# aarch64 + Python3.11 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/aarch64/mindspore-${MS_VERSION/-/}-cp311-cp311-linux_aarch64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ +pip install mindspore==${MS_VERSION} -i https://repo.mindspore.cn/pypi/simple --trusted-host repo.mindspore.cn --extra-index-url https://repo.huaweicloud.com/repository/pypi/simple/ ``` When the network is connected, dependencies of MindSpore are automatically downloaded during the .whl package installation. (For details about the dependency, see required_package in [setup.py](https://gitee.com/mindspore/mindspore/blob/master/setup.py)). In other cases, you need to install dependencies by yourself. diff --git a/install/mindspore_cpu_install_pip.md b/install/mindspore_cpu_install_pip.md index 59256cb36695c434b43f79339d75d4c944feee90..051922baa5c924c9087d37b37ad417bff5bcfe5b 100644 --- a/install/mindspore_cpu_install_pip.md +++ b/install/mindspore_cpu_install_pip.md @@ -89,21 +89,10 @@ sudo apt-get install gcc-9 -y export MS_VERSION=2.7.0 ``` -然后根据系统架构及Python版本,执行以下命令安装MindSpore。 +然后执行以下命令安装MindSpore。 ```bash -# x86_64 + Python3.9 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/x86_64/mindspore-${MS_VERSION/-/}-cp39-cp39-linux_x86_64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# x86_64 + Python3.10 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/x86_64/mindspore-${MS_VERSION/-/}-cp310-cp310-linux_x86_64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# x86_64 + Python3.11 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/x86_64/mindspore-${MS_VERSION/-/}-cp311-cp311-linux_x86_64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# aarch64 + Python3.9 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/aarch64/mindspore-${MS_VERSION/-/}-cp39-cp39-linux_aarch64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# aarch64 + Python3.10 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/aarch64/mindspore-${MS_VERSION/-/}-cp310-cp310-linux_aarch64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# aarch64 + Python3.11 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/aarch64/mindspore-${MS_VERSION/-/}-cp311-cp311-linux_aarch64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ +pip install mindspore==${MS_VERSION} -i https://repo.mindspore.cn/pypi/simple --trusted-host repo.mindspore.cn --extra-index-url https://repo.huaweicloud.com/repository/pypi/simple/ ``` 在联网状态下,安装whl包时会自动下载mindspore安装包的依赖项(依赖项详情参见[setup.py](https://gitee.com/mindspore/mindspore/blob/master/setup.py)中的required_package),其余情况需自行安装依赖。 diff --git a/install/mindspore_cpu_install_pip_en.md b/install/mindspore_cpu_install_pip_en.md index 32d43b5fedde9925923565232bfde782f4975e12..edbbfd0ba47a85d2037d55b68377f0509db221e5 100644 --- a/install/mindspore_cpu_install_pip_en.md +++ b/install/mindspore_cpu_install_pip_en.md @@ -89,21 +89,10 @@ First, refer to [Version List](https://www.mindspore.cn/versions) to select the export MS_VERSION=2.7.0 ``` -Then run the following commands to install MindSpore according to the system architecture and Python version. +Then run the following command to install MindSpore. ```bash -# x86_64 + Python3.9 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/x86_64/mindspore-${MS_VERSION/-/}-cp39-cp39-linux_x86_64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# x86_64 + Python3.10 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/x86_64/mindspore-${MS_VERSION/-/}-cp310-cp310-linux_x86_64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# x86_64 + Python3.11 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/x86_64/mindspore-${MS_VERSION/-/}-cp311-cp311-linux_x86_64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# aarch64 + Python3.9 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/aarch64/mindspore-${MS_VERSION/-/}-cp39-cp39-linux_aarch64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# aarch64 + Python3.10 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/aarch64/mindspore-${MS_VERSION/-/}-cp310-cp310-linux_aarch64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# aarch64 + Python3.11 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/aarch64/mindspore-${MS_VERSION/-/}-cp311-cp311-linux_aarch64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ +pip install mindspore==${MS_VERSION} -i https://repo.mindspore.cn/pypi/simple --trusted-host repo.mindspore.cn --extra-index-url https://repo.huaweicloud.com/repository/pypi/simple/ ``` When the network is connected, dependency items are automatically downloaded during .whl package installation. (For details about the dependency, see required_package in [setup.py](https://gitee.com/mindspore/mindspore/blob/master/setup.py)). In other cases, you need to install dependency by yourself. diff --git a/install/mindspore_cpu_mac_install_pip.md b/install/mindspore_cpu_mac_install_pip.md index 6670ae351c5542ac7d6173bd215bc38a07ef4e9e..a1692cf1e28f28858cefc44959e7a18d8daa0afe 100644 --- a/install/mindspore_cpu_mac_install_pip.md +++ b/install/mindspore_cpu_mac_install_pip.md @@ -50,24 +50,13 @@ export MS_VERSION=2.7.0 ``` -然后根据系统架构及Python版本,执行以下命令安装MindSpore。 +然后执行以下命令安装MindSpore。 ```bash # install prerequisites conda install scipy -c conda-forge -# x86_64 + Python3.9 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/cpu/x86_64/mindspore-${MS_VERSION/-/}-cp39-cp39-macosx_10_15_x86_64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# x86_64 + Python3.10 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/cpu/x86_64/mindspore-${MS_VERSION/-/}-cp310-cp310-macosx_10_15_x86_64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# x86_64 + Python3.11 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/cpu/x86_64/mindspore-${MS_VERSION/-/}-cp311-cp311-macosx_10_15_x86_64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# aarch64 + Python3.9 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/cpu/aarch64/mindspore-${MS_VERSION/-/}-cp39-cp39-macosx_11_0_arm64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# aarch64 + Python3.10 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/cpu/aarch64/mindspore-${MS_VERSION/-/}-cp310-cp310-macosx_11_0_arm64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# aarch64 + Python3.11 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/cpu/aarch64/mindspore-${MS_VERSION/-/}-cp311-cp311-macosx_11_0_arm64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ +pip install mindspore==${MS_VERSION} -i https://repo.mindspore.cn/pypi/simple --trusted-host repo.mindspore.cn --extra-index-url https://repo.huaweicloud.com/repository/pypi/simple/ ``` 在联网状态下,安装whl包时会自动下载mindspore安装包的依赖项(依赖项详情参见[setup.py](https://gitee.com/mindspore/mindspore/blob/master/setup.py)中的required_package),其余情况需自行安装依赖。 diff --git a/install/mindspore_cpu_mac_install_pip_en.md b/install/mindspore_cpu_mac_install_pip_en.md index 1c20b381e4649833722ee84e468609c318f41527..d3a8140d9d012e4900fc5525e1c75a750f1341c8 100644 --- a/install/mindspore_cpu_mac_install_pip_en.md +++ b/install/mindspore_cpu_mac_install_pip_en.md @@ -50,24 +50,13 @@ First, refer to [Version List](https://www.mindspore.cn/versions) to select the export MS_VERSION=2.7.0 ``` -Then run the following commands to install MindSpore according to the system architecture and Python version. +Then run the following command to install MindSpore. ```bash # install prerequisites conda install scipy -c conda-forge -# x86_64 + Python3.9 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/cpu/x86_64/mindspore-${MS_VERSION/-/}-cp39-cp39-macosx_10_15_x86_64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# x86_64 + Python3.10 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/cpu/x86_64/mindspore-${MS_VERSION/-/}-cp310-cp310-macosx_10_15_x86_64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# x86_64 + Python3.11 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/cpu/x86_64/mindspore-${MS_VERSION/-/}-cp311-cp311-macosx_10_15_x86_64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# aarch64 + Python3.9 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/cpu/aarch64/mindspore-${MS_VERSION/-/}-cp39-cp39-macosx_11_0_arm64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# aarch64 + Python3.10 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/cpu/aarch64/mindspore-${MS_VERSION/-/}-cp310-cp310-macosx_11_0_arm64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# aarch64 + Python3.11 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/cpu/aarch64/mindspore-${MS_VERSION/-/}-cp311-cp311-macosx_11_0_arm64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ +pip install mindspore==${MS_VERSION} -i https://repo.mindspore.cn/pypi/simple --trusted-host repo.mindspore.cn --extra-index-url https://repo.huaweicloud.com/repository/pypi/simple/ ``` When the network is connected, dependencies of MindSpore are automatically downloaded during the .whl package installation. For details about dependencies, see required_package in the [setup.py](https://gitee.com/mindspore/mindspore/blob/master/setup.py). In other cases, install the dependencies by yourself. diff --git a/install/mindspore_cpu_win_install_pip.md b/install/mindspore_cpu_win_install_pip.md index d77dcd4282a35199f0a89512d21da3b834893e20..59995d256b980385da4a51a20fbe4944cd8eeddf 100644 --- a/install/mindspore_cpu_win_install_pip.md +++ b/install/mindspore_cpu_win_install_pip.md @@ -27,15 +27,10 @@ set MS_VERSION=2.7.0 ``` -然后根据Python版本执行以下命令安装MindSpore。 +然后执行以下命令安装MindSpore。 ```bash -# Python3.9 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/%MS_VERSION%/MindSpore/cpu/x86_64/mindspore-%MS_VERSION:-=%-cp39-cp39-win_amd64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# Python3.10 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/%MS_VERSION%/MindSpore/cpu/x86_64/mindspore-%MS_VERSION:-=%-cp310-cp310-win_amd64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# Python3.11 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/%MS_VERSION%/MindSpore/cpu/x86_64/mindspore-%MS_VERSION:-=%-cp311-cp311-win_amd64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ +pip install mindspore==%MS_VERSION% -i https://repo.mindspore.cn/pypi/simple --trusted-host repo.mindspore.cn --extra-index-url https://repo.huaweicloud.com/repository/pypi/simple/ ``` 在联网状态下,安装whl包时会自动下载mindspore安装包的依赖项(依赖项详情参见[setup.py](https://gitee.com/mindspore/mindspore/blob/master/setup.py)中的required_package),其余情况需自行安装依赖。 diff --git a/install/mindspore_cpu_win_install_pip_en.md b/install/mindspore_cpu_win_install_pip_en.md index 57b08d447360b3a2eb692e74e2c78f34b5ab9ffd..90a50eb3e6a488e78e982a1ef35725a85455c134 100644 --- a/install/mindspore_cpu_win_install_pip_en.md +++ b/install/mindspore_cpu_win_install_pip_en.md @@ -27,15 +27,10 @@ First, refer to [Version List](https://www.mindspore.cn/versions) to select the set MS_VERSION=2.7.0 ``` -Then run the following commands to install MindSpore according to Python version. +Then run the following command to install MindSpore. ```bash -# Python3.9 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/%MS_VERSION%/MindSpore/cpu/x86_64/mindspore-%MS_VERSION:-=%-cp39-cp39-win_amd64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# Python3.10 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/%MS_VERSION%/MindSpore/cpu/x86_64/mindspore-%MS_VERSION:-=%-cp310-cp310-win_amd64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# Python3.11 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/%MS_VERSION%/MindSpore/cpu/x86_64/mindspore-%MS_VERSION:-=%-cp311-cp311-win_amd64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ +pip install mindspore==%MS_VERSION% -i https://repo.mindspore.cn/pypi/simple --trusted-host repo.mindspore.cn --extra-index-url https://repo.huaweicloud.com/repository/pypi/simple/ ``` When the network is connected, dependency items are automatically downloaded during .whl package installation. (For details about the dependency, see required_package in [setup.py](https://gitee.com/mindspore/mindspore/blob/master/setup.py)). In other cases, you need to install dependency by yourself. diff --git a/install/mindspore_gpu_install_pip.md b/install/mindspore_gpu_install_pip.md index 999451f03d3ed105b0f8c95e84fd7147166795bf..1a9b9ac5408d3fdb9d1b8d0631919a6977f9a3cf 100644 --- a/install/mindspore_gpu_install_pip.md +++ b/install/mindspore_gpu_install_pip.md @@ -175,15 +175,10 @@ cd - export MS_VERSION=2.7.0 ``` -然后根据CUDA版本及Python版本执行如下命令安装最新版本的MindSpore。 +然后执行以下命令安装MindSpore。 ```bash -# Python3.9 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/x86_64/mindspore-${MS_VERSION/-/}-cp39-cp39-linux_x86_64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# Python3.10 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/x86_64/mindspore-${MS_VERSION/-/}-cp310-cp310-linux_x86_64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# Python3.11 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/x86_64/mindspore-${MS_VERSION/-/}-cp311-cp311-linux_x86_64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ +pip install mindspore==${MS_VERSION} -i https://repo.mindspore.cn/pypi/simple --trusted-host repo.mindspore.cn --extra-index-url https://repo.huaweicloud.com/repository/pypi/simple/ ``` 在联网状态下,安装MindSpore时会自动下载MindSpore安装包的依赖项(依赖项详情参见[setup.py](https://gitee.com/mindspore/mindspore/blob/master/setup.py)中的required_package),其余情况需自行安装依赖。 diff --git a/install/mindspore_gpu_install_pip_en.md b/install/mindspore_gpu_install_pip_en.md index 34377b1d1d7f56a8468a3a64911641d843212096..5109a3795e9f5bdeed83942f6f2644c4d1bd3e33 100644 --- a/install/mindspore_gpu_install_pip_en.md +++ b/install/mindspore_gpu_install_pip_en.md @@ -175,15 +175,10 @@ First, refer to [Version List](https://www.mindspore.cn/versions) to select the export MS_VERSION=2.7.0 ``` -Then install the latest version of MindSpore according to the CUDA version and Python version by following the following command. +Then run the following command to install MindSpore. ```bash -# Python3.9 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/x86_64/mindspore-${MS_VERSION/-/}-cp39-cp39-linux_x86_64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# Python3.10 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/x86_64/mindspore-${MS_VERSION/-/}-cp310-cp310-linux_x86_64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ -# Python3.11 -pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/x86_64/mindspore-${MS_VERSION/-/}-cp311-cp311-linux_x86_64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple/ +pip install mindspore==${MS_VERSION} -i https://repo.mindspore.cn/pypi/simple --trusted-host repo.mindspore.cn --extra-index-url https://repo.huaweicloud.com/repository/pypi/simple/ ``` When the network is connected, dependency items are automatically downloaded during MindSpore installation. (For details about the dependency, see required_package in [setup.py](https://gitee.com/mindspore/mindspore/blob/master/setup.py)). In other cases, you need to install dependency by yourself. diff --git a/tools/ci_pipeline_gate_APIView/generate_pr_html.py b/tools/ci_pipeline_gate_APIView/generate_pr_html.py index 94bde7aa84612a4172a77c84fa004a59d5c19c65..d7210c6f26c836aa8ec898976acd7d4093b66c51 100644 --- a/tools/ci_pipeline_gate_APIView/generate_pr_html.py +++ b/tools/ci_pipeline_gate_APIView/generate_pr_html.py @@ -545,7 +545,7 @@ def supplement_pr_file_cn(pr_cn, repo_path, samedfn_rst, pr_need, base_raw_url, samedfn_cn.append(rel_filename) raw_rst_list[rel_filename] = f'{base_raw_url}/{rel_filename}' break - elif j == ori_p.replace('.func_', '.'): + elif j.replace('.func_', '.') == ori_p: samedfn_cn.append(rel_filename) raw_rst_list[rel_filename] = f'{base_raw_url}/{rel_filename}' break diff --git a/tutorials/source_en/beginner/accelerate_with_static_graph.md b/tutorials/source_en/beginner/accelerate_with_static_graph.md index cecbf71c56e9f5df5684819515f5cf308120d695..72317d7bc7ba48f0b1072d677bcef43989941aaa 100644 --- a/tutorials/source_en/beginner/accelerate_with_static_graph.md +++ b/tutorials/source_en/beginner/accelerate_with_static_graph.md @@ -131,7 +131,7 @@ For an example of using static graphs for network compilation, see [Network Buil ## Static Graph Mode Startup Method -Usually, due to the flexibility of dynamic graphs, we choose to use PyNative mode for free neural network construction for model innovation and optimization. But when performance acceleration is needed, we need to accelerate the neural network partially or as a whole. MindSpore provides two ways of switching to graph mode, the decorator-based startup method and the global context-based startup method. +Usually, due to the flexibility of dynamic graphs, we choose to use PyNative mode for free neural network construction for model innovation and optimization. But when performance acceleration is needed, we need to accelerate the neural network partially or as a whole. MindSpore provides two ways of switching to static graph mode, the decorator-based startup method and the global context-based startup method. ### Decorator-based Startup Method diff --git a/tutorials/source_en/custom_program/op_custom.rst b/tutorials/source_en/custom_program/op_custom.rst index 00bff33121e1dab6c61f3b824ec87e87e5b589d8..f237861d2282475e32f226cc999c3939ec40c789 100644 --- a/tutorials/source_en/custom_program/op_custom.rst +++ b/tutorials/source_en/custom_program/op_custom.rst @@ -16,6 +16,7 @@ Custom Operators operation/op_customopbuilder operation/cpp_api_for_custom_ops operation/op_customopbuilder_atb + operation/op_customopbuilder_asdsip When built-in operators cannot meet requirements during network development, you can use MindSpore's custom operator functionality to integrate your operators. Currently, MindSpore provides two approaches for integrating custom operators: diff --git a/tutorials/source_en/custom_program/operation/cpp_api_for_custom_ops.md b/tutorials/source_en/custom_program/operation/cpp_api_for_custom_ops.md index 833893c2770684531b35aa5ef704ba0a5e63dd4d..64559b22efa18f2b4d97243fbd756a145388a2b1 100644 --- a/tutorials/source_en/custom_program/operation/cpp_api_for_custom_ops.md +++ b/tutorials/source_en/custom_program/operation/cpp_api_for_custom_ops.md @@ -517,3 +517,50 @@ void RunAtbOp(const std::string &op_name, const ParamType ¶m, const std::vec - `param`: Parameters required to initialize the ATB operator. - `inputs`: A list of input tensors for the operator. - `outputs`: A list of output tensors for the operator. + +### class AsdSipFFTOpRunner + +The `AsdSipFFTOpRunner` class is a runner for executing Ascend Sip Boost (ASDSIP) operators, defined in the [asdsip_common.h](https://gitee.com/mindspore/mindspore/blob/master/mindspore/ccsrc/ms_extension/ascend/asdsip/asdsip_common.h) header file. + +This class inherits from `PyboostRunner` and encapsulates the process of invoking ASDSIP FFT operators, including initialization, running the ASDSIP FFT operator, managing input/output tensor, memory allocation, and kernel scheduling. + +Refer to the tutorial [CustomOpBuilder Integrates the ASDSIP FFT Operators through AsdSipFFTOpRunner](https://www.mindspore.cn/tutorials/en/master/custom_program/operation/op_customopbuilder_asdsip.html) for usage methods. + +#### Constructor + +- **AsdSipFFTOpRunner** + + ```cpp + explicit AsdSipFFTOpRunner(std::string op_name) : PyboostRunner(op_name) {} + ``` + + Constructor inherited from `PyboostRunner`. + +#### Public Methods + +- **Init(const FFTParam ¶m);** + + ```cpp + void Init(const FFTParam ¶m); + ``` + + - **Description**: [API] Initializes the ASDSIP FFT operator with the given parameters. This method creates a corresponding `asdFftHandle` instance for the operator via `AsdFftCreate` and places it in the cache. Only one `asdFftHandle` instance is created for operators with the same `param`. + - **Parameters**: + - `param`: Parameters used to configure the ASDSIP FFT operator. + +### function RunAsdSipFFTOp + +The interface for executing ASDSIP FFT operators in dynamic graphs, defined in the [asdsip_common.h](https://gitee.com/mindspore/mindspore/blob/master/mindspore/ccsrc/ms_extension/ascend/asdsip/asdsip_common.h) header file. + +```cpp +inline void RunAsdSipFFTOp(const std::string &op_name, const FFTParam &fft_param, const ms::Tensor &input, + const ms::Tensor &output) +``` + +[API] Executes an ASDSIP FFT operator using the provided parameters, input, and output. This function is a wrapper around `AsdSipFFTOpRunner`. + +- **Parameters** + - `op_name`: The name of the ASDSIP FFT operator to execute. + - `fft_param`: Parameters required to initialize the ASDSIP FFT operator. + - `inputs`: A list of input tensor for the operator. + - `outputs`: A list of output tensor for the operator. \ No newline at end of file diff --git a/tutorials/source_en/custom_program/operation/op_customopbuilder.md b/tutorials/source_en/custom_program/operation/op_customopbuilder.md index c9223241fb8dd36b1e2b0bc60081a3d011306f8a..1f7bd0887e823f45ffd8288d404301b466bc0953 100644 --- a/tutorials/source_en/custom_program/operation/op_customopbuilder.md +++ b/tutorials/source_en/custom_program/operation/op_customopbuilder.md @@ -215,3 +215,4 @@ Running the above script produces the following result: ## More Usage Scenarios - [Integrating ATB Operators Using AtbOpRunner](https://www.mindspore.cn/tutorials/en/master/custom_program/operation/op_customopbuilder_atb.html): Introduces methods for quickly integrating ATB operators as custom operators. +- [Integrating ASDSIP FFT Operators Using AsdSipFFTOpRunner](https://www.mindspore.cn/tutorials/zh-CN/master/custom_program/operation/op_customopbuilder_asdsip.html): Introduces methods for quickly integrating ASDSIP FFT operators as custom operators. diff --git a/tutorials/source_en/custom_program/operation/op_customopbuilder_asdsip.md b/tutorials/source_en/custom_program/operation/op_customopbuilder_asdsip.md new file mode 100644 index 0000000000000000000000000000000000000000..2d14959aab98a0f50a766ebad69ad17204d85b1e --- /dev/null +++ b/tutorials/source_en/custom_program/operation/op_customopbuilder_asdsip.md @@ -0,0 +1,128 @@ +# CustomOpBuilder: Integrating ASDSIP FFT Operators Using AsdSipFFTOpRunner + +[![View Source File](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/tutorials/source_en/custom_program/operation/op_customopbuilder.md) + +## Overview + +[Ascend Sip Boost (ASDSIP) Operator Acceleration Library](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/acce/SiP/SIP_0000.html) is an operator library specifically designed for signal process, based on Huawei's Ascend AI processors. + +When users need to use operators from the ASDSIP acceleration library that are not provided by MindSpore, they can quickly integrate and use them through custom operators. + +In [Custom Operators Based on CustomOpBuilder](https://www.mindspore.cn/tutorials/en/master/custom_program/operation/op_customopbuilder.html), MindSpore provides the `PyboostRunner` tool to allow users to integrate custom operators in dynamic graphs. Now, for ASDSIP FFT operators, MindSpore additionally provides the `AsdSipFFTOpRunner` tool to encapsulate the ASDSIP FFT operator's workflow and the dynamic graph's multi-stage pipeline. + +When integrating ASDSIP FFT operators using the [AsdSipFFTOpRunner class](https://www.mindspore.cn/tutorials/en/master/custom_program/operation/cpp_api_for_custom_ops.html#class-asdsipfftoprunner), users only need to provide a `Param` (used as the key for caching `Operation`) and call the `Init` interface for initialization (constructing `Operation`), followed by the `Run` interface to execute the ASDSIP FFT operator. Additionally, users can directly call the [RunAsdSipFFTOp](https://www.mindspore.cn/tutorials/en/master/custom_program/operation/cpp_api_for_custom_ops.html#function-launchasdsipfft) function for one-click execution (the function internally includes calls to both `Init` and `Run` interfaces). + +This guide uses `FftC2C` as an example to demonstrate the ASDSIP FFT operator integration process. The complete code can be found in the [code repository](https://gitee.com/mindspore/mindspore/blob/master/tests/st/graph_kernel/custom/jit_test_files/asdsip_fftc2c.cpp). + +## Installing the ASDSIP Acceleration Library + +[Click here for installation tutorial](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/acce/SiP/SIP_0001.html) + +After successful installation, the users need to activate the environment variable for the ASDSIP acceleration library: + +```sh +source /usr/local/Ascend/nnal/asdsip/set_env.sh &> /dev/null +``` + +## Integrating the FftC2C Operator + +Here we use `ms::pynative::RunLaunchAsdSipFFTAtbOp` to integrate the operator and call the function interface through `ms::pynative::PyboostRunner::Call`: + +```cpp +#include "ms_extension/api.h" + +ms::Tensor InferFFTForward(const ms::Tensor &input) { + ShapeVector out_tensor_shape(input.shape()); + return ms::Tensor(input.data_type(), out_tensor_shape); +} + +ms::Tensor npu_fft(const ms::Tensor &input, int64_t n, int64_t batch_size) { + ms::pynative::FFTParam param; + param.fftXSize = n; + param.fftYSize = 0; + param.fftType = ms::pynative::asdFftType::ASCEND_FFT_C2C; + param.direction = ms::pynative::asdFftDirection::ASCEND_FFT_FORWARD; + param.batchSize = batch_size; + param.dimType = ms::pynative::asdFft1dDimType::ASCEND_FFT_HORIZONTAL; + auto output = InferFFTForward(input); + ms::pynative::RunAsdSipFFTOp("asdFftExecC2C", param, input, output); + return output; +} + +auto pyboost_npu_fft(const ms::Tensor &input, int64_t n, int64_t batch_size) { + return ms::pynative::PyboostRunner::Call<1>(npu_fft, input, n, batch_size); +} + +PYBIND11_MODULE(MS_EXTENSION_NAME, m) { + m.def("fft", &pyboost_npu_fft, "FFT C2C", pybind11::arg("input"), pybind11::arg("n"), + pybind11::arg("batch_size")); +} +``` + +### 1. Infer the Output Information of the Operator + +```cpp +ms::Tensor InferFFT1DForward(const ms::Tensor &input) { + ShapeVector out_tensor_shape(input.shape()); + return ms::Tensor(input.data_type(), out_tensor_shape); +} +``` + +For the `FftC2C` operator, the output tensor has the same data type and shape as the input tensor, an empty tensor is constructed using the `ms::Tensor` constructor. + +### 2. Create and Set the Operator Attribute Structure + +```cpp +ms::pynative::FFTParam param; +param.fftXSize = n; +param.fftYSize = 0; +param.fftType = ms::pynative::asdFftType::ASCEND_FFT_C2C; +param.direction = ms::pynative::asdFftDirection::ASCEND_FFT_FORWARD; +param.batchSize = batch_size; +param.dimType = ms::pynative::asdFft1dDimType::ASCEND_FFT_HORIZONTAL; +``` + +### 3. Execute the Operator via the RunAtbOp Interface + +```cpp +ms::pynative::PyboostRunner::Call<1>(npu_fft, input, n, batch_size); +``` + +This is a template interface, equivalent to: + +```cpp +auto runner = std::make_shared("FFTExecC2C"); +runner->Init(fft_param); +runner->Run({input}, {output}); +``` + +By passing in the operator name, attributes, input tensor list, and output tensor list, the corresponding ASDSIP operator can be invoked. This interface supports multi-stage pipeline execution in dynamic graphs. + +### 4. Bind the C++ Function to a Python Function via pybind11 + +```cpp +auto pyboost_npu_fft(const ms::Tensor &input, int64_t n, int64_t batch_size) { + return ms::pynative::PyboostRunner::Call<1>(npu_fft, input, n, batch_size); +} + +PYBIND11_MODULE(MS_EXTENSION_NAME, m) { + m.def("fft", &pyboost_npu_fft, "FFT C2C", pybind11::arg("input"), pybind11::arg("n"), + pybind11::arg("batch_size")); +} +``` + +### 5. Compile the Custom Operator Using CustomOpBuilder + +Save the above C++ code as a file named `asdsip_fftc2c.cpp`, and then compile it using the Python interface `CustomOpBuilder`. + +```python +input_np = np.random.rand(2, 16) +real_np = input_np.astype(np.float32) +imag_np = input_np.astype(np.float32) +complex_np = real_np + 1j * imag_np +my_ops = CustomOpBuilder("asdsip_fftc2c", "jit_test_files/asdsip_fftc2c.cpp", enable_asdsip=True).load() +output_tensor = my_ops.fft(input_tensor, 16, 2) +print(output_tensor) +``` + +Here, the parameter `enable_asdsip=True` is passed into `CustomOpBuilder`, and MindSpore will automatically add compilation and linking options related to the ASDSIP acceleration library. Users only need to ensure that the `set_env.sh` script for the ASDSIP library has been correctly executed, and the environment contains the `ASDSIP_HOME_PATH` variable. \ No newline at end of file diff --git a/tutorials/source_en/debug/dump.md b/tutorials/source_en/debug/dump.md index cabe663f95d17f776c63c3fb8dda16dd488fa83e..109361a70888b43294608f2371ba3b949b19a32f 100644 --- a/tutorials/source_en/debug/dump.md +++ b/tutorials/source_en/debug/dump.md @@ -155,7 +155,7 @@ MindSpore supports different Dump functionalities under various modes, as shown - "negative inf count": represents the number of `-Inf` elements in the tensor; - "positive inf count": represents the number of `+Inf` elements in the tensor; - "zero count": represents the number of zero elements in the tensor; - - "md5": represents the MD5 value of the tensor; + - "hash": represents the hash feature value of the tensor, currently using the SHA1 algorithm by default, so it can also be written as "hash:sha1", SHA1 algorithm is recommended for its faster;"hash:md5" represents the MD5 value of the tensor, which yields the same result as the "md5" configuration item in previous versions; - "l2norm": represents L2Norm value of the tensor, supporting both device and host statistics. Except for those marked as supporting device statistics, other statistics can be collected only on the host. @@ -168,7 +168,7 @@ MindSpore supports different Dump functionalities under various modes, as shown - `enable`: When set to `true`, enable Synchronous Dump. When set to false or not set, Asynchronous Dump will be used on Ascend. The main difference between the two is that Asynchronous Dump has less impact on the original code execution order. - `trans_flag`: Enable trans flag. Transform the device data format into NCHW. If it is `true`, the data will be saved in the 4D format (NCHW) format on the Host side; if it is `false`, the data format on the Device side will be retained. Default: `true`. - - `stat_calc_mode`: Select the backend for statistical calculations. Options are "host" and "device". Choosing "device" enables device computation of statistics, currently only effective on Ascend, and supports only min/max/avg/l2norm statistics. When `op_debug_mode` is set to 3, only `stat_calc_mode` set to "host" is supported. + - `stat_calc_mode`: Select the backend for statistical calculations. Options are "host" and "device". Choosing "device" enables device computation of statistics, currently only effective on Ascend, and supports only min/max/avg/l2norm statistics. When `op_debug_mode` is set to 3, only `stat_calc_mode` set to "host" is supported. Default: "host". - `device_stat_precision_mode`(Optional): Precision mode of device statistics, and the value can be "high" or "low". When "high" is selected, avg/l2norm statistics will be calculated using float32, which will increase device memory usage and have higher precision; when "low" is selected, the same type as the original data will be used for calculation, which will occupy less device memory, but statistics overflow may be caused when processing large values. The default value is "high". - `sample_mode`(Optional): Setting it to 0 means the sample dump function is not enabled. Enable the sampling dump feature during graph compilation using the ms_backend backend. This field is effective only when "op_debug_mode" is set to `0`, sample dump cannot be enabled in other scene. - `sample_num`(Optional): Used to control the size of sample in sample dump. The default value is 100. diff --git a/tutorials/source_en/index.rst b/tutorials/source_en/index.rst index b8ff888cb223e6aeb225994fb20477661a794134..7345006734f40d704e5881bcb3f17fc79ed71a12 100644 --- a/tutorials/source_en/index.rst +++ b/tutorials/source_en/index.rst @@ -90,11 +90,8 @@ MindSpore Tutorial :caption: Infer :hidden: + model_infer/introduction model_infer/ms_infer/ms_infer_model_infer - model_infer/ms_infer/ms_infer_network_develop - model_infer/ms_infer/ms_infer_parallel_infer - model_infer/ms_infer/ms_infer_quantization - model_infer/ms_infer/ms_infer_model_serving_infer model_infer/lite_infer/overview .. toctree:: @@ -151,7 +148,7 @@ MindSpore Tutorial
- +
Data Processing @@ -237,7 +234,7 @@ MindSpore Tutorial
- +
Inference diff --git a/tutorials/source_en/model_infer/images/model_infer_case_select.png b/tutorials/source_en/model_infer/images/model_infer_case_select.png new file mode 100644 index 0000000000000000000000000000000000000000..0eaa4803ed22d939e178c60571fdc6a5d36a058b Binary files /dev/null and b/tutorials/source_en/model_infer/images/model_infer_case_select.png differ diff --git a/tutorials/source_en/model_infer/images/model_infer_stack.png b/tutorials/source_en/model_infer/images/model_infer_stack.png new file mode 100644 index 0000000000000000000000000000000000000000..a902e358cf176ec5b420aa99ff338fd1c34f92b5 Binary files /dev/null and b/tutorials/source_en/model_infer/images/model_infer_stack.png differ diff --git a/tutorials/source_en/model_infer/introduction.md b/tutorials/source_en/model_infer/introduction.md new file mode 100644 index 0000000000000000000000000000000000000000..f02a2a2cc765e40a949e4d1f3ee5c91bd1911e13 --- /dev/null +++ b/tutorials/source_en/model_infer/introduction.md @@ -0,0 +1,77 @@ +# MindSpore Inference Overview + +[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/tutorials/source_en/model_infer/introduction.md) + +## Context + +MindSpore provides efficient model inference capabilities. From the perspective of AI functions, inference is actually a forward computing of a model using real service data of users. Therefore, the forward computing graph of the MindSpore model can complete the basic functions of inference. However, in actual applications, the purposes of model inference and training are different, and the technologies used for model inference and training are also different. + +- Although model training also requires forward computing, the core purpose of training computing is to compute the inference result based on the existing data set, obtain the intermediate result, and update the weight parameters of the model to optimize the model. + +- Model inference is to use data in the actual production environment to perform inference and prediction under the condition that the model weight parameters are fixed, and obtain the results required by actual services. + +To maximize the model prediction efficiency, model inference needs to provide the following core capabilities: + +- **Cost-effectiveness**: In the production environment, the computing cost of AI models is high. Therefore, the model inference capability needs to complete more computing tasks with fewer computing resources. The AI framework needs to provide lower response latency and higher system throughput to reduce the model inference cost. + +- **Efficient deployment**: In the actual production environment, AI model deployment is complex, involving model weight preparation, model script, and backend adaptation. The ability to quickly deploy AI models to the production environment is one of the important indicators of the inference capability of the AI framework. + +The inference capability required by a model varies with scenarios. Based on common application scenarios in the actual production environment, the inference types are classified as follows: + +- **By computing resource** + + - **Cloud inference**: With the development of cloud computing, computing resources in DCs are becoming increasingly abundant. In the cloud environment, computing resources are usually sufficient. Therefore, cloud inference usually indicates a scenario with abundant computing resources. The AI framework can be completely deployed in the production environment. In addition, the framework has high requirements on distributed parallel capabilities and focuses on the system throughput of AI model inference. + + - **Device inference**: On edges and devices, the AI framework cannot be directly deployed in the production environment due to insufficient computing resources. Therefore, a more lightweight model runtime is required. The number of concurrent tasks is not particularly high, but the response latency of AI model inference is the concern. + +- **By inference model format** + + - **Inference with a framework**: The model network structure and model weight file are saved separately. You can build a network and separately load the model weight to build AI model inference. The advantage of this type of inference is that the inference weight does not need to be updated, regardless of whether the model is fine-tuned, optimized, or developed and debugged, this inference solution has obvious advantages when the weight of the current LLM reaches hundreds of GB. + + - **Inference with a model file**: The model network and weight are packed into a file (ProtoBuf or FlatBed file). You only need to manage one file to execute the model. This inference solution is convenient for deployment management when there are a large number of models and the model size is not large. + +- **By inference backend** + + - **Online inference**: After a model is loaded, the model receives inference requests and calls the model backend for inference. The service backend is also supported. This is a common application deployment mode of AI model services. + + - **Offline inference**: Requests and data are loaded using scripts. Inference is performed for a specific number of times, mostly during model debugging, or integrated as model inference of other service backends. + +## MindSpore Inference Solution + +The MindSpore framework provides multiple model inference modes so that users can select the optimal inference mode as required in different scenarios. The following lists the inference modes supported by MindSpore: + +- **MindSpore inference on the cloud**: This mode is mainly used in scenarios where cloud computing resources are abundant. The MindSpore framework depends on complete components (including dependency libraries such as Python and NumPy). Users can use all capabilities of the MindSpore framework for inference. + + - **Inference with a framework**: The model weight file (CKPT or Safetensor file) and MindSpore network script are used for inference. The model structure can be flexibly adjusted according to requirements. In addition, both dynamic and static graph modes are supported. This inference mode is the main model development and debugging mode for LLMs. + + - **MindIR model inference**: The MindIR file (official MindSpore file) is used for inference, which contains the network structure and weights. Model loading and inference are simpler, but the model cannot be adjusted and the MindIR file needs to be regenerated each time. This mode is not suitable for inference of models with large weights. + + - **vLLM service-based inference**: The vLLM provides the service-based backend model inference capability, which can quickly deploy inference services. This mode is suitable for users who do not have their own service backends and can quickly implement inference services. + +- **Lite inference**: This mode is mainly used in scenarios where the computing resources on the device side are limited. The lightweight runtime reduces the resource requirements for model inference. The model file is in FlatBuffer format, implementing KB-level resource consumption for model inference and enabling the AI capability of devices such as mobile phones. + +The following figure shows the selection routes of common model inference scenarios. + +![LLAMA network structure](images/model_infer_case_select.png) + +You can select the most suitable MindSpore inference solution based on your application scenario. + +The following figure shows the key technology stack of MindSpore inference. + +![LLAMA network structure](images/model_infer_stack.png) + +- **Inference with a framework**: In scenarios with abundant computing resources, only Python APIs are provided. You need to use Python scripts to build models and perform inference. Service-oriented components are not mandatory. + + - **vLLM&vLLM-MindSpore**: The service-oriented capability of the inference solution with a framework is provided. The popular vLLM service-oriented inference capability in the open-source community is used to seamlessly connect the service-oriented capability of the community to the MindSpore inference ecosystem. + + - **Python API**: MindSpore provides Python APIs, including mint operator APIs (consistent with PyTorch semantics), nn APIs, and parallel APIs. + + - **Graph Mode**: It indicates the static graph mode. The graph compilation technology is used to optimize models, and the inference computing performance is high. However, model debugging is not intuitive. You are advised to enable this mode only if the model script is fixed. + + - **PyNative Mode**: It indicates the dynamic graph mode. The Python interpreter is used to execute Python code in the model script one by one, which facilitates model debugging. However, the execution performance is lower than that of the static graph mode due to the Python calling overhead each time. + + - **Runtime**: It indicates the core runtime of the MindSpore framework. The runtime provides parallel execution of the Actor mechanism, computing device management, and memory allocation. + + - **Operator library**: MindSpore has built-in operator libraries for various computations. In inference scenarios, MindSpore also contains various fusion operators to improve inference performance. + +- **Lite inference**: It is oriented to devices with limited resources. The core is C++ runtime, and the resource consumption is less than 1 MB. Lite inference is suitable for devices such as mobile phones. In addition, Lite inference also provides Python APIs to meet different user requirements. diff --git a/tutorials/source_en/model_infer/ms_infer/ms_infer_model_infer.rst b/tutorials/source_en/model_infer/ms_infer/ms_infer_model_infer.rst index 3ec0aed33ef3640da7930c717e4c2d45d4105b96..b7f210325b54d93531bafbf5a853bcb095caee8f 100644 --- a/tutorials/source_en/model_infer/ms_infer/ms_infer_model_infer.rst +++ b/tutorials/source_en/model_infer/ms_infer/ms_infer_model_infer.rst @@ -94,9 +94,9 @@ To achieve the optimal cost-effectiveness, MindSpore LLM has undergone multiple - **Attention optimization**: The primary computation in the LLM's network involves the computation of attention. Since the attention size in mainstream models is often large (typically 4096 x 4096 or more), the performance of the entire inference process heavily relies on the efficiency of attention computation. Many studies focus on optimizing the performance of attention computation, with notable techniques such as flash attention and page attention. - - **Flash attention**: During attention computation, two large matrices (4096 x 4096) are multiplied. This computation breaks the large matrix into smaller matrices that can be processed on multiple chips. Subject to the minimum cache size of chips, data must continuously be moved between the cache and main memory. As a result, compute resources cannot be fully used. Consequently, attention computation is often bandwidth-bound. Flash attention addresses this by dividing attention into blocks, allowing each block to be computed independently on a chip, avoiding multiple data movements during the computation of KVs and enhancing attention computation performance. For details, see `Flash Attention `_. + - **Flash Attention**: During attention computation, two large matrices (4096 x 4096) are multiplied. This computation breaks the large matrix into smaller matrices that can be processed on multiple chips. Subject to the minimum cache size of chips, data must continuously be moved between the cache and main memory. As a result, compute resources cannot be fully used. Consequently, attention computation is often bandwidth-bound. Flash attention addresses this by dividing attention into blocks, allowing each block to be computed independently on a chip, avoiding multiple data movements during the computation of KVs and enhancing attention computation performance. For details, see `Flash Attention `_. - - **Page attention graphics memory optimization**: Standard flash attention reads and saves the entire input KV data each time. This method is simple but wastes many resources. For example, "China's capital" and "China's national flag" share "China's", leading to identical KVs for their attention. Standard flash attention needs to store two copies of KVs, wasting the graphics memory. Page attention optimizes KVCache based on the page table principle of the Linux OS. It stores KVs in blocks of a specific size. In the preceding example, "China", "'s", "capital", and "national flag" are stored as four pieces of KV data. Compared with the original six pieces of data, this method effectively saves graphics memory resources. In the service-oriented scenario, more idle graphics memory allows for a larger batch size for model inference, thereby achieving higher throughput. For details, see `Page Attention `_. + - **Paged Attention**: Standard Flash Attention reads and saves the entire input Key and Value data each time. Although this method is relatively simple, it causes a significant waste of resources. When multiple requests in a batch have inconsistent sequence lengths, Flash Attention requires the key and value to use the memory of the longest sequence. For example, "The capital of China is Beijing" and "The national flag of China is the Five-Star Red Flag", assuming that the words are divided by characters, 10 * 2 = 20 KVCache memory units are required. Paged attention optimizes KVCache based on the page table principle of the Linux OS. Store Key and Value data in blocks of a specific size. For example, when the block size is 2, you can use KVCache per block, only 4 * 2 + 5 * 2 = 18 KVCache memory units are required. Due to the discrete feature of Paged Attention, you can also combine it with technologies such as Prefix Cache to further reduce the memory occupied by "of China". Therefore only 3 * 2 + 5 * 2 = 16 KVCache units are ultimately required. In the service-oriented scenario, more idle graphics memory allows for a larger batch size for model inference, thereby achieving higher throughput. For details, see `Page Attention `_. - **Model quantization**: MindSpore LLM inference supports quantization to reduce the model size. It provides technologies such as A16W8, A16W4, A8W8, and KVCache quantizations to reduce model resource usage and improve the inference throughput. diff --git a/tutorials/source_en/parallel/dataset_slice.md b/tutorials/source_en/parallel/dataset_slice.md index cecf49f7508f8e7a0bfc18c576a38089634a3980..a878165f18e02ec4b15c759a1609a763368ac46b 100644 --- a/tutorials/source_en/parallel/dataset_slice.md +++ b/tutorials/source_en/parallel/dataset_slice.md @@ -4,7 +4,7 @@ ## Overview -When performing distributed training, taking image data as an example, when the size of a single image is too large, such as large-format images of remote sensing satellites, when an image is too large, it is necessary to slice the image and read a portion of each card to perform distributed training. Scenarios that deal with dataset slicing need to be combined with model parallelism to achieve the desired effect of reducing video memory, so this feature is provided based on automatic parallelism. The sample used in this tutorial is not a large-format network, and is intended as an example only. Real-life applications to large-format networks often require detailed design of parallel strategies. +When performing distributed training, taking image data as an example, when the size of a single image is too large, such as large-format images of remote sensing satellites, it is necessary to slice the image and read a portion of each card to perform distributed training. Scenarios that deal with dataset slicing need to be combined with model parallelism to achieve the desired effect of reducing video memory, so this feature is provided based on automatic parallelism. The sample used in this tutorial is not a large-format network, and is intended as an example only. Real-life applications to large-format networks often require detailed design of parallel strategies. > Dataset sharding is not involved in data parallel mode. diff --git a/tutorials/source_en/parallel/host_device_training.md b/tutorials/source_en/parallel/host_device_training.md index 1f34264cb7a636941bab8047cb666f8f6a62c221..997d8643c1134b776248970dfcb1369ddad05ca1 100644 --- a/tutorials/source_en/parallel/host_device_training.md +++ b/tutorials/source_en/parallel/host_device_training.md @@ -10,7 +10,7 @@ In MindSpore, users can easily implement hybrid training by configuring trainabl ### Basic Principle -Pipeline parallel and operator-level parallel are suitable for the model to have a large number of operators, and the parameters are more evenly distributed among the operators. What if the number of operators in the model is small, and the parameters are concentrated in only a few operators? Wide & Deep is an example of this, as shown in the image below. The Embedding table in Wide & Deep can be trained as a parameter of hundreds of GIGabytes or even a few terabytes. If it is executed on an accelerator (device), the number of accelerators required is huge, and the training cost is expensive. On the other hand, if you use accelerator computing, the training acceleration obtained is limited, and it will also trigger cross-server traffic, and the end-to-end training efficiency will not be very high. +Pipeline parallel and operator-level parallel are suitable for scenarios where there are a large number of model operators and parameters are distributed evenly across the operators. If there are fewer model operators and parameters are concentrated in a small number of operators, a different strategy is required. Wide & Deep is an example of this, as shown in the image below. The Embedding table in Wide & Deep can be trained as a parameter of hundreds of GIGabytes or even a few terabytes. If it is executed on an accelerator (device), the number of accelerators required is huge, and the training cost is expensive. On the other hand, if you use accelerator computing, the training acceleration obtained is limited, and it will also trigger cross-server traffic, and the end-to-end training efficiency will not be very high. ![image](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/tutorials/source_zh_cn/parallel/images/host_device_image_0_zh.png) @@ -69,6 +69,7 @@ The dataset is loaded and the data is parallelized consistently with the followi import os import mindspore as ms import mindspore.dataset as ds +from mindspore.communication import get_rank, get_group_size ms.set_seed(1) diff --git a/tutorials/source_zh_cn/beginner/accelerate_with_static_graph.ipynb b/tutorials/source_zh_cn/beginner/accelerate_with_static_graph.ipynb index 4f80abe32f06a91e830e7653ce5c9de0c907b58c..44f1e78f73008e7c6c3ad0ece6635185dfebbef2 100644 --- a/tutorials/source_zh_cn/beginner/accelerate_with_static_graph.ipynb +++ b/tutorials/source_zh_cn/beginner/accelerate_with_static_graph.ipynb @@ -171,7 +171,7 @@ "\n", "## 静态图模式开启方式\n", "\n", - "通常情况下,由于动态图的灵活性,我们会选择使用PyNative模式来进行自由的神经网络构建,以实现模型的创新和优化。但是当需要进行性能加速时,可以对神经网络部分或整体进行加速。MindSpore提供了两种切换为图模式的方式:基于装饰器的开启方式以及基于全局context的开启方式。\n", + "通常情况下,由于动态图的灵活性,我们会选择使用PyNative模式来进行自由的神经网络构建,以实现模型的创新和优化。但是当需要进行性能加速时,可以对神经网络部分或整体进行加速。MindSpore提供了两种切换为静态图模式的方式:基于装饰器的开启方式以及基于全局context的开启方式。\n", "\n", "### 基于装饰器的开启方式\n", "\n", diff --git a/tutorials/source_zh_cn/conf.py b/tutorials/source_zh_cn/conf.py index d79551a49489cfb1cada667671e5a13e466a5567..fa082f61b9ee44773ea8cfe082e3d8007825ba4a 100644 --- a/tutorials/source_zh_cn/conf.py +++ b/tutorials/source_zh_cn/conf.py @@ -48,7 +48,7 @@ html_title = author + ' ' + release + ' ' + project # ones. myst_enable_extensions = ["dollarmath", "amsmath"] - +# 允许生成几级及以上的锚点 myst_heading_anchors = 5 extensions = [ 'myst_parser', diff --git a/tutorials/source_zh_cn/custom_program/op_custom.rst b/tutorials/source_zh_cn/custom_program/op_custom.rst index 13f5a13fff31a0fb9e343a7f8e147281b8b95bb6..f30ac98b4cd9d8ab72137ff559e489bae5c9e0e0 100644 --- a/tutorials/source_zh_cn/custom_program/op_custom.rst +++ b/tutorials/source_zh_cn/custom_program/op_custom.rst @@ -16,6 +16,7 @@ operation/op_customopbuilder operation/cpp_api_for_custom_ops operation/op_customopbuilder_atb + operation/op_customopbuilder_asdsip 当开发网络遇到内置算子不足以满足需求时,你可以利用MindSpore的自定义算子功能接入你的算子。当前MindSpore提供了两种方式接入自定义算子,分别是 `基于Custom原语接入 `_ 和 `基于CustomOpBuilder接入 `_ 。 diff --git a/tutorials/source_zh_cn/custom_program/operation/cpp_api_for_custom_ops.md b/tutorials/source_zh_cn/custom_program/operation/cpp_api_for_custom_ops.md index abca0f67105a0ab79d4cf5e4a076b8b5ef32b008..4c77055275a284309526c474f570341c6ff43c2f 100644 --- a/tutorials/source_zh_cn/custom_program/operation/cpp_api_for_custom_ops.md +++ b/tutorials/source_zh_cn/custom_program/operation/cpp_api_for_custom_ops.md @@ -517,3 +517,50 @@ void RunAtbOp(const std::string &op_name, const ParamType ¶m, const std::vec - `param`:初始化 ATB 算子所需的参数。 - `inputs`:算子的输入 Tensor 列表。 - `outputs`:算子的输出 Tensor 列表。 + +### class AsdSipFFTOpRunner + +用于执行 Ascend Sip Boost (ASDSIP) 算子的运行器类,定义在[asdsip_common.h](https://gitee.com/mindspore/mindspore/blob/master/mindspore/ccsrc/ms_extension/ascend/asdsip/asdsip_common.h)头文件中。 + +此类继承自 `PyboostRunner`,并封装了 ASDSIP FFT 算子的调用流程,包括初始化和运行 ASDSIP FFT 算子、管理输入输出 Tensor、内存分配及内核调度。 + +可以查看教程 [CustomOpBuilder通过AsdSipFFTOpRunner接入ASDSIP FFT算子](https://www.mindspore.cn/tutorials/zh-CN/master/custom_program/operation/op_customopbuilder_asdsip.html) 获取使用方法。 + +#### 构造函数 + +- **AsdSipFFTOpRunner** + + ```cpp + explicit AsdSipFFTOpRunner(std::string op_name) : PyboostRunner(op_name) {} + ``` + + 继承自 `PyboostRunner` 的构造函数。 + +#### 公共方法 + +- **Init(const FFTParam ¶m);** + + ```cpp + void Init(const FFTParam ¶m); + ``` + + - **描述**: 【API】 使用给定参数初始化 ASDSIP FFT 算子。此方法通过 `AsdFftCreate` 创建对应算子的 `asdFftHandle` 实例,并将其放入缓存中。对于`param`相同的算子,只会创建一份 `asdFftHandle` 实例。 + - **参数** + - `param`:用于配置 ASDSIP FFT 算子的参数。 + +### function RunAsdSipFFTOp + +动态图执行ASDSIP FFT算子的接口,定义在[asdsip_common.h](https://gitee.com/mindspore/mindspore/blob/master/mindspore/ccsrc/ms_extension/ascend/asdsip/asdsip_common.h)头文件中。 + +```cpp +inline void RunAsdSipFFTOp(const std::string &op_name, const FFTParam &fft_param, const ms::Tensor &input, + const ms::Tensor &output) +``` + +【API】 使用提供的参数、输入和输出执行一个 ASDSIP FFT 算子。此函数是对 `AsdSipFFTOpRunner` 的一层封装。 + +- **参数** + - `op_name`:要执行的 ASDSIP FFT 算子名称。 + - `fft_param`:初始化 ASDSIP FFT 算子所需的参数。 + - `inputs`:算子的输入 Tensor。 + - `outputs`:算子的输出 Tensor。 diff --git a/tutorials/source_zh_cn/custom_program/operation/op_customopbuilder.md b/tutorials/source_zh_cn/custom_program/operation/op_customopbuilder.md index e74bb4b21de413b6814cd3b25ccf34af71e0e472..1a41e5ffa7e3b31a5e559f0095373806bf0758eb 100644 --- a/tutorials/source_zh_cn/custom_program/operation/op_customopbuilder.md +++ b/tutorials/source_zh_cn/custom_program/operation/op_customopbuilder.md @@ -215,3 +215,4 @@ print(out) ## 更多场景示例 - [通过AtbOpRunner接入ATB算子](https://www.mindspore.cn/tutorials/zh-CN/master/custom_program/operation/op_customopbuilder_atb.html):介绍通过自定义算子快速对接ATB算子的方法。 +- [通过AsdSipFFTOpRunner接入ASDSIP FFT算子](https://www.mindspore.cn/tutorials/zh-CN/master/custom_program/operation/op_customopbuilder_asdsip.html):介绍通过自定义算子快速对接ASDSIP FFT算子的方法。 diff --git a/tutorials/source_zh_cn/custom_program/operation/op_customopbuilder_asdsip.md b/tutorials/source_zh_cn/custom_program/operation/op_customopbuilder_asdsip.md new file mode 100644 index 0000000000000000000000000000000000000000..781f45663409bd3ef614f9d445e0afb0b5bba2c6 --- /dev/null +++ b/tutorials/source_zh_cn/custom_program/operation/op_customopbuilder_asdsip.md @@ -0,0 +1,128 @@ +# CustomOpBuilder通过AsdSipFFTOpRunner接入ASDSIP FFT算子 + +[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/master/tutorials/source_zh_cn/custom_program/operation/op_customopbuilder_asdsip.md) + +## 概述 + +[Ascend Sip Boost (ASDSIP) 算子加速库](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/acce/SiP/SIP_0000.html) 是基于华为Ascend AI处理器,专门为信号处理领域而设计的算子库。 + +当用户需要使用ASDSIP加速库的FFT类算子,而MindSpore未提供相应算子接口时,用户可以使用自定义算子的方法快速接入使用。 + +在 [基于CustomOpBuilder的自定义算子](https://www.mindspore.cn/tutorials/zh-CN/master/custom_program/operation/op_customopbuilder.html) 中,MindSpore提供了 `PyboostRunner` 方便用户在动态图接入自定义算子。现在针对ASDSIP算子,MindSpore又额外提供了一套`AsdSipFFTOpRunner`用于把ASDSIP FFT算子的调用流程和动态图多级流水封装到一起。 + +用户基于 [AsdSipFFTOpRunner类](https://www.mindspore.cn/tutorials/zh-CN/master/custom_program/operation/cpp_api_for_custom_ops.html#class-asdsipfftoprunner) 对接ASDSIP FFT算子时,仅需要提供`Param`,并调用`Init`接口初始化(即构造`Operation`),再调用`Run`接口即可执行ASDSIP算子。还可以直接调用 [RunAsdSipFFTOp](https://www.mindspore.cn/tutorials/zh-CN/master/custom_program/operation/cpp_api_for_custom_ops.html#function-launchasdsipfft)函数一键执行(函数内包含了`Init`和`Run`接口的调用)。 + +本指南以一个`FftC2C`为例,展示ASDSIP FFT算子的接入流程。完整代码请参阅[代码仓库](https://gitee.com/mindspore/mindspore/blob/master/tests/st/graph_kernel/custom/jit_test_files/asdsip_fftc2c.cpp)。 + +## 安装ASDSIP加速库 + +[点这里查看安装教程](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/acce/SiP/SIP_0001.html) + +安装成功之后,需要激活ASDSIP加速库的环境变量: + +```sh +source /usr/local/Ascend/nnal/asdsip/set_env.sh &> /dev/null +``` + +## FftC2C算子接入 + +这里使用`ms::pynative::RunAsdSipFFTOp`接入算子,并通过`ms::pynative::PyboostRunner::Call`调用函数接口: + +```cpp +#include "ms_extension/api.h" + +ms::Tensor InferFFTForward(const ms::Tensor &input) { + ShapeVector out_tensor_shape(input.shape()); + return ms::Tensor(input.data_type(), out_tensor_shape); +} + +ms::Tensor npu_fft(const ms::Tensor &input, int64_t n, int64_t batch_size) { + ms::pynative::FFTParam param; + param.fftXSize = n; + param.fftYSize = 0; + param.fftType = ms::pynative::asdFftType::ASCEND_FFT_C2C; + param.direction = ms::pynative::asdFftDirection::ASCEND_FFT_FORWARD; + param.batchSize = batch_size; + param.dimType = ms::pynative::asdFft1dDimType::ASCEND_FFT_HORIZONTAL; + auto output = InferFFTForward(input); + ms::pynative::RunAsdSipFFTOp("asdFftExecC2C", param, input, output); + return output; +} + +auto pyboost_npu_fft(const ms::Tensor &input, int64_t n, int64_t batch_size) { + return ms::pynative::PyboostRunner::Call<1>(npu_fft, input, n, batch_size); +} + +PYBIND11_MODULE(MS_EXTENSION_NAME, m) { + m.def("fft", &pyboost_npu_fft, "FFT C2C", pybind11::arg("input"), pybind11::arg("n"), + pybind11::arg("batch_size")); +} +``` + +### 1. 推导算子的输出信息 + +```cpp +ms::Tensor InferFFT1DForward(const ms::Tensor &input) { + ShapeVector out_tensor_shape(input.shape()); + return ms::Tensor(input.data_type(), out_tensor_shape); +} +``` + +对于`FftC2C`算子,输出张量的数据类型和输入的一样。推导出输出形状之后,通过`ms::Tensor`构造函数构造一个空的张量。 + +### 2. 创建并设置算子属性结构体 + +```cpp +ms::pynative::FFTParam param; +param.fftXSize = n; +param.fftYSize = 0; +param.fftType = ms::pynative::asdFftType::ASCEND_FFT_C2C; +param.direction = ms::pynative::asdFftDirection::ASCEND_FFT_FORWARD; +param.batchSize = batch_size; +param.dimType = ms::pynative::asdFft1dDimType::ASCEND_FFT_HORIZONTAL; +``` + +### 3. 调用RunAtbOp接口执行算子 + +```cpp +ms::pynative::PyboostRunner::Call<1>(npu_fft, input, n, batch_size); +``` + +这是一个模板接口,其等效于: + +```cpp +auto runner = std::make_shared("FFTExecC2C"); +runner->Init(fft_param); +runner->Run({input}, {output}); +``` + +传入算子名、属性、输入张量、输出张量几个信息,即可调用相应的ASDSIP算子。此接口支持了动态图的多级流水执行流程。 + +### 4. 通过pybind11将C++函数绑定一个Python函数 + +```cpp +auto pyboost_npu_fft(const ms::Tensor &input, int64_t n, int64_t batch_size) { + return ms::pynative::PyboostRunner::Call<1>(npu_fft, input, n, batch_size); +} + +PYBIND11_MODULE(MS_EXTENSION_NAME, m) { + m.def("fft", &pyboost_npu_fft, "FFT C2C", pybind11::arg("input"), pybind11::arg("n"), + pybind11::arg("batch_size")); +} +``` + +### 5. 使用CustomOpBuilder编译自定义算子 + +将上述C++代码保存成文件`asdsip_fftc2c.cpp`,然后使用Python接口`CustomOpBuilder`编译。 + +```python +input_np = np.random.rand(2, 16) +real_np = input_np.astype(np.float32) +imag_np = input_np.astype(np.float32) +complex_np = real_np + 1j * imag_np +my_ops = CustomOpBuilder("asdsip_fftc2c", "jit_test_files/asdsip_fftc2c.cpp", enable_asdsip=True).load() +output_tensor = my_ops.fft(input_tensor, 16, 2) +print(output_tensor) +``` + +这里向`CustomOpBuilder`传入了`enable_asdsip=True`的参数,MindSpore会自动添加与ASDSIP加速库有关的编译和链接选项。用户需保证正确执行了ASDSIP库的`set_env.sh`脚本,环境中已配置`ASDSIP_HOME_PATH`环境变量。 diff --git a/tutorials/source_zh_cn/debug/dump.md b/tutorials/source_zh_cn/debug/dump.md index 444a145e262989c41b3d6a681b3c581062725eb6..1077c97419d2c7735e3de20738a275e61758102d 100644 --- a/tutorials/source_zh_cn/debug/dump.md +++ b/tutorials/source_zh_cn/debug/dump.md @@ -155,7 +155,7 @@ MindSpore在不同后端下支持的Dump功能如下表所示: - "negative inf count": 表示Tensor中`-Inf`元素的个数; - "positive inf count": 表示Tensor中`+Inf`元素的个数; - "zero count": 表示Tensor中元素`0`的个数; - - "md5": 表示Tensor的MD5值; + - "hash": 表示Tensor的哈希特征值,默认使用SHA1算法,也可写作"hash:sha1",该算法计算更快,推荐优先使用;"hash:md5"表示Tensor的MD5值,与先前版本的"md5"配置项结果相同。 - "l2norm": 表示Tensor的L2Norm值,支持在device统计和在host统计。 以上除了标记了支持device统计的,其他都仅支持在host统计。 @@ -168,7 +168,7 @@ MindSpore在不同后端下支持的Dump功能如下表所示: - `enable`:设置成true,表示开启同步Dump;设置成false时,采用异步Dump。不设置该字段时默认值为false,开启异步Dump。两者的区别是异步Dump对原本代码执行过程的影响更小。 - `trans_flag`:开启格式转换,将设备上的数据格式转换成NCHW格式。若为`true`,则数据会以Host侧的4D格式(NCHW)格式保存;若为`false`,则保留Device侧的数据格式。该配置参数在CPU上无效,因为CPU上没有format转换。默认值:true。 - - `stat_calc_mode`:选择统计信息计算后端,可选"host"和"device"。选择"device"后可以使能device计算统计信息,当前只在Ascend生效,只支持`min/max/avg/l2norm`统计量。在op_debug_mode设置为3时,仅支持将`stat_calc_mode`设置为"host"。 + - `stat_calc_mode`:选择统计信息计算后端,可选"host"和"device"。选择"device"后可以使能device计算统计信息,当前只在Ascend生效,只支持`min/max/avg/l2norm`统计量。在op_debug_mode设置为3时,仅支持将`stat_calc_mode`设置为"host"。默认值:"host"。 - `device_stat_precision_mode`(可选):device统计信息精度模式,可选"high"和"low"。选择"high"时,`avg/l2norm`统计量使用float32进行计算,会增加device内存占用,精度更高;为"low"时使用与原始数据相同的类型进行计算,device内存占用较少,但在处理较大数值时可能会导致统计量溢出。默认值为"high"。 - `sample_mode`(可选):设置成0,表示不开启切片dump功能;设置成1时,在图编译后端为ms_backend的情况下开启切片dump功能。仅在op_debug_mode设置为0时生效,其他场景不会开启切片dump功能。 - `sample_num`(可选):用于控制切片dump中切片的大小。默认值为100。 diff --git a/tutorials/source_zh_cn/index.rst b/tutorials/source_zh_cn/index.rst index 4fe672c75125f04f3e7a0318d58b8e6f981dcd55..0fe6549850dad434913ce6b84faeb9e88efcee20 100644 --- a/tutorials/source_zh_cn/index.rst +++ b/tutorials/source_zh_cn/index.rst @@ -148,7 +148,7 @@ MindSpore教程
- +
数据处理 @@ -234,7 +234,7 @@ MindSpore教程
- +
推理 diff --git a/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_model_infer.rst b/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_model_infer.rst index cca47ae04ffb1a0c8c4fe8ac7a13bdb1fa6a52c4..edad80d5899e999f72211d38595267a042ef9a22 100644 --- a/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_model_infer.rst +++ b/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_model_infer.rst @@ -96,7 +96,7 @@ MindSpore大语言模型为了能够实现最优的性价比,针对大语言 - **Flash Attention**:Attention计算中会存在两个大矩阵相乘(4K大小),实际计算会将大矩阵分解为多个芯片能够计算的小矩阵单元进行计算,由于芯片的最小级的缓存大小限制,需要不断地将待计算数据在缓存和主存间搬入搬出,导致计算资源实际无法充分利用,因此当前主流芯片下,Attention计算实际上是带宽bound。Flash Attention技术将原本Attention进行分块,使得每一块计算都能够在芯片上独立计算完成,避免了在计算Key和Value时多次数据的搬入和搬出,从而提升Attention计算性能,具体可以参考 `Flash Attention `_。 - - **Page Attention显存优化**:标准的Flash Attention每次会读取和保存整个输入的Key和Value数据,这种方式虽然比较简单,但是会造成较多的资源浪费,如“中国的首都”和“中国的国旗”,都有共同的“中国的”作为公共前缀,其Attention对应的Key和Value值实际上是一样的,标准Flash Attention就需要存两份Key和Value,导致显存浪费。Page Attention基于Linux操作系统页表原理对KVCache进行优化,按照特定大小的块来存储Key和Value的数据,将上面例子中的Key和Value存储为“中国”、“的”、“首都”、“国旗”一共四份Key和Value数据,相比原来的六份数据,有效地节省了显存资源。在服务化的场景下,更多空闲显存可以让模型推理的batch更大,从而获得更高的吞吐量,具体可以参考 `Page Attention `_。 + - **Paged Attention**:标准的Flash Attention每次会读取和保存整个输入的Key和Value数据,这种方式虽然比较简单,但是会造成较多的资源浪费,当一个batch中多个请求序列长度不一致时,Flash Attention需要key和value用最长的序列的显存,如“中国的首都是北京“和“中国的国旗是五星红旗”,假设分词按字分词,则需要10 * 2 = 20个KVCache显存单元。Paged Attention基于Linux操作系统页表原理对KVCache进行优化,按照特定大小的块来存储Key和Value的数据,如块大小为2时,可以按照块使用KVCache,只需要4 * 2 + 5 * 2 = 18个KVCache的显存单元,由于Paged Attention离散的特性,也可以结合Prefix Cache这类技术进一步节省“中国的”所占用的显存,最终只需要3 * 2 + 5 * 2 = 16个KVCache单元。在服务化的场景下,更多空闲显存可以让模型推理的batch更大,从而获得更高的吞吐量,具体可以参考 `Page Attention `_。 - **模型量化**:MindSpore大语言模型推理支持通过量化技术减小模型体积,提供了A16W8、A16W4、A8W8量化以及KVCache量化等技术,减少模型资源占用,提升推理吞吐量。 diff --git a/tutorials/source_zh_cn/parallel/comm_fusion.md b/tutorials/source_zh_cn/parallel/comm_fusion.md index f5bd3c2bfc9d3d36656c7501a588402ea9d461b0..dca3d1344eb452f5cde67bd1ca9753dc9ecd5b8f 100644 --- a/tutorials/source_zh_cn/parallel/comm_fusion.md +++ b/tutorials/source_zh_cn/parallel/comm_fusion.md @@ -4,7 +4,7 @@ ## 简介 -在分布式并行训练场景下训练大规模参数量的模型(如GPT-3, Pangu-$\alpha$),跨设备甚至跨节点的数据传输是制约扩展性以及算力利用率的瓶颈[1]。通信融合是一种提升网络资源利用率、加速数据传输效率的重要方法,其将相同源节点和目的节点的通信算子打包同时执行,以避免多个单算子执行带来的额外开销。 +在分布式并行训练场景下训练大规模参数量的模型(如GPT-3、Pangu-$\alpha$),跨设备甚至跨节点的数据传输是制约扩展性以及算力利用率的瓶颈[1]。通信融合是一种提升网络资源利用率、加速数据传输效率的重要方法,其将相同源节点和目的节点的通信算子打包同时执行,以避免多个单算子执行带来的额外开销。 MindSpore支持对分布式训练中三种常用通信算子([AllReduce](https://www.mindspore.cn/docs/zh-CN/master/api_python/ops/mindspore.ops.AllReduce.html)、[AllGather](https://www.mindspore.cn/docs/zh-CN/master/api_python/ops/mindspore.ops.AllGather.html)、[ReduceScatter](https://www.mindspore.cn/docs/zh-CN/master/api_python/ops/mindspore.ops.ReduceScatter.html))的融合,并提供简洁易用的接口方便用户自行配置。在长稳训练任务支撑中,通信融合特性发挥了重要作用。 @@ -54,7 +54,7 @@ MindSpore提供两种接口来使能通信融合,下面分别进行介绍: 2. 利用`Cell`提供的接口 - 无论在哪种并行模式场景下,用户都可以通过[Cell.set_comm_fusion](https://www.mindspore.cn/docs/zh-CN/master/api_python/nn/mindspore.nn.Cell.html#mindspore.nn.Cell.set_comm_fusion)接口为模型某layer的参数设置index,MindSpore将融合相同index的参数所对应的通信算子。 + 无论在哪种并行模式场景下,用户都可以通过[Cell.set_comm_fusion](https://www.mindspore.cn/docs/zh-CN/master/api_python/nn/mindspore.nn.Cell.html#mindspore.nn.Cell.set_comm_fusion)接口为模型某个layer的参数设置index,MindSpore将融合相同index的参数所对应的通信算子。 ## 操作实践 @@ -91,7 +91,7 @@ net.comm_fusion(config={"allreduce": {"mode": "auto", "config": None}}) init() ``` -若将所有的同类通信算子融合成一个算子,在当前训练迭代中,传输需要等待计算完全结束后才能执行,这样会造成设备的等待。 +若将所有的同类通信算子融合成一个算子,在当前训练迭代中,需要等待计算完全结束后才能执行传输,这样会造成设备的等待。 为了避免上述问题,可以将网络参数进行分组融合:在下一组参数进行的计算的同时,进行上组参数的通信,使得计算和通信能够互相隐藏,可以通过限定fusion buffer的大小,或者index分区的方法进行分组融合。 diff --git a/tutorials/source_zh_cn/parallel/dataset_slice.md b/tutorials/source_zh_cn/parallel/dataset_slice.md index fcedd8e4146913739ddbfc66e6db17528ae6263f..709c7135dd3d32135f26e3e58282fba5bb670b25 100644 --- a/tutorials/source_zh_cn/parallel/dataset_slice.md +++ b/tutorials/source_zh_cn/parallel/dataset_slice.md @@ -4,7 +4,7 @@ ## 简介 -在进行分布式训练时,以图片数据为例,当单张图片的大小过大时,如遥感卫星等大幅面图片,当单张图片过大时,需要对图片进行切分,每张卡读取一部分图片,进行分布式训练。处理数据集切分的场景,需要配合模型并行一起才能达到预期的降低显存的效果,因此,基于自动并行提供了该项功能。本教程使用的样例不是大幅面的网络,仅作示例。真实应用到大幅面的网络时,往往需要详细设计并行策略。 +在进行分布式训练时,以图片数据为例,当单张图片的大小过大时,如遥感卫星等大幅面图片,需要对图片进行切分,每张卡读取一部分图片,进行分布式训练。处理数据集切分的场景,需要配合模型并行一起才能达到预期的降低显存的效果,因此,基于自动并行提供了该项功能。本教程使用的样例不是大幅面的网络,仅作示例。真实应用到大幅面的网络时,往往需要详细设计并行策略。 > 数据集切分在数据并行模式下不涉及。 diff --git a/tutorials/source_zh_cn/parallel/high_dimension_tensor_parallel.md b/tutorials/source_zh_cn/parallel/high_dimension_tensor_parallel.md index 104c534b22c1fc49c05ac3b98889b46f42e6265a..3781617b1ddd6b3eb5a4281d8b2670a5350b1f69 100644 --- a/tutorials/source_zh_cn/parallel/high_dimension_tensor_parallel.md +++ b/tutorials/source_zh_cn/parallel/high_dimension_tensor_parallel.md @@ -6,11 +6,11 @@ 大模型训练中,模型并行能够有效减少内存负荷,但其引入的通信是一个显著的性能瓶颈。因此需要优化整网模型切分策略以期引入最小的通信量。 -张量并行(Tensor Parallel,简称TP)训练是将一个张量沿特定维度分成 `N` 块,每个设备只持有整个张量的 `1/N`,进行MatMul/BatchMatMul等算子计算,并引入额外通信保证最终结果正确。而高维张量并行则允许灵活控制对张量的切分次数和切分轴,支持1D、2D、3D切分。2D/3D切分相对与1D切分,在合适的切分策略下,通信量随着TP设备数增长更慢,在TP设备数较大时有着更低的额外通信量,达到提高训练速度的目的。 +张量并行(Tensor Parallel,简称TP)训练是将一个张量沿特定维度分成 `N` 块,每个设备只持有整个张量的 `1/N`,进行MatMul/BatchMatMul等算子计算,并引入额外通信保证最终结果正确。而高维张量并行则允许灵活控制对张量的切分次数和切分轴,支持1D、2D、3D切分。2D/3D切分相对于1D切分,在合适的切分策略下,通信量随着TP设备数增长更慢,在TP设备数较大时有着更低的额外通信量,达到提高训练速度的目的。 > 本特性支持的硬件平台为Ascend,需要在Graph模式、半自动并行下运行。 -使用场景:在半自动模式下,网络中存在张量并行,且训练卡数较多时(一般不少于8卡)时,对MatMul/BatchMatMul进行2D/3D张量并行策略配置,并适配上下游算子的切分策略,可获得训练性能提升。 +使用场景:在半自动模式下,网络中存在张量并行,且训练卡数较多时(一般不少于8卡),对MatMul/BatchMatMul进行2D/3D张量并行策略配置,并适配上下游算子的切分策略,可获得训练性能提升。 ### 原理 diff --git a/tutorials/source_zh_cn/parallel/host_device_training.md b/tutorials/source_zh_cn/parallel/host_device_training.md index 63386c8a0885c7a820439be8d7aa9ff2ab2574cb..96207571219cd7771f9b5dcec357eafe745bab43 100644 --- a/tutorials/source_zh_cn/parallel/host_device_training.md +++ b/tutorials/source_zh_cn/parallel/host_device_training.md @@ -10,13 +10,13 @@ ### 基本原理 -流水线并行和算子级并行适用于模型的算子数量较大,同时参数较均匀的分布在各个算子中。如果模型中的算子数量较少,同时参数只集中在几个算子中呢?Wide&Deep就是这样的例子,如下图所示。Wide&Deep中的Embedding table作为需训练的参数可达几百GB甚至几TB,若放在加速器(device)上执行,那么所需的加速器数量巨大,训练费用昂贵。另一方面,若使用加速器计算,其获得的训练加速有限,同时会引发跨服务器的通信量,端到端的训练效率不会很高。 +流水线并行和算子级并行适用于模型算子数量较多,且参数较均匀地分布在各算子中的场景。若模型算子较少而参数集中在少数算子中,则需要采用不同策略。Wide & Deep 是典型例子,如下图所示。Wide&Deep中的Embedding table作为需训练的参数可达几百GB甚至几TB,若放在加速器(device)上执行,那么所需的加速器数量巨大,训练费用昂贵。另一方面,若使用加速器计算,其获得的训练加速有限,同时会引发跨服务器的通信量,端到端的训练效率不会很高。 ![image](./images/host_device_image_0_zh.png) *图:Wide&Deep模型的部分结构* -仔细分析Wide&Deep模型的特殊结构后可得:Embedding table虽然参数量巨大,但其参与的计算量很少,可以将Embedding table和其对应的算子EmbeddingLookup算子放置在Host端计算,其余算子放置在加速器端。这样做能够同时发挥Host端内存量大、加速器端计算快的特性,同时利用了同一台服务器的Host到加速器高带宽的特性。下图展示了Wide&Deep异构切分的方式: +仔细分析Wide&Deep模型的特殊结构后可得:Embedding table虽然参数量巨大,但其参与的计算量很少,可以将Embedding table和其对应的EmbeddingLookup算子放置在Host端计算,其余算子放置在加速器端。这样做能够同时发挥Host端内存量大、加速器端计算快的特性,同时利用了同一台服务器的Host到加速器高带宽的特性。下图展示了Wide&Deep异构切分的方式: ![image](./images/host_device_image_1_zh.png) @@ -69,6 +69,7 @@ init() import os import mindspore as ms import mindspore.dataset as ds +from mindspore.communication import get_rank, get_group_size ms.set_seed(1) @@ -93,7 +94,7 @@ data_set = create_dataset(32) ### 网络定义 -网络定义与单卡网络区别在于,配置[ops.Add()](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.Add.html)算子在主机端运行,代码如下: +网络定义与单卡网络区别在于,配置[ops.Add()](https://www.mindspore.cn/docs/zh-CN/master/api_python/ops/mindspore.ops.Add.html)算子在主机端运行,代码如下: ```python import mindspore as ms @@ -180,7 +181,7 @@ for epoch in range(5): bash run.sh ``` -训练完后,关于Loss部分结果保存在`log_output/worker_*.log`中,示例如下: +训练完成后,关于Loss部分结果保存在`log_output/worker_*.log`中,示例如下: ```text epoch: 0, step: 0, loss is 2.302936 diff --git a/tutorials/source_zh_cn/parallel/msrun_launcher.md b/tutorials/source_zh_cn/parallel/msrun_launcher.md index c0e48a22916c6e1a57106540fb8e217f822a43cd..56a8e1fa7a169417307035d7883905e6e554dbc7 100644 --- a/tutorials/source_zh_cn/parallel/msrun_launcher.md +++ b/tutorials/source_zh_cn/parallel/msrun_launcher.md @@ -4,7 +4,7 @@ ## 概述 -`msrun`是[动态组网](https://www.mindspore.cn/tutorials/zh-CN/master/parallel/dynamic_cluster.html)启动方式的封装,用户可使用`msrun`,以单个命令行指令的方式在各节点拉起多进程分布式任务,并且无需手动设置[动态组网环境变量](https://www.mindspore.cn/tutorials/zh-CN/master/parallel/dynamic_cluster.html)。`msrun`同时支持`Ascend`,`GPU`和`CPU`后端。与`动态组网`启动方式一样,`msrun`无需依赖第三方库以及配置文件。 +`msrun`是[动态组网](https://www.mindspore.cn/tutorials/zh-CN/master/parallel/dynamic_cluster.html)启动方式的封装,用户可使用`msrun`,以单个命令行指令的方式在各节点拉起多进程分布式任务,并且无需手动设置[动态组网环境变量](https://www.mindspore.cn/tutorials/zh-CN/master/parallel/dynamic_cluster.html)。`msrun`同时支持`Ascend`、`GPU`和`CPU`后端。与`动态组网`启动方式一样,`msrun`无需依赖第三方库以及配置文件。 > - `msrun`在用户安装MindSpore后即可使用,可使用指令`msrun --help`查看支持参数。 > - `msrun`支持`图模式`以及`PyNative模式`。 diff --git a/tutorials/source_zh_cn/parallel/optimize_technique.rst b/tutorials/source_zh_cn/parallel/optimize_technique.rst index 4a4df80d3f93bd596f480bbd8ebce011244be8a3..2a1bfe268eab6dfcf457612ea346e4136e2a91c5 100644 --- a/tutorials/source_zh_cn/parallel/optimize_technique.rst +++ b/tutorials/source_zh_cn/parallel/optimize_technique.rst @@ -21,7 +21,7 @@ 考虑到实际并行训练中,可能会对训练性能、吞吐量或规模有要求,可以从三个方面考虑优化:并行策略优化、内存优化和通信优化 -- 并行策略优化:并行策略优化主要包括并行策略的选择、算子级并行下的切分技巧以及多副本技巧。 +- 并行策略优化:主要包括并行策略的选择、算子级并行下的切分技巧以及多副本技巧。 - `策略选择 `_:根据模型规模和数据量大小,可以选择不同的并行策略,以提高训练效率和资源利用率。 - `切分技巧 `_:切分技巧是指通过手动配置某些关键算子的切分策略,减少张量重排布来提升训练效率。 diff --git a/tutorials/source_zh_cn/parallel/overview.md b/tutorials/source_zh_cn/parallel/overview.md index 3da1bc241819a4971bf00d48d8899053f28daf8f..b5a157c5814ae8706ccca7f7306f75e6771dab02 100644 --- a/tutorials/source_zh_cn/parallel/overview.md +++ b/tutorials/source_zh_cn/parallel/overview.md @@ -39,7 +39,7 @@ MindSpore提供两种粒度的算子级并行能力:算子级并行和高阶 ## 流水线并行 -近年来,神经网络的规模几乎是呈指数型增长。受单卡内存的限制,训练这些大模型用到的设备数量也在不断增加。受server间通信带宽低的影响,传统数据并行叠加模型并行的这种混合并行模式的性能表现欠佳,需要引入流水线并行。流水线并行能够将模型在空间上按阶段(Stage)进行切分,每个Stage只需执行网络的一部分,大大节省了内存开销,同时缩小了通信域,缩短了通信时间。MindSpore能够根据用户的配置,将单机模型自动地转换成流水线并行模式去执行。 +近年来,神经网络的规模几乎呈指数型增长。受单卡内存的限制,训练这些大模型用到的设备数量也在不断增加。受server间通信带宽低的影响,传统数据并行叠加模型并行的这种混合并行模式的性能表现欠佳,需要引入流水线并行。流水线并行能够将模型在空间上按阶段(Stage)进行切分,每个Stage只需执行网络的一部分,大大节省了内存开销,同时缩小了通信域,缩短了通信时间。MindSpore能够根据用户的配置,将单机模型自动地转换成流水线并行模式去执行。 详细可参考[流水线并行](https://www.mindspore.cn/tutorials/zh-CN/master/parallel/pipeline_parallel.html)章节。 diff --git a/tutorials/source_zh_cn/parallel/pipeline_parallel.md b/tutorials/source_zh_cn/parallel/pipeline_parallel.md index e38b87ae28d475f347e6b69f21f734f5b4fe9c04..3a8eaa4165b3d6fb8165419489eef6c4860e7ae3 100644 --- a/tutorials/source_zh_cn/parallel/pipeline_parallel.md +++ b/tutorials/source_zh_cn/parallel/pipeline_parallel.md @@ -4,7 +4,7 @@ ## 简介 -近年来,神经网络的规模几乎是呈指数型增长。受单卡内存的限制,训练这些大模型用到的设备数量也在不断增加。受server间通信带宽低的影响,传统数据并行叠加模型并行的这种混合并行模式的性能表现欠佳,需要引入流水线并行。流水线并行能够将模型在空间上按阶段(Stage)进行切分,每个Stage只需执行网络的一部分,大大节省了内存开销,同时缩小了通信域,缩短了通信时间。MindSpore能够根据用户的配置,将单机模型自动地转换成流水线并行模式去执行。 +近年来,神经网络的规模几乎呈指数型增长。受单卡内存的限制,训练这些大模型用到的设备数量也在不断增加。受server间通信带宽低的影响,传统数据并行叠加模型并行的这种混合并行模式的性能表现欠佳,需要引入流水线并行。流水线并行能够将模型在空间上按阶段(Stage)进行切分,每个Stage只需执行网络的一部分,大大节省了内存开销,同时缩小了通信域,缩短了通信时间。MindSpore能够根据用户的配置,将单机模型自动转换成流水线并行模式去执行。 ## 训练操作实践 diff --git a/tutorials/source_zh_cn/parallel/split_technique.md b/tutorials/source_zh_cn/parallel/split_technique.md index a6f1ef019c1fa2938acaecbdf7b4e323b7648155..9a23e6a058ef4a40a32b2e557dc27118921853cd 100644 --- a/tutorials/source_zh_cn/parallel/split_technique.md +++ b/tutorials/source_zh_cn/parallel/split_technique.md @@ -118,7 +118,7 @@ class CoreAttention(nn.Cell): -再看[FlashAttention](https://gitee.com/mindspore/mindformers/blob/master/mindformers/modules/flash_attention.py)的例子: +再看[FlashAttention](https://gitee.com/mindspore/mindformers/blob/master/mindformers/modules/flash_attention.py)的例子:
@@ -193,4 +193,4 @@ class LlamaForCausalLM(LlamaPretrainedModel):
-**用户无法确认是否需要对算子配置策略时,可以不配置,由算法传播找寻最优策略,但是可能无法获得最佳的并行效果;若用户能够确认该算子需要配置什么策略,则可以进行配置帮助算法获得预期效果。** +**用户无法确认是否需要对算子配置策略时,可以不配置,由算法传播找寻最优策略,但是可能无法获得最佳的并行效果;若用户能够确认该算子需要配置什么策略,则可以进行配置,帮助算法获得预期效果。**