From 4811ba62e0982f9750cfd6ccd998249f78fdb52e Mon Sep 17 00:00:00 2001 From: SILONG ZENG <2609716663@qq.com> Date: Thu, 15 Jan 2026 09:06:01 +0800 Subject: [PATCH] [Lint]Style: reformat markdown files via markdownlint (#5884) ### What this PR does / why we need it? reformat markdown files via markdownlint - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/bde38c11df0ea066a740efe9b77fff5418be45df --------- Signed-off-by: root Signed-off-by: MrZ20 <2609716663@qq.com> Co-authored-by: root --- .github/workflows/_pre_commit.yml | 2 + .github/workflows/matchers/markdownlint.json | 17 ++ .markdownlint.yaml | 15 ++ .pre-commit-config.yaml | 9 +- README.md | 15 +- README.zh.md | 16 +- benchmarks/README.md | 69 +++--- benchmarks/scripts/perf_result_template.md | 1 + docs/README.md | 7 +- docs/source/community/governance.md | 3 + docs/source/community/versioning_policy.md | 9 +- .../developer_guide/contribution/index.md | 1 + .../contribution/multi_node_test.md | 6 +- .../developer_guide/contribution/testing.md | 2 +- .../evaluation/using_ais_bench.md | 9 +- .../evaluation/using_evalscope.md | 8 +- .../evaluation/using_lm_eval.md | 19 +- .../evaluation/using_opencompass.md | 10 +- .../feature_guide/ACL_Graph.md | 5 +- .../feature_guide/KV_Cache_Pool_Guide.md | 12 +- .../ModelRunner_prepare_inputs.md | 53 ++++- .../feature_guide/context_parallel.md | 1 + .../feature_guide/disaggregated_prefill.md | 6 +- .../feature_guide/eplb_swift_balancer.md | 39 ++- .../developer_guide/feature_guide/patch.md | 11 +- .../performance_and_debug/msprobe_guide.md | 10 +- .../optimization_and_tuning.md | 7 + .../performance_benchmark.md | 5 +- .../profile_execute_duration.md | 5 +- .../service_profiling_guide.md | 7 + docs/source/faqs.md | 24 +- docs/source/installation.md | 11 +- docs/source/quick_start.md | 5 +- docs/source/tutorials/DeepSeek-R1.md | 4 + docs/source/tutorials/DeepSeek-V3.1.md | 8 +- docs/source/tutorials/DeepSeek-V3.2.md | 20 +- docs/source/tutorials/GLM4.x.md | 4 + docs/source/tutorials/Kimi-K2-Thinking.md | 1 + docs/source/tutorials/Qwen-VL-Dense.md | 7 +- docs/source/tutorials/Qwen2.5-7B.md | 2 + docs/source/tutorials/Qwen2.5-Omni.md | 1 + docs/source/tutorials/Qwen3-235B-A22B.md | 22 +- docs/source/tutorials/Qwen3-32B-W4A4.md | 2 +- docs/source/tutorials/Qwen3-8B-W4A8.md | 5 +- docs/source/tutorials/Qwen3-Dense.md | 21 +- docs/source/tutorials/Qwen3-Next.md | 1 + .../tutorials/Qwen3-Omni-30B-A3B-Thinking.md | 10 +- .../tutorials/Qwen3-VL-235B-A22B-Instruct.md | 13 +- docs/source/tutorials/Qwen3_embedding.md | 5 + docs/source/tutorials/Qwen3_reranker.md | 4 + ...ng_sequence_context_parallel_multi_node.md | 9 +- ...g_sequence_context_parallel_single_node.md | 11 +- .../pd_disaggregation_mooncake_multi_node.md | 5 +- .../pd_disaggregation_mooncake_single_node.md | 3 +- docs/source/tutorials/ray.md | 13 +- .../configuration/additional_config.md | 2 +- .../deployment_guide/using_volcano_kthena.md | 10 +- .../feature_guide/Fine_grained_TP.md | 16 +- .../feature_guide/Multi_Token_Prediction.md | 16 +- .../feature_guide/context_parallel.md | 18 +- .../user_guide/feature_guide/dynamic_batch.md | 2 + .../feature_guide/eplb_swift_balancer.md | 10 +- .../user_guide/feature_guide/external_dp.md | 17 +- .../user_guide/feature_guide/graph_mode.md | 2 + .../user_guide/feature_guide/kv_pool.md | 49 ++-- .../feature_guide/large_scale_ep.md | 4 +- .../feature_guide/layer_sharding.md | 5 +- docs/source/user_guide/feature_guide/lora.md | 6 +- .../feature_guide/speculative_decoding.md | 1 + .../feature_guide/ucm_deployment.md | 11 +- docs/source/user_guide/release_notes.md | 223 ++++++++++++------ .../support_matrix/supported_features.md | 2 +- .../support_matrix/supported_models.md | 2 +- .../mooncake_connector_deployment_guide.md | 30 +-- examples/external_online_dp/README.md | 3 + 75 files changed, 711 insertions(+), 308 deletions(-) create mode 100644 .github/workflows/matchers/markdownlint.json create mode 100644 .markdownlint.yaml diff --git a/.github/workflows/_pre_commit.yml b/.github/workflows/_pre_commit.yml index bc13d86d..e4f89e0c 100644 --- a/.github/workflows/_pre_commit.yml +++ b/.github/workflows/_pre_commit.yml @@ -28,9 +28,11 @@ jobs: - name: cp problem matchers run: | cp .github/workflows/matchers/actionlint.json "$RUNNER_TEMP/actionlint.json" + cp .github/workflows/matchers/markdownlint.json "$RUNNER_TEMP/markdownlint.json" cp .github/workflows/matchers/mypy.json "$RUNNER_TEMP/mypy.json" - run: echo "::add-matcher::$RUNNER_TEMP/actionlint.json" + - run: echo "::add-matcher::$RUNNER_TEMP/markdownlint.json" - run: echo "::add-matcher::$RUNNER_TEMP/mypy.json" - name: Checkout vllm-project/vllm repo diff --git a/.github/workflows/matchers/markdownlint.json b/.github/workflows/matchers/markdownlint.json new file mode 100644 index 00000000..28ad47eb --- /dev/null +++ b/.github/workflows/matchers/markdownlint.json @@ -0,0 +1,17 @@ +{ + "problemMatcher": [ + { + "owner": "markdownlint", + "pattern": [ + { + "regexp": "^([^:]*):(\\d+):?(\\d+)?\\s([\\w-\\/]*)\\s(.*)$", + "file": 1, + "line": 2, + "column": 3, + "code": 4, + "message": 5 + } + ] + } + ] +} diff --git a/.markdownlint.yaml b/.markdownlint.yaml new file mode 100644 index 00000000..d0a8c7f4 --- /dev/null +++ b/.markdownlint.yaml @@ -0,0 +1,15 @@ +MD007: + indent: 4 +MD013: false +MD024: + siblings_only: true +MD031: + list_items: false +MD029: false +MD036: false +MD033: false +MD041: false +MD046: false +MD052: false +MD053: false +MD059: false diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2041e01f..fd2dc626 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,11 +26,12 @@ repos: # files: ^csrc/.*\.(cpp|hpp|cc|hh|cxx|hxx)$ # types_or: [c++] # args: [--style=google, --verbose] -- repo: https://github.com/jackdewinter/pymarkdown - rev: v0.9.29 +- repo: https://github.com/igorshubovych/markdownlint-cli + rev: v0.45.0 hooks: - - id: pymarkdown - args: [fix] + - id: markdownlint + exclude: '.*\.inc\.md$|.*report_template\.md$|.*contributors\.md$|.*PULL_REQUEST_TEMPLATE\.md$' + stages: [manual] # Only run in CI - repo: https://github.com/rhysd/actionlint rev: v1.7.7 hooks: diff --git a/README.md b/README.md index a3217665..5b22596d 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ vLLM Ascend Plugin --- *Latest News* 🔥 + - [2025/12] We released the new official version [v0.11.0](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.11.0)! Please follow the [official guide](https://docs.vllm.ai/projects/ascend/en/v0.11.0/) to start using vLLM Ascend Plugin on Ascend. - [2025/09] We released the new official version [v0.9.1](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.9.1)! Please follow the [official guide](https://docs.vllm.ai/projects/ascend/en/v0.9.1/tutorials/large_scale_ep.html) to start deploy large scale Expert Parallelism (EP) on Ascend. - [2025/08] We hosted the [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/7n8OYNrCC_I9SJaybHA_-Q) with vLLM and Tencent! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF). @@ -28,7 +29,9 @@ vLLM Ascend Plugin - [2025/03] We hosted the [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/VtxO9WXa5fC-mKqlxNUJUQ) with vLLM team! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF). - [2025/02] vLLM community officially created [vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-ascend) repo for running vLLM seamlessly on the Ascend NPU. - [2024/12] We are working with the vLLM community to support [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162). + --- + ## Overview vLLM Ascend (`vllm-ascend`) is a community maintained hardware plugin for running vLLM seamlessly on the Ascend NPU. @@ -42,10 +45,10 @@ By using vLLM Ascend plugin, popular open-source models, including Transformer-l - Hardware: Atlas 800I A2 Inference series, Atlas A2 Training series, Atlas 800I A3 Inference series, Atlas A3 Training series, Atlas 300I Duo (Experimental) - OS: Linux - Software: - * Python >= 3.10, < 3.12 - * CANN == 8.3.rc2 (Ascend HDK version refers to [here](https://www.hiascend.com/document/detail/zh/canncommercial/83RC2/releasenote/releasenote_0000.html)) - * PyTorch == 2.8.0, torch-npu == 2.8.0 - * vLLM (the same version as vllm-ascend) + - Python >= 3.10, < 3.12 + - CANN == 8.3.rc2 (Ascend HDK version refers to [here](https://www.hiascend.com/document/detail/zh/canncommercial/83RC2/releasenote/releasenote_0000.html)) + - PyTorch == 2.8.0, torch-npu == 2.8.0 + - vLLM (the same version as vllm-ascend) ## Getting Started @@ -57,9 +60,11 @@ Please use the following recommended versions to get started quickly: |v0.11.0|Latest stable version|[QuickStart](https://docs.vllm.ai/projects/ascend/en/v0.11.0/quick_start.html) and [Installation](https://docs.vllm.ai/projects/ascend/en/v0.11.0/installation.html) for more details| ## Contributing + See [CONTRIBUTING](https://docs.vllm.ai/projects/ascend/en/latest/developer_guide/contribution/index.html) for more details, which is a step-by-step guide to help you set up development environment, build and test. We welcome and value any contributions and collaborations: + - Please let us know if you encounter a bug by [filing an issue](https://github.com/vllm-project/vllm-ascend/issues) - Please use [User forum](https://discuss.vllm.ai/c/hardware-support/vllm-ascend-support) for usage questions and help. @@ -86,7 +91,7 @@ Please refer to [Versioning policy](https://docs.vllm.ai/projects/ascend/en/late ## Weekly Meeting -- vLLM Ascend Weekly Meeting: https://tinyurl.com/vllm-ascend-meeting +- vLLM Ascend Weekly Meeting: - Wednesday, 15:00 - 16:00 (UTC+8, [Convert to your timezone](https://dateful.com/convert/gmt8?t=15)) ## License diff --git a/README.zh.md b/README.zh.md index 3005fd61..c8c4c017 100644 --- a/README.zh.md +++ b/README.zh.md @@ -29,7 +29,9 @@ vLLM Ascend Plugin - [2025/03] 我们和vLLM团队举办了[vLLM Beijing Meetup](https://mp.weixin.qq.com/s/CGDuMoB301Uytnrkc2oyjg)! 你可以在[这里](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF)找到演讲材料. - [2025/02] vLLM社区正式创建了[vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-ascend)仓库,让vLLM可以无缝运行在Ascend NPU。 - [2024/12] 我们正在与 vLLM 社区合作,以支持 [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162). + --- + ## 总览 vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NPU无缝运行的后端插件。 @@ -43,10 +45,10 @@ vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NP - 硬件:Atlas 800I A2 Inference系列、Atlas A2 Training系列、Atlas 800I A3 Inference系列、Atlas A3 Training系列、Atlas 300I Duo(实验性支持) - 操作系统:Linux - 软件: - * Python >= 3.10, < 3.12 - * CANN == 8.3.rc2 (Ascend HDK 版本参考[这里](https://www.hiascend.com/document/detail/zh/canncommercial/83RC2/releasenote/releasenote_0000.html)) - * PyTorch == 2.8.0, torch-npu == 2.8.0 - * vLLM (与vllm-ascend版本一致) + - Python >= 3.10, < 3.12 + - CANN == 8.3.rc2 (Ascend HDK 版本参考[这里](https://www.hiascend.com/document/detail/zh/canncommercial/83RC2/releasenote/releasenote_0000.html)) + - PyTorch == 2.8.0, torch-npu == 2.8.0 + - vLLM (与vllm-ascend版本一致) ## 开始使用 @@ -58,13 +60,16 @@ vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NP |v0.11.0| 最新正式/稳定版本 |[快速开始](https://docs.vllm.ai/projects/ascend/en/v0.11.0/quick_start.html) and [安装指南](https://docs.vllm.ai/projects/ascend/en/v0.11.0/installation.html)了解更多| ## 贡献 + 请参考 [CONTRIBUTING]((https://docs.vllm.ai/projects/ascend/en/latest/developer_guide/contribution/index.html)) 文档了解更多关于开发环境搭建、功能测试以及 PR 提交规范的信息。 我们欢迎并重视任何形式的贡献与合作: + - 请通过[Issue](https://github.com/vllm-project/vllm-ascend/issues)来告知我们您遇到的任何Bug。 - 请通过[用户论坛](https://discuss.vllm.ai/c/hardware-support/vllm-ascend-support)来交流使用问题和寻求帮助。 ## 分支策略 + vllm-ascend有主干分支和开发分支。 - **main**: 主干分支,与vLLM的主干分支对应,并通过昇腾CI持续进行质量看护。 @@ -86,8 +91,9 @@ vllm-ascend有主干分支和开发分支。 ## 社区例会 -- vLLM Ascend 每周社区例会: https://tinyurl.com/vllm-ascend-meeting +- vLLM Ascend 每周社区例会: - 每周三下午,15:00 - 16:00 (UTC+8, [查看您的时区](https://dateful.com/convert/gmt8?t=15)) ## 许可证 + Apache 许可证 2.0,如 [LICENSE](./LICENSE) 文件中所示。 diff --git a/benchmarks/README.md b/benchmarks/README.md index 64a55cc6..e1d8a018 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -1,8 +1,13 @@ -# Introduction +# vLLM Ascend Benchmarks + +## Introduction + This document outlines the benchmarking methodology for vllm-ascend, aimed at evaluating the performance under a variety of workloads. The primary goal is to help developers assess whether their pull requests improve or degrade vllm-ascend's performance. -# Overview +## Overview + **Benchmarking Coverage**: We measure latency, throughput, and fixed-QPS serving on the Atlas800I A2 (see [quick_start](../docs/source/quick_start.md) to learn more supported devices list), with different models(coming soon). + - Latency tests - Input length: 32 tokens. - Output length: 128 tokens. @@ -26,8 +31,10 @@ This document outlines the benchmarking methodology for vllm-ascend, aimed at ev **Benchmarking Duration**: about 800 senond for single model. -# Quick Use -## Prerequisites +## Quick Use + +### Prerequisites + Before running the benchmarks, ensure the following: - vllm and vllm-ascend are installed and properly set up in an NPU environment, as these scripts are specifically designed for NPU devices. @@ -41,7 +48,7 @@ Before running the benchmarks, ensure the following: - For performance benchmark, it is recommended to set the [load-format](https://github.com/vllm-project/vllm-ascend/blob/5897dc5bbe321ca90c26225d0d70bff24061d04b/benchmarks/tests/latency-tests.json#L7) as `dummy`, It will construct random weights based on the passed model without downloading the weights from internet, which can greatly reduce the benchmark time. - If you want to run benchmark customized, feel free to add your own models and parameters in the [JSON](https://github.com/vllm-project/vllm-ascend/tree/main/benchmarks/tests), let's take `Qwen2.5-VL-7B-Instruct`as an example: - ```shell + ```json [ { "test_name": "serving_qwen2_5vl_7B_tp1", @@ -75,45 +82,46 @@ Before running the benchmarks, ensure the following: this Json will be structured and parsed into server parameters and client parameters by the benchmark script. This configuration defines a test case named `serving_qwen2_5vl_7B_tp1`, designed to evaluate the performance of the `Qwen/Qwen2.5-VL-7B-Instruct` model under different request rates. The test includes both server and client parameters, for more parameters details, see vllm benchmark [cli](https://github.com/vllm-project/vllm/tree/main/vllm/benchmarks). - - **Test Overview** - - Test Name: serving_qwen2_5vl_7B_tp1 +- **Test Overview** + - Test Name: serving_qwen2_5vl_7B_tp1 - - Queries Per Second (QPS): The test is run at four different QPS levels: 1, 4, 16, and inf (infinite load, typically used for stress testing). + - Queries Per Second (QPS): The test is run at four different QPS levels: 1, 4, 16, and inf (infinite load, typically used for stress testing). - - Server Parameters - - Model: Qwen/Qwen2.5-VL-7B-Instruct +- Server Parameters + - Model: Qwen/Qwen2.5-VL-7B-Instruct - - Tensor Parallelism: 1 (no model parallelism is used; the model runs on a single device or node) + - Tensor Parallelism: 1 (no model parallelism is used; the model runs on a single device or node) - - Swap Space: 16 GB (used to handle memory overflow by swapping to disk) + - Swap Space: 16 GB (used to handle memory overflow by swapping to disk) - - disable_log_stats: disables logging of performance statistics. + - disable_log_stats: disables logging of performance statistics. - - disable_log_requests: disables logging of individual requests. + - disable_log_requests: disables logging of individual requests. - - Trust Remote Code: enabled (allows execution of model-specific custom code) + - Trust Remote Code: enabled (allows execution of model-specific custom code) - - Max Model Length: 16,384 tokens (maximum context length supported by the model) + - Max Model Length: 16,384 tokens (maximum context length supported by the model) - - Client Parameters +- Client Parameters - - Model: Qwen/Qwen2.5-VL-7B-Instruct (same as the server) + - Model: Qwen/Qwen2.5-VL-7B-Instruct (same as the server) - - Backend: openai-chat (suggests the client uses the OpenAI-compatible chat API format) + - Backend: openai-chat (suggests the client uses the OpenAI-compatible chat API format) - - Dataset Source: Hugging Face (hf) + - Dataset Source: Hugging Face (hf) - - Dataset Split: train + - Dataset Split: train - - Endpoint: /v1/chat/completions (the REST API endpoint to which chat requests are sent) + - Endpoint: /v1/chat/completions (the REST API endpoint to which chat requests are sent) - - Dataset Path: lmarena-ai/vision-arena-bench-v0.1 (the benchmark dataset used for evaluation, hosted on Hugging Face) + - Dataset Path: lmarena-ai/vision-arena-bench-v0.1 (the benchmark dataset used for evaluation, hosted on Hugging Face) - - Number of Prompts: 200 (the total number of prompts used during the test) + - Number of Prompts: 200 (the total number of prompts used during the test) -## Run benchmarks +### Run benchmarks + +#### Use benchmark script -### Use benchmark script The provided scripts automatically execute performance tests for serving, throughput, and latency. To start the benchmarking process, run command in the vllm-ascend root directory: ```shell @@ -134,11 +142,13 @@ Once the script completes, you can find the results in the benchmarks/results fo These files contain detailed benchmarking results for further analysis. -### Use benchmark cli +#### Use benchmark cli For more flexible and customized use, benchmark cli is also provided to run online/offline benchmarks Similarly, let’s take `Qwen2.5-VL-7B-Instruct` benchmark as an example: -#### Online serving + +##### Online serving + 1. Launch the server: ```shell @@ -156,7 +166,8 @@ Similarly, let’s take `Qwen2.5-VL-7B-Instruct` benchmark as an example: --request-rate 16 ``` -#### Offline +##### Offline + - **Throughput** ```shell diff --git a/benchmarks/scripts/perf_result_template.md b/benchmarks/scripts/perf_result_template.md index cb6a2e64..17cf98ad 100644 --- a/benchmarks/scripts/perf_result_template.md +++ b/benchmarks/scripts/perf_result_template.md @@ -10,6 +10,7 @@ {serving_tests_markdown_table} ## Offline tests + ### Latency tests - Input length: 32 tokens. diff --git a/docs/README.md b/docs/README.md index 739d442c..cb4dcd5d 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,6 +1,6 @@ # vLLM Ascend Plugin documents -Live doc: https://docs.vllm.ai/projects/ascend +Live doc: ## Build the docs @@ -20,5 +20,6 @@ python -m http.server -d _build/html/ ``` Launch your browser and open: -- English version: http://localhost:8000 -- Chinese version: http://localhost:8000/zh_CN + +- English version: +- Chinese version: diff --git a/docs/source/community/governance.md b/docs/source/community/governance.md index f5a852f0..63c0c3fe 100644 --- a/docs/source/community/governance.md +++ b/docs/source/community/governance.md @@ -1,12 +1,15 @@ # Governance ## Mission + As a vital component of vLLM, the vLLM Ascend project is dedicated to providing an easy, fast, and cheap LLM Serving for everyone on Ascend NPUs and to actively contributing to the enrichment of vLLM. ## Principles + vLLM Ascend follows the vLLM community's code of conduct: [vLLM - CODE OF CONDUCT](https://github.com/vllm-project/vllm/blob/main/CODE_OF_CONDUCT.md) ## Governance - Mechanics + vLLM Ascend is an open-source project under the vLLM community, where the authority to appoint roles is ultimately determined by the vLLM community. It adopts a hierarchical technical governance structure. - Contributor: diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md index 891b155f..797eee12 100644 --- a/docs/source/community/versioning_policy.md +++ b/docs/source/community/versioning_policy.md @@ -12,6 +12,7 @@ Each vLLM Ascend release is versioned as `v[major].[minor].[micro][rcN][.postN]` - **Post releases**: Typically issued **on demand** to address minor errors in a final release. Different from [PEP-440 post release note](https://peps.python.org/pep-0440/#post-releases) convention, these versions include actual bug fixes, as the final release version must strictly align with the vLLM final release format (`v[major].[minor].[micro]`). Any post version must be published as a patch version of the final release. For example: + - `v0.7.x`: first final release to match the vLLM `v0.7.x` version. - `v0.7.3rc1`: first pre version of vLLM Ascend. - `v0.7.3.post1`: post release for the `v0.7.3` release if it has some minor errors. @@ -49,6 +50,7 @@ If you're using v0.7.3, don't forget to install [mindie-turbo](https://pypi.org/ ::: For main branch of vLLM Ascend, we usually make it compatible with the latest vLLM release and a newer commit hash of vLLM. Please note that this table is usually updated. Please check it regularly. + | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | |-------------|--------------|------------------|-------------|--------------------| | main | bde38c11df0ea066a740efe9b77fff5418be45df, v0.13.0 tag | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 | @@ -94,6 +96,7 @@ vLLM Ascend includes two branches: main and dev. Commits should typically be merged into the main branch first, and only then backported to the dev branch, to reduce maintenance costs as much as possible. ### Maintenance branch and EOL + The table below lists branch states. | Branch | Time Frame | Summary | @@ -121,7 +124,8 @@ Usually, each minor version of vLLM (such as 0.7) corresponds to a vLLM Ascend v | Branch | State | RFC Link | Scheduled Merge Time | Mentor | |------------|--------------|---------------------------------------|------------|--------| -|rfc/long_seq_optimization|Maintained|https://github.com/vllm-project/vllm/issues/22693|930|wangxiyuan| +|rfc/long_seq_optimization|Maintained||930|wangxiyuan| + - Branch: The feature branch should be created with a prefix `rfc/` followed by the feature name, such as `rfc/feature-name`. - State: The state of the feature branch is `Maintained` until it is merged into the main branch or deleted. - RFC Link: The feature branch should be created with a corresponding RFC issue. The creation of a feature branch requires an RFC and approval from at least two maintainers. @@ -131,11 +135,13 @@ Usually, each minor version of vLLM (such as 0.7) corresponds to a vLLM Ascend v ### Backward compatibility For main branch, vLLM Ascend should works with vLLM main branch and latest 1 or 2 releases. To ensure backward compatibility, do as follows: + - Both main branch and target vLLM release, such as the vLLM main branch and vLLM 0.8.4, are tested by Ascend E2E CI. - To make sure that code changes are compatible with the latest 1 or 2 vLLM releases, vLLM Ascend introduces a version check mechanism inside the code. It checks the version of the installed vLLM package first to decide which code logic to use. If users hit the `InvalidVersion` error, it may indicate that they have installed a dev or editable version of vLLM package. In this case, we provide the env variable `VLLM_VERSION` to let users specify the version of vLLM package to use. - Document changes should be compatible with the latest 1 or 2 vLLM releases. Notes should be added if there are any breaking changes. ## Document branch policy + To reduce maintenance costs, **all branch documentation content should remain consistent, and version differences can be controlled via variables in [docs/source/conf.py](https://github.com/vllm-project/vllm-ascend/blob/main/docs/source/conf.py)**. While this is not a simple task, it is a principle we should strive to follow. | Version | Purpose | Code Branch | @@ -151,6 +157,7 @@ Notes: - `version` documentation: keep updating the `releases/vX.Y.Z` branch documentation to fix doc bugs. ## Software dependency management + - `torch-npu`: Ascend Extension for PyTorch (torch-npu) releases a stable version to [PyPi](https://pypi.org/project/torch-npu) every 3 months, a development version (aka the POC version) every month, and a nightly version every day. The PyPi stable version **CAN** be used in vLLM Ascend final version, the monthly dev version **ONLY CAN** be used in diff --git a/docs/source/developer_guide/contribution/index.md b/docs/source/developer_guide/contribution/index.md index 256c8aff..0ee79b87 100644 --- a/docs/source/developer_guide/contribution/index.md +++ b/docs/source/developer_guide/contribution/index.md @@ -1,6 +1,7 @@ # Contributing ## Building and Testing + It's recommended to set up a local development environment to build vllm-ascend and run tests before you submit a PR. diff --git a/docs/source/developer_guide/contribution/multi_node_test.md b/docs/source/developer_guide/contribution/multi_node_test.md index fd972c51..fd4e3cda 100644 --- a/docs/source/developer_guide/contribution/multi_node_test.md +++ b/docs/source/developer_guide/contribution/multi_node_test.md @@ -116,7 +116,7 @@ This section assumes that you already have a [Kubernetes](https://kubernetes.io/ - Step 1. Install LWS CRD resources - See https://lws.sigs.k8s.io/docs/installation/ Which can be used as a reference + See Which can be used as a reference - Step 2. Deploy the following yaml file `lws.yaml` as what you want @@ -318,14 +318,14 @@ Since our script is Kubernetes-friendly, we need to actively pass in some cluste `cluster_hosts: ["xxx.xxx.xxx.188", "xxx.xxx.xxx.212"]` - Step 2. Install develop environment - - Install vllm-ascend develop packages on every cluster host + - Install vllm-ascend develop packages on every cluster host ``` bash cd /vllm-workspace/vllm-ascend python3 -m pip install -r requirements-dev.txt ``` - - Install AISBench on the first host(leader node) in cluster_hosts + - Install AISBench on the first host(leader node) in cluster_hosts ``` bash export AIS_BENCH_TAG="v3.0-20250930-master" diff --git a/docs/source/developer_guide/contribution/testing.md b/docs/source/developer_guide/contribution/testing.md index 28d29e7c..958c008d 100644 --- a/docs/source/developer_guide/contribution/testing.md +++ b/docs/source/developer_guide/contribution/testing.md @@ -248,7 +248,7 @@ This will reproduce the E2E test. See [vllm_ascend_test.yaml](https://github.com Run nightly multi-node test cases locally refer to section of `Running Locally` of [Multi Node Test](./multi_node_test.md). -#### E2E test example: +#### E2E test example - Offline test example: [`tests/e2e/singlecard/test_offline_inference.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_offline_inference.py) - Online test examples: [`tests/e2e/singlecard/test_prompt_embedding.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_prompt_embedding.py) diff --git a/docs/source/developer_guide/evaluation/using_ais_bench.md b/docs/source/developer_guide/evaluation/using_ais_bench.md index 25811aa7..cf31bc50 100644 --- a/docs/source/developer_guide/evaluation/using_ais_bench.md +++ b/docs/source/developer_guide/evaluation/using_ais_bench.md @@ -1,8 +1,11 @@ # Using AISBench + This document guides you to conduct accuracy testing using [AISBench](https://gitee.com/aisbench/benchmark/tree/master). AISBench provides accuracy and performance evaluation for many datasets. ## Online Server + ### 1. Start the vLLM server + You can run docker container to start the vLLM server on a single NPU: ```{code-block} bash @@ -44,7 +47,7 @@ vllm serve Qwen/Qwen2.5-0.5B-Instruct --max_model_len 35000 & The vLLM server is started successfully, if you see logs as below: -``` +```shell INFO: Started server process [9446] INFO: Waiting for application startup. INFO: Application startup complete. @@ -220,7 +223,7 @@ ais_bench --models vllm_api_general_chat --datasets aime2024_gen_0_shot_chat_pro After each dataset execution, you can get the result from saved files such as `outputs/default/20250628_151326`, there is an example as follows: -``` +```shell 20250628_151326/ ├── configs # Combined configuration file for model tasks, dataset tasks, and result presentation tasks │ └── 20250628_151326_29317.py @@ -276,7 +279,7 @@ ais_bench --models vllm_api_stream_chat --datasets textvqa_gen_base64 --summariz After execution, you can get the result from saved files, there is an example as follows: -``` +```shell 20251031_070226/ |-- configs # Combined configuration file for model tasks, dataset tasks, and result presentation tasks | `-- 20251031_070226_122485.py diff --git a/docs/source/developer_guide/evaluation/using_evalscope.md b/docs/source/developer_guide/evaluation/using_evalscope.md index a10f5f33..99d0b783 100644 --- a/docs/source/developer_guide/evaluation/using_evalscope.md +++ b/docs/source/developer_guide/evaluation/using_evalscope.md @@ -34,7 +34,7 @@ vllm serve Qwen/Qwen2.5-7B-Instruct --max_model_len 26240 If the vLLM server is started successfully, you can see information shown below: -``` +```shell INFO: Started server process [6873] INFO: Waiting for application startup. INFO: Application startup complete. @@ -42,7 +42,7 @@ INFO: Application startup complete. Once your server is started, you can query the model with input prompts in a new terminal: -``` +```shell curl http://localhost:8000/v1/completions \ -H "Content-Type: application/json" \ -d '{ @@ -67,7 +67,7 @@ pip install gradio plotly evalscope You can use `evalscope eval` to run GSM8K for accuracy testing: -``` +```shell evalscope eval \ --model Qwen/Qwen2.5-7B-Instruct \ --api-url http://localhost:8000/v1 \ @@ -101,7 +101,7 @@ pip install evalscope[perf] -U You can use `evalscope perf` to run perf testing: -``` +```shell evalscope perf \ --url "http://localhost:8000/v1/chat/completions" \ --parallel 5 \ diff --git a/docs/source/developer_guide/evaluation/using_lm_eval.md b/docs/source/developer_guide/evaluation/using_lm_eval.md index 11670aa3..961531a7 100644 --- a/docs/source/developer_guide/evaluation/using_lm_eval.md +++ b/docs/source/developer_guide/evaluation/using_lm_eval.md @@ -1,8 +1,11 @@ # Using lm-eval + This document guides you to conduct accuracy testing using [lm-eval][1]. ## Online Server + ### 1. Start the vLLM server + You can run docker container to start the vLLM server on a single NPU: ```{code-block} bash @@ -34,7 +37,7 @@ vllm serve Qwen/Qwen2.5-0.5B-Instruct --max_model_len 4096 & The vLLM server is started successfully, if you see logs as below: -``` +```shell INFO: Started server process [9446] INFO: Waiting for application startup. INFO: Application startup complete. @@ -44,7 +47,7 @@ INFO: Application startup complete. You can query the result with input prompts: -``` +```shell curl http://localhost:8000/v1/completions \ -H "Content-Type: application/json" \ -d '{ @@ -71,7 +74,7 @@ curl http://localhost:8000/v1/completions \ The output format matches the following: -``` +```json { "id": "cmpl-2f678e8bdf5a4b209a3f2c1fa5832e25", "object": "text_completion", @@ -108,7 +111,7 @@ pip install lm-eval[api] Run the following command: -``` +```shell # Only test gsm8k dataset in this demo lm_eval \ --model local-completions \ @@ -119,7 +122,7 @@ lm_eval \ After 30 minutes, the output is as shown below: -``` +```shell The markdown format results is as below: |Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| @@ -130,6 +133,7 @@ The markdown format results is as below: ``` ## Offline Server + ### 1. Run docker container You can run docker container on a single NPU: @@ -161,6 +165,7 @@ docker run --rm \ ``` ### 2. Run GSM8K using lm-eval for accuracy testing + Install lm-eval in the container: ```bash @@ -170,7 +175,7 @@ pip install lm-eval Run the following command: -``` +```shell # Only test gsm8k dataset in this demo lm_eval \ --model vllm \ @@ -181,7 +186,7 @@ lm_eval \ After 1 to 2 minutes, the output is shown below: -``` +```shell The markdown format results is as below: Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| diff --git a/docs/source/developer_guide/evaluation/using_opencompass.md b/docs/source/developer_guide/evaluation/using_opencompass.md index fbd7c1c3..9d21c66c 100644 --- a/docs/source/developer_guide/evaluation/using_opencompass.md +++ b/docs/source/developer_guide/evaluation/using_opencompass.md @@ -1,4 +1,5 @@ # Using OpenCompass + This document guides you to conduct accuracy testing using [OpenCompass](https://github.com/open-compass/opencompass). ## 1. Online Server @@ -33,7 +34,7 @@ vllm serve Qwen/Qwen2.5-7B-Instruct --max_model_len 26240 The vLLM server is started successfully, if you see information as below: -``` +```shell INFO: Started server process [6873] INFO: Waiting for application startup. INFO: Application startup complete. @@ -41,7 +42,7 @@ INFO: Application startup complete. Once your server is started, you can query the model with input prompts in a new terminal. -``` +```shell curl http://localhost:8000/v1/completions \ -H "Content-Type: application/json" \ -d '{ @@ -53,6 +54,7 @@ curl http://localhost:8000/v1/completions \ ``` ## 2. Run C-Eval using OpenCompass for accuracy testing + Install OpenCompass and configure the environment variables in the container: ```bash @@ -107,13 +109,13 @@ models = [ Run the following command: -``` +```shell python3 run.py opencompass/configs/eval_vllm_ascend_demo.py --debug ``` After 1 to 2 minutes, the output is shown below: -``` +```shell The markdown format results is as below: | dataset | version | metric | mode | Qwen2.5-7B-Instruct-vLLM-API | diff --git a/docs/source/developer_guide/feature_guide/ACL_Graph.md b/docs/source/developer_guide/feature_guide/ACL_Graph.md index f9c76d50..fab751cf 100644 --- a/docs/source/developer_guide/feature_guide/ACL_Graph.md +++ b/docs/source/developer_guide/feature_guide/ACL_Graph.md @@ -4,7 +4,7 @@ When in LLM inference, each token requires nearly thousand operator executions, and when host launching operators are slower than device, it will cause host bound. In severe cases, the device will be idle for more than half of the time. To solve this problem, we use graph in LLM inference. -``` +```shell eager mode: host: | launch op1 | launch op2 | launch op3 | launch op4 | launch op5 | @@ -38,11 +38,12 @@ But in reality, graph mode is not that simple. Due to graph can only replay the ops captured before, without doing tiling and checking graph input, we need to ensure the consistency of the graph input, but we know that model input's shape depends on the request scheduled by Scheduler, we can't ensure the consistency. Obviously, we can solve this problem by capturing the biggest shape and padding all of the model input to it. But it will bring a lot of redundant computing and make performance worse. So we can capture multiple graphs with different shape, and pad the model input to the nearest graph, which will greatly reduce redundant computing. But when `max_num_batched_tokens` is very large, the number of graphs that need to be captured will also become very large. But we know that when intensor's shape is large, the computing time will be very long, and graph mode is not necessary in this case. So all of things we need to do is: + 1. Set a threshold; 2. When `num_scheduled_tokens` is bigger than the threshold, use `eager_mode`; 3. Capture multiple graphs within a range below the threshold; -``` +```shell | graph1 | | graph2 | | graph3 | diff --git a/docs/source/developer_guide/feature_guide/KV_Cache_Pool_Guide.md b/docs/source/developer_guide/feature_guide/KV_Cache_Pool_Guide.md index ef67f627..843c7b56 100644 --- a/docs/source/developer_guide/feature_guide/KV_Cache_Pool_Guide.md +++ b/docs/source/developer_guide/feature_guide/KV_Cache_Pool_Guide.md @@ -21,6 +21,7 @@ vLLM Ascend Currently supports Mooncake Store for KV Cache Pool. To enable Moonc For step-by-step deployment and configuration, please refer to the [KV Pool User Guide](https://docs.vllm.ai/projects/ascend/en/latest/user_guide/feature_guide/kv_pool.html). ## How it works? + The KV Cache Pool integrates multiple memory tiers (HBM, DRAM, SSD, etc.) through a connector-based architecture. Each connector implements a unified interface for storing, retrieving, and transferring KV blocks between tiers, depending on access frequency and hardware bandwidth. @@ -28,6 +29,7 @@ Each connector implements a unified interface for storing, retrieving, and trans When combined with vLLM’s Prefix Caching mechanism, the pool enables efficient caching both locally (in HBM) and globally (via Mooncake), ensuring that frequently used prefixes remain hot while less frequently accessed KV data can spill over to lower-cost memory. ### 1. Combining KV Cache Pool with HBM Prefix Caching + Prefix Caching with HBM is already supported by the vLLM V1 Engine. By introducing KV Connector V1, users can seamlessly combine HBM-based Prefix Caching with Mooncake-backed KV Pool. @@ -54,17 +56,22 @@ To Enable this feature, we need to setup both Mooncake Connector and Mooncake St For details, please also refer to the Mooncake Connector Store Deployment Guide. ## How is MooncakestoreConnectorV1 Implemented? + **MooncakestoreConnectorV1** inhereits the KV Connector V1 class in vLLM V1: through implementing the required methods defined in the KV connector V1 base class, one can integrate a thrid-party KV cache transfer/storage backend into the vLLM framework. MooncakeStoreConnectorV1 is also largly inspried by LMCacheConnectorV1 in term of the `Lookup Engine`/`Lookup Client` design for looking up KV cache keys, and the `ChunkedTokenDatabase` class for processing tokens into prefix-aware hashes as well as other hashing related designs. On top of this, we have also added our own design including `KVTransferThread` that allows async `get` and `put` of KV caches with multi-threading, and NPU-related data transfer optimization such as removing the `LocalBuffer` in LMCache to remove redundant data transfer. The KV Connector methods that need to be implemented can be categorized into scheduler-side methods that are called in V1 scheduler and worker-side methods that are called in V1 worker, namely: -### KV Connector Scheduler-Side Methods: + +### KV Connector Scheduler-Side Methods + `get_num_new_matched_tokens`: Get prefix cache hit in number of tokens through looking up into the KV pool. `update_states_after_alloc`: Update KVConnector state after temporary buffer alloc. `build_connector_meta`: Attach the connector metadata to the request object. `request_finished`: Once a request is finished, determine whether request blocks should be freed now or will be sent asynchronously and freed later. -### Connector Worker-Side Methods: + +### Connector Worker-Side Methods + `register_kv_caches`: Register KV cache buffers needed for KV cache transfer. `start_load_kv`: Perform KV cache load operation that transfers KV cache from storage to device. `wait_for_layer_load`: Optional; Wait for layer load in layerwise + async KV load scenario. @@ -73,6 +80,7 @@ The KV Connector methods that need to be implemented can be categorized into sch `get_finished` Get request that finished KV transfer, `done_sending` if `put` finished, `done_reciving` if `get` finished. ## DFX + 1. When looking up a key in KV Pool, if we cannot find the key, there is no Cache Hit for this specific block; we return no hit for this block and do not look up further blocks for current request. 2. Similaly, when we are trying to put a block into KV Pool and failed, we do not put further blocks (subject to change). diff --git a/docs/source/developer_guide/feature_guide/ModelRunner_prepare_inputs.md b/docs/source/developer_guide/feature_guide/ModelRunner_prepare_inputs.md index 5eb8f051..09647156 100644 --- a/docs/source/developer_guide/feature_guide/ModelRunner_prepare_inputs.md +++ b/docs/source/developer_guide/feature_guide/ModelRunner_prepare_inputs.md @@ -1,13 +1,15 @@ # Prepare inputs for model forwarding ## Purpose + Information required to perform model forward pass: - - the inputs - - the corresponding attention metadata of the inputs + +- the inputs +- the corresponding attention metadata of the inputs The following diagram shows what we should prepare for model inference. -``` +```shell +---------------+ inputs --> | | | model | --> output @@ -20,8 +22,11 @@ Therefore, as long as we have these two pieces of information mentioned above, w This document will explain **how we obtain the inputs and their corresponding attention metadata**. ## Overview + ### 1. Obtain inputs + The workflow of obtaining inputs: + 1. Get `token positions`: relative position of each token within its request sequence. 2. Get `token indices`: index of each scheduled token in the token table. @@ -33,7 +38,9 @@ At last, these `Token IDs` are required to be fed into a model, and also, `posit **Note**: The `Token IDs` are the inputs of a model, so we also call them `Inputs IDs`. ### 2. Build inputs attention metadata + A model requires these attention metadata during the forward pass: + - `query start location`: start and end location of each request corresponding to the scheduled tokens. - `sequence length`: length of each request including both computed tokens and newly scheduled tokens. - `number of computed tokens`: number of computed tokens for each request. @@ -45,7 +52,9 @@ A model requires these attention metadata during the forward pass: - `attention mask`: mask matrix applied to attention scores before softmax to control which tokens can attend to each other (usually a causal attention). ## Before start + There are mainly three types of variables. + - token level: represents one attribute corresponding to each scheduled token, so the length of this variable is the number of scheduled tokens - request level: represents one attribute of each scheduled request, whose length usually is the number of scheduled requests. (`query start location` is a special case, which has one more element) - system level: @@ -55,10 +64,11 @@ There are mainly three types of variables. **Note**: Both of these two tables are come from the `_update_states` method before **preparing inputs**. You can take a look if you need more inspiration. ### Tips + Simply put, a `token ID` is an **integer** (usually `int32`), which represents a token. Example of `Token ID`: -``` +```shell | Token ID | Token | |--------------|---------------| | 0 | [PAD] | @@ -76,19 +86,24 @@ Example of `Token ID`: ``` ## Go through details + Assumptions: + - maximum number of tokens can be scheduled at once: 10 - `block size`: 2 - Totally schedule 3 requests. Their prompt lengths are 3, 2, and 8 respectively. - `max model length`: 12 (the maximum token count can be handled at one request sequence in a model). These assumptions are configured in the beginning when starting vLLM. They are not fixed, so you can manually set them. + ### Step 1: All requests in the prefill phase #### Obtain inputs + As the maximum number of tokens that can be schedules is 10, the scheduled tokens of each request can be represented as `{'0': 3, '1': 2, '2': 5}`. Note that`request_2` uses chunked prefill, leaving 3 prompt tokens unscheduled. -##### 1. Get token positions: +##### 1. Get token positions + First, determine which request each token belongs to: tokens 0–2 are assigned to **request_0**, tokens 3–4 to **request_1**, and tokens 5–9 to **request_2**. To represent this mapping, we use `request indices`, for example, `request indices`: `[0, 0, 0, 1, 1, 2, 2, 2, 2, 2]`. For each request, use **the number of computed tokens** + **the relative position of current scheduled tokens** (`request_0: [0 + 0, 0 + 1, 0 + 2]`, `request_1: [0 + 0, 0 + 1]`, `request_2: [0 + 0, 0 + 1,..., 0 + 4]`) and then concatenate them together (`[0, 1, 2, 0, 1, 0, 1, 2, 3, 4]`). @@ -97,13 +112,15 @@ Note: there is more efficient way (using `request indices`) to create positions Finally, `token positions` can be obtained as `[0, 1, 2, 0, 1, 0, 1, 2, 3, 4]`. This variable is **token level**. -##### 2. Get token indices: +##### 2. Get token indices + The shape of the current **Token IDs table** is `(max num request, max model len)`. Why these `T_3_5`, `T_3_6`, `T_3_7` are in this table without being scheduled? + - We fill all Token IDs in one request sequence to this table at once, but we only retrieve the tokens we scheduled this time. Then we retrieve the remain Token IDs next time. -``` +```shell | T_0_0 | T_0_1 | T_0_2 | ? | ? | ? | ? | ? | ? | ? | ? | ? | | T_1_0 | T_1_1 | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | | T_2_0 | T_2_1 | T_3_2 | T_3_3 | T_3_4 | T_3_5 | T_3_6 | T_3_7 | ? | ? | ? | ? | @@ -120,19 +137,22 @@ Let's say `M = max model len`. Then we can use `token positions` together with ` So `token indices` = `[0 + 0 * M, 1 + 0 * M, 2 + 0 * M, 0 + 1 * M, 1 + 1 * M, 0 + 2 * M, 1 + 2 * M, 2 + 2 * M, 3 + 2 * M, 4 + 2 * M]` = `[0, 1, 2, 12, 13, 24, 25, 26, 27, 28]` ##### 3. Retrieve the Token IDs + We use `token indices` to select out the corresponding `Input IDs` from the token table. The pseudocode is as follows: -``` +```shell input_ids = token_table[token_indices] ``` As mentioned before, we refer to these `Token IDs` as `Input IDs`. + - `Input IDs` = `[T_0_0, T_0_1, T_0_2, T_1_0, T_1_1, T_2_0, T_2_1, T_3_2, T_3_3, T_3_4]` #### Build inputs attention metadata + In the current **Block Table**, we use the first block (i.e. block_0) to mark the unused block. The shape of the block is `(max num request, max model len / block size)`, where `max model len / block size = 12 / 2 = 6`. -``` +```shell | 1 | 2 | 0 | 0 | 0 | 0 | | 3 | 0 | 0 | 0 | 0 | 0 | | 4 | 5 | 6 | 0 | 0 | 0 | @@ -144,13 +164,14 @@ In the current **Block Table**, we use the first block (i.e. block_0) to mark th The KV cache block in the device memory is like: -``` +```shell | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ...... ``` Let's say `K = max model len / block size = 6`, and we can get token `device block number`. The workflow of achieving slot mapping: + 1. Get `block table indices` using `K`, `positions` and `request indices`. Purpose: For each token, it could be used to select `device block number` from `block table`. @@ -168,6 +189,7 @@ The workflow of achieving slot mapping: Purpose: we can use `slot mapping` to store Token IDs into token slots. Details: + 1. (**Token level**) Use a simple formula to calculate `block table indices`: `request indices * K + positions / block size`. So it equal to `[0 * 6 + 0 / 2, 0 * 6 + 1 / 2, 0 * 6 + 2 / 2, 1 * 6 + 0 / 2, 1 * 6 + 1 / 2, 2 * 6 + 0 / 2, 2 * 6 + 1 / 2, 2 * 6 + 2 / 2, 2 * 6 + 3 / 2, 2 * 6 + 4 / 2] = [0, 0, 1, 6, 6, 12, 12, 13, 13, 14]`. This could be used to select `device block number` from `block table`. 2. (**Token level**) Use `block table indices` to select out `device block number` for each scheduled token. The Pseudocode is `block_numbers = block_table[block_table_indices]`. So `device block number=[1, 1, 2, 3, 3, 4, 4, 5, 5, 6]` 3. (**Token level**) `block offsets` could be computed by `block offsets = positions % block size = [0, 1, 0, 0, 1, 0, 1, 0, 1, 0]`. @@ -185,9 +207,11 @@ Details: - `attention mask`: For all requests that initiate a prefill process, we simply create only one mask matrix for reuse across different requests. The shape of this mask matrix is `5 * 5`: ### Step 2: Chunked prefill + In Step 2, we no longer provide explanations or perform calculations; instead, we directly present the final result. #### Obtain inputs + Scheduled token of each request: `{'0': 1, '1': 1, '2': 3}` 1. `request indices`: `[0, 1, 2, 2, 2]` @@ -195,7 +219,7 @@ Scheduled token of each request: `{'0': 1, '1': 1, '2': 3}` Current **Token IDs table**: -``` +```shell | T_0_0 | T_0_1 | T_0_2 | T_0_3 | ? | ? | ? | ? | ? | ? | ? | ? | | T_1_0 | T_1_1 | T_1_2 | ? | ? | ? | ? | ? | ? | ? | ? | ? | | T_2_0 | T_2_1 | T_3_2 | T_3_3 | T_3_4 | T_3_5 | T_3_6 | T_3_7 | ? | ? | ? | ? | @@ -211,11 +235,12 @@ Current **Token IDs table**: 4. `Input IDs`: `[T_0_3, T_1_2, T_3_5, T_3_6, T_3_7]` #### Build inputs attention metadata + We allocate the blocks `7` and `8` to `request_1` and `request_2` respectively, as they need more space in device to store KV cache following token generation or chunked prefill. Current **Block Table**: -``` +```shell | 1 | 2 | 0 | 0 | 0 | 0 | | 3 | 7 | 0 | 0 | 0 | 0 | | 4 | 5 | 6 | 8 | 0 | 0 | @@ -227,7 +252,7 @@ Current **Block Table**: KV cache block in the device memory: -``` +```shell | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ...... ``` @@ -237,6 +262,7 @@ KV cache block in the device memory: 4. (**Token level**) `slot mapping`: `[5, 14, 13, 16, 17]` Scheduled token count:`[1, 1, 3]` + - `query start location`: `[0, 1, 2, 5]` - `sequence length`: `[4, 3, 8]` @@ -254,6 +280,7 @@ Scheduled token count:`[1, 1, 3]` Each token has a `1 * 8` vector, and there are 5 scheduled tokens. ## At last + If you understand the step_1 and step_2, you will know the all following steps. Hope this document can help you better understand how vLLM prepares inputs for model forwarding. If you have any good idea, welcome to contribute to us. diff --git a/docs/source/developer_guide/feature_guide/context_parallel.md b/docs/source/developer_guide/feature_guide/context_parallel.md index c36c9699..75d7900b 100644 --- a/docs/source/developer_guide/feature_guide/context_parallel.md +++ b/docs/source/developer_guide/feature_guide/context_parallel.md @@ -20,6 +20,7 @@ Its main objective is to eliminate duplicated storage of the KV cache by shardin DCP primarily influences the Decode logic, as well as the logic for chunked prefill and cached prefill. ## How to Use CP? + Please refer to the [context parallel user guide](../../user_guide/feature_guide/context_parallel.md) for detailed information. ## How It Works? diff --git a/docs/source/developer_guide/feature_guide/disaggregated_prefill.md b/docs/source/developer_guide/feature_guide/disaggregated_prefill.md index 87c50c68..9e358862 100644 --- a/docs/source/developer_guide/feature_guide/disaggregated_prefill.md +++ b/docs/source/developer_guide/feature_guide/disaggregated_prefill.md @@ -15,6 +15,7 @@ This feature addresses the need to optimize the **Time Per Output Token (TPOT)** ## Usage vLLM Ascend currently supports two types of connectors for handling KV cache management: + - **MooncakeConnector**: D nodes pull KV cache from P nodes. - **MooncakeLayerwiseConnector**: P nodes push KV cache to D nodes in a layered manner. @@ -35,7 +36,7 @@ Our design diagram is shown below, illustrating the pull and push schemes respec ![alt text](../../assets/disaggregated_prefill_pull.png) ![alt text](../../assets/disaggregated_prefill_push.png) -#### Mooncake Connector: +#### Mooncake Connector 1. The request is sent to the Proxy’s `_handle_completions` endpoint. 2. The Proxy calls `select_prefiller` to choose a P node and forwards the request, configuring `kv_transfer_params` with `do_remote_decode=True`, `max_tokens=1`, and `min_tokens=1`. @@ -43,7 +44,7 @@ Our design diagram is shown below, illustrating the pull and push schemes respec 4. The Proxy calls `select_decoder` to choose a D node and forwards the request. 5. On the D node, the scheduler marks the request as `RequestStatus.WAITING_FOR_REMOTE_KVS`, pre-allocates KV cache, calls `kv_connector_no_forward` to pull the remote KV cache, then notifies the P node to release KV cache and proceeds with decoding to return the result. -#### Mooncake Layerwise Connector: +#### Mooncake Layerwise Connector 1. The request is sent to the Proxy’s `_handle_completions` endpoint. 2. The Proxy calls `select_decoder` to choose a D node and forwards the request, configuring `kv_transfer_params` with `do_remote_prefill=True` and setting the `metaserver` endpoint. @@ -55,6 +56,7 @@ Our design diagram is shown below, illustrating the pull and push schemes respec ### 3. Interface Design Taking MooncakeConnector as an example, the system is organized into three primary classes: + - **MooncakeConnector**: Base class that provides core interfaces. - **MooncakeConnectorScheduler**: Interface for scheduling the connectors within the engine core, responsible for managing KV cache transfer requirements and completion. - **MooncakeConnectorWorker**: Interface for managing KV cache registration and transfer in worker processes. diff --git a/docs/source/developer_guide/feature_guide/eplb_swift_balancer.md b/docs/source/developer_guide/feature_guide/eplb_swift_balancer.md index af6e90db..999dc6e1 100644 --- a/docs/source/developer_guide/feature_guide/eplb_swift_balancer.md +++ b/docs/source/developer_guide/feature_guide/eplb_swift_balancer.md @@ -1,18 +1,22 @@ # Expert Parallelism Load Balancer (EPLB) ## Why We Need EPLB? + When using Expert Parallelism (EP), different experts are assigned to different NPUs. Given that the load of various experts may vary depending on the current workload, it is crucial to maintain balanced loads across different NPUs. We adopt a redundant experts strategy by duplicating heavily-loaded experts. Then, we heuristically pack these duplicated experts onto NPUs to ensure load balancing across them. Moreover, thanks to the group-limited expert routing used in MoE models, we also attempt to place experts of the same group on the same node to reduce inter-node data traffic, whenever possible. To facilitate reproduction and deployment, Vllm Ascend supported deployed EP load balancing algorithm in `vllm_ascend/eplb/core/policy`. The algorithm computes a balanced expert replication and placement plan based on the estimated expert loads. Note that the exact method for predicting expert loads is outside the scope of this repository. A common method is to use a moving average of historical statistics. ![eplb](../../assets/eplb.png) + ## How to Use EPLB? + Please refer to the EPLB section of the user guide for detailed information: [How to Use EPLB](../../user_guide/feature_guide/eplb_swift_balancer.md) ## How It Works? + **EPLB Module Architecture** -``` +```shell vllm_ascend ├── eplb │ ├── adaptor @@ -35,6 +39,7 @@ vllm_ascend **1. Adaptor Module** *Handles registration and adaptation for different MoE model types* + - `abstract_adaptor.py` Abstract base class defining unified registration interfaces for EPLB adapters - `vllm_adaptor.py` @@ -42,17 +47,18 @@ vllm_ascend **2. Core Module** *Implements core algorithms, updates, and asynchronous processing* + - **Policy Submodule** *Load balancing algorithms with factory pattern instantiation* - - `policy_abstract.py` + - `policy_abstract.py` Abstract class for load balancing strategy interfaces - - `policy_dynamic_ep.py` + - `policy_dynamic_ep.py` Default implementation of open-source EPLB paper algorithm - - `policy_dynamic_ep_v2.py` + - `policy_dynamic_ep_v2.py` Enhanced version optimizing expert swaps for low-bandwidth devices (e.g., A2) - - `policy_flashlb.py` + - `policy_flashlb.py` Threshold-based adjustment reducing operational costs through layer-wise fluctuation detection - - `policy_factory.py` + - `policy_factory.py` Strategy factory for automatic algorithm instantiation - `eplb_device_transfer_loader.py` @@ -63,12 +69,14 @@ vllm_ascend Asynchronous algorithm orchestration and result processing **3. System Components** + - `eplb_updator.py` Central coordinator for load balancing during inference workflows - `utils.py` General utilities for EPLB interface registration *Key Optimizations:* + 1. Maintained original structure while improving technical clarity 2. Standardized terminology 3. Enhanced algorithm differentiation through concise descriptors @@ -76,14 +84,19 @@ vllm_ascend 5. Preserved file/class relationships while optimizing readability ### Default Algorithm + #### Hierarchical Load Balancing + When the number of server nodes evenly divides the number of expert groups, we use the hierarchical load balancing policy to leverage group-limited expert routing. We first pack the expert groups onto nodes evenly, ensuring balanced loads across different nodes. Then, we replicate the experts within each node. Finally, we pack the replicated experts onto individual NPUs to ensure load balancing across them. The hierarchical load balancing policy can be used in the prefilling stage with a smaller expert-parallel size. #### Global Load Balancing + In other cases, we use the global load balancing policy, which replicates experts globally regardless of expert groups, and packs the replicated experts onto individual NPUs. This policy can be adopted in the decoding stage with a larger expert-parallel size. ### Add a New EPLB Policy + If you want to add a new eplb policy to vllm_ascend, you must follow these steps: + 1. Inherit the `EplbPolicy` abstract class of `policy_abstract.py` and override the `rebalance_experts` interface, ensuring consistent input parameters `current_expert_table`, `expert_workload` and return types `newplacement`. For example: @@ -113,6 +126,7 @@ class RandomLoadBalance(EplbPolicy): 2. To add a new EPLB algorithm, include the policy type and its corresponding implementation class in the `PolicyFactory` of `policy_factory.py`. ### Add a New MoE Model + **Implementation Guide for Model Integration** 1. **Adapter File Modification** @@ -154,12 +168,17 @@ class RandomLoadBalance(EplbPolicy): - Benchmark against baseline implementations (e.g., Qwen3-MoE) *Key Implementation Notes:* + - Preserve existing interface contracts in abstract classes - Use decorators for non-intrusive patch integration - Leverage `eplb_utils.py` for shared expert mapping operations + ## DFX + ### Parameter Validation + #### Integer Parameters + All integer input parameters must explicitly specify their maximum and minimum values and be subject to valid value validation. For example, `num_iterations_eplb_update` must be greater than 0: ```python @@ -176,6 +195,7 @@ All integer input parameters must explicitly specify their maximum and minimum v ``` #### File Path + The file path for EPLB must be checked for legality, such as whether the file path is valid and whether it has appropriate read and write permissions. For example: ```python @@ -203,20 +223,27 @@ The file path for EPLB must be checked for legality, such as whether the file pa ``` ### Function Specifications + #### Initialization Function + All EPLB parameters must be initialized by default during initialization, with specified parameter types and default values for proper handling. #### General Functions + All method arguments must specify parameter types and default values, and functions must include default return value handling for default arguments. It is recommended to use `try-except` blocks to handle the function body, specifying the type of exception captured and the failure handling (e.g., logging exceptions or returning a failure status). ### Consistency + #### Expert Map + The expert map must be globally unique during initialization and update. In a multi-node scenario during initialization, distributed communication should be used to verify the consistency of expert maps across each rank. If they are inconsistent, the user should be notified which ranks have inconsistent maps. During the update process, if only a few layers or the expert table of a certain rank has been changed, the updated expert table must be synchronized with the EPLB's context to ensure global consistency. #### Expert Weight + When updating expert weights, ensure that the memory allocated for the expert weights has been released, or that the expert (referring to the old version) is no longer in use. ## Limitation + Before using EPLB, start the script and add `export DYNAMIC_EPLB="true"`. Before performing load data collection (or performance data collection), start the script and add `export EXPERT_MAP_RECORD="true"`. diff --git a/docs/source/developer_guide/feature_guide/patch.md b/docs/source/developer_guide/feature_guide/patch.md index 56d5f0ee..8a983217 100644 --- a/docs/source/developer_guide/feature_guide/patch.md +++ b/docs/source/developer_guide/feature_guide/patch.md @@ -16,7 +16,7 @@ We should keep in mind that Patch is not the best way to make vLLM Ascend compat In `vllm_ascend/patch`, you can see the code structure as follows: -``` +```shell vllm_ascend ├── patch │ ├── platform @@ -27,10 +27,10 @@ vllm_ascend ``` - **platform**: The patch code in this directory is for patching the code in vLLM main process. It's called by `vllm_ascend/platform::NPUPlatform::pre_register_and_update` very early when vLLM is initialized. - - For online mode, vLLM process calls the platform patch in `vllm/vllm/engine/arg_utils.py::AsyncEngineArgs.add_cli_args` when parsing the cli args. - - For offline mode, vLLM process calls the platform patch in `vllm/vllm/engine/arg_utils.py::EngineArgs.create_engine_config` when parsing the input parameters. + - For online mode, vLLM process calls the platform patch in `vllm/vllm/engine/arg_utils.py::AsyncEngineArgs.add_cli_args` when parsing the cli args. + - For offline mode, vLLM process calls the platform patch in `vllm/vllm/engine/arg_utils.py::EngineArgs.create_engine_config` when parsing the input parameters. - **worker**: The patch code in this directory is for patching the code in vLLM worker process. It's called by `vllm_ascend/worker/worker::NPUWorker::__init__` when the vLLM worker process is initialized. - - For both online and offline mode, vLLM engine core process calls the worker patch in `vllm/vllm/worker/worker_base.py::WorkerWrapperBase.init_worker` when initializing the worker process. + - For both online and offline mode, vLLM engine core process calls the worker patch in `vllm/vllm/worker/worker_base.py::WorkerWrapperBase.init_worker` when initializing the worker process. ## How to write a patch @@ -54,7 +54,7 @@ Before writing a patch, following the principle above, we should patch the least 5. Import the patch file in `__init__.py`. In this example, add `import vllm_ascend.patch.platform.patch_distributed` into `vllm_ascend/patch/platform/__init__.py`. 6. Add the description of the patch in `vllm_ascend/patch/__init__.py`. The description format is as follows: - ``` + ```python # ** File: ** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `` @@ -71,5 +71,6 @@ Before writing a patch, following the principle above, we should patch the least 7. Add the Unit Test and E2E Test. Any newly added code in vLLM Ascend should contain the Unit Test and E2E Test as well. You can find more details in [test guide](../contribution/testing.md) ## Limitation + 1. In V1 Engine, vLLM starts three kinds of process: Main process, EngineCore process and Worker process. Now vLLM Ascend only can patch the code in Main process and Worker process by default. If you want to patch the code running in EngineCore process, you should patch EngineCore process entirely during setup. Find the entire code in `vllm.v1.engine.core`. Please override `EngineCoreProc` and `DPEngineCoreProc` entirely. 2. If you are running edited vLLM code, the version of vLLM may be changed automatically. For example, if you run the edited vLLM based on v0.9.n, the version of vLLM may be changed to v0.9.nxxx. In this case, the patch for v0.9.n in vLLM Ascend would not work as expected, because vLLM Ascend can't distinguish the version of the vLLM you're using. In this case, you can set the environment variable `VLLM_VERSION` to specify the version of the vLLM you're using, and then the patch for v0.10.0 should work. diff --git a/docs/source/developer_guide/performance_and_debug/msprobe_guide.md b/docs/source/developer_guide/performance_and_debug/msprobe_guide.md index b16d2918..b18e2a3d 100644 --- a/docs/source/developer_guide/performance_and_debug/msprobe_guide.md +++ b/docs/source/developer_guide/performance_and_debug/msprobe_guide.md @@ -53,7 +53,7 @@ To restrict the operators that are captured, configure the `list` block: - `scope` (list[str]): In PyTorch pynative scenarios this field restricts the dump range. Provide two module or API names that follow the tool's naming convention to lock a range; only data between the two names will be dumped. Examples: - ``` + ```json "scope": ["Module.conv1.Conv2d.forward.0", "Module.fc2.Linear.forward.0"] "scope": ["Cell.conv1.Conv2d.forward.0", "Cell.fc2.Dense.backward.0"] "scope": ["Tensor.add.0.forward", "Functional.square.2.forward"] @@ -62,9 +62,9 @@ To restrict the operators that are captured, configure the `list` block: The `level` setting determines what can be provided—modules when `level=L0`, APIs when `level=L1`, and either modules or APIs when `level=mix`. - `list` (list[str]): Custom operator list. Options include: - - Supply the full names of specific APIs in PyTorch pynative scenarios to only dump those APIs. Example: `"list": ["Tensor.permute.1.forward", "Tensor.transpose.2.forward", "Torch.relu.3.backward"]`. - - When `level=mix`, you can provide module names so that the dump expands to everything produced while the module is running. Example: `"list": ["Module.module.language_model.encoder.layers.0.mlp.ParallelMlp.forward.0"]`. - - Provide a substring such as `"list": ["relu"]` to dump every API whose name contains the substring. When `level=mix`, modules whose names contain the substring are also expanded. + - Supply the full names of specific APIs in PyTorch pynative scenarios to only dump those APIs. Example: `"list": ["Tensor.permute.1.forward", "Tensor.transpose.2.forward", "Torch.relu.3.backward"]`. + - When `level=mix`, you can provide module names so that the dump expands to everything produced while the module is running. Example: `"list": ["Module.module.language_model.encoder.layers.0.mlp.ParallelMlp.forward.0"]`. + - Provide a substring such as `"list": ["relu"]` to dump every API whose name contains the substring. When `level=mix`, modules whose names contain the substring are also expanded. Example configuration: @@ -188,7 +188,7 @@ Use `msprobe graph_visualize` to generate results that can be opened inside `tb_ Replace the paths with your dump directories before invoking `msprobe graph_visualize`. **If you only need to build a single graph**, omit `bench_path` to visualize one dump. Multi-rank scenarios (single rank, multi-rank, or multi-step multi-rank) are also supported. `npu_path` or `bench_path` must contain folders named `rank+number`, and every rank folder must contain a non-empty `construct.json` together with `dump.json` and `stack.json`. If any `construct.json` is empty, verify that the dump level includes `L0` or `mix`. When comparing graphs, both `npu_path` and `bench_path` must contain the same set of rank folders so they can be paired one-to-one. - ``` + ```shell ├── npu_path or bench_path | ├── rank0 | | ├── dump_tensor_data (only when the `tensor` option is enabled) diff --git a/docs/source/developer_guide/performance_and_debug/optimization_and_tuning.md b/docs/source/developer_guide/performance_and_debug/optimization_and_tuning.md index 15562908..0ba3acd7 100644 --- a/docs/source/developer_guide/performance_and_debug/optimization_and_tuning.md +++ b/docs/source/developer_guide/performance_and_debug/optimization_and_tuning.md @@ -200,10 +200,12 @@ echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor ``` Purpose + - Forces all CPU cores to run under the `performance` governor - Disables dynamic frequency scaling (e.g., `ondemand`, `powersave`) Benefits + - Keeps CPU cores at maximum frequency - Reduces latency jitter - Improves predictability for inference workloads @@ -224,6 +226,7 @@ Benefits - Improves stability for large in-memory models Notes + - For inference workloads, swap can introduce second-level latency - Recommended values are `0` or `1` @@ -244,6 +247,7 @@ Benefits - Improves performance stability on NUMA systems Recommended For + - Multi-socket servers - Ascend / NPU deployments with explicit NUMA binding - Systems with manually managed CPU and memory affinity @@ -255,14 +259,17 @@ sysctl -w kernel.sched_migration_cost_ns=50000 ``` Purpose + - Increases the cost for the scheduler to migrate tasks between CPU cores Benefits + - Reduces frequent thread migration - Improves CPU cache locality - Lowers latency jitter for inference workloads Parameter Details + - Unit: nanoseconds (ns) - Typical recommended range: 50000–100000 - Higher values encourage threads to stay on the same CPU core diff --git a/docs/source/developer_guide/performance_and_debug/performance_benchmark.md b/docs/source/developer_guide/performance_and_debug/performance_benchmark.md index 9a421393..47aacb86 100644 --- a/docs/source/developer_guide/performance_and_debug/performance_benchmark.md +++ b/docs/source/developer_guide/performance_and_debug/performance_benchmark.md @@ -1,4 +1,5 @@ # Performance Benchmark + This document details the benchmark methodology for vllm-ascend, aimed at evaluating the performance under a variety of workloads. To maintain alignment with vLLM, we use the [benchmark](https://github.com/vllm-project/vllm/tree/main/benchmarks) script provided by the vllm project. **Benchmark Coverage**: We measure offline E2E latency and throughput, and fixed-QPS online serving benchmarks. For more details, see [vllm-ascend benchmark scripts](https://github.com/vllm-project/vllm-ascend/tree/main/benchmarks). @@ -38,10 +39,12 @@ pip install -r benchmarks/requirements-bench.txt ``` ## 3. Run basic benchmarks + This section introduces how to perform performance testing using the benchmark suite built into VLLM. ### 3.1 Dataset -VLLM supports a variety of (datasets)[https://github.com/vllm-project/vllm/blob/main/vllm/benchmarks/datasets.py]. + +VLLM supports a variety of [datasets](https://github.com/vllm-project/vllm/blob/main/vllm/benchmarks/datasets.py).