diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md index 2f88e52c..03cbc5a8 100644 --- a/docs/source/community/versioning_policy.md +++ b/docs/source/community/versioning_policy.md @@ -21,29 +21,30 @@ For example: The table below is the release compatibility matrix for vLLM Ascend release. -| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | -|-------------|-------------------|-----------------|-------------|---------------------------------| -| v0.13.0rc1 | v0.13.0 | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 | -| v0.11.0 | v0.11.0 | >= 3.9 , < 3.12 | 8.3.RC2 | 2.7.1 / 2.7.1.post1 | -| v0.12.0rc1 | v0.12.0 | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 | -| v0.11.0rc3 | v0.11.0 | >= 3.9, < 3.12 | 8.3.RC2 | 2.7.1 / 2.7.1.post1 | -| v0.11.0rc2 | v0.11.0 | >= 3.9, < 3.12 | 8.3.RC2 | 2.7.1 / 2.7.1 | -| v0.11.0rc1 | v0.11.0 | >= 3.9, < 3.12 | 8.3.RC1 | 2.7.1 / 2.7.1 | -| v0.11.0rc0 | v0.11.0rc3 | >= 3.9, < 3.12 | 8.2.RC1 | 2.7.1 / 2.7.1.dev20250724 | -| v0.10.2rc1 | v0.10.2 | >= 3.9, < 3.12 | 8.2.RC1 | 2.7.1 / 2.7.1.dev20250724 | -| v0.10.1rc1 | v0.10.1/v0.10.1.1 | >= 3.9, < 3.12 | 8.2.RC1 | 2.7.1 / 2.7.1.dev20250724 | -| v0.10.0rc1 | v0.10.0 | >= 3.9, < 3.12 | 8.2.RC1 | 2.7.1 / 2.7.1.dev20250724 | -| v0.9.2rc1 | v0.9.2 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1.post1.dev20250619 | -| v0.9.1 | v0.9.1 | >= 3.9, < 3.12 | 8.2.RC1 | 2.5.1 / 2.5.1.post1 | -| v0.9.1rc3 | v0.9.1 | >= 3.9, < 3.12 | 8.2.RC1 | 2.5.1 / 2.5.1.post1 | -| v0.9.1rc2 | v0.9.1 | >= 3.9, < 3.12 | 8.2.RC1 | 2.5.1 / 2.5.1.post1 | -| v0.9.1rc1 | v0.9.1 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1.post1.dev20250528 | -| v0.9.0rc2 | v0.9.0 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | -| v0.9.0rc1 | v0.9.0 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | -| v0.8.5rc1 | v0.8.5.post1 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | -| v0.8.4rc2 | v0.8.4 | >= 3.9, < 3.12 | 8.0.0 | 2.5.1 / 2.5.1 | -| v0.7.3.post1| v0.7.3 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | -| v0.7.3 | v0.7.3 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | +| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | Triton Ascend | +|-------------|-------------------|-----------------|-------------|---------------------------------|---------------| +| v0.13.0rc2 | v0.13.0 | >= 3.10, < 3.12 | 8.5.0 | 2.8.0 / 2.8.0.post1 | 3.2.0 | +| v0.13.0rc1 | v0.13.0 | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 | | +| v0.11.0 | v0.11.0 | >= 3.9 , < 3.12 | 8.3.RC2 | 2.7.1 / 2.7.1.post1 | | +| v0.12.0rc1 | v0.12.0 | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 | | +| v0.11.0rc3 | v0.11.0 | >= 3.9, < 3.12 | 8.3.RC2 | 2.7.1 / 2.7.1.post1 | | +| v0.11.0rc2 | v0.11.0 | >= 3.9, < 3.12 | 8.3.RC2 | 2.7.1 / 2.7.1 | | +| v0.11.0rc1 | v0.11.0 | >= 3.9, < 3.12 | 8.3.RC1 | 2.7.1 / 2.7.1 | | +| v0.11.0rc0 | v0.11.0rc3 | >= 3.9, < 3.12 | 8.2.RC1 | 2.7.1 / 2.7.1.dev20250724 | | +| v0.10.2rc1 | v0.10.2 | >= 3.9, < 3.12 | 8.2.RC1 | 2.7.1 / 2.7.1.dev20250724 | | +| v0.10.1rc1 | v0.10.1/v0.10.1.1 | >= 3.9, < 3.12 | 8.2.RC1 | 2.7.1 / 2.7.1.dev20250724 | | +| v0.10.0rc1 | v0.10.0 | >= 3.9, < 3.12 | 8.2.RC1 | 2.7.1 / 2.7.1.dev20250724 | | +| v0.9.2rc1 | v0.9.2 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1.post1.dev20250619 | | +| v0.9.1 | v0.9.1 | >= 3.9, < 3.12 | 8.2.RC1 | 2.5.1 / 2.5.1.post1 | | +| v0.9.1rc3 | v0.9.1 | >= 3.9, < 3.12 | 8.2.RC1 | 2.5.1 / 2.5.1.post1 | | +| v0.9.1rc2 | v0.9.1 | >= 3.9, < 3.12 | 8.2.RC1 | 2.5.1 / 2.5.1.post1 | | +| v0.9.1rc1 | v0.9.1 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1.post1.dev20250528 | | +| v0.9.0rc2 | v0.9.0 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | +| v0.9.0rc1 | v0.9.0 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | +| v0.8.5rc1 | v0.8.5.post1 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | +| v0.8.4rc2 | v0.8.4 | >= 3.9, < 3.12 | 8.0.0 | 2.5.1 / 2.5.1 | | +| v0.7.3.post1| v0.7.3 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | +| v0.7.3 | v0.7.3 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | :::{note} If you're using v0.7.3, don't forget to install [mindie-turbo](https://pypi.org/project/mindie-turbo) as well. @@ -61,6 +62,8 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL | Date | Event | |------------|-------------------------------------------| +| 2026.01.24 | Release candidates, v0.13.0rc2 | +| 2025.12.27 | Release candidates, v0.13.0rc1 | | 2025.12.16 | v0.11.0 Final release, v0.11.0 | | 2025.12.13 | Release candidates, v0.12.0rc1 | | 2025.12.03 | Release candidates, v0.11.0rc3 | diff --git a/docs/source/faqs.md b/docs/source/faqs.md index 8c6bce21..14ac8620 100644 --- a/docs/source/faqs.md +++ b/docs/source/faqs.md @@ -3,7 +3,7 @@ ## Version Specific FAQs - [[v0.11.0] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/4808) -- [[v0.13.0rc1] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/5333) +- [[v0.13.0rc2] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/6186) ## General FAQs diff --git a/docs/source/user_guide/release_notes.md b/docs/source/user_guide/release_notes.md index 1559cf48..5a5cf6c2 100644 --- a/docs/source/user_guide/release_notes.md +++ b/docs/source/user_guide/release_notes.md @@ -1,5 +1,60 @@ # Release Notes +## v0.13.0rc2 - 2026.01.24 + +This is the second release candidate of v0.13.0 for vLLM Ascend. In this rc relesae, we fixed lots of bugs and improved the performance of many models. Please follow the [official doc](https://docs.vllm.ai/projects/ascend/en/v0.13.0/) to get started. Any feedback is welcome to help us to improve the final version of v0.13.0. + +### Highlights + +We mainly focus on quality and performance improvement in this release. The spec decode, graph mode, context parallel and EPLB have been improved significantly. A lot of bugs have been fixed and the performance has been improved for DeepSeek3.1/3.2, Qwen3 Dense/MOE models. + +### Features + +- implement basic framework for batch invariant [#5517](https://github.com/vllm-project/vllm-ascend/pull/5517) +- Eagle spec decode feature now works with full graph mode. [#5118](https://github.com/vllm-project/vllm-ascend/pull/5118) +- Context Parallel(PCP&DCP) feature is more stable now. And it works for most case. Please try it out. +- MTP and eagle spec decode feature now works in most cases. And it's suggested to use them in most cases. +- EPLB feature more stable now. Many bugs have been fixed. Mix placement works now [#6086](https://github.com/vllm-project/vllm-ascend/pull/6086) +- Support kv nz feature for DeepSeek decode node in disagg-prefill scenario [#3072](https://github.com/vllm-project/vllm-ascend/pull/3072) + +### Model Support + +- LongCat-Flash is supproted now.[#3833](https://github.com/vllm-project/vllm-ascend/pull/3833) +- minimax_m2 is supported now. [#5624](https://github.com/vllm-project/vllm-ascend/pull/5624) +- Support for cross-attention and whisper models [#5592](https://github.com/vllm-project/vllm-ascend/pull/5592) + +### Performance + +- Many custom ops and triton kernels are added in this release to speed up the performance of models. Such as `RejectSampler`, `MoeInitRoutingCustom`, `DispatchFFNCombine` and so on. +- Improved the performance of Layerwise Connector [#5303](https://github.com/vllm-project/vllm-ascend/pull/5303) + +### Others + +- Basic support Model Runner v2. Model Runner V2 is the next generation of vLLM. It will be used by default in the future release. [#5210](https://github.com/vllm-project/vllm-ascend/pull/5210) +- Fixed a bug that the zmq send/receive may failed [#5503](https://github.com/vllm-project/vllm-ascend/pull/5503) +- Supported to use full-graph with Qwen3-Next-MTP [#5477](https://github.com/vllm-project/vllm-ascend/pull/5477) +- Fix weight transpose in RL scenarios [#5567](https://github.com/vllm-project/vllm-ascend/pull/5567) +- Adapted SP to eagle3 [#5562](https://github.com/vllm-project/vllm-ascend/pull/5562) +- Context Parallel(PCP&DCP) support mlapo [#5672](https://github.com/vllm-project/vllm-ascend/pull/5672) +- GLM4.6 support mtp with fullgraph [#5460](https://github.com/vllm-project/vllm-ascend/pull/5460) +- Flashcomm2 now works with oshard generalized feature [#4723](https://github.com/vllm-project/vllm-ascend/pull/4723) +- Support setting tp=1 for the Eagle draft model [#5804](https://github.com/vllm-project/vllm-ascend/pull/5804) +- Flashcomm1 feature now works with qwen3-vl [#5848](https://github.com/vllm-project/vllm-ascend/pull/5848) +- Support fine-grained shared expert overlap [#5962](https://github.com/vllm-project/vllm-ascend/pull/5962) + +### Dependencies + +- CANN is upgraded to 8.5.0 +- torch-npu is upgraded to 2.8.0.post1. Please note that the post version will not be installed by default. Please install it by hand from [pypi mirror](https://mirrors.huaweicloud.com/ascend/repos/pypi/torch-npu/). +- triton-ascend is upgraded to 3.2.0 + +### Deprecation & Breaking Changes + +- `CPUOffloadingConnector` is deprecated. We'll remove it in the next release. It'll be replaced by CPUOffload feature from vLLM in the future. +- eplb config options is moved to `eplb_config` in [additional config](https://docs.vllm.ai/projects/ascend/en/latest/developer_guide/performance_and_debug/profile_execute_duration.html). The old ones will be removed in the next release. +- `ProfileExecuteDuration` [feature](https://docs.vllm.ai/projects/ascend/en/latest/developer_guide/performance_and_debug/profile_execute_duration.html) is deprecated. It's replaced by `ObservabilityConfig` from vLLM. +- The value of `VLLM_ASCEND_ENABLE_MLAPO` env will be set to True by default in the next release. It'll be enabled in decode node by default. Please note that this feature will cost more memory. If you are memory sensitive, please set it to False. + ## v0.13.0rc1 - 2025.12.27 This is the first release candidate of v0.13.0 for vLLM Ascend. We landed lots of bug fix, performance improvement and feature support in this release. Any feedback is welcome to help us to improve vLLM Ascend. Please follow the [official doc](https://docs.vllm.ai/projects/ascend/en/latest) to get started. diff --git a/tools/install_flash_infer_attention_score_ops_a2.sh b/tools/install_flash_infer_attention_score_ops_a2.sh index 2e4589a8..7f1fe4cd 100644 --- a/tools/install_flash_infer_attention_score_ops_a2.sh +++ b/tools/install_flash_infer_attention_score_ops_a2.sh @@ -23,7 +23,7 @@ trap 'echo "Error on line $LINENO: command \`$BASH_COMMAND\` failed with exit co cd /vllm-workspace # download fused_infer_attention_score related source files -wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/fused_infer_attention_score_a2_$(uname -i).tar.gz +wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/cann-8.5.0/fused_infer_attention_score_a2_$(uname -i).tar.gz tar -zxvf ./fused_infer_attention_score_a2_$(uname -i).tar.gz # replace fused_infer_attention_score operation files diff --git a/tools/install_flash_infer_attention_score_ops_a3.sh b/tools/install_flash_infer_attention_score_ops_a3.sh index 21819c16..b833bf2d 100644 --- a/tools/install_flash_infer_attention_score_ops_a3.sh +++ b/tools/install_flash_infer_attention_score_ops_a3.sh @@ -22,7 +22,7 @@ trap 'echo "Error on line $LINENO: command \`$BASH_COMMAND\` failed with exit co cd /vllm-workspace # download fused_infer_attention_score related source files -wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/fused_infer_attention_score_a3_$(uname -i).tar.gz +wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/cann-8.5.0/fused_infer_attention_score_a3_$(uname -i).tar.gz tar -zxvf ./fused_infer_attention_score_a3_$(uname -i).tar.gz # replace fused_infer_attention_score operation files