From e18643f8a4d5bd9990727654318ad069ea0b56e2 Mon Sep 17 00:00:00 2001 From: zzzzwwjj <34335947+zzzzwwjj@users.noreply.github.com> Date: Thu, 30 Apr 2026 18:44:08 +0800 Subject: [PATCH] [doc][0.18.0] v0.18.0 release note (#8383) ### What this PR does / why we need it? Added v0.18.0 release note. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? --------- Signed-off-by: zzzzwwjj <1183291235@qq.com> --- docs/source/community/versioning_policy.md | 60 +++++----- docs/source/user_guide/release_notes.md | 121 +++++++++++++++++++++ 2 files changed, 152 insertions(+), 29 deletions(-) diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md index 690656d3..584cbcb5 100644 --- a/docs/source/community/versioning_policy.md +++ b/docs/source/community/versioning_policy.md @@ -21,35 +21,37 @@ For example: The table below is the release compatibility matrix for vLLM Ascend release. -| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | Triton Ascend | -|-------------|-------------------|-----------------|-------------|---------------------------------|---------------| -| v0.17.0rc1 | v0.17.0 | >= 3.10, < 3.12 | 8.5.1 | 2.9.0 / 2.9.0 | 3.2.0 | -| v0.16.0rc1 | v0.16.0 | >= 3.10, < 3.12 | 8.5.1 | 2.9.0 / 2.9.0 | 3.2.0 | -| v0.15.0rc1 | v0.15.0 | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | 3.2.0 | -| v0.14.0rc1 | v0.14.1 | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | 3.2.0 | -| v0.13.0 | v0.13.0 | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.8.0.post2 | 3.2.0 | -| v0.13.0rc2 | v0.13.0 | >= 3.10, < 3.12 | 8.5.0 | 2.8.0 / 2.8.0.post1 | 3.2.0 | -| v0.13.0rc1 | v0.13.0 | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 | | -| v0.12.0rc1 | v0.12.0 | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 | | -| v0.11.0 | v0.11.0 | >= 3.9, < 3.12 | 8.3.RC2 | 2.7.1 / 2.7.1.post1 | | -| v0.11.0rc3 | v0.11.0 | >= 3.9, < 3.12 | 8.3.RC2 | 2.7.1 / 2.7.1.post1 | | -| v0.11.0rc2 | v0.11.0 | >= 3.9, < 3.12 | 8.3.RC2 | 2.7.1 / 2.7.1 | | -| v0.11.0rc1 | v0.11.0 | >= 3.9, < 3.12 | 8.3.RC1 | 2.7.1 / 2.7.1 | | -| v0.11.0rc0 | v0.11.0rc3 | >= 3.9, < 3.12 | 8.2.RC1 | 2.7.1 / 2.7.1.dev20250724 | | -| v0.10.2rc1 | v0.10.2 | >= 3.9, < 3.12 | 8.2.RC1 | 2.7.1 / 2.7.1.dev20250724 | | -| v0.10.1rc1 | v0.10.1/v0.10.1.1 | >= 3.9, < 3.12 | 8.2.RC1 | 2.7.1 / 2.7.1.dev20250724 | | -| v0.10.0rc1 | v0.10.0 | >= 3.9, < 3.12 | 8.2.RC1 | 2.7.1 / 2.7.1.dev20250724 | | -| v0.9.2rc1 | v0.9.2 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1.post1.dev20250619 | | -| v0.9.1 | v0.9.1 | >= 3.9, < 3.12 | 8.2.RC1 | 2.5.1 / 2.5.1.post1 | | -| v0.9.1rc3 | v0.9.1 | >= 3.9, < 3.12 | 8.2.RC1 | 2.5.1 / 2.5.1.post1 | | -| v0.9.1rc2 | v0.9.1 | >= 3.9, < 3.12 | 8.2.RC1 | 2.5.1 / 2.5.1.post1 | | -| v0.9.1rc1 | v0.9.1 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1.post1.dev20250528 | | -| v0.9.0rc2 | v0.9.0 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | -| v0.9.0rc1 | v0.9.0 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | -| v0.8.5rc1 | v0.8.5.post1 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | -| v0.8.4rc2 | v0.8.4 | >= 3.9, < 3.12 | 8.0.0 | 2.5.1 / 2.5.1 | | -| v0.7.3.post1| v0.7.3 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | -| v0.7.3 | v0.7.3 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | +| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | Triton Ascend | Mooncake | +|-------------|-------------------|-----------------|-------------|---------------------------------|-------------------|----------| +| v0.18.0 | v0.18.0 | >= 3.10, < 3.12 | 8.5.1 | 2.9.0 / 2.9.0.post1+git4c901a4 | 3.2.0.dev20260322 | 3.9.0 | +| v0.18.0rc1 | v0.18.0 | >= 3.10, < 3.12 | 8.5.1 | 2.9.0 / 2.9.0.post1+git4c901a4 | 3.2.0.dev20260322 | 3.8.9 | +| v0.17.0rc1 | v0.17.0 | >= 3.10, < 3.12 | 8.5.1 | 2.9.0 / 2.9.0 | 3.2.0 | | +| v0.16.0rc1 | v0.16.0 | >= 3.10, < 3.12 | 8.5.1 | 2.9.0 / 2.9.0 | 3.2.0 | | +| v0.15.0rc1 | v0.15.0 | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | 3.2.0 | | +| v0.14.0rc1 | v0.14.1 | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | 3.2.0 | | +| v0.13.0 | v0.13.0 | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.8.0.post2 | 3.2.0 | | +| v0.13.0rc2 | v0.13.0 | >= 3.10, < 3.12 | 8.5.0 | 2.8.0 / 2.8.0.post1 | 3.2.0 | | +| v0.13.0rc1 | v0.13.0 | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 | | | +| v0.12.0rc1 | v0.12.0 | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 | | | +| v0.11.0 | v0.11.0 | >= 3.9, < 3.12 | 8.3.RC2 | 2.7.1 / 2.7.1.post1 | | | +| v0.11.0rc3 | v0.11.0 | >= 3.9, < 3.12 | 8.3.RC2 | 2.7.1 / 2.7.1.post1 | | | +| v0.11.0rc2 | v0.11.0 | >= 3.9, < 3.12 | 8.3.RC2 | 2.7.1 / 2.7.1 | | | +| v0.11.0rc1 | v0.11.0 | >= 3.9, < 3.12 | 8.3.RC1 | 2.7.1 / 2.7.1 | | | +| v0.11.0rc0 | v0.11.0rc3 | >= 3.9, < 3.12 | 8.2.RC1 | 2.7.1 / 2.7.1.dev20250724 | | | +| v0.10.2rc1 | v0.10.2 | >= 3.9, < 3.12 | 8.2.RC1 | 2.7.1 / 2.7.1.dev20250724 | | | +| v0.10.1rc1 | v0.10.1/v0.10.1.1 | >= 3.9, < 3.12 | 8.2.RC1 | 2.7.1 / 2.7.1.dev20250724 | | | +| v0.10.0rc1 | v0.10.0 | >= 3.9, < 3.12 | 8.2.RC1 | 2.7.1 / 2.7.1.dev20250724 | | | +| v0.9.2rc1 | v0.9.2 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1.post1.dev20250619 | | | +| v0.9.1 | v0.9.1 | >= 3.9, < 3.12 | 8.2.RC1 | 2.5.1 / 2.5.1.post1 | | | +| v0.9.1rc3 | v0.9.1 | >= 3.9, < 3.12 | 8.2.RC1 | 2.5.1 / 2.5.1.post1 | | | +| v0.9.1rc2 | v0.9.1 | >= 3.9, < 3.12 | 8.2.RC1 | 2.5.1 / 2.5.1.post1 | | | +| v0.9.1rc1 | v0.9.1 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1.post1.dev20250528 | | | +| v0.9.0rc2 | v0.9.0 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | | +| v0.9.0rc1 | v0.9.0 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | | +| v0.8.5rc1 | v0.8.5.post1 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | | +| v0.8.4rc2 | v0.8.4 | >= 3.9, < 3.12 | 8.0.0 | 2.5.1 / 2.5.1 | | | +| v0.7.3.post1| v0.7.3 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | | +| v0.7.3 | v0.7.3 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | | :::{note} If you're using v0.7.3, don't forget to install [mindie-turbo](https://pypi.org/project/mindie-turbo) as well. diff --git a/docs/source/user_guide/release_notes.md b/docs/source/user_guide/release_notes.md index 90ee25ef..70a63a5a 100644 --- a/docs/source/user_guide/release_notes.md +++ b/docs/source/user_guide/release_notes.md @@ -1,5 +1,126 @@ # Release Notes +## v0.18.0 - 2026.04.22 + +We're excited to announce the release of v0.18.0 for vLLM Ascend. This is the official release for v0.18.0. Please follow the [official doc](https://docs.vllm.ai/projects/ascend/en/v0.18.0) to get started. + +### Highlights + +**Model Support** + +- **Kimi-K2.x Model Support**: [Experimental]Added support for Kimi-K2.x models. @aipaes @dragondream-chen @SparrowMu @LoganJane [#6755](https://github.com/vllm-project/vllm-ascend/pull/6755) +- **Minimax-m2.x Model Support**: [Experimental]Added support for Minimax-m2.x models with eagle3. @SparrowMu @GDzhu01 [#7105](https://github.com/vllm-project/vllm-ascend/pull/7105) [#7714](https://github.com/vllm-project/vllm-ascend/pull/7714) +- **GLM5 Support**: [Experimental]Added support for GLM5 models without any code modification! +- **Qwen3.x Support**: [Experimental]Added support for Qwen3.x models without any code modification! +- **DeepseekOCR Support**: [Experimental]Added support for DeepseekOCR model and optimize `RelPosAttention` and `CustomQwen2Decoder`. @Wangbei25 [#7737](https://github.com/vllm-project/vllm-ascend/pull/7737) + +**Core Features** + +- **EPLB (Expert Parallelism Load Balance)**: EPLB is more stable with many bug fixes, and has better performance now. EPLB now works in most cases and is recommended for use. [#6528](https://github.com/vllm-project/vllm-ascend/pull/6528) [#7344](https://github.com/vllm-project/vllm-ascend/pull/7344) [#7890](https://github.com/vllm-project/vllm-ascend/pull/7890) [#6477](https://github.com/vllm-project/vllm-ascend/pull/6477) +- **ACLGraph Enhancement**: ACLGraph now support capturing a single merged graph for multi-step drafts, which greatly reduce host bound in multi-step spec decoding case! [#5553](https://github.com/vllm-project/vllm-ascend/pull/5553) [#5940](https://github.com/vllm-project/vllm-ascend/pull/5940) +- **KV Pooling**: Enhanced KV pool with Mooncake connector now support sparse attention, and LMCacheAscendConnector is added as a new KV cache pooling solution for Ascend, and support FabricMem Mode for HIXL interconnect, support yuanrong as a backend for AscendStoreConnector, and now MooncakeLayerwiseConnector can be activated together with KV Pooling. Compared with previous versions, KV Pooling has a huge performance optimization on TTFT! [#6339](https://github.com/vllm-project/vllm-ascend/pull/6339) [#6882](https://github.com/vllm-project/vllm-ascend/pull/6882) [#6806](https://github.com/vllm-project/vllm-ascend/pull/6806) [#6869](https://github.com/vllm-project/vllm-ascend/pull/6869) [#7032](https://github.com/vllm-project/vllm-ascend/pull/7032) +- **PD disaggregation**: Mooncake layerwise connector now support hybrid attention manager and PCP feature. [#7022](https://github.com/vllm-project/vllm-ascend/pull/7022) [#6627](https://github.com/vllm-project/vllm-ascend/pull/6627) +- **NPU Graph EX (npugraph_ex) Enabled by Default**: The npugraph_ex feature is now enabled by default, providing better graph optimization with integrated inductor pass and MatmulAllReduceAddRMSNorm fusion. [#6354](https://github.com/vllm-project/vllm-ascend/pull/6354) [#6664](https://github.com/vllm-project/vllm-ascend/pull/6664) [#6006](https://github.com/vllm-project/vllm-ascend/pull/6006) +- **RL(Reinforcement learning)**: [Experimental]RL enhanced with implemented batch invariant feature with AscendC and triton op, and added routing replay feature. [#6590](https://github.com/vllm-project/vllm-ascend/pull/6590) [#6696](https://github.com/vllm-project/vllm-ascend/pull/6696) +- **CPU Binding Enabled by Default**: Enabled ARM-only CPU binding with global-slicing A3 policy, improving inference throughput in hostbound scenarios. [#6686](https://github.com/vllm-project/vllm-ascend/pull/6686) + +### Features + +- Prefix cache is now supported in hybrid model. [#7103](https://github.com/vllm-project/vllm-ascend/pull/7103) +- Flash Comm V1 now supports VL models with MLA, removing a previous limitation for multimodal serving. [#7390](https://github.com/vllm-project/vllm-ascend/pull/7390) +- VL MoE models now support SP, and `sp_threshold` is removed in favor of `sp_min_token_num` from vLLM. [#7044](https://github.com/vllm-project/vllm-ascend/pull/7044) +- [Experimental]Pipeline Parallel now supports async scheduling, improving throughput for PP deployments. [#7136](https://github.com/vllm-project/vllm-ascend/pull/7136) +- Eagle3 now supports QuaRot quantization without embedding. [#7038](https://github.com/vllm-project/vllm-ascend/pull/7038) +- Refactoring eagle3/mtp, eagle3 and mtp are now using the same proposer. [#6349](https://github.com/vllm-project/vllm-ascend/pull/6349) [#7033](https://github.com/vllm-project/vllm-ascend/pull/7033) + +### Hardware and Operator Support + +- **First time support 310P, with huge performance optimization!**: + - support W8A8 quantization. [#6641](https://github.com/vllm-project/vllm-ascend/pull/6641) [#6454](https://github.com/vllm-project/vllm-ascend/pull/6454) + - support weightNZ with quant and unquant case. [#6705](https://github.com/vllm-project/vllm-ascend/pull/6705) + - support W8A8SC quantization. [#7075](https://github.com/vllm-project/vllm-ascend/pull/7075) + - fix post-sampling not working in graph mode. [#8077](https://github.com/vllm-project/vllm-ascend/pull/8077) + - Added addrmsnorm support for 300I DUO. [#6704](https://github.com/vllm-project/vllm-ascend/pull/6704) + - Fix ngram graph replay accuracy error on 310P. [#7134](https://github.com/vllm-project/vllm-ascend/pull/7134) +- **Custom Operators**: Added multiple custom operators including: + - Added AscendC casual_conv1d_fn operator for Qwen3-Next. [#6661](https://github.com/vllm-project/vllm-ascend/pull/6661) + - Added Ascend Ops recurrent_gated_delta_rule operator. [#6725](https://github.com/vllm-project/vllm-ascend/pull/6725) + - Added GMM custom operator for MoE models. [#7010](https://github.com/vllm-project/vllm-ascend/pull/7010) + - Optimize split_qkv_rmsnorm_rope operator. [#6827](https://github.com/vllm-project/vllm-ascend/pull/6827) + - Triton rope now supports index_selecting from cos_sin_cache. [#5450](https://github.com/vllm-project/vllm-ascend/pull/5450) + - Added AscendC fused op transpose_kv_cache_by_block to speed up GQA transfer. [#6366](https://github.com/vllm-project/vllm-ascend/pull/6366) + - Optimized `DispatchFFNCombine` kernel performance and resolved vector error caused by unaligned UB access. [#6468](https://github.com/vllm-project/vllm-ascend/pull/6468) [#6707](https://github.com/vllm-project/vllm-ascend/pull/6707) + - Refactor and optimize CausalConv1d. [#7495](https://github.com/vllm-project/vllm-ascend/pull/7495) + +### Performance + +- **Initialize Performance**: Optimized Triton operator recompilation to reduce redundant rebuilds and unnecessary recompilation triggered by function parameter optimization. [#7647](https://github.com/vllm-project/vllm-ascend/pull/7647) [#7645](https://github.com/vllm-project/vllm-ascend/pull/7645) +- **Qwen3.x Performance**: [Experimental]Optimized the Qwen3.x and Qwen3-Next performance by supporting full graph mode, PD disaggregation, mamba prefill prefix-caching and flashcomm1, prebuilding chunk metadata to reducing host-device synchronization overhead, and multiple op performance optimization including `chunk_gated_delta_rule`, `chunk_fwd_kernel_o`, `solve_tril`, `recompute_w_u_fwd_kernel`, `split_qkv_rmsnorm_mrope`, etc. @LoganJane @shaopeng-666 @ppppeng @SunnyLee151064 @hust17yixuan @Toneymiller @linfeng-yuan [#7487](https://github.com/vllm-project/vllm-ascend/pull/7487) [#6830](https://github.com/vllm-project/vllm-ascend/pull/6830) [#7506](https://github.com/vllm-project/vllm-ascend/pull/7506) [#7796](https://github.com/vllm-project/vllm-ascend/pull/7796) [#7527](https://github.com/vllm-project/vllm-ascend/pull/7527) [#7529](https://github.com/vllm-project/vllm-ascend/pull/7529) [#7495](https://github.com/vllm-project/vllm-ascend/pull/7495) [#7368](https://github.com/vllm-project/vllm-ascend/pull/7368) +- **Kimi-K2.x Performance**: [Experimental]Optimized the Kimi-K2.x performance by supporting eagle3 and flashcomm1, and reducing d2h overhead. @aipaes @dragondream-chen @SparrowMu @LoganJane @GDzhu01 @Yaphets24 @hust17yixuan [#7342](https://github.com/vllm-project/vllm-ascend/pull/7342) [#7390](https://github.com/vllm-project/vllm-ascend/pull/7390) [#7521](https://github.com/vllm-project/vllm-ascend/pull/7521) +- **Qwen3-VL Performance**: Qwen3-VL gets stronger multimodal operator enablement with Flash Comm V1 and `qkv_rmsnorm_mrope` support, and enable 2.7x faster for convolution computation with aclnn BatchMatMulV2, support EAGLE speculative decoding. [#7893](https://github.com/vllm-project/vllm-ascend/pull/7893) [#7852](https://github.com/vllm-project/vllm-ascend/pull/7852) [#7017](https://github.com/vllm-project/vllm-ascend/pull/7017) [#6327](https://github.com/vllm-project/vllm-ascend/pull/6327) +- **Qwen3-Omni Performance**: Qwen3-Omni quantization adaptation and optimization is now available. [#6828](https://github.com/vllm-project/vllm-ascend/pull/6828) +- **DeepSeek-V3.2/GLM5 Performance**: Performance optimizations, support W8A8C8 quantization, and optimized KV cache usage. @yydyzr @ZYang6263 @rjg-lyh @Nagisa125 [#7029](https://github.com/vllm-project/vllm-ascend/pull/7029) [#6610](https://github.com/vllm-project/vllm-ascend/pull/6610) +- **GLM4.7-Flash Performance**: Added W8A8 quantization support for GLM4.7-Flash. @aipaes [#6492](https://github.com/vllm-project/vllm-ascend/pull/6492) + +### Dependencies + +- **vLLM**: Upgraded to 0.18.0 and dropped 0.17.0 support. +- **CANN**: Upgraded to 8.5.1. **PS:** AscendStoreConnector with FabricMem mode, 310P device supporting and Qwen3-Omni model need upgrades CANN version to 9.0.0, if you need these features, please upgrade manually. +- **torch-npu**: Upgraded to 2.9.0.post1+git4c901a4 because of some known issue. This version can't install by default, please upgrade manually. We can get installstion pkg from this link: . **PS:** If CANN has been upgraded to version 9.0.0, please upgrade torch-npu version to 2.9.0.post2 synchronously. +- **triton-ascend**: Upgraded to 3.2.0.dev20260322 because of some known issue. This version can't install by default, please upgrade manually. We can get installstion pkg from this link: . **PS:** If CANN has been upgraded to version 9.0.0, please upgrade triton-ascend version to 3.2.1 synchronously. +- **Transformers**: Upgraded to >= 4.57.4. +- **Mooncake**: Upgraded to 3.9.0. + +> \${PYTHON_TAG} is python version tag, and \${ARCH} is cpu architecture. +> +> For example: python3.11 and aarch64, \${PYTHON_TAG}=cp311, \${ARCH}=aarch64. + +### Deprecation & Breaking Changes + +- Cleaned up and deprecated ProfileExecuteDuration feature. [#6461](https://github.com/vllm-project/vllm-ascend/pull/6461) +- Removed custom rotary_embedding operator. [#6523](https://github.com/vllm-project/vllm-ascend/pull/6523) +- Cleaned up unused env `USE_OPTIMIZED_MODEL`. [#6618](https://github.com/vllm-project/vllm-ascend/pull/6618) +- `enable_flash_comm_v1` config option has been renamed back to `enable_sp`. [#6883](https://github.com/vllm-project/vllm-ascend/pull/6883) + +### Documentation + +- Add a new introduction for MiniMax-M2.5 and MiniMax-M2.7. [#8169](https://github.com/vllm-project/vllm-ascend/pull/8169) +- Add preemption guidance in FAQs. [#8136](https://github.com/vllm-project/vllm-ascend/pull/8136) +- Update the deployment and support documentation for GLM5, including parameter descriptions, best practices, and FAQs. [#7963](https://github.com/vllm-project/vllm-ascend/pull/7963) [#7909](https://github.com/vllm-project/vllm-ascend/pull/7909) +- Update the Qwen3.5 user guide. [#7934](https://github.com/vllm-project/vllm-ascend/pull/7934) +- Update the document configuration for DeepSeek-V3.2. [#7970](https://github.com/vllm-project/vllm-ascend/pull/7970) +- Clean up documentation wording and grammar. [#8073](https://github.com/vllm-project/vllm-ascend/pull/8073) +- Refreshed deployment and model docs for Kimi-K2.5, GLM-4.7, DeepSeek-V3.2, MiniMax-M2.5, and PD disaggregation guides. [#7371](https://github.com/vllm-project/vllm-ascend/pull/7371) [#7403](https://github.com/vllm-project/vllm-ascend/pull/7403) [#7292](https://github.com/vllm-project/vllm-ascend/pull/7292) [#7296](https://github.com/vllm-project/vllm-ascend/pull/7296) [#7300](https://github.com/vllm-project/vllm-ascend/pull/7300) +- Added user/developer guide for CPU binding. [#7045](https://github.com/vllm-project/vllm-ascend/pull/7045) +- Added Memcache Usage Guide. [#6476](https://github.com/vllm-project/vllm-ascend/pull/6476) +- Added Benchmark Tutorial for Suffix Speculative Decoding. [#6323](https://github.com/vllm-project/vllm-ascend/pull/6323) +- Added npugraph_ex introduction documentation. [#6306](https://github.com/vllm-project/vllm-ascend/pull/6306) + +### Others + +- Add async communication checks for capturing mode. [#8149](https://github.com/vllm-project/vllm-ascend/pull/8149) +- Fix KV Pool conflicts between pooling scenarios and fix missing KV cache placement on vLLM v0.18.0. [#8101](https://github.com/vllm-project/vllm-ascend/pull/8101) [#7874](https://github.com/vllm-project/vllm-ascend/pull/7874) +- Fix short-prompt forwarding by correcting attention state handling. [#8088](https://github.com/vllm-project/vllm-ascend/pull/8088) +- Restore `global_bs=0` and `mc2_mask` for uniform-token dispatching, and support inter-node RoCE hierarchical MC2 communication. [#8040](https://github.com/vllm-project/vllm-ascend/pull/8040) +- Fix the weights mapper bug of Qwen3-VL. [#7868](https://github.com/vllm-project/vllm-ascend/pull/7868) +- Fixed quantization config key mapping in `AscendModelSlimConfig` by switching from reverse mapping to forward mapping. [#7716](https://github.com/vllm-project/vllm-ascend/pull/7716) +- Fixed support for ALL D-Nodes in full graph when running MTP in PD deployment. [#5472](https://github.com/vllm-project/vllm-ascend/pull/5472) +- Layerwise connector now supports recompute scheduler. [#5900](https://github.com/vllm-project/vllm-ascend/pull/5900) +- Fixed pooling code issues and updated usage guide. [#6126](https://github.com/vllm-project/vllm-ascend/pull/6126) +- NPUWorker Profiler now supports profile_prefix for better profiling experience. [#6968](https://github.com/vllm-project/vllm-ascend/pull/6968) + +### Known Issue + +- Currently, `VLLM_ASCEND_ENABLE_FUSED_MC2` is not recommended for multi-DP and large number of tokens case(`kv_producer` or `kv_both`), this case may create large number of padded tokens across dp, which will be routed to certain experts, and make certain ranks receive tokens overload, resulting accuracy and performance issues. [#8320](https://github.com/vllm-project/vllm-ascend/issues/8320) +- Currently, EPLB cannot support `minimax_m2` model and W4A8 quantization. [#8341](https://github.com/vllm-project/vllm-ascend/issues/8341) +- PCP and eagle3 overlaying may generate error when a prefill req's scheduled token number is smaller than `1 + num_speculative_tokens`, which will make this prefill req be treated as a decode req, resulting in an error. [#8402](https://github.com/vllm-project/vllm-ascend/issues/8402) +- NPU soft partitioning + `CUDAGraphMode.PIECEWISE` is not supported. [#8585](https://github.com/vllm-project/vllm-ascend/issues/8585) +- Qwen3.x now has accuracy issue with PD disaggregation case. [#8421](https://github.com/vllm-project/vllm-ascend/issues/8421) +- Currently, there is a known issue on x86 arch, and this issue has been resolved with CANN 9.0.0, if you want to deploy vllm-ascend on x86, please upgrade CANN version manually. [#7993](https://github.com/vllm-project/vllm-ascend/issues/7993) +- P/D proxy may leak resources on recomputed retry and mask metaserver errors after. [#8852](https://github.com/vllm-project/vllm-ascend/issues/8852) +- When deploying GLM5 and Deepseek V3.2 separately via PD architecture, there is a probabilistic issue of empty output or garbled characters. [#8853](https://github.com/vllm-project/vllm-ascend/issues/8853) +- For GLM 5/5.1 under PD separation which D node setup with TP16 DP2 parallelism, the GPQA accuracy fell short of the standard. [#8844](https://github.com/vllm-project/vllm-ascend/issues/8844) + ## v0.17.0rc1 - 2026.03.15 This is the first release candidate of v0.17.0 for vLLM Ascend. Please follow the [official doc](https://docs.vllm.ai/projects/ascend/en/latest) to get started.