diff --git a/docs/source/conf.py b/docs/source/conf.py index b1e33dc..4b8eb0e 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -68,10 +68,10 @@ myst_substitutions = { # the branch of vllm-ascend, used in vllm-ascend clone and image tag # - main branch: 'main' # - vX.Y.Z branch: latest vllm-ascend release tag - 'vllm_ascend_version': 'v0.9.0rc1', + 'vllm_ascend_version': 'v0.9.0rc2', # the newest release version of vllm-ascend and matched vLLM, used in pip install. # This value should be updated when cut down release. - 'pip_vllm_ascend_version': "0.9.0rc1", + 'pip_vllm_ascend_version': "0.9.0rc2", 'pip_vllm_version': "0.9.0", # CANN image tag 'cann_image_tag': "8.1.rc1-910b-ubuntu22.04-py3.10", diff --git a/docs/source/developer_guide/versioning_policy.md b/docs/source/developer_guide/versioning_policy.md index bbb08c3..1533b0a 100644 --- a/docs/source/developer_guide/versioning_policy.md +++ b/docs/source/developer_guide/versioning_policy.md @@ -22,6 +22,7 @@ Following is the Release Compatibility Matrix for vLLM Ascend Plugin: | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | MindIE Turbo | |-------------|--------------|------------------|-------------|--------------------|--------------| +| v0.9.0rc2 | v0.9.0 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | | v0.9.0rc1 | v0.9.0 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | | v0.8.5rc1 | v0.8.5.post1 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | | v0.8.4rc2 | v0.8.4 | >= 3.9, < 3.12 | 8.0.0 | 2.5.1 / 2.5.1 | | @@ -34,6 +35,7 @@ Following is the Release Compatibility Matrix for vLLM Ascend Plugin: | Date | Event | |------------|-------------------------------------------| +| 2025.06.10 | Release candidates, v0.9.0rc2 | | 2025.06.09 | Release candidates, v0.9.0rc1 | | 2025.05.29 | v0.7.x post release, v0.7.3.post1 | | 2025.05.08 | v0.7.x Final release, v0.7.3 | @@ -71,6 +73,7 @@ Usually, each minor version of vLLM (such as 0.7) will correspond to a vLLM Asce | Branch | Status | Note | |------------|--------------|--------------------------------------| | main | Maintained | CI commitment for vLLM main branch and vLLM 0.9.x branch | +| v0.9.1-dev | Maintained | CI commitment for vLLM 0.9.0 and 0.9.1 version | | v0.7.3-dev | Maintained | CI commitment for vLLM 0.7.3 version | | v0.7.1-dev | Unmaintained | Replaced by v0.7.3-dev | diff --git a/docs/source/faqs.md b/docs/source/faqs.md index 8c840be..1d355b5 100644 --- a/docs/source/faqs.md +++ b/docs/source/faqs.md @@ -3,7 +3,7 @@ ## Version Specific FAQs - [[v0.7.3.post1] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/1007) -- [[v0.9.0rc1] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/1115) +- [[v0.9.0rc2] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/1115) ## General FAQs @@ -69,14 +69,14 @@ If all above steps are not working, feel free to submit a GitHub issue. ### 7. How does vllm-ascend perform? -Currently, only some models are improved. Such as `Qwen2 VL`, `Deepseek V3`. Others are not good enough. In the future, we will support graph mode and custom ops to improve the performance of vllm-ascend. And when the official release of vllm-ascend is released, you can install `mindie-turbo` with `vllm-ascend` to speed up the inference as well. +Currently, only some models are improved. Such as `Qwen2 VL`, `Deepseek V3`. Others are not good enough. From 0.9.0rc2, Qwen and Deepseek works with graph mode to play a good performance. What's more, you can install `mindie-turbo` with `vllm-ascend v0.7.3` to speed up the inference as well. ### 8. How vllm-ascend work with vllm? vllm-ascend is a plugin for vllm. Basically, the version of vllm-ascend is the same as the version of vllm. For example, if you use vllm 0.7.3, you should use vllm-ascend 0.7.3 as well. For main branch, we will make sure `vllm-ascend` and `vllm` are compatible by each commit. ### 9. Does vllm-ascend support Prefill Disaggregation feature? -Currently, only 1P1D is supported by vllm. For vllm-ascend, it'll be done by [this PR](https://github.com/vllm-project/vllm-ascend/pull/432). For NPND, vllm is not stable and fully supported yet. We will make it stable and supported by vllm-ascend in the future. +Currently, only 1P1D is supported on V0 Engine. For V1 Engine or NPND support, We will make it stable and supported by vllm-ascend in the future. ### 10. Does vllm-ascend support quantization method? @@ -84,9 +84,7 @@ Currently, w8a8 quantization is already supported by vllm-ascend originally on v ### 11. How to run w8a8 DeepSeek model? -Currently, w8a8 DeepSeek is working in process: [support AscendW8A8 quantization](https://github.com/vllm-project/vllm-ascend/pull/511) - -Please run DeepSeek with BF16 now, following the [Multi-Node DeepSeek inferencing tutorail](https://vllm-ascend.readthedocs.io/en/main/tutorials/multi_node.html) +Please following the [quantization inferencing tutorail](https://vllm-ascend.readthedocs.io/en/main/tutorials/multi_npu_quantization.html) and replace model to DeepSeek. ### 12. There is not output in log when loading models using vllm-ascend, How to solve it? diff --git a/docs/source/user_guide/graph_mode.md b/docs/source/user_guide/graph_mode.md index 2bd83ff..126978f 100644 --- a/docs/source/user_guide/graph_mode.md +++ b/docs/source/user_guide/graph_mode.md @@ -54,7 +54,7 @@ outputs = model.generate("Hello, how are you?") online example: ```shell -vllm serve Qwen/Qwen2-7B-Instruct --additional-config='{"torchair_graph_config": {"enable": True}}' +vllm serve Qwen/Qwen2-7B-Instruct --additional-config='{"torchair_graph_config": {"enable": true}}' ``` You can find more detail about additional config [here](./additional_config.md) diff --git a/docs/source/user_guide/release_notes.md b/docs/source/user_guide/release_notes.md index 42a944f..8f72b93 100644 --- a/docs/source/user_guide/release_notes.md +++ b/docs/source/user_guide/release_notes.md @@ -1,5 +1,13 @@ # Release note +## v0.9.0rc2 - 2025.06.10 + +This release contains some quick fixes for v0.9.0rc1. Please use this release instead of v0.9.0rc1. + +### Highlights + +- Fix the import error when vllm-ascend is installed without editable way. [#1152](https://github.com/vllm-project/vllm-ascend/pull/1152) + ## v0.9.0rc1 - 2025.06.09 This is the 1st release candidate of v0.9.0 for vllm-ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/) to start the journey. From this release, V1 Engine is recommended to use. The code of V0 Engine is frozen and will not be maintained any more. Please set environment `VLLM_USE_V1=1` to enable V1 Engine. diff --git a/vllm_ascend/compilation/__init__.py b/vllm_ascend/compilation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 03be38f..2b343d7 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -496,7 +496,11 @@ class NPUModelRunner(LoRAModelRunnerMixin): # Update the block IDs. if not req_data.resumed_from_preemption: # Append the new blocks to the existing block IDs. - req_state.block_ids.extend(req_data.new_block_ids) + for block_ids, new_block_ids in zip( # type: ignore[call-overload] + req_state.block_ids, + req_data.new_block_ids, + strict=True): + block_ids.extend(new_block_ids) else: # The request is resumed from preemption. # Replace the existing block IDs with the new ones.