2025-03-20 19:34:44 +08:00
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
2026-01-05 14:07:54 +08:00
# Copyright 2025 The vLLM team.
2025-03-20 19:34:44 +08:00
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
2025-04-17 14:59:56 +08:00
# This file is a part of the vllm-ascend project.
# Adapted from vllm-project/vllm/vllm/worker/gpu_model_runner.py
2025-03-20 19:34:44 +08:00
#
2025-10-15 19:36:32 +08:00
import math
2025-12-30 08:32:14 +08:00
import sys
2025-09-16 01:17:42 +08:00
from collections import defaultdict
support aclgraph (#426)
<!-- Thanks for sending a pull request!
BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html
-->
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.
- Please clarify why the changes are needed. For instance, the use case
and bug description.
- Fixes #
-->
This PR supports the access of vllm-acend to the piecewise_graph feature
provided by the v1 engine.
1. register unifiled_ascend_attention_with_output for piecewise_graph to
split graph.
2. support NPUGraph to accelerate kernel launch.
### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->
support npugraph to default, Users can disenable the npugraph feature by
configuring enforce_eager.
This has corresponding requirements for the versions of torch_npu and
CANN, and they need to support graph capture.
### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->
it turn to default
---------
Signed-off-by: Bug Hunter Yan <yanpq@zju.edu.cn>
Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
Co-authored-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-04-23 20:56:24 +08:00
from contextlib import contextmanager , nullcontext
2025-12-18 15:27:55 +08:00
from copy import copy , deepcopy
support aclgraph (#426)
<!-- Thanks for sending a pull request!
BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html
-->
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.
- Please clarify why the changes are needed. For instance, the use case
and bug description.
- Fixes #
-->
This PR supports the access of vllm-acend to the piecewise_graph feature
provided by the v1 engine.
1. register unifiled_ascend_attention_with_output for piecewise_graph to
split graph.
2. support NPUGraph to accelerate kernel launch.
### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->
support npugraph to default, Users can disenable the npugraph feature by
configuring enforce_eager.
This has corresponding requirements for the versions of torch_npu and
CANN, and they need to support graph capture.
### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->
it turn to default
---------
Signed-off-by: Bug Hunter Yan <yanpq@zju.edu.cn>
Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
Co-authored-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-04-23 20:56:24 +08:00
from dataclasses import dataclass
2025-09-17 10:36:43 +08:00
from multiprocessing import Manager
2026-01-27 22:27:01 +08:00
from typing import TYPE_CHECKING , Any , Dict , NamedTuple , Optional , Union , TypeAlias , Tuple
2025-03-20 19:34:44 +08:00
import numpy as np
import torch
2025-06-04 18:31:41 +08:00
import torch . distributed as dist
2025-03-20 19:34:44 +08:00
import torch . nn as nn
2025-11-24 17:08:20 +08:00
from vllm . attention . layer import Attention , MLAAttention
from vllm . config import ( CompilationMode , CUDAGraphMode , VllmConfig ,
get_layers_from_vllm_config )
2026-01-27 22:27:01 +08:00
from vllm . compilation . cuda_graph import CUDAGraphStat
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
from vllm . distributed import ( get_tensor_model_parallel_world_size ,
tensor_model_parallel_all_gather )
2025-12-03 20:48:45 +08:00
from vllm . distributed . ec_transfer import get_ec_transfer , has_ec_transfer
2025-07-26 17:15:47 +08:00
from vllm . distributed . kv_transfer import ( get_kv_transfer_group ,
has_kv_transfer_group )
2025-10-24 10:32:01 +08:00
from vllm . distributed . parallel_state import ( get_dcp_group , get_dp_group ,
2025-12-16 17:44:04 +08:00
get_pcp_group , get_pp_group ,
2025-12-30 08:32:14 +08:00
get_tp_group )
2026-01-27 22:27:01 +08:00
from vllm . forward_context import BatchDescriptor , get_forward_context
2025-04-15 10:18:05 +08:00
from vllm . logger import logger
2025-09-16 01:17:42 +08:00
from vllm . model_executor . layers . attention_layer_base import AttentionLayerBase
from vllm . model_executor . layers . mamba . abstract import MambaBase
2025-03-20 19:34:44 +08:00
from vllm . model_executor . model_loader import get_model
2025-09-26 06:18:15 +08:00
from vllm . sequence import IntermediateTensors
2025-11-24 17:08:20 +08:00
from vllm . utils . import_utils import LazyLoader
2026-01-27 22:27:01 +08:00
from vllm . utils . math_utils import cdiv , round_up
2025-11-24 17:08:20 +08:00
from vllm . utils . mem_utils import DeviceMemoryProfiler
2026-01-27 22:27:01 +08:00
from vllm . v1 . attention . backend import AttentionBackend , AttentionMetadata
2025-09-16 01:17:42 +08:00
from vllm . v1 . attention . backends . gdn_attn import GDNAttentionMetadataBuilder
2025-12-30 08:32:14 +08:00
from vllm . v1 . attention . backends . utils import CommonAttentionMetadata
2026-01-23 09:45:08 +08:00
from vllm . v1 . attention . selector import get_attn_backend # type: ignore
2026-01-05 14:07:54 +08:00
from vllm . v1 . core . sched . output import SchedulerOutput
2026-01-19 14:22:18 +08:00
from vllm . v1 . kv_cache_interface import ( AttentionSpec ,
[Feat] Supports Aclgraph for bge-m3 (#3171)
### What this PR does / why we need it?
[Feat] Supports Aclgraph for bge-m3
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
```
pytest -s tests/e2e/singlecard/test_embedding.py
pytest -s tests/e2e/singlecard/test_embedding_aclgraph.py
```
to start an online server with bs 10, each batch's seq length=8192, we
set --max-num-batched-tokens=8192*10 to ensure encoder is not chunked:
```
vllm serve /home/data/bge-m3 --max_model_len 1024 --served-model-name "bge-m3" --task embed --host 0.0.0.0 --port 9095 --max-num-batched-tokens 81920 --compilation-config '{"cudagraph_capture_sizes":[8192, 10240, 20480, 40960, 81920]}'
```
For bs10, each batch's seq length=8192, QPS is improved from 85 to 104,
which is a 22% improvement, lots of host bound is reduced.
- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
Co-authored-by: wangyongjun <1104133197@qq.com>
2025-10-14 23:07:45 +08:00
EncoderOnlyAttentionSpec ,
FullAttentionSpec , KVCacheConfig ,
KVCacheGroupSpec , KVCacheSpec ,
2026-01-19 14:22:18 +08:00
MambaSpec , UniformTypeKVCacheSpecs )
2025-09-11 16:35:36 +08:00
from vllm . v1 . outputs import ( EMPTY_MODEL_RUNNER_OUTPUT , AsyncModelRunnerOutput ,
2026-01-27 22:27:01 +08:00
ECConnectorOutput , LogprobsLists , LogprobsTensors ,
ModelRunnerOutput , SamplerOutput ,
2025-12-03 20:48:45 +08:00
make_empty_encoder_model_runner_output )
2026-01-07 18:41:45 +08:00
from vllm . v1 . sample . logits_processor import build_logitsprocs
Spec decode support for V1 Engine (#874)
<!-- Thanks for sending a pull request!
BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html
-->
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.
- Please clarify why the changes are needed. For instance, the use case
and bug description.
- Fixes #
-->
Make spec decode support for V1 Engine
- Currently, Ascend does not support the triton kernel. PyTorch is used
to rewrite the `rejection_sampler.py` triton kernel. However, PyTorch is
not as good as Triton. Therefore, ascend c is used to implement the
function in the future.
- Currently, spec decode supports only the ngram algorithm. The eagle
algorithm needs to be further adapted.
### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->
Not change user facing.
### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->
test by `tests/singlecard/spec_decode/e2e/test_v1_spec_decode.py` and
`tests/sample/test_rejection_sampler.py`, test base function of
rejection sampler and e2e function of spec decode.
Signed-off-by: ponix-j <657511300@qq.com>
2025-05-23 14:25:46 +08:00
from vllm . v1 . sample . metadata import SamplingMetadata
2025-12-16 11:32:26 +08:00
from vllm . v1 . sample . rejection_sampler import RejectionSampler
Spec decode support for V1 Engine (#874)
<!-- Thanks for sending a pull request!
BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html
-->
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.
- Please clarify why the changes are needed. For instance, the use case
and bug description.
- Fixes #
-->
Make spec decode support for V1 Engine
- Currently, Ascend does not support the triton kernel. PyTorch is used
to rewrite the `rejection_sampler.py` triton kernel. However, PyTorch is
not as good as Triton. Therefore, ascend c is used to implement the
function in the future.
- Currently, spec decode supports only the ngram algorithm. The eagle
algorithm needs to be further adapted.
### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->
Not change user facing.
### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->
test by `tests/singlecard/spec_decode/e2e/test_v1_spec_decode.py` and
`tests/sample/test_rejection_sampler.py`, test base function of
rejection sampler and e2e function of spec decode.
Signed-off-by: ponix-j <657511300@qq.com>
2025-05-23 14:25:46 +08:00
from vllm . v1 . spec_decode . metadata import SpecDecodeMetadata
from vllm . v1 . spec_decode . ngram_proposer import NgramProposer
2025-12-01 18:41:42 +08:00
from vllm . v1 . spec_decode . suffix_decoding import SuffixDecodingProposer
2025-12-16 15:26:01 +08:00
from vllm . v1 . structured_output . utils import apply_grammar_bitmask
2025-12-12 17:27:09 +08:00
from vllm . v1 . worker . gpu_model_runner import ( AsyncGPUModelRunnerOutput ,
GPUModelRunner )
2025-08-15 07:35:27 +08:00
from vllm . v1 . worker . kv_connector_model_runner_mixin import KVConnectorOutput
2025-12-12 17:27:09 +08:00
from vllm . v1 . worker . utils import AttentionGroup
2026-01-27 22:27:01 +08:00
from vllm . v1 . worker . ubatch_utils import (
UBatchSlices ,
maybe_create_ubatch_slices ,
)
2025-03-20 19:34:44 +08:00
2025-06-05 16:28:01 +08:00
from vllm_ascend . ascend_config import get_ascend_config
2025-11-12 10:11:43 +08:00
from vllm_ascend . attention . attention_v1 import AscendAttentionState
2026-01-22 15:49:22 +08:00
from vllm_ascend . attention . utils import AscendCommonAttentionMetadata , using_paged_attention
2025-11-24 17:08:20 +08:00
# yapf conflicts with isort for this block
2025-10-27 09:58:23 +08:00
# yapf: disable
2025-09-22 22:23:14 +08:00
from vllm_ascend . compilation . acl_graph import ( ACLGraphWrapper ,
2025-12-29 09:54:51 +08:00
set_draft_graph_params ,
2025-09-22 22:23:14 +08:00
set_graph_params ,
2026-01-26 09:04:54 +08:00
update_full_graph_params )
2025-10-27 09:58:23 +08:00
# yapf: enable
2025-09-17 10:36:43 +08:00
from vllm_ascend . eplb . adaptor . vllm_adaptor import VllmEplbAdaptor
from vllm_ascend . eplb . core . eplb_device_transfer_loader import \
D2DExpertWeightLoader
from vllm_ascend . eplb . core . eplb_worker import EplbProcess
from vllm_ascend . eplb . eplb_updator import EplbUpdator
from vllm_ascend . eplb . utils import model_register
2025-12-17 08:53:44 +08:00
from vllm_ascend . ops . rotary_embedding import set_cos_and_sin , update_cos_sin
2025-12-10 22:54:24 +08:00
from vllm_ascend . patch . worker . patch_module import patch_torch_npu_argsort
2025-12-10 23:50:18 +08:00
from vllm_ascend . sample . sampler import AscendSampler
2025-09-04 11:34:47 +08:00
from vllm_ascend . spec_decode import get_spec_decode_method
from vllm_ascend . spec_decode . eagle_proposer import EagleProposer
Add Medusa speculative decoding support for vllm_ascend (#5668)
### What this PR does / why we need it?
`vllm_ascend` already supports several speculative decoding strategies
such as MTP, EAGLE, N-gram, and suffix decoding. However, Medusa is not
yet supported. Medusa is an efficient speculative decoding framework
that leverages a lightweight draft model to propose multiple tokens in a
single step, which can significantly improve decoding throughput and
reduce latency.
To enable Medusa-based speculative decoding on Ascend hardware and
provide more decoding options for users, this PR adds Medusa support
into the `vllm_ascend` speculative decoding pipeline.
### Does this PR introduce _any_ user-facing change?
This PR introduces Medusa speculative decoding as an additional
speculative decoding method:
✔ Adds `MedusaProposer` and integrates it into the speculative decoding
registry
✔ Extends `SpecDcodeType` with a `MEDUSA` enum entry
✔ Updates `NPUModelRunner` to recognize and invoke Medusa during
decoding
✔ Adds Medusa-specific handling in the draft token generation logic
✔ Ensures backward compatibility — Medusa is only used when explicitly
enabled
Key code changes include:
* New file: `vllm_ascend/spec_decode/medusa_proposer.py`
* Register Medusa in `get_spec_decode_method`
* Extend proposer type hints to include `MedusaProposer`
* Add a Medusa-specific branch in `generate_draft_token_ids`
* Pass `sample_hidden_states` required by Medusa
### How was this patch tested?
Medusa is implemented as a new proposer class (`MedusaProposer`)
following the existing speculative decoding interface. The integration
works as follows:
1. Users enable Medusa via the speculative decoding configuration.
2. `get_spec_decode_method()` returns a `MedusaProposer` instance when
`method="medusa"`.
3. During decoding, `NPUModelRunner` detects that the active drafter is
a `MedusaProposer`.
4. Instead of the generic speculative decoding path, the Medusa-specific
`generate_token_ids()` method is invoked, which consumes:
* `valid_sampled_token_ids`
* `sampling_metadata`
* `spec_decode_metadata`
* `sample_hidden_states`
5. The proposed tokens are validated by the target model as usual.
When Medusa is not enabled, the decoding pipeline behaves exactly as
before, ensuring full backward compatibility.
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d
Signed-off-by: simplzyu <191163281@qq.com>
Signed-off-by: simplzyu <zhenyuguo@cmbchina.com>
2026-01-23 14:14:23 +08:00
from vllm_ascend . spec_decode . medusa_proposer import MedusaProposer
2025-09-04 11:34:47 +08:00
from vllm_ascend . spec_decode . mtp_proposer import MtpProposer
2025-12-19 14:27:24 +08:00
from vllm_ascend . utils import ( AscendDeviceType , ProfileExecuteDuration ,
2026-01-24 11:29:42 +08:00
enable_sp , get_ascend_device_type ,
is_drafter_moe_model , is_moe_model ,
2025-12-23 08:49:52 +08:00
lmhead_tp_enable , maybe_trans_nz ,
2026-01-27 08:44:36 +08:00
set_weight_prefetch_method , vllm_version_is )
2025-12-15 19:54:23 +08:00
from vllm_ascend . worker . npu_input_batch import NPUInputBatch
2025-12-31 09:29:57 +08:00
from vllm_ascend . worker . pcp_utils import PCPManager
2025-03-20 19:34:44 +08:00
2025-12-17 08:53:44 +08:00
from vllm_ascend . ascend_forward_context import ( # isort: skip
MoECommType , get_mc2_tokens_capacity , select_moe_comm_method ,
set_ascend_forward_context , set_mc2_mask , set_mc2_tokens_capacity )
2025-03-20 19:34:44 +08:00
if TYPE_CHECKING :
2025-04-18 16:47:55 +08:00
import xgrammar as xgr # type: ignore[import-untyped]
2025-11-26 11:48:58 +08:00
from vllm . v1 . core . sched . output import GrammarOutput , SchedulerOutput
2025-04-18 16:47:55 +08:00
else :
xgr = LazyLoader ( " xgr " , globals ( ) , " xgrammar " )
2025-03-20 19:34:44 +08:00
2025-07-05 16:29:21 +08:00
import torch_npu
2025-05-31 06:03:03 +08:00
2025-08-27 09:08:17 +08:00
# if true, allow tensor initialization and casting with internal format (e.g., NZ)
torch . npu . config . allow_internal_format = True
2026-01-27 22:27:01 +08:00
AttnMetadataDict : TypeAlias = dict [ str , AttentionMetadata ]
# list when ubatching is enabled
PerLayerAttnMetadata : TypeAlias = list [ AttnMetadataDict ] | AttnMetadataDict
[refact] unified soc_version code (#4359)
### What this PR does / why we need it?
Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.
We need to unify these codes based on the following points:
1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.
Based on the above points, we have made the following changes:
1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.
### Does this PR introduce _any_ user-facing change?
When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-26 14:28:55 +08:00
if get_ascend_device_type ( ) == AscendDeviceType . _310P :
2025-07-05 16:29:21 +08:00
torch_npu . npu . set_compile_mode ( jit_compile = False )
2025-03-20 19:34:44 +08:00
2026-01-22 15:49:22 +08:00
SEQ_LEN_WITH_MAX_PA_WORKSPACE = 6144
support aclgraph (#426)
<!-- Thanks for sending a pull request!
BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html
-->
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.
- Please clarify why the changes are needed. For instance, the use case
and bug description.
- Fixes #
-->
This PR supports the access of vllm-acend to the piecewise_graph feature
provided by the v1 engine.
1. register unifiled_ascend_attention_with_output for piecewise_graph to
split graph.
2. support NPUGraph to accelerate kernel launch.
### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->
support npugraph to default, Users can disenable the npugraph feature by
configuring enforce_eager.
This has corresponding requirements for the versions of torch_npu and
CANN, and they need to support graph capture.
### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->
it turn to default
---------
Signed-off-by: Bug Hunter Yan <yanpq@zju.edu.cn>
Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
Co-authored-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-04-23 20:56:24 +08:00
@dataclass
class GraphCaptureContext :
stream : torch . npu . Stream
@contextmanager
def graph_capture ( device : torch . device ) :
"""
` graph_capture ` is a context manager which should surround the code that
is capturing the NPU graph . Its main purpose is to ensure that the
some operations will be run after the graph is captured , before the graph
is replayed . It returns a ` GraphCaptureContext ` object which contains the
necessary data for the graph capture . Currently , it only contains the
stream that the graph capture is running on . This stream is set to the
current NPU stream when the context manager is entered and reset to the
default stream when the context manager is exited . This is to ensure that
the graph capture is running on a separate stream from the default stream ,
in order to explicitly distinguish the kernels to capture
from other kernels possibly launched on background in the default stream .
"""
graph_capture_context = GraphCaptureContext (
torch . npu . Stream ( device = device ) )
stream = graph_capture_context . stream
# we use nullcontext now
maybe_ca_context = nullcontext ( )
# ensure all initialization operations complete before attempting to
# capture the graph on another stream
curr_stream = torch . npu . current_stream ( )
if curr_stream != stream :
stream . wait_stream ( curr_stream )
with torch . npu . stream ( stream ) , maybe_ca_context :
yield graph_capture_context
[BugFix] Support setting tp=1 for the Eagle draft model to take effect (#6097)
According to the official documentation, the parameter
"draft_tensor_parallel_size": 1 is supposed to be applied to the Eagle3
model. However, based on actual debugging, it was found that the number
of tensor parallelisms (tp) of the Eagle model is consistent with that
of the target model. The setting of tp for the draft model did not take
effect as expected.
**Note:** This feature has not been superimposed and tested with `sp`
and `dp`. It will be adapted later
No
```python
from vllm import LLM, SamplingParams
def main():
prompts = [
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(
model="meta-llama/Llama-3.1-8B-Instruct",
tensor_parallel_size=4,
gpu_memory_utilization=0.9,
enforce_eager=True,
speculative_config={
"method": "eagle3",
"model": "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
"draft_tensor_parallel_size": 1,
"num_speculative_tokens": 3,
},
)
outputs = llm.generate(prompts, sampling_params)
print(f"Outputs: {outputs}")
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
Fixes vllm-project/vllm#31345
### What this PR does / why we need it?
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60
Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
Co-authored-by: drslark <slarksblood@qq.com>
2026-01-22 11:36:23 +08:00
def get_tp_context ( drafter ) :
return getattr ( drafter , " tp_group_context " , nullcontext ( ) )
2025-11-26 11:48:58 +08:00
class ExecuteModelState ( NamedTuple ) :
""" Ephemeral cached state transferred between execute_model() and
sample_tokens ( ) , after execute_model ( ) returns None . """
scheduler_output : " SchedulerOutput "
logits : torch . Tensor
spec_decode_metadata : SpecDecodeMetadata | None
2026-01-27 22:27:01 +08:00
spec_decode_common_attn_metadata : AscendCommonAttentionMetadata | None
2025-11-26 11:48:58 +08:00
hidden_states : torch . Tensor
sample_hidden_states : torch . Tensor
aux_hidden_states : list [ torch . Tensor ] | None
2026-01-27 22:27:01 +08:00
attn_metadata : " PerLayerAttnMetadata "
2025-11-26 11:48:58 +08:00
positions : torch . Tensor
2026-01-27 22:27:01 +08:00
ec_connector_output : " ECConnectorOutput | None "
[Fix] Adds CUDA graph stats to execution state (#6331)
### What this PR does / why we need it?
Adds a CUDA graph profiling stats field to the execution state and
updates the NPU model runner to set, unpack, and forward those stats
during execution. This preserves CUDA graph metrics across state
transitions, improving observability for later use and diagnostics.
### Does this PR introduce _any_ user-facing change?
Enable this by set
```python
llm = LLM(
...
disable_log_stats=False,
cudagraph_metrics=True,
...
)
```
or `--cudagraph-metrics` and make sure do not disable log stats.
After this, you should be able to see something like this, which is
really helpful for some light debugging:
```
[loggers.py:257] Engine 000: Avg prompt throughput: 32.3 tokens/s, Avg generation throughput: 114.4 tokens/s, Running: 4 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.1%, Prefix cache hit rate: 0.0%
[cuda_graph.py:117] **CUDAGraph Config Settings:**
[cuda_graph.py:117]
[cuda_graph.py:117] - Mode: FULL_DECODE_ONLY
[cuda_graph.py:117] - Capture sizes: [1, 2, 4, 8, 16, 24, 32]
[cuda_graph.py:117]
[cuda_graph.py:117] **CUDAGraph Stats:**
[cuda_graph.py:117]
[cuda_graph.py:117] | Unpadded Tokens | Padded Tokens | Num Paddings | Runtime Mode | Count |
[cuda_graph.py:117] |-----------------|---------------|--------------|--------------|-------|
[cuda_graph.py:117] | 4 | 4 | 0 | FULL | 18 |
[cuda_graph.py:117] | 5 | 5 | 0 | NONE | 1 |
[cuda_graph.py:117] | 1 | 1 | 0 | FULL | 1 |
[cuda_graph.py:117] | 18 | 18 | 0 | NONE | 1 |
```
### How was this patch tested?
None.
- vLLM version: v0.14.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd
Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
2026-01-28 16:34:20 +08:00
cudagraph_stats : CUDAGraphStat | None
2025-11-26 11:48:58 +08:00
2025-12-12 17:27:09 +08:00
class NPUModelRunner ( GPUModelRunner ) :
2025-03-20 19:34:44 +08:00
def __init__ ( self , vllm_config : VllmConfig , device : torch . device ) :
2026-01-04 17:25:40 +08:00
# TODO(qcs): These manual pad and unpad for GPUModelRunner are
# used to expand some buffers, which need to be reverted after
# the following PR is merged:
# https://github.com/vllm-project/vllm/pull/28988
max_pcp_pad_tokens = vllm_config . parallel_config . prefill_context_parallel_size * 2 * vllm_config . scheduler_config . max_num_seqs
vllm_config . scheduler_config . max_num_batched_tokens + = max_pcp_pad_tokens
2025-12-12 17:27:09 +08:00
with _torch_cuda_wrapper ( ) :
super ( ) . __init__ ( vllm_config , device )
2026-01-30 16:41:44 +08:00
# NOTE: For FULL mode we change +1 to +2 to reserve extra space for padding.
# See _pad_query_start_loc_for_fia.
self . query_start_loc = self . _make_buffer (
self . max_num_reqs + 2 , dtype = torch . int32 # type: ignore[has-type]
)
2026-01-04 17:25:40 +08:00
vllm_config . scheduler_config . max_num_batched_tokens - = max_pcp_pad_tokens
2025-12-25 11:58:52 +08:00
self . max_num_tokens = self . scheduler_config . max_num_batched_tokens
2025-12-12 17:27:09 +08:00
self . max_num_reqs = self . scheduler_config . max_num_seqs
2025-06-25 16:20:14 +08:00
self . dp_size = vllm_config . parallel_config . data_parallel_size
self . dp_rank = vllm_config . parallel_config . data_parallel_rank
2025-12-12 17:27:09 +08:00
try :
self . dcp_size = get_dcp_group ( ) . world_size
self . dcp_rank = get_dcp_group ( ) . rank_in_group
self . pcp_size = get_pcp_group ( ) . world_size
self . pcp_rank = get_pcp_group (
) . rank_in_group if self . pcp_size > 1 else 0
except Exception :
self . dcp_size = 1
self . dcp_rank = 0
self . pcp_size = 1
self . pcp_rank = 0
2025-11-19 18:10:27 +08:00
if self . pcp_size > 1 :
self . model_config . max_model_len + = 2 * self . pcp_size * self . max_num_reqs
2025-12-31 09:29:57 +08:00
max_buffer_num_tokens = self . max_num_tokens
if self . pcp_size * self . dcp_size > 1 :
max_buffer_num_tokens = ( self . max_num_tokens +
self . max_num_reqs * 2 * self . pcp_size )
self . pcp_manager = PCPManager (
self . pcp_size ,
self . pcp_rank ,
self . dcp_size ,
self . dcp_rank ,
max_buffer_num_tokens ,
self . max_num_reqs ,
self . device ,
self . vllm_config ,
2026-01-20 15:24:05 +08:00
self . use_async_scheduling ,
2025-12-31 09:29:57 +08:00
self . pin_memory ,
)
# TODO(zhenwenqi) after https://github.com/vllm-project/vllm/pull/28988 is merged, we can delete this
self . input_ids = self . _make_buffer ( max_buffer_num_tokens ,
dtype = torch . int32 )
self . positions = self . _make_buffer ( max_buffer_num_tokens ,
dtype = torch . int64 )
2025-12-10 23:50:18 +08:00
self . sampler = AscendSampler ( )
2026-01-27 22:27:01 +08:00
self . attn_state : AscendAttentionState | None = None
2025-06-25 16:20:14 +08:00
2025-11-24 21:58:31 +08:00
# Ascend-specific configurations
2025-09-30 03:25:58 +08:00
self . ascend_config = get_ascend_config ( )
2025-12-23 08:49:52 +08:00
set_weight_prefetch_method ( self . ascend_config . weight_prefetch_config )
2025-11-24 21:58:31 +08:00
# Dump / PrecisionDebugger configuration now comes from AscendConfig
2025-12-26 14:07:37 +08:00
dump_cfg = self . ascend_config . dump_config_path
2025-11-24 21:58:31 +08:00
self . debugger = None
2025-12-26 14:07:37 +08:00
if dump_cfg is not None :
2025-11-24 21:58:31 +08:00
if self . model_config . enforce_eager :
from msprobe . pytorch import PrecisionDebugger
2025-12-26 14:07:37 +08:00
self . debugger = PrecisionDebugger ( dump_cfg )
2025-11-24 21:58:31 +08:00
else :
raise RuntimeError (
" Dumping/debugging only works in eager mode. " )
2025-09-18 21:43:22 +08:00
# use_hybrid_blocks: if hybrid blocks is used.
self . use_hybrid_blocks : bool = False
2025-09-30 03:25:58 +08:00
self . need_accepted_tokens : bool = False
2025-06-28 18:51:07 +08:00
2025-04-15 10:24:02 +08:00
self . is_multimodal_model = self . model_config . is_multimodal_model
2025-12-12 17:27:09 +08:00
self . block_size = vllm_config . cache_config . block_size
2025-06-25 16:20:14 +08:00
# Set up Attention
2026-01-06 16:41:39 +08:00
self . use_sparse = hasattr ( self . vllm_config . model_config . hf_text_config ,
2025-10-15 17:48:58 +08:00
" index_topk " )
2025-12-20 09:38:53 +08:00
self . attn_backend = get_attn_backend (
0 ,
self . dtype ,
None ,
self . block_size ,
use_mla = self . model_config . use_mla ,
use_sparse = self . use_sparse ,
use_mm_prefix = self . model_config is not None
and self . model_config . is_mm_prefix_lm )
2025-04-19 17:38:18 +08:00
2025-10-30 16:53:05 +08:00
self . _set_up_drafter ( )
Spec decode support for V1 Engine (#874)
<!-- Thanks for sending a pull request!
BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html
-->
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.
- Please clarify why the changes are needed. For instance, the use case
and bug description.
- Fixes #
-->
Make spec decode support for V1 Engine
- Currently, Ascend does not support the triton kernel. PyTorch is used
to rewrite the `rejection_sampler.py` triton kernel. However, PyTorch is
not as good as Triton. Therefore, ascend c is used to implement the
function in the future.
- Currently, spec decode supports only the ngram algorithm. The eagle
algorithm needs to be further adapted.
### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->
Not change user facing.
### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->
test by `tests/singlecard/spec_decode/e2e/test_v1_spec_decode.py` and
`tests/sample/test_rejection_sampler.py`, test base function of
rejection sampler and e2e function of spec decode.
Signed-off-by: ponix-j <657511300@qq.com>
2025-05-23 14:25:46 +08:00
2025-10-17 21:57:05 +08:00
# kv role
self . is_kv_producer = False
self . is_kv_consumer = False
if vllm_config . kv_transfer_config is not None :
self . is_kv_producer = vllm_config . kv_transfer_config . is_kv_producer
self . is_kv_consumer = vllm_config . kv_transfer_config . is_kv_consumer
2025-12-12 17:27:09 +08:00
set_cos_and_sin ( vllm_config , self . max_num_reqs ,
self . uniform_decode_query_len , self . dtype , self . device )
set_mc2_tokens_capacity ( vllm_config , self . max_num_reqs ,
self . uniform_decode_query_len )
set_mc2_mask ( vllm_config , self . device )
2025-11-12 17:22:21 +08:00
self . decode_threshold = 1 + (
self . speculative_config . num_speculative_tokens
if self . speculative_config else 0 )
2025-03-20 19:34:44 +08:00
2025-08-21 08:54:57 +08:00
self . use_aclgraph = self . _use_aclgraph ( )
2025-11-19 09:36:37 +08:00
2026-01-15 10:26:44 +08:00
eplb_config = self . ascend_config . eplb_config
self . dynamic_eplb = eplb_config . dynamic_eplb
2025-09-17 10:36:43 +08:00
if self . dynamic_eplb :
self . is_eplb_warmuped = False
2026-01-15 10:26:44 +08:00
self . policy_type = eplb_config . eplb_policy_type
2025-09-17 10:36:43 +08:00
self . eplb_loader = D2DExpertWeightLoader ( )
self . manager = Manager ( )
self . shared_dict = self . manager . dict ( {
" expert_map " : None ,
" moe_load " : None ,
" expert_maps " : None
} )
self . eplb_process = EplbProcess ( shared_dict = self . shared_dict ,
2025-10-11 16:28:57 +08:00
policy_type = self . policy_type ,
2025-09-17 10:36:43 +08:00
enable_d2d = True )
self . process = self . eplb_process . _launch_process ( )
2026-01-15 10:26:44 +08:00
self . eplb_updator = EplbUpdator ( eplb_config , self . eplb_loader ,
2025-09-17 10:36:43 +08:00
self . eplb_process , self . process )
2025-09-16 01:17:42 +08:00
# Input Batch
# NOTE(Chen): Ideally, we should initialize the input batch inside
# `initialize_kv_cache` based on the kv cache config. However, as in
# https://github.com/vllm-project/vllm/pull/18298, due to some unknown
# reasons, we have to initialize the input batch before `load_model`,
# quantization + weight offloading will fail otherwise. As a temporary
# solution, we initialize the input batch here, and re-initialize it
# in `initialize_kv_cache` if the block_sizes here is different from
# the block_sizes in the kv cache config.
2025-12-15 19:54:23 +08:00
self . input_batch = NPUInputBatch (
2025-09-16 01:17:42 +08:00
max_num_reqs = self . max_num_reqs ,
2026-01-11 11:38:45 +08:00
max_model_len = max ( self . model_config . max_model_len ,
self . max_encoder_len ) ,
2025-09-16 01:17:42 +08:00
max_num_batched_tokens = self . max_num_tokens ,
device = self . device ,
pin_memory = self . pin_memory ,
vocab_size = self . model_config . get_vocab_size ( ) ,
block_sizes = [ self . block_size ] ,
2025-12-12 17:27:09 +08:00
kernel_block_sizes = [ [ self . cache_config . block_size ] ] ,
2025-09-16 01:17:42 +08:00
is_spec_decode = bool ( self . vllm_config . speculative_config ) ,
logitsprocs = build_logitsprocs (
self . vllm_config , self . device , self . pin_memory ,
self . is_pooling_model ,
self . vllm_config . model_config . logits_processors ) ,
is_pooling_model = self . is_pooling_model ,
2025-11-19 11:21:46 +08:00
num_speculative_tokens = (
self . vllm_config . speculative_config . num_speculative_tokens
if self . vllm_config . speculative_config else 0 ) ,
2025-10-24 10:32:01 +08:00
cp_kv_cache_interleave_size = self . parallel_config .
2025-12-05 10:31:49 +08:00
cp_kv_cache_interleave_size ,
2025-09-16 01:17:42 +08:00
)
2025-12-12 17:27:09 +08:00
self . num_draft_tokens = self . _make_buffer ( self . max_num_reqs ,
dtype = torch . int32 )
2025-12-17 01:35:26 +08:00
# here we use int32
2025-12-12 17:27:09 +08:00
self . sampled_token_ids_pinned_cpu = torch . empty (
( self . max_num_reqs , 1 ) ,
dtype = torch . int32 ,
device = " cpu " ,
pin_memory = self . pin_memory ,
)
2025-12-17 01:35:26 +08:00
# for cleancode , actually the three attrs is defined in gpu_model_runner
self . execute_model_state : ExecuteModelState | None = None
2025-12-12 17:27:09 +08:00
# None in the first PP rank. The rest are set after load_model.
self . intermediate_tensors : IntermediateTensors | None = None
self . reorder_batch_threshold : int | None = None
2025-12-31 09:29:57 +08:00
self . long_seq_metadata = None
2025-12-12 17:27:09 +08:00
2026-01-27 22:27:01 +08:00
@property
def use_cp ( self ) - > bool :
return self . pcp_size * self . dcp_size > 1
2025-12-12 17:27:09 +08:00
def _init_device_properties ( self ) - > None :
self . num_sms = None
2025-11-26 11:48:58 +08:00
2025-12-12 17:27:09 +08:00
def _sync_device ( self ) - > None :
torch . npu . synchronize ( )
2025-11-26 11:48:58 +08:00
2025-10-30 16:53:05 +08:00
def _set_up_drafter ( self ) :
# Set up speculative decoding.
self . drafter : Optional [ Union [ NgramProposer , EagleProposer , MtpProposer ,
Add Medusa speculative decoding support for vllm_ascend (#5668)
### What this PR does / why we need it?
`vllm_ascend` already supports several speculative decoding strategies
such as MTP, EAGLE, N-gram, and suffix decoding. However, Medusa is not
yet supported. Medusa is an efficient speculative decoding framework
that leverages a lightweight draft model to propose multiple tokens in a
single step, which can significantly improve decoding throughput and
reduce latency.
To enable Medusa-based speculative decoding on Ascend hardware and
provide more decoding options for users, this PR adds Medusa support
into the `vllm_ascend` speculative decoding pipeline.
### Does this PR introduce _any_ user-facing change?
This PR introduces Medusa speculative decoding as an additional
speculative decoding method:
✔ Adds `MedusaProposer` and integrates it into the speculative decoding
registry
✔ Extends `SpecDcodeType` with a `MEDUSA` enum entry
✔ Updates `NPUModelRunner` to recognize and invoke Medusa during
decoding
✔ Adds Medusa-specific handling in the draft token generation logic
✔ Ensures backward compatibility — Medusa is only used when explicitly
enabled
Key code changes include:
* New file: `vllm_ascend/spec_decode/medusa_proposer.py`
* Register Medusa in `get_spec_decode_method`
* Extend proposer type hints to include `MedusaProposer`
* Add a Medusa-specific branch in `generate_draft_token_ids`
* Pass `sample_hidden_states` required by Medusa
### How was this patch tested?
Medusa is implemented as a new proposer class (`MedusaProposer`)
following the existing speculative decoding interface. The integration
works as follows:
1. Users enable Medusa via the speculative decoding configuration.
2. `get_spec_decode_method()` returns a `MedusaProposer` instance when
`method="medusa"`.
3. During decoding, `NPUModelRunner` detects that the active drafter is
a `MedusaProposer`.
4. Instead of the generic speculative decoding path, the Medusa-specific
`generate_token_ids()` method is invoked, which consumes:
* `valid_sampled_token_ids`
* `sampling_metadata`
* `spec_decode_metadata`
* `sample_hidden_states`
5. The proposed tokens are validated by the target model as usual.
When Medusa is not enabled, the decoding pipeline behaves exactly as
before, ensuring full backward compatibility.
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d
Signed-off-by: simplzyu <191163281@qq.com>
Signed-off-by: simplzyu <zhenyuguo@cmbchina.com>
2026-01-23 14:14:23 +08:00
SuffixDecodingProposer ,
MedusaProposer ] ] = None
2025-10-30 16:53:05 +08:00
self . actual_seq_lengths_q : list [ int ] = [ ]
self . decode_token_per_req = 1
if self . speculative_config :
spec_token_num = self . speculative_config . num_speculative_tokens
assert spec_token_num > 0
self . decode_token_per_req = 1 + spec_token_num
if get_pp_group ( ) . is_last_rank :
self . drafter = self . _get_drafter ( )
2025-12-22 15:24:54 +08:00
if self . speculative_config . method == " eagle3 " :
assert isinstance ( self . drafter , EagleProposer )
self . use_aux_hidden_state_outputs = (
self . drafter . eagle3_use_aux_hidden_state )
2025-12-16 11:32:26 +08:00
self . rejection_sampler = RejectionSampler ( self . sampler )
2025-10-30 16:53:05 +08:00
self . actual_seq_lengths_q = list (
range ( self . decode_token_per_req , self . max_num_tokens + 1 ,
self . decode_token_per_req ) )
self . discard_request_indices = self . _make_buffer ( self . max_num_reqs ,
dtype = torch . int64 )
self . num_discarded_requests = 0
def _get_drafter ( self ) :
return get_spec_decode_method ( self . speculative_config . method ,
self . vllm_config , self . device , self )
2025-08-21 08:54:57 +08:00
def _use_aclgraph ( self ) - > bool :
2025-11-24 17:08:20 +08:00
return self . compilation_config . cudagraph_mode != CUDAGraphMode . NONE and self . compilation_config . mode == CompilationMode . VLLM_COMPILE and not self . model_config . enforce_eager
2025-07-26 17:15:47 +08:00
2026-01-24 11:29:42 +08:00
def _skip_all_reduce_across_dp_group ( self , is_draft_model = False ) - > bool :
2025-12-26 17:39:44 +08:00
"""
Decide whether to skip the all - reduce across the data - parallel ( DP ) group .
2026-01-24 11:29:42 +08:00
Skipping is applicable for all dense models and for moe models only on ranks
that act as KV consumers . We skip the DP all - reduce when either :
2025-12-26 17:39:44 +08:00
- Both the prefill and decode communication methods are MC2 ( or FUSED_MC2 ) , or
- Decode requires MC2 and ascend_config . recompute_scheduler_enable is True .
"""
2026-01-24 11:29:42 +08:00
# For dense models, since we don't actually need dp communication, we simply skip it.
# This usually happens when main model is moe while eagle draft model is dense.
is_context_moe_model = is_drafter_moe_model ( self . vllm_config ) if is_draft_model \
else is_moe_model ( self . vllm_config )
if not is_context_moe_model :
return True
# Only applicable to MoE models on KV consumer ranks.
if not self . is_kv_consumer :
2025-12-13 18:59:54 +08:00
return False
2025-12-26 17:39:44 +08:00
def needs_mc2 ( num_tokens : int ) - > bool :
return select_moe_comm_method ( num_tokens , self . vllm_config ) in {
MoECommType . MC2 , MoECommType . FUSED_MC2
}
# Determine whether decode must use MC2. Use max cudagraph capture size
# if available, otherwise use the maximal uniform decode token count.
2025-12-13 18:59:54 +08:00
if self . compilation_config . cudagraph_capture_sizes :
2025-12-26 17:39:44 +08:00
potential_max_tokens = self . compilation_config . max_cudagraph_capture_size
2025-12-13 18:59:54 +08:00
else :
2025-12-26 17:39:44 +08:00
potential_max_tokens = self . max_num_reqs * self . uniform_decode_query_len
decode_must_use_mc2 = needs_mc2 ( potential_max_tokens )
# For prefill, use the scheduler's max_num_batched_tokens for a single
# batch.
prefill_must_use_mc2 = needs_mc2 (
self . vllm_config . scheduler_config . max_num_batched_tokens )
# Skip all-reduce if decode requires MC2 and either prefill also
# requires MC2 or recompute-based scheduler is enabled.
return decode_must_use_mc2 and (
prefill_must_use_mc2
or self . ascend_config . recompute_scheduler_enable )
2025-12-13 18:59:54 +08:00
[Fix] Fix DP-related padding logic (#2582)
### What this PR does / why we need it?
The determination of attention state, padding, and other forward
metadata has been moved to an earlier stage within the input preparation
process. This change enables us to utilize a single all-reduce
operation, maximizing synchronization efficiency as early as possible.
The logic for synchronizing metadata—such as the number of tokens,
prefill status, and DBO status—across data parallel (DP) ranks has now
been unified and simplified.
For performance improvements, the all-reduce operation has been switched
from the `gloo` backend to the `npu` backend, which results in an
reduction of several milliseconds per step (**approximately 10%
performance gain for TPOT!**).
Additionally, the multi-DP server hang issue has been resolved, ensuring
no more hangs occur when `num_requests < dp_size`. Alas, a relief.
Finally, the miscalculated memory usage issue has been addressed by
removing the unnecessary `DummyCommImpl`, allowing the system to use the
real communication method when determining available memory.
### Does this PR introduce _any_ user-facing change?
None.
### How was this patch tested?
Maybe we should add an test case for multi-DP online server?
@MengqingCao
- vLLM version: v0.10.1.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/c5d004aaaf3b2106d33974c673bec0568c18f762
---------
Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-08-28 19:39:58 +08:00
def _sync_metadata_across_dp (
2026-01-24 11:29:42 +08:00
self ,
num_tokens : int ,
with_prefill : bool = False ,
is_draft_model : bool = False
) - > tuple [ int , Optional [ torch . Tensor ] , bool ] :
2025-09-08 09:55:16 +08:00
# TODO: In vLLM, the only thing that needs to be synced is num_tokens, but in
# our case, we still need to sync the other two flags as well. So we need to
# include them in the all_reduce operation, and more over, we CANNOT skip it
# even if we are running in eager mode, which harms performance.
# FIXME: Restore the `or self.vllm_config.model_config.enforce_eager` here
# immediately once the other two flags are no longer needed.
if self . dp_size == 1 :
2025-10-25 15:53:01 +08:00
return num_tokens , None , with_prefill
2025-12-13 18:59:54 +08:00
2026-01-24 11:29:42 +08:00
if self . _skip_all_reduce_across_dp_group ( is_draft_model ) :
2025-12-13 18:59:54 +08:00
num_tokens_after_padding = torch . tensor ( [ num_tokens ] *
self . dp_size ,
device = " cpu " ,
dtype = torch . int32 )
return num_tokens , num_tokens_after_padding , with_prefill
2025-10-25 15:53:01 +08:00
# Sync num_tokens, with_prefill across dp ranks
[Fix] Fix DP-related padding logic (#2582)
### What this PR does / why we need it?
The determination of attention state, padding, and other forward
metadata has been moved to an earlier stage within the input preparation
process. This change enables us to utilize a single all-reduce
operation, maximizing synchronization efficiency as early as possible.
The logic for synchronizing metadata—such as the number of tokens,
prefill status, and DBO status—across data parallel (DP) ranks has now
been unified and simplified.
For performance improvements, the all-reduce operation has been switched
from the `gloo` backend to the `npu` backend, which results in an
reduction of several milliseconds per step (**approximately 10%
performance gain for TPOT!**).
Additionally, the multi-DP server hang issue has been resolved, ensuring
no more hangs occur when `num_requests < dp_size`. Alas, a relief.
Finally, the miscalculated memory usage issue has been addressed by
removing the unnecessary `DummyCommImpl`, allowing the system to use the
real communication method when determining available memory.
### Does this PR introduce _any_ user-facing change?
None.
### How was this patch tested?
Maybe we should add an test case for multi-DP online server?
@MengqingCao
- vLLM version: v0.10.1.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/c5d004aaaf3b2106d33974c673bec0568c18f762
---------
Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-08-28 19:39:58 +08:00
num_tokens_tensor = torch . tensor ( [
num_tokens if i == self . dp_rank else 0 for i in range ( self . dp_size )
] ,
dtype = torch . int32 ,
2025-12-06 17:15:57 +08:00
device = " cpu " )
[Fix] Fix DP-related padding logic (#2582)
### What this PR does / why we need it?
The determination of attention state, padding, and other forward
metadata has been moved to an earlier stage within the input preparation
process. This change enables us to utilize a single all-reduce
operation, maximizing synchronization efficiency as early as possible.
The logic for synchronizing metadata—such as the number of tokens,
prefill status, and DBO status—across data parallel (DP) ranks has now
been unified and simplified.
For performance improvements, the all-reduce operation has been switched
from the `gloo` backend to the `npu` backend, which results in an
reduction of several milliseconds per step (**approximately 10%
performance gain for TPOT!**).
Additionally, the multi-DP server hang issue has been resolved, ensuring
no more hangs occur when `num_requests < dp_size`. Alas, a relief.
Finally, the miscalculated memory usage issue has been addressed by
removing the unnecessary `DummyCommImpl`, allowing the system to use the
real communication method when determining available memory.
### Does this PR introduce _any_ user-facing change?
None.
### How was this patch tested?
Maybe we should add an test case for multi-DP online server?
@MengqingCao
- vLLM version: v0.10.1.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/c5d004aaaf3b2106d33974c673bec0568c18f762
---------
Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-08-28 19:39:58 +08:00
2025-10-25 15:53:01 +08:00
flags_tensor = torch . tensor ( [ int ( with_prefill ) ] ,
dtype = torch . int32 ,
2025-12-06 17:15:57 +08:00
device = " cpu " )
[Fix] Fix DP-related padding logic (#2582)
### What this PR does / why we need it?
The determination of attention state, padding, and other forward
metadata has been moved to an earlier stage within the input preparation
process. This change enables us to utilize a single all-reduce
operation, maximizing synchronization efficiency as early as possible.
The logic for synchronizing metadata—such as the number of tokens,
prefill status, and DBO status—across data parallel (DP) ranks has now
been unified and simplified.
For performance improvements, the all-reduce operation has been switched
from the `gloo` backend to the `npu` backend, which results in an
reduction of several milliseconds per step (**approximately 10%
performance gain for TPOT!**).
Additionally, the multi-DP server hang issue has been resolved, ensuring
no more hangs occur when `num_requests < dp_size`. Alas, a relief.
Finally, the miscalculated memory usage issue has been addressed by
removing the unnecessary `DummyCommImpl`, allowing the system to use the
real communication method when determining available memory.
### Does this PR introduce _any_ user-facing change?
None.
### How was this patch tested?
Maybe we should add an test case for multi-DP online server?
@MengqingCao
- vLLM version: v0.10.1.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/c5d004aaaf3b2106d33974c673bec0568c18f762
---------
Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-08-28 19:39:58 +08:00
packed_tensor = torch . cat ( [ num_tokens_tensor , flags_tensor ] )
2025-12-06 17:15:57 +08:00
# use cpu_group to avoid cpu synchronization issue.
# it can be overlapped with main moell execution on npu.
dist . all_reduce ( packed_tensor , group = get_dp_group ( ) . cpu_group )
[Fix] Fix DP-related padding logic (#2582)
### What this PR does / why we need it?
The determination of attention state, padding, and other forward
metadata has been moved to an earlier stage within the input preparation
process. This change enables us to utilize a single all-reduce
operation, maximizing synchronization efficiency as early as possible.
The logic for synchronizing metadata—such as the number of tokens,
prefill status, and DBO status—across data parallel (DP) ranks has now
been unified and simplified.
For performance improvements, the all-reduce operation has been switched
from the `gloo` backend to the `npu` backend, which results in an
reduction of several milliseconds per step (**approximately 10%
performance gain for TPOT!**).
Additionally, the multi-DP server hang issue has been resolved, ensuring
no more hangs occur when `num_requests < dp_size`. Alas, a relief.
Finally, the miscalculated memory usage issue has been addressed by
removing the unnecessary `DummyCommImpl`, allowing the system to use the
real communication method when determining available memory.
### Does this PR introduce _any_ user-facing change?
None.
### How was this patch tested?
Maybe we should add an test case for multi-DP online server?
@MengqingCao
- vLLM version: v0.10.1.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/c5d004aaaf3b2106d33974c673bec0568c18f762
---------
Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-08-28 19:39:58 +08:00
# Unpack the results
2025-10-25 15:53:01 +08:00
num_tokens_across_dp = packed_tensor [ : - 1 ]
synced_flags = packed_tensor [ - 1 : ]
[Fix] Fix DP-related padding logic (#2582)
### What this PR does / why we need it?
The determination of attention state, padding, and other forward
metadata has been moved to an earlier stage within the input preparation
process. This change enables us to utilize a single all-reduce
operation, maximizing synchronization efficiency as early as possible.
The logic for synchronizing metadata—such as the number of tokens,
prefill status, and DBO status—across data parallel (DP) ranks has now
been unified and simplified.
For performance improvements, the all-reduce operation has been switched
from the `gloo` backend to the `npu` backend, which results in an
reduction of several milliseconds per step (**approximately 10%
performance gain for TPOT!**).
Additionally, the multi-DP server hang issue has been resolved, ensuring
no more hangs occur when `num_requests < dp_size`. Alas, a relief.
Finally, the miscalculated memory usage issue has been addressed by
removing the unnecessary `DummyCommImpl`, allowing the system to use the
real communication method when determining available memory.
### Does this PR introduce _any_ user-facing change?
None.
### How was this patch tested?
Maybe we should add an test case for multi-DP online server?
@MengqingCao
- vLLM version: v0.10.1.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/c5d004aaaf3b2106d33974c673bec0568c18f762
---------
Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-08-28 19:39:58 +08:00
max_tokens_across_dp = torch . max ( num_tokens_across_dp ) . item ( )
global_with_prefill = bool ( synced_flags [ 0 ] )
# Create a tensor for num_tokens_after_padding
num_tokens_after_padding = torch . tensor ( [ max_tokens_across_dp ] *
self . dp_size ,
2025-10-17 21:13:41 +08:00
device = " cpu " ,
[Fix] Fix DP-related padding logic (#2582)
### What this PR does / why we need it?
The determination of attention state, padding, and other forward
metadata has been moved to an earlier stage within the input preparation
process. This change enables us to utilize a single all-reduce
operation, maximizing synchronization efficiency as early as possible.
The logic for synchronizing metadata—such as the number of tokens,
prefill status, and DBO status—across data parallel (DP) ranks has now
been unified and simplified.
For performance improvements, the all-reduce operation has been switched
from the `gloo` backend to the `npu` backend, which results in an
reduction of several milliseconds per step (**approximately 10%
performance gain for TPOT!**).
Additionally, the multi-DP server hang issue has been resolved, ensuring
no more hangs occur when `num_requests < dp_size`. Alas, a relief.
Finally, the miscalculated memory usage issue has been addressed by
removing the unnecessary `DummyCommImpl`, allowing the system to use the
real communication method when determining available memory.
### Does this PR introduce _any_ user-facing change?
None.
### How was this patch tested?
Maybe we should add an test case for multi-DP online server?
@MengqingCao
- vLLM version: v0.10.1.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/c5d004aaaf3b2106d33974c673bec0568c18f762
---------
Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-08-28 19:39:58 +08:00
dtype = torch . int32 )
2025-10-25 15:53:01 +08:00
return max_tokens_across_dp , num_tokens_after_padding , global_with_prefill
2025-08-01 09:08:45 +08:00
2025-03-20 19:34:44 +08:00
def get_model ( self ) - > nn . Module :
2025-08-20 09:01:04 +08:00
# get raw model out of the aclgraph wrapper.
if isinstance ( self . model , ACLGraphWrapper ) :
return self . model . unwrap ( )
2025-03-20 19:34:44 +08:00
return self . model
2026-01-30 16:41:44 +08:00
def _pad_query_start_loc_for_fia (
self , num_tokens_padded : int , num_reqs_padded : int , num_reqs : int
) - > int :
"""
This function is only designed to satisfied the constraint that when the layout is TND ,
the first dimension of ` hidden_states ` must equal the last element of ` actual_seq_lengths_q ` .
"""
if num_tokens_padded == num_reqs_padded * self . uniform_decode_query_len :
# Uniform-batch case: num_reqs must be no greater than num_reqs_padded
assert num_reqs < = num_reqs_padded
last_loc = self . query_start_loc . np [ num_reqs ]
self . query_start_loc . np [ num_reqs + 1 : num_reqs_padded + 1 ] = (
self . arange_np [ 1 : num_reqs_padded + 1 - num_reqs ]
* self . uniform_decode_query_len
+ last_loc
)
else :
# Mixed-batch case: num_reqs must equal num_reqs_padded
assert num_reqs == num_reqs_padded
# Insert a dummy request instead of setting query_start_loc[num_reqs] = num_tokens_padded directly
self . query_start_loc . np [ num_reqs_padded + 1 ] = num_tokens_padded
num_reqs_padded = num_reqs_padded + 1
self . query_start_loc . copy_to_gpu ( )
return num_reqs_padded
2025-08-21 08:54:57 +08:00
def _prepare_inputs (
2025-03-20 19:34:44 +08:00
self ,
scheduler_output : " SchedulerOutput " ,
2026-01-27 22:27:01 +08:00
num_scheduled_tokens : np . ndarray ,
) - > tuple [
torch . Tensor ,
SpecDecodeMetadata | None ] :
"""
: return : tuple [
logits_indices , spec_decode_metadata ,
]
"""
2025-03-20 19:34:44 +08:00
total_num_scheduled_tokens = scheduler_output . total_num_scheduled_tokens
assert total_num_scheduled_tokens > 0
num_reqs = self . input_batch . num_reqs
assert num_reqs > 0
2025-08-12 21:10:20 +08:00
2025-03-20 19:34:44 +08:00
# OPTIMIZATION: Start copying the block table first.
# This way, we can overlap the copy with the following CPU operations.
2025-07-26 15:43:29 +08:00
self . input_batch . block_table . commit_block_table ( num_reqs )
2025-04-02 10:33:53 +08:00
2026-01-27 22:27:01 +08:00
req_indices = np . repeat ( self . arange_np [ : num_reqs ] , num_scheduled_tokens )
2025-10-24 10:32:01 +08:00
2026-01-27 22:27:01 +08:00
# Get the attention state.
2025-12-15 19:22:40 +08:00
if not scheduler_output . scheduled_spec_decode_tokens :
2026-01-27 22:27:01 +08:00
num_valid_tokens = num_scheduled_tokens
2025-12-15 19:22:40 +08:00
else :
num_valid_tokens = np . array ( [
2026-01-27 22:27:01 +08:00
scheduler_output . num_scheduled_tokens [ i ] -
2025-12-15 19:22:40 +08:00
len ( scheduler_output . scheduled_spec_decode_tokens . get ( i , [ ] ) )
2026-01-27 22:27:01 +08:00
for i in self . input_batch . req_ids
] , dtype = np . int32 )
2025-12-31 09:29:57 +08:00
attn_state = self . _build_attn_state ( num_reqs , num_scheduled_tokens ,
num_valid_tokens )
self . attn_state = attn_state # type: ignore
# Determine if it's a splitfuse batch
with_prefill = attn_state not in [
AscendAttentionState . DecodeOnly , AscendAttentionState . SpecDecoding
]
2026-01-27 22:27:01 +08:00
self . with_prefill = with_prefill
2025-12-31 09:29:57 +08:00
# Get positions.
positions_np = self . positions . np [ : total_num_scheduled_tokens ]
cu_num_tokens , arange = self . _get_cumsum_and_arange (
num_scheduled_tokens )
np . add ( self . input_batch . num_computed_tokens_cpu [ req_indices ] ,
arange ,
out = positions_np )
2025-03-20 19:34:44 +08:00
2025-12-31 09:29:57 +08:00
self . input_batch . block_table . compute_slot_mapping (
req_indices , positions_np )
self . input_batch . block_table . commit_slot_mapping (
total_num_scheduled_tokens )
# for pcp, prefill mtp should use origin scheduleroutput ,
if self . speculative_config and self . pcp_size * self . dcp_size > 1 :
self . pcp_manager . generate_pcp_mtp_input (
2026-01-20 15:24:05 +08:00
num_reqs ,
total_num_scheduled_tokens ,
scheduler_output . num_scheduled_tokens ,
with_prefill ,
self . input_batch ,
self . arange_np ,
req_indices ,
positions_np ,
cu_num_tokens ,
self . _draft_token_ids , # type: ignore[has-type]
scheduler_output ,
self . num_spec_tokens )
2025-12-31 09:29:57 +08:00
if self . pcp_size > 1 :
num_scheduled_tokens [ :
num_reqs ] , position_pcp = self . pcp_manager . update_tokens_for_pcp (
num_scheduled_tokens [ : num_reqs ] ,
self . arange_np ,
self . input_batch . num_reqs ,
self . reorder_batch_threshold ,
)
# Re-update after PCP split sequences.
total_num_scheduled_tokens = sum ( num_scheduled_tokens )
req_indices = np . repeat ( self . arange_np [ : num_reqs ] ,
num_scheduled_tokens )
cu_num_tokens , _ = self . _get_cumsum_and_arange (
num_scheduled_tokens )
positions_np = self . positions . np [ : total_num_scheduled_tokens ]
np . add (
self . input_batch . num_computed_tokens_cpu [ req_indices ] ,
position_pcp [ : total_num_scheduled_tokens ] ,
out = positions_np ,
)
[Fix] Fix DP-related padding logic (#2582)
### What this PR does / why we need it?
The determination of attention state, padding, and other forward
metadata has been moved to an earlier stage within the input preparation
process. This change enables us to utilize a single all-reduce
operation, maximizing synchronization efficiency as early as possible.
The logic for synchronizing metadata—such as the number of tokens,
prefill status, and DBO status—across data parallel (DP) ranks has now
been unified and simplified.
For performance improvements, the all-reduce operation has been switched
from the `gloo` backend to the `npu` backend, which results in an
reduction of several milliseconds per step (**approximately 10%
performance gain for TPOT!**).
Additionally, the multi-DP server hang issue has been resolved, ensuring
no more hangs occur when `num_requests < dp_size`. Alas, a relief.
Finally, the miscalculated memory usage issue has been addressed by
removing the unnecessary `DummyCommImpl`, allowing the system to use the
real communication method when determining available memory.
### Does this PR introduce _any_ user-facing change?
None.
### How was this patch tested?
Maybe we should add an test case for multi-DP online server?
@MengqingCao
- vLLM version: v0.10.1.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/c5d004aaaf3b2106d33974c673bec0568c18f762
---------
Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-08-28 19:39:58 +08:00
self . query_lens = torch . from_numpy ( num_scheduled_tokens )
2025-09-08 10:45:23 +08:00
# Get token indices.
# E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
# -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2]
# where M is the max_model_len.
token_indices = ( positions_np +
req_indices * self . input_batch . token_ids_cpu . shape [ 1 ] )
2025-10-30 17:15:57 +08:00
token_indices_tensor = torch . from_numpy ( token_indices )
2025-09-08 10:45:23 +08:00
# Prepare input_ids.
# NOTE(woosuk): We use torch.index_select instead of np.take here
# because torch.index_select is much faster than np.take for large
# tensors.
torch . index_select ( self . input_batch . token_ids_cpu_tensor . flatten ( ) ,
0 ,
2025-10-30 17:15:57 +08:00
token_indices_tensor ,
2025-12-12 17:27:09 +08:00
out = self . input_ids . cpu [ : total_num_scheduled_tokens ] )
if self . enable_prompt_embeds :
is_token_ids = self . input_batch . is_token_ids_tensor . flatten ( )
torch . index_select (
is_token_ids ,
0 ,
token_indices_tensor ,
out = self . is_token_ids . cpu [ : total_num_scheduled_tokens ] )
2025-10-30 17:15:57 +08:00
# Because we did not pre-allocate a massive prompt_embeds CPU tensor on
# the InputBatch, we need to fill in the prompt embeds into the expected
# spots in the GpuModelRunner's pre-allocated prompt_embeds tensor.
if self . input_batch . req_prompt_embeds and ( self . is_multimodal_model or
self . enable_prompt_embeds ) :
output_idx = 0
for req_idx in range ( num_reqs ) :
num_sched = num_scheduled_tokens [ req_idx ]
# Skip if this request doesn't have embeddings
if req_idx not in self . input_batch . req_prompt_embeds :
output_idx + = num_sched
continue
# Skip if no tokens scheduled
if num_sched < = 0 :
output_idx + = num_sched
continue
req_embeds = self . input_batch . req_prompt_embeds [ req_idx ]
start_pos = self . input_batch . num_computed_tokens_cpu [ req_idx ]
# Skip if trying to read beyond available embeddings
if start_pos > = req_embeds . shape [ 0 ] :
output_idx + = num_sched
continue
# Copy available embeddings
end_pos = start_pos + num_sched
actual_end = min ( end_pos , req_embeds . shape [ 0 ] )
actual_num_sched = actual_end - start_pos
if actual_num_sched > 0 :
self . inputs_embeds . cpu [ output_idx : output_idx +
actual_num_sched ] . copy_ (
req_embeds [ start_pos : actual_end ]
)
output_idx + = num_sched
2025-09-08 10:45:23 +08:00
2025-12-12 17:27:09 +08:00
self . query_start_loc . np [ 0 ] = 0
self . query_start_loc . np [ 1 : num_reqs + 1 ] = cu_num_tokens
self . query_start_loc . copy_to_gpu ( )
2025-03-20 19:34:44 +08:00
2025-12-12 17:27:09 +08:00
self . seq_lens . np [ : num_reqs ] = (
2025-03-20 19:34:44 +08:00
self . input_batch . num_computed_tokens_cpu [ : num_reqs ] +
num_scheduled_tokens )
2025-12-12 17:27:09 +08:00
self . seq_lens . copy_to_gpu ( )
2025-03-20 19:34:44 +08:00
2025-12-12 17:27:09 +08:00
self . seq_lens . gpu [ num_reqs : ] . fill_ ( 0 )
2025-05-28 21:18:41 +08:00
2025-09-08 10:45:23 +08:00
# Copy the tensors to the NPU.
2025-12-06 17:15:57 +08:00
self . _prepare_input_ids ( scheduler_output , total_num_scheduled_tokens ,
cu_num_tokens )
2026-01-27 22:27:01 +08:00
# Calculate M-RoPE positions.
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
if self . uses_mrope :
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
self . _calc_mrope_positions ( scheduler_output )
self . mrope_positions . gpu [ : , : total_num_scheduled_tokens ] . copy_ (
self . mrope_positions . cpu [ : , : total_num_scheduled_tokens ] ,
non_blocking = True ,
)
elif self . uses_xdrope_dim > 0 :
self . _calc_xdrope_positions ( scheduler_output )
# Only relevant for models using XD-RoPE (e.g, HunYuan-VL)
self . xdrope_positions . gpu [ : , : total_num_scheduled_tokens ] . copy_ (
self . xdrope_positions . cpu [ : , : total_num_scheduled_tokens ] ,
non_blocking = True ,
)
else :
# Common case (1D positions)
self . positions . copy_to_gpu ( total_num_scheduled_tokens )
2025-03-20 19:34:44 +08:00
2025-10-30 16:53:05 +08:00
# Record the index of requests that should not be sampled,
# so that we could clear the sampled tokens before returning
num_tokens = [
self . requests [ r ] . num_tokens for r in self . input_batch . req_ids
]
num_tokens_np = np . array ( num_tokens , dtype = np . int32 )
2025-12-16 22:06:40 +08:00
base_num_reqs = self . input_batch . num_reqs
num_reqs = base_num_reqs
2025-11-11 09:18:02 +08:00
if self . pcp_size > 1 :
2025-10-31 15:43:22 +08:00
# while pcp > 1, we need the original num_scheduled_tokens before split
# to calculate discard_requests_mask
2025-11-11 09:18:02 +08:00
tokens_original = [
2026-01-27 22:27:01 +08:00
scheduler_output . num_scheduled_tokens [ i ] for i in self . input_batch . req_ids
2025-11-11 09:18:02 +08:00
]
2025-10-31 15:43:22 +08:00
original_seq_lens_np = (
self . input_batch . num_computed_tokens_cpu [ : num_reqs ] +
2025-11-11 09:18:02 +08:00
np . array ( tokens_original , dtype = np . int32 ) )
2025-10-31 15:43:22 +08:00
discard_requests_mask = original_seq_lens_np < num_tokens_np
2025-11-11 09:18:02 +08:00
else :
2025-12-12 17:27:09 +08:00
discard_requests_mask = self . seq_lens . np [ : num_reqs ] < num_tokens_np
2025-11-11 09:18:02 +08:00
2025-10-30 16:53:05 +08:00
discard_request_indices = np . nonzero ( discard_requests_mask ) [ 0 ]
self . num_discarded_requests = len ( discard_request_indices )
self . discard_request_indices . np [ : self . num_discarded_requests ] = (
discard_request_indices )
self . discard_request_indices . copy_to_gpu ( self . num_discarded_requests )
2026-01-27 22:27:01 +08:00
use_spec_decode = len ( scheduler_output . scheduled_spec_decode_tokens ) > 0
Spec decode support for V1 Engine (#874)
<!-- Thanks for sending a pull request!
BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html
-->
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.
- Please clarify why the changes are needed. For instance, the use case
and bug description.
- Fixes #
-->
Make spec decode support for V1 Engine
- Currently, Ascend does not support the triton kernel. PyTorch is used
to rewrite the `rejection_sampler.py` triton kernel. However, PyTorch is
not as good as Triton. Therefore, ascend c is used to implement the
function in the future.
- Currently, spec decode supports only the ngram algorithm. The eagle
algorithm needs to be further adapted.
### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->
Not change user facing.
### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->
test by `tests/singlecard/spec_decode/e2e/test_v1_spec_decode.py` and
`tests/sample/test_rejection_sampler.py`, test base function of
rejection sampler and e2e function of spec decode.
Signed-off-by: ponix-j <657511300@qq.com>
2025-05-23 14:25:46 +08:00
if not use_spec_decode :
# NOTE(woosuk): Due to chunked prefills, the batch may contain
# partial requests. While we should not sample any token
# from these partial requests, we do so for simplicity.
# We will ignore the sampled tokens from the partial requests.
# TODO: Support prompt logprobs.
spec_decode_metadata = None
2026-01-27 22:27:01 +08:00
num_draft_tokens = None
num_sampled_tokens = np . ones ( num_reqs , dtype = np . int32 )
2025-11-07 09:48:39 +08:00
if self . pcp_size * self . dcp_size > 1 :
2025-12-31 09:29:57 +08:00
logits_indices = self . pcp_manager . get_logits_indices (
cu_num_tokens , num_reqs )
2025-11-13 09:11:26 +08:00
logits_indices = logits_indices . pin_memory ( ) . to (
2025-11-07 09:48:39 +08:00
self . device , non_blocking = True )
2025-11-13 09:11:26 +08:00
else :
2025-12-12 17:27:09 +08:00
logits_indices = self . query_start_loc . gpu [ 1 : num_reqs + 1 ] - 1
Spec decode support for V1 Engine (#874)
<!-- Thanks for sending a pull request!
BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html
-->
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.
- Please clarify why the changes are needed. For instance, the use case
and bug description.
- Fixes #
-->
Make spec decode support for V1 Engine
- Currently, Ascend does not support the triton kernel. PyTorch is used
to rewrite the `rejection_sampler.py` triton kernel. However, PyTorch is
not as good as Triton. Therefore, ascend c is used to implement the
function in the future.
- Currently, spec decode supports only the ngram algorithm. The eagle
algorithm needs to be further adapted.
### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->
Not change user facing.
### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->
test by `tests/singlecard/spec_decode/e2e/test_v1_spec_decode.py` and
`tests/sample/test_rejection_sampler.py`, test base function of
rejection sampler and e2e function of spec decode.
Signed-off-by: ponix-j <657511300@qq.com>
2025-05-23 14:25:46 +08:00
else :
# Get the number of draft tokens for each request.
# Iterate over the dictionary rather than all requests since not all
# requests have draft tokens.
num_draft_tokens = np . zeros ( num_reqs , dtype = np . int32 )
2025-12-10 22:54:24 +08:00
# For chunked prefills, use -1 as mask rather than 0, as guided
# decoding may rollback speculative tokens.
num_decode_draft_tokens = np . full ( num_reqs , - 1 , dtype = np . int32 )
2026-01-27 22:27:01 +08:00
for (
req_id ,
draft_token_ids ,
) in scheduler_output . scheduled_spec_decode_tokens . items ( ) :
Spec decode support for V1 Engine (#874)
<!-- Thanks for sending a pull request!
BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html
-->
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.
- Please clarify why the changes are needed. For instance, the use case
and bug description.
- Fixes #
-->
Make spec decode support for V1 Engine
- Currently, Ascend does not support the triton kernel. PyTorch is used
to rewrite the `rejection_sampler.py` triton kernel. However, PyTorch is
not as good as Triton. Therefore, ascend c is used to implement the
function in the future.
- Currently, spec decode supports only the ngram algorithm. The eagle
algorithm needs to be further adapted.
### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->
Not change user facing.
### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->
test by `tests/singlecard/spec_decode/e2e/test_v1_spec_decode.py` and
`tests/sample/test_rejection_sampler.py`, test base function of
rejection sampler and e2e function of spec decode.
Signed-off-by: ponix-j <657511300@qq.com>
2025-05-23 14:25:46 +08:00
req_idx = self . input_batch . req_id_to_index [ req_id ]
num_draft_tokens [ req_idx ] = len ( draft_token_ids )
2025-12-10 22:54:24 +08:00
num_decode_draft_tokens [ req_idx ] = ( len ( draft_token_ids ) if (
self . input_batch . num_computed_tokens_cpu [ req_idx ]
> = self . input_batch . num_prompt_tokens [ req_idx ] ) else - 1 )
Spec decode support for V1 Engine (#874)
<!-- Thanks for sending a pull request!
BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html
-->
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.
- Please clarify why the changes are needed. For instance, the use case
and bug description.
- Fixes #
-->
Make spec decode support for V1 Engine
- Currently, Ascend does not support the triton kernel. PyTorch is used
to rewrite the `rejection_sampler.py` triton kernel. However, PyTorch is
not as good as Triton. Therefore, ascend c is used to implement the
function in the future.
- Currently, spec decode supports only the ngram algorithm. The eagle
algorithm needs to be further adapted.
### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->
Not change user facing.
### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->
test by `tests/singlecard/spec_decode/e2e/test_v1_spec_decode.py` and
`tests/sample/test_rejection_sampler.py`, test base function of
rejection sampler and e2e function of spec decode.
Signed-off-by: ponix-j <657511300@qq.com>
2025-05-23 14:25:46 +08:00
spec_decode_metadata = self . _calc_spec_decode_metadata (
2025-12-31 09:29:57 +08:00
num_draft_tokens ,
cu_num_tokens ,
num_pcp_pads = self . pcp_manager . num_pcp_pads_cpu [ : num_reqs ]
if self . pcp_size > 1 else None )
2025-07-11 15:30:51 +08:00
logits_indices = spec_decode_metadata . logits_indices
2026-01-27 22:27:01 +08:00
num_sampled_tokens = num_draft_tokens + 1
2025-12-10 22:54:24 +08:00
# For DECODE only cuda graph of some attention backends (e.g., GDN).
2026-01-27 22:27:01 +08:00
self . num_decode_draft_tokens . np [ : num_reqs ] = num_decode_draft_tokens
2025-12-10 22:54:24 +08:00
self . num_decode_draft_tokens . np [ num_reqs : ] . fill ( - 1 )
self . num_decode_draft_tokens . copy_to_gpu ( )
2025-11-12 17:22:21 +08:00
# save logits_indices for pcp spec decode usage
self . logits_indices = logits_indices
2025-09-16 01:17:42 +08:00
2026-01-27 22:27:01 +08:00
# Hot-Swap lora model
if self . lora_config :
assert (
np . sum ( num_sampled_tokens )
< = self . vllm_config . scheduler_config . max_num_batched_tokens
)
self . set_active_loras (
self . input_batch , num_scheduled_tokens , num_sampled_tokens
2026-01-11 11:38:45 +08:00
)
2025-08-29 11:41:21 +08:00
if lmhead_tp_enable ( ) :
2025-12-13 18:59:54 +08:00
max_num_reqs_across_dp = self . max_num_reqs * self . uniform_decode_query_len
2025-08-29 11:41:21 +08:00
logits_indices = nn . functional . pad (
logits_indices ,
( 0 , max_num_reqs_across_dp - logits_indices . shape [ 0 ] ) )
2026-01-27 22:27:01 +08:00
return logits_indices , spec_decode_metadata
2025-08-21 08:54:57 +08:00
def _build_attn_state ( self , num_reqs , num_scheduled_tokens ,
num_valid_tokens ) :
2025-12-31 09:29:57 +08:00
if np . all ( self . input_batch . num_computed_tokens_cpu [ : num_reqs ] == 0 ) :
2025-08-21 08:54:57 +08:00
attn_state = AscendAttentionState . PrefillNoCache
# We assume it is the decode stage, where prefill occurs but only one token is not hit in cache.
elif np . all ( num_scheduled_tokens == 1 ) :
attn_state = AscendAttentionState . DecodeOnly
2025-12-08 11:02:42 +08:00
if self . speculative_config and self . speculative_config . method == ' mtp ' :
2025-08-21 08:54:57 +08:00
# SpecDecoding now supports seq_len=1 and seq_len=2
# In Prefilling Decoding Disaggregation scenario, SpecDecoding need to supports seq_len=1
attn_state = AscendAttentionState . SpecDecoding
# Speculative decoding.
elif np . all ( num_valid_tokens == 1 ) :
2025-12-08 11:02:42 +08:00
if self . speculative_config and self . speculative_config . method == ' mtp ' :
2025-08-21 08:54:57 +08:00
attn_state = AscendAttentionState . SpecDecoding
2025-11-07 16:39:03 +08:00
else :
attn_state = AscendAttentionState . ChunkedPrefill
2025-08-21 08:54:57 +08:00
# splitfuse
2025-12-05 09:03:45 +08:00
elif self . scheduler_config . enable_chunked_prefill :
2025-08-21 08:54:57 +08:00
attn_state = AscendAttentionState . ChunkedPrefill
else :
attn_state = AscendAttentionState . PrefillCacheHit
return attn_state
Spec decode support for V1 Engine (#874)
<!-- Thanks for sending a pull request!
BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html
-->
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.
- Please clarify why the changes are needed. For instance, the use case
and bug description.
- Fixes #
-->
Make spec decode support for V1 Engine
- Currently, Ascend does not support the triton kernel. PyTorch is used
to rewrite the `rejection_sampler.py` triton kernel. However, PyTorch is
not as good as Triton. Therefore, ascend c is used to implement the
function in the future.
- Currently, spec decode supports only the ngram algorithm. The eagle
algorithm needs to be further adapted.
### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->
Not change user facing.
### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->
test by `tests/singlecard/spec_decode/e2e/test_v1_spec_decode.py` and
`tests/sample/test_rejection_sampler.py`, test base function of
rejection sampler and e2e function of spec decode.
Signed-off-by: ponix-j <657511300@qq.com>
2025-05-23 14:25:46 +08:00
def _calc_spec_decode_metadata (
self ,
num_draft_tokens : np . ndarray ,
cu_num_scheduled_tokens : np . ndarray ,
2025-12-31 09:29:57 +08:00
num_pcp_pads : np . ndarray | None ,
Spec decode support for V1 Engine (#874)
<!-- Thanks for sending a pull request!
BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html
-->
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.
- Please clarify why the changes are needed. For instance, the use case
and bug description.
- Fixes #
-->
Make spec decode support for V1 Engine
- Currently, Ascend does not support the triton kernel. PyTorch is used
to rewrite the `rejection_sampler.py` triton kernel. However, PyTorch is
not as good as Triton. Therefore, ascend c is used to implement the
function in the future.
- Currently, spec decode supports only the ngram algorithm. The eagle
algorithm needs to be further adapted.
### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->
Not change user facing.
### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->
test by `tests/singlecard/spec_decode/e2e/test_v1_spec_decode.py` and
`tests/sample/test_rejection_sampler.py`, test base function of
rejection sampler and e2e function of spec decode.
Signed-off-by: ponix-j <657511300@qq.com>
2025-05-23 14:25:46 +08:00
) - > SpecDecodeMetadata :
# Inputs:
# cu_num_scheduled_tokens: [ 4, 104, 107, 207, 209]
# num_draft_tokens: [ 3, 0, 2, 0, 1]
# Outputs:
# cu_num_draft_tokens: [ 3, 3, 5, 5, 6]
# logits_indices: [ 0, 1, 2, 3, 103, 104, 105, 106,
# 206, 207, 208]
# target_logits_indices: [ 0, 1, 2, 5, 6, 9]
# bonus_logits_indices: [ 3, 4, 7, 8, 10]
# Compute the logits indices.
# [4, 1, 3, 1, 2]
num_sampled_tokens = num_draft_tokens + 1
# Step 1. [4, 5, 8, 9, 11]
cu_num_sampled_tokens = np . cumsum ( num_sampled_tokens , dtype = np . int32 )
total_num_sampled_tokens = cu_num_sampled_tokens [ - 1 ]
# Step 2. [0, 0, 0, 0, 4, 5, 5, 5, 8, 9, 9]
cumsums_offsets = np . repeat ( cu_num_sampled_tokens - num_sampled_tokens ,
num_sampled_tokens )
# Step 3. [0, 1, 2, 3, 0, 0, 1, 2, 0, 0, 1]
arange = self . arange_np [ : total_num_sampled_tokens ] - cumsums_offsets
# Step 4. [0, 0, 0, 0, 103, 104, 104, 104, 206, 207, 207]
logits_indices = np . repeat (
cu_num_scheduled_tokens - num_sampled_tokens , num_sampled_tokens )
# Step 5. [0, 1, 2, 3, 103, 104, 105, 106, 206, 207, 208]
logits_indices + = arange
2025-11-12 17:22:21 +08:00
# while pcp > 1, decode results may contain padding (from pcp all-gather),
# update logits_indices after getting draft_token_ids from ori logits_indices
if self . pcp_size > 1 :
cu_num_scheduled_tokens = cu_num_scheduled_tokens * self . pcp_size - num_pcp_pads
logits_indices_pcp = np . repeat (
cu_num_scheduled_tokens - num_sampled_tokens ,
num_sampled_tokens )
logits_indices_pcp + = arange
2025-12-06 17:15:57 +08:00
logits_indices_pcp = torch . from_numpy (
logits_indices_pcp ) . pin_memory ( ) . to ( self . device ,
non_blocking = True )
2025-11-12 17:22:21 +08:00
Spec decode support for V1 Engine (#874)
<!-- Thanks for sending a pull request!
BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html
-->
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.
- Please clarify why the changes are needed. For instance, the use case
and bug description.
- Fixes #
-->
Make spec decode support for V1 Engine
- Currently, Ascend does not support the triton kernel. PyTorch is used
to rewrite the `rejection_sampler.py` triton kernel. However, PyTorch is
not as good as Triton. Therefore, ascend c is used to implement the
function in the future.
- Currently, spec decode supports only the ngram algorithm. The eagle
algorithm needs to be further adapted.
### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->
Not change user facing.
### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->
test by `tests/singlecard/spec_decode/e2e/test_v1_spec_decode.py` and
`tests/sample/test_rejection_sampler.py`, test base function of
rejection sampler and e2e function of spec decode.
Signed-off-by: ponix-j <657511300@qq.com>
2025-05-23 14:25:46 +08:00
# Compute the bonus logits indices.
bonus_logits_indices = cu_num_sampled_tokens - 1
# Compute the draft logits indices.
# [3, 3, 5, 5, 6]
cu_num_draft_tokens = np . cumsum ( num_draft_tokens , dtype = np . int32 )
total_num_draft_tokens = cu_num_draft_tokens [ - 1 ]
# [0, 0, 0, 3, 3, 5]
cumsums_offsets = np . repeat ( cu_num_draft_tokens - num_draft_tokens ,
num_draft_tokens )
# [0, 1, 2, 0, 1, 0]
arange = self . arange_np [ : total_num_draft_tokens ] - cumsums_offsets
# [0, 0, 0, 5, 5, 9]
target_logits_indices = np . repeat (
cu_num_sampled_tokens - num_sampled_tokens , num_draft_tokens )
# [0, 1, 2, 5, 6, 9]
target_logits_indices + = arange
# TODO: Optimize the CPU -> NPU copy.
2025-12-06 17:15:57 +08:00
cu_num_draft_tokens = (
torch . from_numpy ( cu_num_draft_tokens ) . pin_memory ( ) . to (
self . device , non_blocking = True ) )
cu_num_sampled_tokens = (
torch . from_numpy ( cu_num_sampled_tokens ) . pin_memory ( ) . to (
self . device , non_blocking = True ) )
logits_indices = ( torch . from_numpy ( logits_indices ) . pin_memory ( ) . to (
self . device , non_blocking = True ) )
target_logits_indices = (
torch . from_numpy ( target_logits_indices ) . pin_memory ( ) . to (
self . device , non_blocking = True ) )
bonus_logits_indices = torch . from_numpy (
bonus_logits_indices ) . pin_memory ( ) . to ( self . device ,
non_blocking = True )
Spec decode support for V1 Engine (#874)
<!-- Thanks for sending a pull request!
BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html
-->
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.
- Please clarify why the changes are needed. For instance, the use case
and bug description.
- Fixes #
-->
Make spec decode support for V1 Engine
- Currently, Ascend does not support the triton kernel. PyTorch is used
to rewrite the `rejection_sampler.py` triton kernel. However, PyTorch is
not as good as Triton. Therefore, ascend c is used to implement the
function in the future.
- Currently, spec decode supports only the ngram algorithm. The eagle
algorithm needs to be further adapted.
### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->
Not change user facing.
### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->
test by `tests/singlecard/spec_decode/e2e/test_v1_spec_decode.py` and
`tests/sample/test_rejection_sampler.py`, test base function of
rejection sampler and e2e function of spec decode.
Signed-off-by: ponix-j <657511300@qq.com>
2025-05-23 14:25:46 +08:00
# Compute the draft token ids.
# draft_token_indices: [ 1, 2, 3, 105, 106, 208]
2025-12-12 17:27:09 +08:00
draft_token_ids = self . input_ids . gpu [ logits_indices ]
Spec decode support for V1 Engine (#874)
<!-- Thanks for sending a pull request!
BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html
-->
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.
- Please clarify why the changes are needed. For instance, the use case
and bug description.
- Fixes #
-->
Make spec decode support for V1 Engine
- Currently, Ascend does not support the triton kernel. PyTorch is used
to rewrite the `rejection_sampler.py` triton kernel. However, PyTorch is
not as good as Triton. Therefore, ascend c is used to implement the
function in the future.
- Currently, spec decode supports only the ngram algorithm. The eagle
algorithm needs to be further adapted.
### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->
Not change user facing.
### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->
test by `tests/singlecard/spec_decode/e2e/test_v1_spec_decode.py` and
`tests/sample/test_rejection_sampler.py`, test base function of
rejection sampler and e2e function of spec decode.
Signed-off-by: ponix-j <657511300@qq.com>
2025-05-23 14:25:46 +08:00
draft_token_ids = draft_token_ids [ target_logits_indices + 1 ]
2025-11-12 17:22:21 +08:00
if self . pcp_size > 1 :
logits_indices = logits_indices_pcp
2025-12-16 11:32:26 +08:00
return SpecDecodeMetadata (
2025-11-24 17:08:20 +08:00
draft_token_ids = draft_token_ids ,
num_draft_tokens = num_draft_tokens . tolist ( ) ,
cu_num_draft_tokens = cu_num_draft_tokens ,
cu_num_sampled_tokens = cu_num_sampled_tokens ,
target_logits_indices = target_logits_indices ,
bonus_logits_indices = bonus_logits_indices ,
logits_indices = logits_indices ,
)
2025-03-20 19:34:44 +08:00
2026-01-05 14:07:54 +08:00
# TODO: Once the PCP features are complete, it will fully inherit the classes from the VLLM community.
2025-08-20 09:01:04 +08:00
def propose_draft_token_ids (
Spec decode support for V1 Engine (#874)
<!-- Thanks for sending a pull request!
BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html
-->
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.
- Please clarify why the changes are needed. For instance, the use case
and bug description.
- Fixes #
-->
Make spec decode support for V1 Engine
- Currently, Ascend does not support the triton kernel. PyTorch is used
to rewrite the `rejection_sampler.py` triton kernel. However, PyTorch is
not as good as Triton. Therefore, ascend c is used to implement the
function in the future.
- Currently, spec decode supports only the ngram algorithm. The eagle
algorithm needs to be further adapted.
### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->
Not change user facing.
### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->
test by `tests/singlecard/spec_decode/e2e/test_v1_spec_decode.py` and
`tests/sample/test_rejection_sampler.py`, test base function of
rejection sampler and e2e function of spec decode.
Signed-off-by: ponix-j <657511300@qq.com>
2025-05-23 14:25:46 +08:00
self ,
2025-12-02 22:10:52 +08:00
valid_sampled_token_ids : torch . Tensor | list [ list [ int ] ] ,
Spec decode support for V1 Engine (#874)
<!-- Thanks for sending a pull request!
BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html
-->
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.
- Please clarify why the changes are needed. For instance, the use case
and bug description.
- Fixes #
-->
Make spec decode support for V1 Engine
- Currently, Ascend does not support the triton kernel. PyTorch is used
to rewrite the `rejection_sampler.py` triton kernel. However, PyTorch is
not as good as Triton. Therefore, ascend c is used to implement the
function in the future.
- Currently, spec decode supports only the ngram algorithm. The eagle
algorithm needs to be further adapted.
### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->
Not change user facing.
### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->
test by `tests/singlecard/spec_decode/e2e/test_v1_spec_decode.py` and
`tests/sample/test_rejection_sampler.py`, test base function of
rejection sampler and e2e function of spec decode.
Signed-off-by: ponix-j <657511300@qq.com>
2025-05-23 14:25:46 +08:00
sampling_metadata : SamplingMetadata ,
scheduler_output : " SchedulerOutput " ,
spec_decode_metadata : SpecDecodeMetadata ,
2026-01-27 22:27:01 +08:00
spec_decode_common_attn_metadata : AscendCommonAttentionMetadata ,
Spec decode support for V1 Engine (#874)
<!-- Thanks for sending a pull request!
BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html
-->
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.
- Please clarify why the changes are needed. For instance, the use case
and bug description.
- Fixes #
-->
Make spec decode support for V1 Engine
- Currently, Ascend does not support the triton kernel. PyTorch is used
to rewrite the `rejection_sampler.py` triton kernel. However, PyTorch is
not as good as Triton. Therefore, ascend c is used to implement the
function in the future.
- Currently, spec decode supports only the ngram algorithm. The eagle
algorithm needs to be further adapted.
### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->
Not change user facing.
### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->
test by `tests/singlecard/spec_decode/e2e/test_v1_spec_decode.py` and
`tests/sample/test_rejection_sampler.py`, test base function of
rejection sampler and e2e function of spec decode.
Signed-off-by: ponix-j <657511300@qq.com>
2025-05-23 14:25:46 +08:00
positions : torch . Tensor ,
num_scheduled_tokens : int ,
hidden_states : torch . Tensor ,
2026-01-27 22:27:01 +08:00
attn_metadata : list [ dict [ str , Any ] ] | dict [ str , Any ] ,
[V1][eagle3] Support eagle3 proposer for v1 (#1032)
### What this PR does / why we need it?
This PR implements the Eagle Pososer feature for vLLM v1, which enables
more efficient speculative decoding by using a draft model to predict
potential future tokens.
- The implementation includes the core Eagle algorithm integration with
vLLM's existing architecture, allowing for faster inference while
maintaining output quality.
- This is needed to significantly improve the generation speed of large
language models without compromising on the quality of generated text.
### Does this PR introduce any user-facing change?
Yes, this PR introduces a new speculative decoding mode that can be
enabled via configuration.
- Users can now choose to use Eagle Pososer by setting appropriate flags
in the inference configuration.
- The API remains backward compatible, with the new functionality being
opt-in.
### How was this patch tested?
CI passed with new unit tests added for the Eagle Pososer functionality.
- Benchmark tests were conducted comparing generation speed and quality
with and without Eagle Pososer.
- Integration tests were performed with various model architectures to
ensure compatibility.
- Manual testing was done using different prompt scenarios to verify
output quality remains consistent.
- we test accept rate on one Ascend 910B npu, The acceptance rate
results are basically consistent with those shown here:
https://github.com/vllm-project/vllm/pull/16937
- Currently, we support scenarios where num_spec_tokens <= 2. When
num_spec_tokens > 2, issues such as insufficient GPU memory and operator
computation errors may occur. We will address this in subsequent
updates.
- We will add support for Eagle v1 in future updates.
### Acceptance Test Script
```bash
SCRIPT="/offline/eagle.py"
DATASET="ShareGpt"
MODEL=Meta-Llama-3.1-8B-Instruct
DRAFT=EAGLE3-LLaMA3.1-Instruct-8B
CUDA_VISIBLE_DEVICES="0" VLLM_USE_V1=1 $PYTHON $SCRIPT \
--dataset $DATASET \
--num_spec_tokens 2 \
--max_num_seqs 1 \
--model_dir $MODEL \
--eagle_dir $DRAFT \
--tp 1 \
--num_prompts 80
```
### Acceptance Test Results
```bash
██████████████████████████████████████████████████████████████████████████████████████████████████████████| 80/80 [21:22<00:00, 16.03s/it, est. speed input: 4.72 toks/s, output: 13.56 toks/s]
-------------------------------------------------------------------------------------
mean acceptance length: 1.63
-------------------------------------------------------------------------------------
total_counts: 8062
acceptance at token 0: 1.00 (8062 times)
acceptance at token 1: 0.70 (5612 times)
acceptance at token 2: 0.47 (3765 times)
```
Closes: https://github.com/vllm-project/vllm-ascend/issues/1004
---------
Signed-off-by: yuancaoyaoHW <a2749322671@gmail.com>
2025-06-20 17:19:54 +08:00
aux_hidden_states : torch . Tensor = None ,
Add Medusa speculative decoding support for vllm_ascend (#5668)
### What this PR does / why we need it?
`vllm_ascend` already supports several speculative decoding strategies
such as MTP, EAGLE, N-gram, and suffix decoding. However, Medusa is not
yet supported. Medusa is an efficient speculative decoding framework
that leverages a lightweight draft model to propose multiple tokens in a
single step, which can significantly improve decoding throughput and
reduce latency.
To enable Medusa-based speculative decoding on Ascend hardware and
provide more decoding options for users, this PR adds Medusa support
into the `vllm_ascend` speculative decoding pipeline.
### Does this PR introduce _any_ user-facing change?
This PR introduces Medusa speculative decoding as an additional
speculative decoding method:
✔ Adds `MedusaProposer` and integrates it into the speculative decoding
registry
✔ Extends `SpecDcodeType` with a `MEDUSA` enum entry
✔ Updates `NPUModelRunner` to recognize and invoke Medusa during
decoding
✔ Adds Medusa-specific handling in the draft token generation logic
✔ Ensures backward compatibility — Medusa is only used when explicitly
enabled
Key code changes include:
* New file: `vllm_ascend/spec_decode/medusa_proposer.py`
* Register Medusa in `get_spec_decode_method`
* Extend proposer type hints to include `MedusaProposer`
* Add a Medusa-specific branch in `generate_draft_token_ids`
* Pass `sample_hidden_states` required by Medusa
### How was this patch tested?
Medusa is implemented as a new proposer class (`MedusaProposer`)
following the existing speculative decoding interface. The integration
works as follows:
1. Users enable Medusa via the speculative decoding configuration.
2. `get_spec_decode_method()` returns a `MedusaProposer` instance when
`method="medusa"`.
3. During decoding, `NPUModelRunner` detects that the active drafter is
a `MedusaProposer`.
4. Instead of the generic speculative decoding path, the Medusa-specific
`generate_token_ids()` method is invoked, which consumes:
* `valid_sampled_token_ids`
* `sampling_metadata`
* `spec_decode_metadata`
* `sample_hidden_states`
5. The proposed tokens are validated by the target model as usual.
When Medusa is not enabled, the decoding pipeline behaves exactly as
before, ensuring full backward compatibility.
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d
Signed-off-by: simplzyu <191163281@qq.com>
Signed-off-by: simplzyu <zhenyuguo@cmbchina.com>
2026-01-23 14:14:23 +08:00
sample_hidden_states : torch . Tensor = None
Spec decode support for V1 Engine (#874)
<!-- Thanks for sending a pull request!
BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html
-->
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.
- Please clarify why the changes are needed. For instance, the use case
and bug description.
- Fixes #
-->
Make spec decode support for V1 Engine
- Currently, Ascend does not support the triton kernel. PyTorch is used
to rewrite the `rejection_sampler.py` triton kernel. However, PyTorch is
not as good as Triton. Therefore, ascend c is used to implement the
function in the future.
- Currently, spec decode supports only the ngram algorithm. The eagle
algorithm needs to be further adapted.
### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->
Not change user facing.
### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->
test by `tests/singlecard/spec_decode/e2e/test_v1_spec_decode.py` and
`tests/sample/test_rejection_sampler.py`, test base function of
rejection sampler and e2e function of spec decode.
Signed-off-by: ponix-j <657511300@qq.com>
2025-05-23 14:25:46 +08:00
) - > Optional [ list [ list [ int ] ] ] :
2025-09-04 11:34:47 +08:00
if not self . drafter :
Spec decode support for V1 Engine (#874)
<!-- Thanks for sending a pull request!
BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html
-->
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.
- Please clarify why the changes are needed. For instance, the use case
and bug description.
- Fixes #
-->
Make spec decode support for V1 Engine
- Currently, Ascend does not support the triton kernel. PyTorch is used
to rewrite the `rejection_sampler.py` triton kernel. However, PyTorch is
not as good as Triton. Therefore, ascend c is used to implement the
function in the future.
- Currently, spec decode supports only the ngram algorithm. The eagle
algorithm needs to be further adapted.
### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->
Not change user facing.
### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->
test by `tests/singlecard/spec_decode/e2e/test_v1_spec_decode.py` and
`tests/sample/test_rejection_sampler.py`, test base function of
rejection sampler and e2e function of spec decode.
Signed-off-by: ponix-j <657511300@qq.com>
2025-05-23 14:25:46 +08:00
# Speculative decoding is not enabled.
2025-08-20 09:01:04 +08:00
draft_token_ids = None
2025-09-04 11:34:47 +08:00
else :
2026-01-05 14:07:54 +08:00
if self . speculative_config . method in ( " suffix " , " ngram " ) :
draft_token_ids = self . drafter . generate_token_ids (
valid_sampled_token_ids , sampling_metadata ,
scheduler_output , spec_decode_metadata , positions ,
num_scheduled_tokens , hidden_states , aux_hidden_states )
Add Medusa speculative decoding support for vllm_ascend (#5668)
### What this PR does / why we need it?
`vllm_ascend` already supports several speculative decoding strategies
such as MTP, EAGLE, N-gram, and suffix decoding. However, Medusa is not
yet supported. Medusa is an efficient speculative decoding framework
that leverages a lightweight draft model to propose multiple tokens in a
single step, which can significantly improve decoding throughput and
reduce latency.
To enable Medusa-based speculative decoding on Ascend hardware and
provide more decoding options for users, this PR adds Medusa support
into the `vllm_ascend` speculative decoding pipeline.
### Does this PR introduce _any_ user-facing change?
This PR introduces Medusa speculative decoding as an additional
speculative decoding method:
✔ Adds `MedusaProposer` and integrates it into the speculative decoding
registry
✔ Extends `SpecDcodeType` with a `MEDUSA` enum entry
✔ Updates `NPUModelRunner` to recognize and invoke Medusa during
decoding
✔ Adds Medusa-specific handling in the draft token generation logic
✔ Ensures backward compatibility — Medusa is only used when explicitly
enabled
Key code changes include:
* New file: `vllm_ascend/spec_decode/medusa_proposer.py`
* Register Medusa in `get_spec_decode_method`
* Extend proposer type hints to include `MedusaProposer`
* Add a Medusa-specific branch in `generate_draft_token_ids`
* Pass `sample_hidden_states` required by Medusa
### How was this patch tested?
Medusa is implemented as a new proposer class (`MedusaProposer`)
following the existing speculative decoding interface. The integration
works as follows:
1. Users enable Medusa via the speculative decoding configuration.
2. `get_spec_decode_method()` returns a `MedusaProposer` instance when
`method="medusa"`.
3. During decoding, `NPUModelRunner` detects that the active drafter is
a `MedusaProposer`.
4. Instead of the generic speculative decoding path, the Medusa-specific
`generate_token_ids()` method is invoked, which consumes:
* `valid_sampled_token_ids`
* `sampling_metadata`
* `spec_decode_metadata`
* `sample_hidden_states`
5. The proposed tokens are validated by the target model as usual.
When Medusa is not enabled, the decoding pipeline behaves exactly as
before, ensuring full backward compatibility.
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d
Signed-off-by: simplzyu <191163281@qq.com>
Signed-off-by: simplzyu <zhenyuguo@cmbchina.com>
2026-01-23 14:14:23 +08:00
elif isinstance ( self . drafter , MedusaProposer ) :
draft_token_ids = self . drafter . generate_token_ids (
valid_sampled_token_ids , sampling_metadata ,
spec_decode_metadata , sample_hidden_states )
2026-01-05 14:07:54 +08:00
elif self . speculative_config . use_eagle ( ) :
2026-01-27 22:27:01 +08:00
common_attn_metadata = spec_decode_common_attn_metadata
2026-01-05 14:07:54 +08:00
sampled_token_ids = valid_sampled_token_ids
if self . vllm_config . speculative_config . disable_padded_drafter_batch :
# When padded-batch is disabled, the sampled_token_ids should be
# the cpu-side list[list[int]] of valid sampled tokens for each
# request, with invalid requests having empty lists.
assert isinstance ( sampled_token_ids , list ) , \
" sampled_token_ids should be a python list when " \
" padded-batch is disabled. "
assert self . drafter is not None
next_token_ids = self . drafter . prepare_next_token_ids_cpu (
sampled_token_ids , self . requests , self . input_batch ,
scheduler_output . num_scheduled_tokens )
else :
# When using padded-batch, the sampled_token_ids should be
# the gpu tensor of sampled tokens for each request, of shape
# (num_reqs, num_spec_tokens + 1) with rejected tokens having
# value -1.
assert isinstance ( sampled_token_ids , torch . Tensor ) , \
" sampled_token_ids should be a torch.Tensor when " \
" padded-batch is enabled. "
assert self . drafter is not None
next_token_ids , valid_sampled_tokens_count = \
self . drafter . prepare_next_token_ids_padded (
common_attn_metadata ,
sampled_token_ids ,
self . requests ,
self . input_batch ,
self . discard_request_indices . gpu ,
self . num_discarded_requests
)
self . _copy_valid_sampled_token_count (
next_token_ids , valid_sampled_tokens_count )
req_scheduled_tokens = scheduler_output . num_scheduled_tokens
if self . pcp_size * self . dcp_size > 1 :
long_seq_metadata = self . long_seq_metadata # type: ignore
input_ids_pcp_full = self . pcp_manager . input_ids_pcp_full . gpu
query_start_loc_pcp_full = self . pcp_manager . query_start_loc_pcp_full . gpu
query_start_loc_pcp_full_cpu = self . pcp_manager . query_start_loc_pcp_full . cpu
num_reqs = self . input_batch . num_reqs
ori_query_lens = query_start_loc_pcp_full_cpu [ 1 : num_reqs + 1 ] - \
query_start_loc_pcp_full_cpu [ : num_reqs ]
num_prefill_reqs = ( ori_query_lens
> self . decode_threshold ) . sum ( ) . item ( )
num_decode_reqs = num_reqs - num_prefill_reqs
else :
long_seq_metadata = None # type: ignore
num_prefill_reqs = 0
num_decode_reqs = 0
if spec_decode_metadata is None :
# update pcp related params
if self . pcp_size > 1 :
token_indices_to_sample = \
query_start_loc_pcp_full [ 1 : num_reqs + 1 ] - 1
target_token_ids = input_ids_pcp_full [ :
num_scheduled_tokens ]
2026-01-19 08:58:07 +08:00
target_positions = self . _get_positions ( num_scheduled_tokens )
2026-01-05 14:07:54 +08:00
target_hidden_states = hidden_states
else :
token_indices_to_sample = None
# input_ids can be None for multimodal models.
target_token_ids = self . input_ids . gpu [ :
num_scheduled_tokens ]
2026-01-19 08:58:07 +08:00
target_positions = self . _get_positions ( num_scheduled_tokens )
2026-01-05 14:07:54 +08:00
if self . use_aux_hidden_state_outputs :
target_hidden_states = torch . cat ( [
h [ : num_scheduled_tokens ]
for h in aux_hidden_states
] ,
dim = - 1 )
else :
target_hidden_states = hidden_states [ :
num_scheduled_tokens ]
else :
if self . pcp_size > 1 :
assert common_attn_metadata is not None
common_attn_metadata . query_start_loc_cpu [ : num_reqs + 1 ] = \
query_start_loc_pcp_full_cpu [ : num_reqs + 1 ]
assert common_attn_metadata is not None
common_attn_metadata . query_start_loc [ : num_reqs + 1 ] = \
query_start_loc_pcp_full [ : num_reqs + 1 ]
if self . vllm_config . speculative_config . disable_padded_drafter_batch :
# NOTE: Currently, MTP-fullgraph is incompatibility with pcp
token_indices_to_sample = None
assert self . drafter is not None
common_attn_metadata , token_indices = \
self . drafter . prepare_inputs (
common_attn_metadata ,
sampled_token_ids ,
spec_decode_metadata . num_draft_tokens )
else :
assert self . drafter is not None
common_attn_metadata , token_indices , \
token_indices_to_sample = \
self . drafter . prepare_inputs_padded (
common_attn_metadata ,
spec_decode_metadata ,
valid_sampled_tokens_count )
if self . pcp_size > 1 :
target_token_ids = input_ids_pcp_full [ token_indices ]
target_positions = positions
target_hidden_states = hidden_states
else :
target_token_ids = self . input_ids . gpu [ token_indices ]
2026-01-19 08:58:07 +08:00
target_positions = self . _get_positions ( token_indices )
2026-01-05 14:07:54 +08:00
if self . use_aux_hidden_state_outputs :
target_hidden_states = torch . cat (
[ h [ token_indices ] for h in aux_hidden_states ] ,
dim = - 1 )
else :
target_hidden_states = hidden_states [ token_indices ]
assert self . drafter is not None
draft_token_ids = self . drafter . _propose (
target_token_ids = target_token_ids ,
target_positions = target_positions ,
target_hidden_states = target_hidden_states ,
next_token_ids = next_token_ids ,
last_token_indices = token_indices_to_sample ,
common_attn_metadata = common_attn_metadata ,
sampling_metadata = sampling_metadata ,
req_scheduled_tokens = req_scheduled_tokens ,
long_seq_metadata = long_seq_metadata ,
num_prefill_reqs = num_prefill_reqs ,
num_decode_reqs = num_decode_reqs ,
scheduler_output = scheduler_output ,
num_scheduled_tokens = num_scheduled_tokens ,
)
else :
raise ValueError ( " Unknown speculative decoding method: "
f " { self . speculative_config . method } " )
2025-08-20 09:01:04 +08:00
return draft_token_ids
Spec decode support for V1 Engine (#874)
<!-- Thanks for sending a pull request!
BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html
-->
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.
- Please clarify why the changes are needed. For instance, the use case
and bug description.
- Fixes #
-->
Make spec decode support for V1 Engine
- Currently, Ascend does not support the triton kernel. PyTorch is used
to rewrite the `rejection_sampler.py` triton kernel. However, PyTorch is
not as good as Triton. Therefore, ascend c is used to implement the
function in the future.
- Currently, spec decode supports only the ngram algorithm. The eagle
algorithm needs to be further adapted.
### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->
Not change user facing.
### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->
test by `tests/singlecard/spec_decode/e2e/test_v1_spec_decode.py` and
`tests/sample/test_rejection_sampler.py`, test base function of
rejection sampler and e2e function of spec decode.
Signed-off-by: ponix-j <657511300@qq.com>
2025-05-23 14:25:46 +08:00
2025-03-20 19:34:44 +08:00
@torch.inference_mode ( )
def execute_model (
self ,
scheduler_output : " SchedulerOutput " ,
intermediate_tensors : Optional [ IntermediateTensors ] = None ,
2026-01-27 22:27:01 +08:00
) - > ModelRunnerOutput | IntermediateTensors | None :
2025-11-26 11:48:58 +08:00
if self . execute_model_state is not None :
raise RuntimeError ( " State error: sample_tokens() must be called "
" after execute_model() returns None. " )
2026-01-27 22:27:01 +08:00
# self._draft_token_ids is None when `input_fits_in_drafter=False`
# and there is no draft tokens scheduled. so it need to update the
# spec_decoding info in scheduler_output with async_scheduling.
# use deepcopy to avoid the modification has influence on the
# scheduler_output in engine core process.
# TODO(Ronald1995): deepcopy is expensive when there is a large
# number of requests, optimize it later.
if (
self . use_async_scheduling
and self . num_spec_tokens
and self . _draft_token_ids is None # type: ignore[has-type]
) :
scheduler_output = deepcopy ( scheduler_output )
num_scheduled_tokens = scheduler_output . total_num_scheduled_tokens
2025-08-21 08:54:57 +08:00
with ProfileExecuteDuration ( ) . capture_async ( " prepare input " ) :
2026-01-27 22:27:01 +08:00
with self . synchronize_input_prep ( ) :
# Update persistent batch states.
self . _update_states ( scheduler_output )
if has_ec_transfer ( ) and get_ec_transfer ( ) . is_producer :
with self . maybe_get_ec_connector_output (
2025-12-03 20:48:45 +08:00
scheduler_output ,
encoder_cache = self . encoder_cache ,
2026-01-27 22:27:01 +08:00
) as ec_connector_output :
self . _execute_mm_encoder ( scheduler_output )
return make_empty_encoder_model_runner_output ( scheduler_output )
if not num_scheduled_tokens :
if (
self . parallel_config . distributed_executor_backend
== " external_launcher "
and self . parallel_config . data_parallel_size > 1
) :
# this is a corner case when both external launcher
# and DP are enabled, num_scheduled_tokens could be
# 0, and has_unfinished_requests in the outer loop
# returns True. before returning early here we call
# dummy run to ensure coordinate_batch_across_dp
# is called into to avoid out of sync issues.
self . _dummy_run ( 1 )
if not has_kv_transfer_group ( ) :
# Return empty ModelRunnerOutput if no work to do.
return EMPTY_MODEL_RUNNER_OUTPUT
return self . kv_connector_no_forward (
scheduler_output , self . vllm_config
)
if self . cache_config . kv_sharing_fast_prefill :
assert not self . num_prompt_logprobs , (
" --kv-sharing-fast-prefill produces incorrect "
" logprobs for prompt tokens, tokens, please disable "
" it when the requests need prompt logprobs "
)
num_reqs = self . input_batch . num_reqs
req_ids = self . input_batch . req_ids
tokens = [ scheduler_output . num_scheduled_tokens [ i ] for i in req_ids ]
num_scheduled_tokens_np = np . array ( tokens , dtype = np . int32 )
max_num_scheduled_tokens = int ( num_scheduled_tokens_np . max ( ) )
(
logits_indices ,
spec_decode_metadata ,
) = self . _prepare_inputs (
scheduler_output ,
num_scheduled_tokens_np ,
)
2026-01-30 16:41:44 +08:00
2026-01-27 22:27:01 +08:00
num_tokens_unpadded = scheduler_output . total_num_scheduled_tokens
if self . pcp_size > 1 :
num_tokens_unpadded = self . pcp_manager . total_num_sampled_tokens_pcp
cascade_attn_prefix_lens = None
# Disable cascade attention when using microbatching (DBO)
if self . cascade_attn_enabled and not self . parallel_config . enable_dbo :
# Pre-compute cascade attention prefix lengths
cascade_attn_prefix_lens = self . _compute_cascade_attn_prefix_lens (
num_scheduled_tokens_np ,
self . input_batch . num_computed_tokens_cpu [ : num_reqs ] ,
scheduler_output . num_common_prefix_blocks ,
2025-07-26 17:15:47 +08:00
)
2025-09-17 10:36:43 +08:00
2026-01-27 22:27:01 +08:00
(
cudagraph_mode ,
batch_desc ,
should_ubatch ,
num_tokens_across_dp ,
cudagraph_stats ,
) = self . _determine_batch_execution_and_padding (
num_tokens = num_tokens_unpadded ,
num_reqs = num_reqs ,
num_scheduled_tokens_np = num_scheduled_tokens_np ,
max_num_scheduled_tokens = max_num_scheduled_tokens ,
use_cascade_attn = cascade_attn_prefix_lens is not None ,
num_encoder_reqs = len ( scheduler_output . scheduled_encoder_inputs ) ,
)
2025-09-17 10:36:43 +08:00
2026-01-27 22:27:01 +08:00
logger . debug (
" Running batch with cudagraph_mode: %s , batch_descriptor: %s , "
" should_ubatch: %s , num_tokens_across_dp: %s " ,
cudagraph_mode ,
batch_desc ,
should_ubatch ,
num_tokens_across_dp ,
)
2026-01-26 14:05:23 +08:00
2026-01-27 22:27:01 +08:00
num_tokens_padded = batch_desc . num_tokens
num_reqs_padded = (
batch_desc . num_reqs if batch_desc . num_reqs is not None else num_reqs
)
ubatch_slices , ubatch_slices_padded = maybe_create_ubatch_slices (
should_ubatch ,
num_scheduled_tokens_np ,
num_tokens_padded ,
num_reqs_padded ,
self . parallel_config . num_ubatches ,
)
pad_attn = cudagraph_mode == CUDAGraphMode . FULL
use_spec_decode = len ( scheduler_output . scheduled_spec_decode_tokens ) > 0
ubatch_slices_attn = ubatch_slices_padded if pad_attn else ubatch_slices
2026-01-30 16:41:44 +08:00
if cudagraph_mode != CUDAGraphMode . NONE :
num_reqs_padded = self . _pad_query_start_loc_for_fia (
num_tokens_padded , num_reqs_padded , num_reqs
)
2026-01-27 22:27:01 +08:00
( attn_metadata , spec_decode_common_attn_metadata ) = (
self . _build_attention_metadata (
num_tokens = num_tokens_unpadded ,
[bugfix]fix rope_forward_triton error (#6404)
### What this PR does / why we need it?
The rope_forward_triton method reports an error.
For example:
```
(Worker_DP0_TP1_EP1 pid=5298) ERROR 01-29 02:01:11 [multiproc_executor.py:822] q, k = rope_forward_triton(q, k, cos, sin, rope_dim=self.qk_rope_head_dim, is_neox_style=True)
(Worker_DP0_TP1_EP1 pid=5298) ERROR 01-29 02:01:11 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_DP0_TP1_EP1 pid=5298) ERROR 01-29 02:01:11 [multiproc_executor.py:822] File "/vllm-workspace/vllm-ascend/vllm_ascend/ops/triton/rope.py", line 155, in rope_forward_triton
(Worker_DP0_TP1_EP1 pid=5298) ERROR 01-29 02:01:11 [multiproc_executor.py:822] cos = cos.view(num_tokens, -1)
(Worker_DP0_TP1_EP1 pid=5298) ERROR 01-29 02:01:11 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_DP0_TP1_EP1 pid=5298) ERROR 01-29 02:01:11 [multiproc_executor.py:822] RuntimeError: shape '[14, -1]' is invalid for input of size 768
```
This is because an incorrect num_tokens_padded was passed in.
Related-RFC: https://github.com/vllm-project/vllm-ascend/issues/5449
Co-authored-by: @zhenwenqi2024
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.14.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd
Signed-off-by: Wang Kunpeng <1289706727@qq.com>
2026-01-30 14:09:00 +08:00
num_tokens_padded = num_tokens_padded ,
2026-01-27 22:27:01 +08:00
num_reqs = num_reqs ,
[bugfix]fix rope_forward_triton error (#6404)
### What this PR does / why we need it?
The rope_forward_triton method reports an error.
For example:
```
(Worker_DP0_TP1_EP1 pid=5298) ERROR 01-29 02:01:11 [multiproc_executor.py:822] q, k = rope_forward_triton(q, k, cos, sin, rope_dim=self.qk_rope_head_dim, is_neox_style=True)
(Worker_DP0_TP1_EP1 pid=5298) ERROR 01-29 02:01:11 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_DP0_TP1_EP1 pid=5298) ERROR 01-29 02:01:11 [multiproc_executor.py:822] File "/vllm-workspace/vllm-ascend/vllm_ascend/ops/triton/rope.py", line 155, in rope_forward_triton
(Worker_DP0_TP1_EP1 pid=5298) ERROR 01-29 02:01:11 [multiproc_executor.py:822] cos = cos.view(num_tokens, -1)
(Worker_DP0_TP1_EP1 pid=5298) ERROR 01-29 02:01:11 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_DP0_TP1_EP1 pid=5298) ERROR 01-29 02:01:11 [multiproc_executor.py:822] RuntimeError: shape '[14, -1]' is invalid for input of size 768
```
This is because an incorrect num_tokens_padded was passed in.
Related-RFC: https://github.com/vllm-project/vllm-ascend/issues/5449
Co-authored-by: @zhenwenqi2024
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.14.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd
Signed-off-by: Wang Kunpeng <1289706727@qq.com>
2026-01-30 14:09:00 +08:00
num_reqs_padded = num_reqs_padded ,
2026-01-27 22:27:01 +08:00
max_query_len = max_num_scheduled_tokens ,
ubatch_slices = ubatch_slices_attn ,
logits_indices = logits_indices ,
use_spec_decode = use_spec_decode ,
num_scheduled_tokens = scheduler_output . num_scheduled_tokens ,
num_scheduled_tokens_np = num_scheduled_tokens_np ,
cascade_attn_prefix_lens = cascade_attn_prefix_lens ,
)
)
2025-08-21 08:54:57 +08:00
2026-01-27 22:27:01 +08:00
(
input_ids ,
inputs_embeds ,
positions ,
intermediate_tensors ,
model_kwargs ,
ec_connector_output ,
) = self . _preprocess (
scheduler_output , num_tokens_padded , intermediate_tensors
)
2026-01-26 14:05:23 +08:00
# update global cos, sin
update_cos_sin ( positions )
2026-01-27 22:27:01 +08:00
# Set cudagraph mode to none if calc_kv_scales is true.
# KV scales calculation involves dynamic operations that are incompatible
# with CUDA graph capture.
if self . calculate_kv_scales : # type: ignore[has-type]
cudagraph_mode = CUDAGraphMode . NONE
# Mark KV scales as calculated after the first forward pass
self . calculate_kv_scales = False # type: ignore[has-type]
2025-11-24 21:58:31 +08:00
# prevent debugger is None
2025-12-26 14:07:37 +08:00
if self . debugger is not None :
2025-11-24 21:58:31 +08:00
dbg_cfg = getattr ( self . debugger , " config " , None )
dump_level = str (
getattr ( dbg_cfg , " level " ,
" L1 " ) ) . upper ( ) if dbg_cfg is not None else " L1 "
if dump_level in ( " L0 " , " MIX " ) :
self . debugger . start ( model = self . model )
else :
self . debugger . start ( )
2025-12-26 14:07:37 +08:00
if self . ascend_config . enable_async_exponential :
2025-12-20 21:23:21 +08:00
self . sampler . do_async_exponential (
b_s = logits_indices . shape [ 0 ] ,
head_dim = self . model_config . get_vocab_size ( ) ,
generators = self . input_batch . sampling_metadata . generators )
2026-01-27 08:44:36 +08:00
# Encoder-decoder models can only compile the pure decode steps where no
# encoder inputs are present. Use eager for the first pass.
num_encoder_reqs = len ( scheduler_output . scheduled_encoder_inputs )
has_encoder_input = (
self . model_config . is_encoder_decoder and num_encoder_reqs > 0
)
2025-08-21 08:54:57 +08:00
# Run forward pass
with ProfileExecuteDuration ( ) . capture_async ( " forward " ) :
2026-01-27 08:44:36 +08:00
with (
set_ascend_forward_context (
2025-08-21 08:54:57 +08:00
attn_metadata ,
self . vllm_config ,
2026-01-27 22:27:01 +08:00
num_tokens = num_tokens_padded ,
2025-08-21 08:54:57 +08:00
num_tokens_across_dp = num_tokens_across_dp ,
2026-01-27 22:27:01 +08:00
aclgraph_runtime_mode = cudagraph_mode ,
batch_descriptor = batch_desc ,
2025-08-21 08:54:57 +08:00
num_actual_tokens = scheduler_output .
2025-09-11 21:20:09 +08:00
total_num_scheduled_tokens ,
2026-01-27 08:44:36 +08:00
model_instance = self . model ,
skip_compiled = has_encoder_input ) ,
self . maybe_get_kv_connector_output ( scheduler_output ) as kv_connector_output ,
) :
2026-01-27 22:27:01 +08:00
hidden_states = self . _model_forward (
num_tokens_padded , input_ids , positions ,
intermediate_tensors , inputs_embeds , * * model_kwargs )
with ( ProfileExecuteDuration ( ) . capture_async ( " post process " ) ) :
if self . pcp_size > 1 :
# NOTE we must `slice` hidden_states because pcp_allgather_restore_idx
# ignores the padding from CUDA Graph.
hidden_states = self . pcp_manager . get_restore_hidden_states (
hidden_states
)
2025-08-21 08:54:57 +08:00
aux_hidden_states = None
2025-12-22 15:24:54 +08:00
if self . use_aux_hidden_state_outputs :
2025-08-21 08:54:57 +08:00
hidden_states , aux_hidden_states = hidden_states
2026-01-27 22:27:01 +08:00
if not self . broadcast_pp_output :
# Common case.
if not get_pp_group ( ) . is_last_rank :
# Return the intermediate tensors.
assert isinstance ( hidden_states , IntermediateTensors )
2025-08-15 07:35:27 +08:00
hidden_states . kv_connector_output = kv_connector_output
2025-12-18 15:27:55 +08:00
self . kv_connector_output = kv_connector_output
2025-12-26 14:07:37 +08:00
if self . debugger is not None :
2025-11-24 21:58:31 +08:00
self . debugger . stop ( )
self . debugger . step ( )
2025-07-11 15:30:51 +08:00
return hidden_states
2026-01-27 22:27:01 +08:00
if self . is_pooling_model :
# Return the pooling output.
output = self . _pool (
hidden_states , num_scheduled_tokens , num_scheduled_tokens_np , kv_connector_output
)
output . kv_connector_output = kv_connector_output
2025-12-26 14:07:37 +08:00
if self . debugger is not None :
2025-11-24 21:58:31 +08:00
self . debugger . stop ( )
self . debugger . step ( )
2026-01-27 22:27:01 +08:00
return output
2025-07-11 15:30:51 +08:00
sample_hidden_states = hidden_states [ logits_indices ]
2025-10-09 10:28:38 +08:00
logits = self . model . compute_logits ( sample_hidden_states )
2026-01-27 22:27:01 +08:00
else :
# Rare case.
assert not self . is_pooling_model
if not get_pp_group ( ) . is_last_rank :
sample_hidden_states = hidden_states [ logits_indices ]
get_pp_group ( ) . send_tensor_dict (
hidden_states . tensors , all_gather_group = get_tp_group ( ) )
logits = None
else :
sample_hidden_states = hidden_states [ logits_indices ]
logits = self . model . compute_logits ( sample_hidden_states )
model_output_broadcast_data : dict [ str , Any ] = { }
if logits is not None :
model_output_broadcast_data [ " logits " ] = logits . contiguous ( )
broadcasted = get_pp_group ( ) . broadcast_tensor_dict (
model_output_broadcast_data , src = len ( get_pp_group ( ) . ranks ) - 1
)
assert broadcasted is not None
logits = broadcasted [ " logits " ]
2025-06-06 09:29:34 +08:00
# Apply structured output bitmasks if present
2025-11-26 11:48:58 +08:00
self . execute_model_state = ExecuteModelState (
scheduler_output ,
logits ,
spec_decode_metadata ,
2026-01-27 22:27:01 +08:00
spec_decode_common_attn_metadata ,
2025-11-26 11:48:58 +08:00
hidden_states ,
sample_hidden_states ,
aux_hidden_states ,
attn_metadata ,
positions ,
2026-01-27 22:27:01 +08:00
ec_connector_output ,
[Fix] Adds CUDA graph stats to execution state (#6331)
### What this PR does / why we need it?
Adds a CUDA graph profiling stats field to the execution state and
updates the NPU model runner to set, unpack, and forward those stats
during execution. This preserves CUDA graph metrics across state
transitions, improving observability for later use and diagnostics.
### Does this PR introduce _any_ user-facing change?
Enable this by set
```python
llm = LLM(
...
disable_log_stats=False,
cudagraph_metrics=True,
...
)
```
or `--cudagraph-metrics` and make sure do not disable log stats.
After this, you should be able to see something like this, which is
really helpful for some light debugging:
```
[loggers.py:257] Engine 000: Avg prompt throughput: 32.3 tokens/s, Avg generation throughput: 114.4 tokens/s, Running: 4 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.1%, Prefix cache hit rate: 0.0%
[cuda_graph.py:117] **CUDAGraph Config Settings:**
[cuda_graph.py:117]
[cuda_graph.py:117] - Mode: FULL_DECODE_ONLY
[cuda_graph.py:117] - Capture sizes: [1, 2, 4, 8, 16, 24, 32]
[cuda_graph.py:117]
[cuda_graph.py:117] **CUDAGraph Stats:**
[cuda_graph.py:117]
[cuda_graph.py:117] | Unpadded Tokens | Padded Tokens | Num Paddings | Runtime Mode | Count |
[cuda_graph.py:117] |-----------------|---------------|--------------|--------------|-------|
[cuda_graph.py:117] | 4 | 4 | 0 | FULL | 18 |
[cuda_graph.py:117] | 5 | 5 | 0 | NONE | 1 |
[cuda_graph.py:117] | 1 | 1 | 0 | FULL | 1 |
[cuda_graph.py:117] | 18 | 18 | 0 | NONE | 1 |
```
### How was this patch tested?
None.
- vLLM version: v0.14.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd
Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
2026-01-28 16:34:20 +08:00
cudagraph_stats ,
2025-11-26 11:48:58 +08:00
)
2025-12-18 15:27:55 +08:00
self . kv_connector_output = kv_connector_output
2025-11-26 11:48:58 +08:00
return None
@torch.inference_mode
def sample_tokens (
self , grammar_output : " GrammarOutput | None "
) - > ModelRunnerOutput | AsyncModelRunnerOutput | IntermediateTensors :
2025-12-18 15:27:55 +08:00
kv_connector_output = self . kv_connector_output
self . kv_connector_output = None
2025-11-26 11:48:58 +08:00
if self . execute_model_state is None :
# Nothing to do (PP non-final rank case), output isn't used.
2025-12-18 15:27:55 +08:00
if not kv_connector_output :
return None # noqa
# In case of PP with kv transfer, we need to pass through the
# kv_connector_output
if kv_connector_output . is_empty ( ) :
return EMPTY_MODEL_RUNNER_OUTPUT
output = copy ( EMPTY_MODEL_RUNNER_OUTPUT )
output . kv_connector_output = kv_connector_output
return output
2025-11-26 11:48:58 +08:00
# Unpack ephemeral state.
(
scheduler_output ,
logits ,
spec_decode_metadata ,
2026-01-27 22:27:01 +08:00
spec_decode_common_attn_metadata ,
2025-11-26 11:48:58 +08:00
hidden_states ,
sample_hidden_states ,
2025-12-18 15:27:55 +08:00
aux_hidden_states ,
2025-11-26 11:48:58 +08:00
attn_metadata ,
positions ,
2026-01-27 22:27:01 +08:00
ec_connector_output ,
[Fix] Adds CUDA graph stats to execution state (#6331)
### What this PR does / why we need it?
Adds a CUDA graph profiling stats field to the execution state and
updates the NPU model runner to set, unpack, and forward those stats
during execution. This preserves CUDA graph metrics across state
transitions, improving observability for later use and diagnostics.
### Does this PR introduce _any_ user-facing change?
Enable this by set
```python
llm = LLM(
...
disable_log_stats=False,
cudagraph_metrics=True,
...
)
```
or `--cudagraph-metrics` and make sure do not disable log stats.
After this, you should be able to see something like this, which is
really helpful for some light debugging:
```
[loggers.py:257] Engine 000: Avg prompt throughput: 32.3 tokens/s, Avg generation throughput: 114.4 tokens/s, Running: 4 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.1%, Prefix cache hit rate: 0.0%
[cuda_graph.py:117] **CUDAGraph Config Settings:**
[cuda_graph.py:117]
[cuda_graph.py:117] - Mode: FULL_DECODE_ONLY
[cuda_graph.py:117] - Capture sizes: [1, 2, 4, 8, 16, 24, 32]
[cuda_graph.py:117]
[cuda_graph.py:117] **CUDAGraph Stats:**
[cuda_graph.py:117]
[cuda_graph.py:117] | Unpadded Tokens | Padded Tokens | Num Paddings | Runtime Mode | Count |
[cuda_graph.py:117] |-----------------|---------------|--------------|--------------|-------|
[cuda_graph.py:117] | 4 | 4 | 0 | FULL | 18 |
[cuda_graph.py:117] | 5 | 5 | 0 | NONE | 1 |
[cuda_graph.py:117] | 1 | 1 | 0 | FULL | 1 |
[cuda_graph.py:117] | 18 | 18 | 0 | NONE | 1 |
```
### How was this patch tested?
None.
- vLLM version: v0.14.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd
Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
2026-01-28 16:34:20 +08:00
cudagraph_stats ,
2025-11-26 11:48:58 +08:00
) = self . execute_model_state
# Clear ephemeral state.
self . execute_model_state = None
# Apply structured output bitmasks if present.
if grammar_output is not None :
2025-12-16 15:26:01 +08:00
# here we are different from gpu_model_runner,
# the apply_grammar_bitmask uses torch.compile to optimize this,ascend does not support it now
logits_dtype = logits . dtype
logits = logits . to ( " cpu " ) . float ( )
apply_grammar_bitmask ( scheduler_output , grammar_output ,
self . input_batch , logits )
logits = logits . to ( self . device ) . to ( logits_dtype )
2025-06-06 09:29:34 +08:00
2025-10-30 16:53:05 +08:00
with ProfileExecuteDuration ( ) . capture_async ( " Sample " ) :
2025-12-16 11:32:26 +08:00
sampler_output = self . _sample ( logits , spec_decode_metadata )
2025-06-14 22:31:16 +08:00
2025-10-30 16:53:05 +08:00
def propose_draft_token_ids ( sampled_token_ids ) :
2026-01-27 22:27:01 +08:00
assert spec_decode_common_attn_metadata is not None
2025-10-30 16:53:05 +08:00
self . _draft_token_ids = self . propose_draft_token_ids (
sampled_token_ids ,
2025-12-16 11:32:26 +08:00
self . input_batch . sampling_metadata ,
2025-10-30 16:53:05 +08:00
scheduler_output ,
spec_decode_metadata ,
2026-01-27 22:27:01 +08:00
spec_decode_common_attn_metadata ,
2025-10-30 16:53:05 +08:00
positions ,
scheduler_output . total_num_scheduled_tokens ,
hidden_states ,
attn_metadata ,
aux_hidden_states ,
Add Medusa speculative decoding support for vllm_ascend (#5668)
### What this PR does / why we need it?
`vllm_ascend` already supports several speculative decoding strategies
such as MTP, EAGLE, N-gram, and suffix decoding. However, Medusa is not
yet supported. Medusa is an efficient speculative decoding framework
that leverages a lightweight draft model to propose multiple tokens in a
single step, which can significantly improve decoding throughput and
reduce latency.
To enable Medusa-based speculative decoding on Ascend hardware and
provide more decoding options for users, this PR adds Medusa support
into the `vllm_ascend` speculative decoding pipeline.
### Does this PR introduce _any_ user-facing change?
This PR introduces Medusa speculative decoding as an additional
speculative decoding method:
✔ Adds `MedusaProposer` and integrates it into the speculative decoding
registry
✔ Extends `SpecDcodeType` with a `MEDUSA` enum entry
✔ Updates `NPUModelRunner` to recognize and invoke Medusa during
decoding
✔ Adds Medusa-specific handling in the draft token generation logic
✔ Ensures backward compatibility — Medusa is only used when explicitly
enabled
Key code changes include:
* New file: `vllm_ascend/spec_decode/medusa_proposer.py`
* Register Medusa in `get_spec_decode_method`
* Extend proposer type hints to include `MedusaProposer`
* Add a Medusa-specific branch in `generate_draft_token_ids`
* Pass `sample_hidden_states` required by Medusa
### How was this patch tested?
Medusa is implemented as a new proposer class (`MedusaProposer`)
following the existing speculative decoding interface. The integration
works as follows:
1. Users enable Medusa via the speculative decoding configuration.
2. `get_spec_decode_method()` returns a `MedusaProposer` instance when
`method="medusa"`.
3. During decoding, `NPUModelRunner` detects that the active drafter is
a `MedusaProposer`.
4. Instead of the generic speculative decoding path, the Medusa-specific
`generate_token_ids()` method is invoked, which consumes:
* `valid_sampled_token_ids`
* `sampling_metadata`
* `spec_decode_metadata`
* `sample_hidden_states`
5. The proposed tokens are validated by the target model as usual.
When Medusa is not enabled, the decoding pipeline behaves exactly as
before, ensuring full backward compatibility.
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d
Signed-off-by: simplzyu <191163281@qq.com>
Signed-off-by: simplzyu <zhenyuguo@cmbchina.com>
2026-01-23 14:14:23 +08:00
sample_hidden_states
2025-10-30 16:53:05 +08:00
)
2026-01-23 09:45:08 +08:00
self . _copy_draft_token_ids_to_cpu ( scheduler_output )
2025-10-30 16:53:05 +08:00
2025-12-16 11:32:26 +08:00
(
logprobs_lists ,
valid_sampled_token_ids ,
prompt_logprobs_dict ,
req_ids_output_copy ,
req_id_to_index_output_copy ,
invalid_req_indices ,
) = self . _bookkeeping_sync (
scheduler_output ,
sampler_output ,
logits ,
hidden_states ,
scheduler_output . total_num_scheduled_tokens ,
spec_decode_metadata ,
)
2025-10-30 16:53:05 +08:00
with ProfileExecuteDuration ( ) . capture_async ( " Draft " ) :
2025-08-20 09:01:04 +08:00
if self . speculative_config :
2025-10-30 16:53:05 +08:00
use_padded_batch_for_eagle = self . speculative_config and \
2025-12-16 22:06:40 +08:00
self . speculative_config . use_eagle ( ) and \
2025-10-30 16:53:05 +08:00
not self . speculative_config . disable_padded_drafter_batch
if use_padded_batch_for_eagle :
# EAGLE speculative decoding can use the GPU sampled tokens
# as inputs, and does not need to wait for bookkeeping to finish.
propose_draft_token_ids ( sampler_output . sampled_token_ids )
if self . speculative_config and not use_padded_batch_for_eagle :
# ngram and other speculative decoding methods use the sampled
# tokens on the CPU, so they are run after bookkeeping.
propose_draft_token_ids ( valid_sampled_token_ids )
2025-12-09 16:15:26 +08:00
2025-07-26 17:15:47 +08:00
if has_kv_transfer_group ( ) :
get_kv_transfer_group ( ) . clear_connector_metadata ( )
2025-09-10 08:43:10 +08:00
model_runner_output = ModelRunnerOutput (
2025-09-11 16:35:36 +08:00
req_ids = req_ids_output_copy ,
req_id_to_index = req_id_to_index_output_copy ,
2025-09-10 08:43:10 +08:00
sampled_token_ids = valid_sampled_token_ids ,
logprobs = logprobs_lists ,
prompt_logprobs_dict = prompt_logprobs_dict ,
2026-01-26 14:05:23 +08:00
kv_connector_output = kv_connector_output ,
2025-09-10 08:43:10 +08:00
pooler_output = [ ] ,
2026-01-27 22:27:01 +08:00
ec_connector_output = ec_connector_output
if self . supports_mm_inputs
else None ,
[Fix] Adds CUDA graph stats to execution state (#6331)
### What this PR does / why we need it?
Adds a CUDA graph profiling stats field to the execution state and
updates the NPU model runner to set, unpack, and forward those stats
during execution. This preserves CUDA graph metrics across state
transitions, improving observability for later use and diagnostics.
### Does this PR introduce _any_ user-facing change?
Enable this by set
```python
llm = LLM(
...
disable_log_stats=False,
cudagraph_metrics=True,
...
)
```
or `--cudagraph-metrics` and make sure do not disable log stats.
After this, you should be able to see something like this, which is
really helpful for some light debugging:
```
[loggers.py:257] Engine 000: Avg prompt throughput: 32.3 tokens/s, Avg generation throughput: 114.4 tokens/s, Running: 4 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.1%, Prefix cache hit rate: 0.0%
[cuda_graph.py:117] **CUDAGraph Config Settings:**
[cuda_graph.py:117]
[cuda_graph.py:117] - Mode: FULL_DECODE_ONLY
[cuda_graph.py:117] - Capture sizes: [1, 2, 4, 8, 16, 24, 32]
[cuda_graph.py:117]
[cuda_graph.py:117] **CUDAGraph Stats:**
[cuda_graph.py:117]
[cuda_graph.py:117] | Unpadded Tokens | Padded Tokens | Num Paddings | Runtime Mode | Count |
[cuda_graph.py:117] |-----------------|---------------|--------------|--------------|-------|
[cuda_graph.py:117] | 4 | 4 | 0 | FULL | 18 |
[cuda_graph.py:117] | 5 | 5 | 0 | NONE | 1 |
[cuda_graph.py:117] | 1 | 1 | 0 | FULL | 1 |
[cuda_graph.py:117] | 18 | 18 | 0 | NONE | 1 |
```
### How was this patch tested?
None.
- vLLM version: v0.14.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd
Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
2026-01-28 16:34:20 +08:00
cudagraph_stats = cudagraph_stats ,
2025-09-10 08:43:10 +08:00
)
Spec decode support for V1 Engine (#874)
<!-- Thanks for sending a pull request!
BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html
-->
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.
- Please clarify why the changes are needed. For instance, the use case
and bug description.
- Fixes #
-->
Make spec decode support for V1 Engine
- Currently, Ascend does not support the triton kernel. PyTorch is used
to rewrite the `rejection_sampler.py` triton kernel. However, PyTorch is
not as good as Triton. Therefore, ascend c is used to implement the
function in the future.
- Currently, spec decode supports only the ngram algorithm. The eagle
algorithm needs to be further adapted.
### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->
Not change user facing.
### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->
test by `tests/singlecard/spec_decode/e2e/test_v1_spec_decode.py` and
`tests/sample/test_rejection_sampler.py`, test base function of
rejection sampler and e2e function of spec decode.
Signed-off-by: ponix-j <657511300@qq.com>
2025-05-23 14:25:46 +08:00
2025-06-06 09:29:34 +08:00
durations = ProfileExecuteDuration ( ) . pop_captured_sync ( )
if durations :
dr_str = [
f " [ { tag } ]: { duration : .2f } ms "
for tag , duration in durations . items ( )
]
captured_name = " Decode " if self . attn_state == AscendAttentionState . DecodeOnly else " Prefill "
2025-06-11 10:02:11 +08:00
logger . info ( " Profile execute duration [ %s ]: %s " , captured_name ,
" " . join ( dr_str ) )
2025-09-17 10:36:43 +08:00
if self . dynamic_eplb :
self . eplb_updator . forward_end ( )
2025-09-11 16:35:36 +08:00
2025-12-26 14:07:37 +08:00
if self . debugger is not None :
2025-11-24 21:58:31 +08:00
self . debugger . stop ( )
self . debugger . step ( )
2026-01-27 22:27:01 +08:00
if not self . use_async_scheduling :
return model_runner_output
2025-12-12 17:27:09 +08:00
return AsyncGPUModelRunnerOutput (
2025-09-11 16:35:36 +08:00
model_runner_output = model_runner_output ,
2025-12-16 11:32:26 +08:00
sampled_token_ids = sampler_output . sampled_token_ids ,
2025-12-13 17:04:54 +08:00
logprobs_tensors = sampler_output . logprobs_tensors ,
2025-09-11 16:35:36 +08:00
invalid_req_indices = invalid_req_indices ,
async_output_copy_stream = self . async_output_copy_stream ,
2025-12-13 17:04:54 +08:00
vocab_size = self . input_batch . vocab_size ,
2025-09-11 16:35:36 +08:00
)
2025-03-20 19:34:44 +08:00
2025-12-16 11:32:26 +08:00
# overwrite _sample for lmhead_tp_enable and need_accepted_tokens
def _sample ( self , logits , spec_decode_metadata ) :
# Sample the next token and get logprobs if needed.
sampling_metadata = self . input_batch . sampling_metadata
if spec_decode_metadata is None :
if lmhead_tp_enable ( ) and logits is not None :
logits = logits [ : self . input_batch . num_reqs ]
return self . sampler (
logits = logits ,
sampling_metadata = sampling_metadata ,
)
if lmhead_tp_enable ( ) and logits is not None :
logits = logits [ : len ( spec_decode_metadata . logits_indices ) ]
sampler_output = self . rejection_sampler (
spec_decode_metadata ,
None , # draft_probs
logits ,
sampling_metadata ,
)
if self . need_accepted_tokens : # TODO remove this if
self . _update_states_after_model_execute (
sampler_output . sampled_token_ids )
return sampler_output
# TODO: remove this func after eagle_proposer is refactored and
# _bookkeeping_sync is moved after propose_draft_token_ids
def _bookkeeping_sync (
self ,
scheduler_output : " SchedulerOutput " ,
sampler_output : SamplerOutput ,
logits : torch . Tensor | None ,
hidden_states : torch . Tensor ,
num_scheduled_tokens : int ,
spec_decode_metadata : SpecDecodeMetadata | None ,
) - > tuple [
LogprobsLists | None ,
list [ list [ int ] ] ,
dict [ str , LogprobsTensors | None ] ,
list [ str ] ,
dict [ str , int ] ,
list [ int ] ,
] :
# TODO: implement PR 28597 from vllm
discard_sampled_tokens_req_indices = \
self . discard_request_indices . np [ : self . num_discarded_requests ]
for i in discard_sampled_tokens_req_indices :
gen = self . input_batch . generators . get ( int ( i ) )
if gen is not None :
gen . set_offset ( gen . get_offset ( ) - 4 )
# Copy some objects so they don't get modified after returning.
# This is important when using async scheduling.
req_ids_output_copy = self . input_batch . req_ids . copy ( )
req_id_to_index_output_copy = self . input_batch . req_id_to_index . copy ( )
num_sampled_tokens = sampler_output . sampled_token_ids . shape [ 0 ]
sampled_token_ids = sampler_output . sampled_token_ids
logprobs_tensors = sampler_output . logprobs_tensors
invalid_req_indices = [ ]
cu_num_tokens : list [ int ] | None = None
if not self . use_async_scheduling :
# Get the valid generated tokens.
max_gen_len = sampled_token_ids . shape [ - 1 ]
if max_gen_len == 1 :
# No spec decode tokens.
valid_sampled_token_ids = self . _to_list ( sampled_token_ids )
# Mask out the sampled tokens that should not be sampled.
for i in discard_sampled_tokens_req_indices :
valid_sampled_token_ids [ int ( i ) ] . clear ( )
else :
# Includes spec decode tokens.
2026-01-23 09:45:08 +08:00
valid_sampled_token_ids , cu_num_tokens = RejectionSampler . parse_output (
sampled_token_ids ,
self . input_batch . vocab_size ,
discard_sampled_tokens_req_indices ,
logprobs_tensors = logprobs_tensors ,
)
2025-12-16 11:32:26 +08:00
else :
valid_sampled_token_ids = [ ]
invalid_req_indices = discard_sampled_tokens_req_indices . tolist ( )
invalid_req_indices_set = set ( invalid_req_indices )
if self . num_spec_tokens < = 0 :
assert sampled_token_ids . shape [ - 1 ] == 1
# Cache the sampled tokens on the NPU and avoid CPU sync.
# These will be copied into input_ids in the next step
# when preparing inputs.
self . input_batch . prev_sampled_token_ids = sampled_token_ids
self . input_batch . prev_req_id_to_index = {
req_id : i
for i , req_id in enumerate ( self . input_batch . req_ids )
if i not in invalid_req_indices_set
}
# Cache the sampled tokens in the model runner, so that the scheduler
# doesn't need to send them back.
# NOTE(woosuk): As an exception, when using PP, the scheduler sends
# the sampled tokens back, because there's no direct communication
# between the first-stage worker and the last-stage worker.
req_ids = self . input_batch . req_ids
for req_idx in range ( num_sampled_tokens ) :
if self . use_async_scheduling :
sampled_ids = [
- 1
] if req_idx not in invalid_req_indices_set else None
else :
sampled_ids = valid_sampled_token_ids [ req_idx ]
num_sampled_ids : int = len ( sampled_ids ) if sampled_ids else 0
if not sampled_ids :
continue
start_idx = self . input_batch . num_tokens_no_spec [ req_idx ]
end_idx = start_idx + num_sampled_ids
assert end_idx < = self . max_model_len , (
" Sampled token IDs exceed the max model length. "
f " Total number of tokens: { end_idx } > max_model_len: "
f " { self . max_model_len } " )
self . input_batch . token_ids_cpu [ req_idx ,
start_idx : end_idx ] = sampled_ids
self . input_batch . is_token_ids [ req_idx , start_idx : end_idx ] = True
self . input_batch . num_tokens_no_spec [ req_idx ] = end_idx
self . input_batch . num_tokens [ req_idx ] = end_idx
req_id = req_ids [ req_idx ]
req_state = self . requests [ req_id ]
req_state . output_token_ids . extend ( sampled_ids )
logprobs_lists = ( logprobs_tensors . tolists ( cu_num_tokens )
if not self . use_async_scheduling
and logprobs_tensors is not None else None )
# Compute prompt logprobs if needed.
prompt_logprobs_dict = self . _get_prompt_logprobs_dict (
hidden_states [ : num_scheduled_tokens ] ,
scheduler_output . num_scheduled_tokens ,
)
return (
logprobs_lists ,
valid_sampled_token_ids ,
prompt_logprobs_dict ,
req_ids_output_copy ,
req_id_to_index_output_copy ,
invalid_req_indices ,
)
2026-01-27 22:27:01 +08:00
# all-gather one hidden-states in sp scene
@staticmethod
def _all_gather_hidden_states ( hidden_states ) :
hidden_states = tensor_model_parallel_all_gather ( hidden_states , 0 )
pad_size = get_forward_context ( ) . pad_size
if pad_size > 0 :
hidden_states = hidden_states [ : - pad_size , : ]
2025-10-21 00:00:42 +08:00
2026-01-27 22:27:01 +08:00
return hidden_states
2025-09-22 17:14:28 +08:00
2026-01-27 22:27:01 +08:00
# all-gather a list of hidden-states in sp scene
@staticmethod
def _all_gather_hidden_states_list ( hidden_states_list ) :
return [
NPUModelRunner . _all_gather_hidden_states ( hidden_states )
for hidden_states in hidden_states_list
]
2025-09-22 17:14:28 +08:00
2026-01-27 22:27:01 +08:00
# all-gather hidden-states in last layer with aux-hidden-states in sp scene
@staticmethod
def _all_gather_hidden_states_and_aux ( hidden_states ) :
if isinstance ( hidden_states , tuple ) :
return ( NPUModelRunner . _all_gather_hidden_states ( hidden_states [ 0 ] ) ,
NPUModelRunner . _all_gather_hidden_states_list (
hidden_states [ 1 ] ) )
return NPUModelRunner . _all_gather_hidden_states ( hidden_states )
2025-11-17 10:50:35 +08:00
2026-01-27 22:27:01 +08:00
def _model_forward (
self ,
num_tokens_padded : int ,
input_ids : torch . Tensor | None = None ,
positions : torch . Tensor | None = None ,
intermediate_tensors : IntermediateTensors | None = None ,
inputs_embeds : torch . Tensor | None = None ,
* * model_kwargs : dict [ str , Any ] , ) :
assert self . model is not None
hidden_states = self . model (
input_ids = input_ids ,
positions = positions ,
intermediate_tensors = intermediate_tensors ,
inputs_embeds = inputs_embeds ,
* * model_kwargs )
2025-09-30 11:14:51 +08:00
forward_context = get_forward_context ( )
assert forward_context is not None
2025-10-10 16:31:20 +08:00
if forward_context . cudagraph_runtime_mode == CUDAGraphMode . FULL and \
2025-11-03 10:02:47 +08:00
not forward_context . capturing and not self . use_sparse :
2026-01-27 22:27:01 +08:00
assert positions is not None
2026-01-26 09:04:54 +08:00
update_full_graph_params ( self . attn_backend , self . update_stream , forward_context ,
2026-01-27 22:27:01 +08:00
num_tokens_padded , self . vllm_config ,
2026-01-26 09:04:54 +08:00
self . speculative_config , positions . shape [ 0 ] )
2026-01-27 22:27:01 +08:00
if get_forward_context ( ) . sp_enabled and not isinstance (
hidden_states , IntermediateTensors ) :
hidden_states = self . _all_gather_hidden_states_and_aux (
hidden_states )
return hidden_states
def _pad_for_sequence_parallelism ( self , num_scheduled_tokens : int ) - > int :
# Pad tokens to multiple of tensor_parallel_size when
# enabled collective fusion for SP
tp_size = self . vllm_config . parallel_config . tensor_parallel_size
if enable_sp ( ) :
return round_up ( num_scheduled_tokens , tp_size )
return num_scheduled_tokens
2025-09-30 11:14:51 +08:00
2026-01-27 22:27:01 +08:00
def _sync_batch_across_dp (
self ,
num_tokens_padded : int | None = None ,
cudagraph_mode : int = 0 ,
) - > tuple [ bool , torch . Tensor | None , int ] :
"""
Coordinates amongst all DP ranks to determine if and how the full batch
should be split into microbatches .
Args :
num_tokens_padded : Number of tokens including any non - DP padding ( CUDA graphs ,
TP , etc )
cudagraph_mode : The cudagraph mode for this rank ( 0 = NONE , 1 = PIECEWISE , 2 = FULL )
Returns : tuple [
ubatch_slices : if this is set then all DP ranks have agreed to
microbatch
num_tokens_after_padding : A tensor containing the total number of
tokens per - microbatch for each DP rank including padding . Will be
padded up to the max value across all DP ranks when allow_dp_padding
is True .
synced_cudagraph_mode : The synchronized cudagraph mode ( min across ranks )
]
"""
# TODO: In vLLM, the only thing that needs to be synced is num_tokens, but in
# our case, we still need to sync the other two flags as well. So we need to
# include them in the all_reduce operation, and more over, we CANNOT skip it
# even if we are running in eager mode, which harms performance.
# FIXME: Restore the `or self.vllm_config.model_config.enforce_eager` here
# immediately once the other two flags are no longer needed.
if self . dp_size == 1 :
return False , None , cudagraph_mode
if self . _skip_all_reduce_across_dp_group ( ) :
num_tokens_after_padding = torch . tensor ( [ num_tokens_padded ] *
self . dp_size ,
device = " cpu " ,
dtype = torch . int32 )
return False , num_tokens_after_padding , cudagraph_mode
tensor = torch . zeros ( 2 , self . dp_size , device = " cpu " , dtype = torch . int32 )
tensor [ 0 ] [ self . dp_rank ] = num_tokens_padded
tensor [ 1 ] [ self . dp_rank ] = cudagraph_mode
dist . all_reduce ( tensor , group = get_dp_group ( ) . cpu_group )
num_tokens_across_dp = tensor [ 0 , : ]
max_num_tokens = int ( num_tokens_across_dp . max ( ) . item ( ) )
num_tokens_after_padding = torch . tensor (
[ max_num_tokens ] * len ( num_tokens_across_dp ) ,
device = " cpu " ,
dtype = torch . int32 ,
)
# Synchronize cudagraph_mode across ranks (take min)
synced_cudagraph_mode = _post_process_cudagraph_mode ( tensor )
return False , num_tokens_after_padding , synced_cudagraph_mode
def _determine_batch_execution_and_padding (
self ,
num_tokens : int ,
num_reqs : int ,
num_scheduled_tokens_np : np . ndarray ,
max_num_scheduled_tokens : int ,
use_cascade_attn : bool ,
allow_microbatching : bool = False ,
force_eager : bool = False ,
# For cudagraph capture TODO(lucas): Refactor how we capture cudagraphs (will
# be improved in model runner v2)
force_uniform_decode : bool | None = None ,
force_has_lora : bool | None = None ,
num_encoder_reqs : int = 0 ,
) - > tuple [ CUDAGraphMode , BatchDescriptor , bool ,
torch . Tensor | None , CUDAGraphStat | None ] :
num_tokens_padded = self . _pad_for_sequence_parallelism ( num_tokens )
uniform_decode = (
( ( max_num_scheduled_tokens == self . uniform_decode_query_len ) and
( num_tokens == max_num_scheduled_tokens * num_reqs ) )
if force_uniform_decode is None else force_uniform_decode )
# Encoder-decoder models only support CG for decoder_step > 0 (no enc_output
# is present). Also, chunked-prefill is disabled, so batch are uniform.
has_encoder_output = ( self . model_config . is_encoder_decoder
and num_encoder_reqs > 0 )
has_lora = ( len ( self . input_batch . lora_id_to_lora_request ) > 0
if force_has_lora is None else force_has_lora )
# ruff: noqa: E731
dispatch_cudagraph = (
lambda num_tokens , disable_full : self . cudagraph_dispatcher .
dispatch (
num_tokens = num_tokens ,
has_lora = has_lora ,
uniform_decode = uniform_decode ,
disable_full = disable_full ,
) if not force_eager else
( CUDAGraphMode . NONE , BatchDescriptor ( num_tokens_padded ) ) )
cudagraph_mode , batch_descriptor = dispatch_cudagraph (
num_tokens_padded , use_cascade_attn or has_encoder_output )
num_tokens_padded = batch_descriptor . num_tokens
if enable_sp ( self . vllm_config ) :
assert ( batch_descriptor . num_tokens %
self . vllm_config . parallel_config . tensor_parallel_size == 0
) , ( " Sequence parallelism requires num_tokens to be "
" a multiple of tensor parallel size " )
# Extra coordination when running data-parallel since we need to coordinate
# across ranks
should_ubatch , num_tokens_across_dp = False , None
if self . vllm_config . parallel_config . data_parallel_size > 1 :
_ , num_tokens_across_dp , synced_cudagraph_mode = self . _sync_batch_across_dp ( num_tokens_padded = num_tokens_padded ,
cudagraph_mode = cudagraph_mode . value ,
)
# Extract DP padding if there is any
if num_tokens_across_dp is not None :
dp_rank = self . parallel_config . data_parallel_rank
num_tokens_padded = int ( num_tokens_across_dp [ dp_rank ] . item ( ) )
# Re-dispatch with DP padding
cudagraph_mode , batch_descriptor = dispatch_cudagraph (
num_tokens_padded ,
disable_full = synced_cudagraph_mode < = CUDAGraphMode . PIECEWISE . value , )
# Assert to make sure the agreed upon token count is correct otherwise
# num_tokens_across_dp will no-longer be valid
assert batch_descriptor . num_tokens == num_tokens_padded
cudagraph_stats = None
if self . vllm_config . observability_config . cudagraph_metrics :
cudagraph_stats = CUDAGraphStat (
num_unpadded_tokens = num_tokens ,
num_padded_tokens = batch_descriptor . num_tokens ,
num_paddings = batch_descriptor . num_tokens - num_tokens ,
runtime_mode = str ( cudagraph_mode ) ,
)
return (
cudagraph_mode ,
batch_descriptor ,
should_ubatch ,
num_tokens_across_dp ,
cudagraph_stats ,
)
def _build_attention_metadata (
self ,
num_tokens : int ,
num_reqs : int ,
max_query_len : int ,
num_tokens_padded : int | None = None ,
num_reqs_padded : int | None = None ,
ubatch_slices : UBatchSlices | None = None ,
logits_indices : torch . Tensor | None = None ,
use_spec_decode : bool = False ,
for_cudagraph_capture : bool = False ,
num_scheduled_tokens : dict [ str , int ] | None = None ,
num_scheduled_tokens_np : np . ndarray | None = None ,
cascade_attn_prefix_lens : list [ list [ int ] ] | None = None ,
) - > tuple [ PerLayerAttnMetadata , CommonAttentionMetadata | None ] :
"""
: return : tuple [ attn_metadata , spec_decode_common_attn_metadata ]
"""
# Attention metadata is not needed for attention free models
if len ( self . kv_cache_config . kv_cache_groups ) == 0 :
return { } , None
num_tokens_padded = num_tokens_padded or num_tokens
num_reqs_padded = num_reqs_padded or num_reqs
attn_metadata : PerLayerAttnMetadata = { }
if ubatch_slices is not None :
attn_metadata = [ dict ( ) for _ in range ( len ( ubatch_slices ) ) ]
if for_cudagraph_capture :
# For some attention backends (e.g. FA) with sliding window models we need
# to make sure the backend see a max_seq_len that is larger to the sliding
# window size when capturing to make sure the correct kernel is selected.
max_seq_len = self . max_model_len
2025-08-11 18:03:19 +08:00
else :
2026-01-27 22:27:01 +08:00
max_seq_len = self . seq_lens . np [ : num_reqs ] . max ( ) . item ( )
if use_spec_decode and self . need_accepted_tokens :
self . num_accepted_tokens . np [ : num_reqs ] = (
self . input_batch . num_accepted_tokens_cpu [ : num_reqs ] )
self . num_accepted_tokens . np [ num_reqs : ] . fill ( 1 )
self . num_accepted_tokens . copy_to_gpu ( )
kv_cache_groups = self . kv_cache_config . kv_cache_groups
def _get_pcp_metadata ( num_tokens ) :
if not self . use_cp :
return None
return self . pcp_manager . generate_pcp_metadata ( num_tokens , self . query_lens , self . input_batch , num_scheduled_tokens_np )
def _get_block_table_and_slot_mapping ( kv_cache_gid : int ) :
assert num_reqs_padded is not None and num_tokens_padded is not None
kv_cache_spec = kv_cache_groups [ kv_cache_gid ] . kv_cache_spec
maybe_pcp_full_tokens = (
num_tokens_padded if self . pcp_size == 1 else
num_tokens * self . pcp_size -
sum ( self . pcp_manager . num_pcp_pads_cpu [ : num_reqs ] ) )
if isinstance ( kv_cache_spec , EncoderOnlyAttentionSpec ) :
blk_table_tensor = torch . zeros (
( num_reqs_padded , 1 ) ,
dtype = torch . int32 ,
device = self . device ,
)
slot_mapping = torch . zeros (
( num_tokens_padded , ) ,
dtype = torch . int64 ,
device = self . device ,
)
else :
blk_table = self . input_batch . block_table [ kv_cache_gid ]
slot_mapping = blk_table . slot_mapping . gpu [ : maybe_pcp_full_tokens ]
maybe_num_reqs_padded = num_reqs_padded * self . decode_token_per_req if self . use_cp else num_reqs_padded
blk_table_tensor = blk_table . get_device_tensor ( ) [ : maybe_num_reqs_padded ]
# Fill unused with -1. Needed for reshape_and_cache in full cuda
# graph mode. `blk_table_tensor` -1 to match mamba PAD_SLOT_ID
if self . pcp_size == 1 :
slot_mapping [ num_tokens : num_tokens_padded ] . fill_ ( - 1 )
blk_table_tensor [ num_reqs : num_reqs_padded ] . fill_ ( 0 )
if self . pcp_size > 1 :
slot_mapping = self . pcp_manager . get_padded_slot_mapping (
num_tokens ,
num_tokens_padded ,
slot_mapping ,
)
return blk_table_tensor , slot_mapping
long_seq_metdadata = _get_pcp_metadata ( num_tokens )
block_table_gid_0 , slot_mapping_gid_0 = _get_block_table_and_slot_mapping ( 0 )
2026-01-30 16:41:44 +08:00
actual_last_loc = self . query_start_loc . np [ num_reqs_padded ]
error_msg = (
f " Due to FIA kernel constraints, when the layout is TND, "
f " the first dimension of `hidden_states` ( { num_tokens_padded } ) "
f " must equal the last element of `actual_seq_lengths_q` ( { actual_last_loc } ). "
)
assert self . query_start_loc . np [ num_reqs_padded ] == num_tokens_padded , error_msg
2026-01-27 22:27:01 +08:00
cm_base = AscendCommonAttentionMetadata (
query_start_loc = self . query_start_loc . gpu [ : num_reqs_padded + 1 ] ,
query_start_loc_cpu = self . query_start_loc . cpu [ : num_reqs_padded + 1 ] ,
seq_lens = self . seq_lens . gpu [ : num_reqs_padded ] ,
# TODO
seq_lens_cpu = self . seq_lens . cpu [ : num_reqs_padded ] ,
# TODO
num_computed_tokens_cpu = self . input_batch . num_computed_tokens_cpu_tensor [
: num_reqs_padded
] ,
num_reqs = num_reqs_padded ,
num_actual_tokens = num_tokens ,
max_query_len = max_query_len ,
max_seq_len = max_seq_len ,
block_table_tensor = block_table_gid_0 ,
slot_mapping = slot_mapping_gid_0 ,
causal = True ,
num_input_tokens = num_tokens_padded ,
actual_seq_lengths_q = self . actual_seq_lengths_q ,
positions = self . positions . gpu ,
attn_state = self . attn_state ,
decode_token_per_req = self . decode_token_per_req ,
prefill_context_parallel_metadata = long_seq_metdadata ,
)
if logits_indices is not None and self . cache_config . kv_sharing_fast_prefill :
cm_base . num_logits_indices = logits_indices . size ( 0 )
cm_base . logits_indices_padded = self . _prepare_kv_sharing_fast_prefill (
logits_indices
)
def _build_attn_group_metadata (
kv_cache_gid : int ,
attn_gid : int ,
common_attn_metadata : CommonAttentionMetadata ,
ubid : int | None = None ,
) - > None :
attn_group = self . attn_groups [ kv_cache_gid ] [ attn_gid ]
builder = attn_group . get_metadata_builder ( ubid or 0 )
cascade_attn_prefix_len = (
cascade_attn_prefix_lens [ kv_cache_gid ] [ attn_gid ]
if cascade_attn_prefix_lens
else 0
)
extra_attn_metadata_args = { }
if use_spec_decode and isinstance ( builder , GDNAttentionMetadataBuilder ) :
assert ubid is None , " UBatching not supported with GDN yet "
patch_torch_npu_argsort ( )
extra_attn_metadata_args = dict (
num_accepted_tokens = self . num_accepted_tokens . gpu [ : num_reqs_padded ] ,
num_decode_draft_tokens_cpu = self . num_decode_draft_tokens . cpu [
: num_reqs_padded
] ,
)
if for_cudagraph_capture :
attn_metadata_i = builder . build_for_cudagraph_capture (
common_attn_metadata
)
else :
attn_metadata_i = builder . build (
common_prefix_len = cascade_attn_prefix_len ,
common_attn_metadata = common_attn_metadata ,
* * extra_attn_metadata_args ,
)
if ubid is None :
assert isinstance ( attn_metadata , dict )
attn_metadata_dict = attn_metadata
else :
assert isinstance ( attn_metadata , list )
attn_metadata_dict = attn_metadata [ ubid ]
for layer_name in attn_group . layer_names :
attn_metadata_dict [ layer_name ] = attn_metadata_i
# Prepare the attention metadata for each KV cache group and make layers
# in the same group share the same metadata.
spec_decode_common_attn_metadata = None
for kv_cache_gid , kv_cache_group in enumerate (
self . kv_cache_config . kv_cache_groups ) :
cm = copy ( cm_base ) # shallow copy
# Basically only the encoder seq_lens, block_table and slot_mapping change
# for each kv_cache_group.
cm . encoder_seq_lens , cm . encoder_seq_lens_cpu = self . _get_encoder_seq_lens (
num_scheduled_tokens or { } ,
kv_cache_group . kv_cache_spec ,
num_reqs_padded ,
)
if kv_cache_gid > 0 :
cm . block_table_tensor , cm . slot_mapping = (
_get_block_table_and_slot_mapping ( kv_cache_gid )
)
if self . speculative_config and spec_decode_common_attn_metadata is None :
if isinstance ( self . drafter , EagleProposer ) :
if self . drafter . attn_layer_names [ 0 ] in kv_cache_group . layer_names :
spec_decode_common_attn_metadata = cm
else :
spec_decode_common_attn_metadata = cm
for attn_gid in range ( len ( self . attn_groups [ kv_cache_gid ] ) ) :
_build_attn_group_metadata ( kv_cache_gid , attn_gid , cm )
if self . is_mm_prefix_lm :
req_doc_ranges = { }
for req_id in self . input_batch . req_ids :
image_doc_ranges = [ ]
req_state = self . requests [ req_id ]
for mm_feature in req_state . mm_features :
pos_info = mm_feature . mm_position
img_doc_range = pos_info . extract_embeds_range ( )
image_doc_ranges . extend ( img_doc_range )
req_idx = self . input_batch . req_id_to_index [ req_id ]
req_doc_ranges [ req_idx ] = image_doc_ranges
if isinstance ( attn_metadata , list ) :
for ub_metadata in attn_metadata :
for _metadata in ub_metadata . values ( ) :
_metadata . mm_prefix_range = req_doc_ranges # type: ignore[attr-defined]
else :
for _metadata in attn_metadata . values ( ) :
_metadata . mm_prefix_range = req_doc_ranges # type: ignore[attr-defined]
if spec_decode_common_attn_metadata is not None and (
num_reqs != num_reqs_padded or num_tokens != num_tokens_padded
) :
# Currently the drafter still only uses piecewise cudagraphs (and modifies
# the attention metadata in directly), and therefore does not want to use
# padded attention metadata.
spec_decode_common_attn_metadata = (
spec_decode_common_attn_metadata . unpadded ( num_tokens , num_reqs )
)
return attn_metadata , spec_decode_common_attn_metadata
2025-08-11 18:03:19 +08:00
2025-03-20 19:34:44 +08:00
@torch.inference_mode ( )
2025-05-31 06:03:03 +08:00
def _dummy_run (
self ,
num_tokens : int ,
2025-07-28 14:06:20 +08:00
with_prefill : bool = False ,
2025-12-30 08:32:14 +08:00
cudagraph_runtime_mode : Optional [ CUDAGraphMode ] = None ,
2025-08-20 09:01:04 +08:00
force_attention : bool = False ,
uniform_decode : bool = False ,
2025-12-16 17:44:04 +08:00
is_profile : bool = False ,
2026-01-27 22:27:01 +08:00
create_mixed_batch : bool = False ,
2025-12-30 08:32:14 +08:00
allow_microbatching : bool = True ,
skip_eplb : bool = False ,
remove_lora : bool = True ,
activate_lora : bool = False ,
is_graph_capturing : bool = False ,
2026-01-27 22:27:01 +08:00
) - > tuple [ torch . Tensor , torch . Tensor ] :
2025-08-20 09:01:04 +08:00
# only support eager mode and piecewise graph now
2026-01-27 22:27:01 +08:00
assert cudagraph_runtime_mode is None or cudagraph_runtime_mode . valid_runtime_modes ( )
2025-08-20 09:01:04 +08:00
# If cudagraph_mode.decode_mode() == FULL and
2026-01-27 22:27:01 +08:00
# cudagraph_mode.separate_routine(). This means that we are using
2025-08-20 09:01:04 +08:00
# different graphs and/or modes for mixed prefill-decode batches vs.
# uniform decode batches. A uniform decode batch means that all
# requests have identical query length, except a potential virtual
# request (shorter) in the batch account for padding.
# Uniform decode batch could either be common pure decode, where
# max_query_len == 1, or speculative decode, where
# max_query_len == 1 + num_spec_decode_tokens.
# When setting max_query_len = 1, we switch to and capture the optimized
# routine of FA2 for pure decode, i.e., Flashdecode + an optimization
# for GQA/MQA.
2026-01-27 22:27:01 +08:00
max_query_len = self . uniform_decode_query_len if uniform_decode else num_tokens
2025-05-22 19:20:51 +08:00
# Set num_scheduled_tokens based on num_tokens and max_num_seqs
# for dummy run with LoRA so that the num_reqs collectively
# has num_tokens in total.
assert num_tokens < = self . scheduler_config . max_num_batched_tokens
2026-01-27 22:27:01 +08:00
max_num_reqs = self . scheduler_config . max_num_seqs
if create_mixed_batch :
raise NotImplementedError ( " create_mixed_batch is used for warmup deepgemm, vllm-ascend does not need it " )
elif uniform_decode :
assert not create_mixed_batch
num_reqs = min ( max_num_reqs , cdiv ( num_tokens , max_query_len ) )
2025-08-20 09:01:04 +08:00
num_scheduled_tokens_list = [ max_query_len ] * num_reqs
if num_tokens % max_query_len != 0 :
num_scheduled_tokens_list [ - 1 ] = num_tokens % max_query_len
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
else :
2026-01-27 22:27:01 +08:00
num_reqs = min ( num_tokens , max_num_reqs )
2025-08-20 09:01:04 +08:00
min_tokens_per_req = num_tokens / / num_reqs
num_scheduled_tokens_list = [ min_tokens_per_req ] * num_reqs
num_scheduled_tokens_list [ - 1 ] + = num_tokens % num_reqs
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
assert sum ( num_scheduled_tokens_list ) == num_tokens
assert len ( num_scheduled_tokens_list ) == num_reqs
2025-09-17 10:36:43 +08:00
2026-01-27 22:27:01 +08:00
num_scheduled_tokens = np . array ( num_scheduled_tokens_list , dtype = np . int32 )
self . query_lens = torch . from_numpy ( num_scheduled_tokens )
num_tokens_unpadded = int ( num_scheduled_tokens . sum ( ) )
num_sampled_tokens = np . ones ( num_reqs , dtype = np . int32 )
_cudagraph_mode , batch_desc , _ , num_tokens_across_dp , _ = (
self . _determine_batch_execution_and_padding (
num_tokens = num_tokens_unpadded ,
num_reqs = num_reqs ,
num_scheduled_tokens_np = num_scheduled_tokens ,
max_num_scheduled_tokens = max_query_len ,
use_cascade_attn = False ,
allow_microbatching = allow_microbatching ,
force_eager = is_profile
or ( cudagraph_runtime_mode == CUDAGraphMode . NONE ) ,
# `force_uniform_decode` is used for cudagraph capture; because for
# capturing mixed prefill-decode batches, we sometimes use
# num_tokens == num_reqs which looks like a uniform decode batch to the
# dispatcher; but we actually want to capture a piecewise cudagraph
force_uniform_decode = uniform_decode ,
# `force_has_lora` is used for cudagraph capture; because LoRA is
# activated later in the context manager, but we need to know the
# LoRA state when determining the batch descriptor for capture
force_has_lora = activate_lora ,
)
)
if cudagraph_runtime_mode is None :
cudagraph_runtime_mode = _cudagraph_mode
else :
assert cudagraph_runtime_mode == _cudagraph_mode , (
f " Cudagraph runtime mode mismatch in dummy_run. "
f " Expected { _cudagraph_mode } , but got { cudagraph_runtime_mode } . "
)
num_tokens_padded = batch_desc . num_tokens
num_reqs_padded = (
batch_desc . num_reqs if batch_desc . num_reqs is not None else num_reqs
)
2025-12-08 20:32:35 +08:00
if num_tokens_across_dp is not None and num_tokens_padded != num_tokens :
# pad is needed if the pad of `num_tokens` is triggered inside CudagraphDispatcher
num_tokens_across_dp [ : ] = num_tokens_padded
num_scheduled_tokens = num_scheduled_tokens . repeat ( num_reqs_padded )
2026-01-27 22:27:01 +08:00
# vllm-ascend does not support ubatch now
ubatch_slices , ubatch_slices_padded = None , None
attn_metadata : PerLayerAttnMetadata | None = None
# If force_attention is True, we always capture attention. Otherwise,
# it only happens for cudagraph_runtime_mode=FULL.
if force_attention or cudagraph_runtime_mode == CUDAGraphMode . FULL :
if create_mixed_batch :
raise NotImplementedError ( " create_mixed_batch is used for warmup deepgemm, vllm-ascend does not need it " )
self . attn_state = AscendAttentionState . DecodeOnly
if self . speculative_config and \
self . speculative_config . method == " mtp " :
# `AscendAttentionState.SpecDecoding` is only designed for mla
if self . vllm_config . model_config . use_mla :
self . attn_state = AscendAttentionState . SpecDecoding
else :
self . attn_state = AscendAttentionState . ChunkedPrefill
# The reason why we use a fixed seq_len rather than max_query_len is that
# _npu_paged_attention_get_workspace only returns max workspace with specific
# seq_lens. We use this seq_len only when capturing graph, and still use max_query_len
# in inference. This will be removed once npu_fused_infer_attention_score
# outperforms _npu_paged_attention on all cases.
seq_lens = SEQ_LEN_WITH_MAX_PA_WORKSPACE if is_graph_capturing and using_paged_attention ( num_tokens , self . vllm_config ) else max_query_len # type: ignore[assignment]
self . seq_lens . np [ : num_reqs_padded ] = seq_lens
self . seq_lens . np [ num_reqs_padded : ] = 0
self . seq_lens . copy_to_gpu ( )
2026-01-30 16:41:44 +08:00
2026-01-27 22:27:01 +08:00
cum_num_tokens , _ = self . _get_cumsum_and_arange ( num_scheduled_tokens )
self . query_start_loc . np [ 1 : num_reqs_padded + 1 ] = cum_num_tokens
self . query_start_loc . copy_to_gpu ( )
2026-01-30 16:41:44 +08:00
num_reqs_padded = self . _pad_query_start_loc_for_fia (
num_tokens_padded , num_reqs_padded , num_reqs
)
2026-01-27 22:27:01 +08:00
pad_attn = cudagraph_runtime_mode == CUDAGraphMode . FULL
attn_metadata , _ = self . _build_attention_metadata (
num_tokens = num_tokens_unpadded ,
num_tokens_padded = num_tokens_padded ,
num_reqs = num_reqs_padded ,
max_query_len = max_query_len ,
ubatch_slices = ubatch_slices_padded if pad_attn else ubatch_slices ,
for_cudagraph_capture = is_graph_capturing ,
num_scheduled_tokens_np = num_scheduled_tokens ,
)
2025-12-08 20:32:35 +08:00
2026-01-27 22:27:01 +08:00
with self . maybe_dummy_run_with_lora (
self . lora_config ,
num_scheduled_tokens ,
num_sampled_tokens ,
) :
2025-12-08 20:32:35 +08:00
# Make sure padding doesn't exceed max_num_tokens
assert num_tokens_padded < = self . max_num_tokens
2026-01-11 11:38:45 +08:00
if self . is_multimodal_model and not self . model_config . is_encoder_decoder :
2025-05-22 19:20:51 +08:00
input_ids = None
2025-12-08 20:32:35 +08:00
inputs_embeds = self . inputs_embeds . gpu [ : num_tokens_padded ]
2025-10-30 17:15:57 +08:00
elif self . enable_prompt_embeds :
input_ids = None
2025-12-08 20:32:35 +08:00
inputs_embeds = self . inputs_embeds . gpu [ : num_tokens_padded ]
2025-05-22 19:20:51 +08:00
else :
2025-12-12 17:27:09 +08:00
input_ids = self . input_ids . gpu [ : num_tokens_padded ]
2025-05-22 19:20:51 +08:00
inputs_embeds = None
2025-03-20 19:34:44 +08:00
2025-05-22 19:20:51 +08:00
if self . uses_mrope :
2025-12-12 17:27:09 +08:00
positions = self . mrope_positions . gpu [ : , : num_tokens_padded ]
2025-12-23 10:46:54 +08:00
elif self . uses_xdrope_dim > 0 :
positions = self . xdrope_positions . gpu [ : , : num_tokens_padded ]
2025-05-22 19:20:51 +08:00
else :
2025-12-12 17:27:09 +08:00
positions = self . positions . gpu [ : num_tokens_padded ]
2025-03-20 19:34:44 +08:00
2025-12-17 08:53:44 +08:00
# update global cos, sin
update_cos_sin ( positions )
2025-05-22 19:20:51 +08:00
if get_pp_group ( ) . is_first_rank :
intermediate_tensors = None
else :
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
# When PP and flashcomm1 are enabled, during dummy_run the estimated space should divide num_tokens by tp_size;
# otherwise, on non-first PP ranks it would effectively perform an extra all-gather, leading to incorrect memory estimation and potentially causing OOM.
2026-01-26 16:53:07 +08:00
intermediate_tokens = num_tokens_padded
mooncake connector support pipeline parallel & fix pp with flashcomm1 (#4054)
### What this PR does / why we need it?
To support pipeline parallel with PD disaggregation, this PR support PP
in mooncake connector and fix other bugs when enable pp with other
optimization params, including following changes:
- mooncake connector support pp in prefill, we do not support decode pp
currently
- fix bugs when enable both pp and flashcomm1
- optimize ascend-scheduler to support full batch in multiple pipeline
stages, original implementation would cause all pipeline stages
batch_size total summed to max_num_seq, which makes pipeline is not
full, this optimization can make all stages running with full batch_size
= max_num_seq, the same changes will contribute to vllm scheduler too.
### Does this PR introduce _any_ user-facing change?
add `pp_size` in mooncake connector kv_connector_extra_config
```
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 1,
"tp_size": 4,
"pp_size": 4
},
"decode": {
"dp_size": 16,
"tp_size": 1
}
}
```
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: chenxiao <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <Jaychou1620@Gmail.com>
Signed-off-by: Kurumi5210 <jaychou1620@gmail.com>
Signed-off-by: 秋刀鱼 <jaychou1620@Gmail.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zss <zss@qq.com>
Co-authored-by: zss <3265779424@qq.com>
2025-12-10 16:01:43 +08:00
if enable_sp ( ) :
tp_size = get_tensor_model_parallel_world_size ( )
2026-01-26 16:53:07 +08:00
intermediate_tokens = ( num_tokens_padded + tp_size -
1 ) / / tp_size
2025-05-22 19:20:51 +08:00
if self . intermediate_tensors is None :
2026-01-26 16:53:07 +08:00
max_actual_tokens = self . max_num_tokens
if enable_sp ( ) :
max_actual_tokens = ( self . max_num_tokens + tp_size -
1 ) / / tp_size
2025-05-22 19:20:51 +08:00
self . intermediate_tensors = (
self . model . make_empty_intermediate_tensors (
2026-01-26 16:53:07 +08:00
batch_size = max_actual_tokens ,
2025-05-22 19:20:51 +08:00
dtype = self . dtype ,
device = self . device ) )
intermediate_tensors = IntermediateTensors ( {
2025-12-08 20:32:35 +08:00
k :
2026-01-26 16:53:07 +08:00
v [ : intermediate_tokens ]
2025-05-22 19:20:51 +08:00
for k , v in self . intermediate_tensors . items ( )
} )
2025-10-23 09:35:18 +08:00
2025-12-17 23:48:34 +08:00
need_dummy_logits = ( not is_profile and lmhead_tp_enable ( ) )
2025-12-13 18:59:54 +08:00
max_num_reqs_across_dp = max_num_reqs * self . uniform_decode_query_len
2025-12-01 10:22:36 +08:00
dummy_indices = torch . zeros ( max_num_reqs_across_dp ,
dtype = torch . int32 )
def dummy_compute_logits ( hidden_states ) :
if not need_dummy_logits :
return None
return self . model . compute_logits ( hidden_states [ dummy_indices ] )
def dummy_drafter_compute_logits ( hidden_states ) :
if not need_dummy_logits or self . drafter is None :
return
if hasattr ( self . drafter , " model " ) and hasattr (
self . drafter . model , " compute_logits " ) :
return self . drafter . model . compute_logits (
2025-10-09 10:28:38 +08:00
hidden_states [ dummy_indices ] )
2025-08-29 11:41:21 +08:00
2025-07-28 14:06:20 +08:00
with set_ascend_forward_context (
attn_metadata ,
self . vllm_config ,
2025-12-08 20:32:35 +08:00
num_tokens = num_tokens_padded ,
2025-07-28 14:06:20 +08:00
num_tokens_across_dp = num_tokens_across_dp ,
2025-12-16 17:44:04 +08:00
in_profile_run = is_profile ,
2026-01-27 22:27:01 +08:00
num_actual_tokens = num_tokens_padded ,
2025-12-30 08:32:14 +08:00
aclgraph_runtime_mode = cudagraph_runtime_mode ,
2026-01-27 22:27:01 +08:00
batch_descriptor = batch_desc ,
2025-12-23 08:49:52 +08:00
model_instance = self . model ) :
2026-01-27 22:27:01 +08:00
outputs = self . _model_forward (
num_tokens_padded , input_ids , positions ,
2025-12-11 11:21:13 +08:00
intermediate_tensors , inputs_embeds )
2026-01-27 22:27:01 +08:00
if self . use_aux_hidden_state_outputs :
hidden_states , _ = outputs
else :
hidden_states = outputs
dummy_compute_logits ( hidden_states )
2025-08-29 11:41:21 +08:00
2025-09-04 11:34:47 +08:00
if self . drafter :
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
self . drafter . dummy_run (
2025-12-08 20:32:35 +08:00
num_tokens = num_tokens_padded ,
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
with_prefill = with_prefill ,
2025-12-08 20:32:35 +08:00
num_reqs = num_reqs_padded ,
2025-10-17 18:14:49 +08:00
num_tokens_across_dp = num_tokens_across_dp ,
2025-12-30 08:32:14 +08:00
aclgraph_runtime_mode = cudagraph_runtime_mode ,
2026-01-27 22:27:01 +08:00
batch_descriptor = batch_desc ,
2025-12-10 20:11:09 +08:00
dummy_compute_logits = dummy_drafter_compute_logits ,
2025-12-18 22:27:47 +08:00
in_graph_capturing = not force_attention ,
is_profile = is_profile )
2025-12-17 23:48:34 +08:00
if is_profile and self . dynamic_eplb :
2025-09-17 10:36:43 +08:00
self . model . clear_all_moe_loads ( )
2026-01-23 14:21:13 +08:00
if self . dynamic_eplb :
2025-09-17 10:36:43 +08:00
self . eplb_updator . take_update_info_from_eplb_process ( )
self . eplb_updator . forward_end ( )
2025-12-16 17:44:04 +08:00
return hidden_states , hidden_states
2025-09-26 08:57:47 +08:00
2025-12-16 17:44:04 +08:00
@torch.inference_mode ( )
def _dummy_sampler_run (
self ,
hidden_states : torch . Tensor ,
) - > torch . Tensor :
2025-07-09 14:33:40 +08:00
output = None
2025-03-20 19:34:44 +08:00
2025-12-16 17:44:04 +08:00
# For profile, have maximum num_reqs and that collectively have
# maximum num_tokens.
min_tokens_per_req = self . max_num_tokens / / self . max_num_reqs
num_scheduled_tokens_list = [ min_tokens_per_req ] * self . max_num_reqs
num_scheduled_tokens_list [
- 1 ] + = self . max_num_tokens % self . max_num_reqs
num_scheduled_tokens = np . array ( num_scheduled_tokens_list ,
dtype = np . int32 )
logit_indices = np . cumsum ( num_scheduled_tokens ) - 1
# TODO: need to rum a dummy sampler for generate task
2026-01-22 12:35:06 +08:00
hidden_states = hidden_states [ logit_indices ]
output = self . model . compute_logits ( hidden_states )
2025-12-16 17:44:04 +08:00
return output
def profile_run ( self ) - > None :
2026-01-23 14:21:13 +08:00
self . eplb_warmup ( )
2025-12-16 17:44:04 +08:00
mc2_tokens_capacity = get_mc2_tokens_capacity ( )
if self . max_num_tokens > mc2_tokens_capacity and \
2025-12-18 23:34:31 +08:00
select_moe_comm_method ( mc2_tokens_capacity , self . vllm_config ) in { MoECommType . MC2 , MoECommType . FUSED_MC2 } :
2025-12-16 17:44:04 +08:00
self . _dummy_run ( mc2_tokens_capacity ,
with_prefill = True ,
is_profile = True )
2025-12-25 11:58:52 +08:00
origin_max_num_tokens = self . max_num_tokens
# in the pcp scenario, the split sequence needs to be used for profile run
# TODO: after the vllm pcp function is launched, this logic needs to be brought up to the community
if self . pcp_size > 1 :
self . max_num_tokens = math . ceil ( self . max_num_tokens /
( self . pcp_size * 2 ) ) * 2
2025-12-16 17:44:04 +08:00
super ( ) . profile_run ( )
2025-12-25 11:58:52 +08:00
self . max_num_tokens = origin_max_num_tokens
2025-03-20 19:34:44 +08:00
2025-09-17 10:36:43 +08:00
def eplb_warmup ( self ) :
if self . dynamic_eplb and not self . is_eplb_warmuped :
self . is_eplb_warmuped = True
self . eplb_adaptor = VllmEplbAdaptor ( model = self . model )
self . eplb_loader . set_adator ( self . eplb_adaptor )
self . eplb_updator . set_adaptor ( self . eplb_adaptor )
self . eplb_updator . warm_up_eplb ( )
2025-03-20 19:34:44 +08:00
def load_model ( self ) - > None :
logger . info ( " Starting to load model %s ... " , self . model_config . model )
with DeviceMemoryProfiler ( ) as m : # noqa: SIM117
self . model = get_model ( vllm_config = self . vllm_config )
2025-09-17 10:36:43 +08:00
if self . dynamic_eplb :
model_register ( self . model , self . model_config )
2025-06-25 16:20:14 +08:00
if self . drafter :
Spec decode support for V1 Engine (#874)
<!-- Thanks for sending a pull request!
BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html
-->
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.
- Please clarify why the changes are needed. For instance, the use case
and bug description.
- Fixes #
-->
Make spec decode support for V1 Engine
- Currently, Ascend does not support the triton kernel. PyTorch is used
to rewrite the `rejection_sampler.py` triton kernel. However, PyTorch is
not as good as Triton. Therefore, ascend c is used to implement the
function in the future.
- Currently, spec decode supports only the ngram algorithm. The eagle
algorithm needs to be further adapted.
### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->
Not change user facing.
### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->
test by `tests/singlecard/spec_decode/e2e/test_v1_spec_decode.py` and
`tests/sample/test_rejection_sampler.py`, test base function of
rejection sampler and e2e function of spec decode.
Signed-off-by: ponix-j <657511300@qq.com>
2025-05-23 14:25:46 +08:00
logger . info ( " Loading drafter model... " )
[BugFix] Support setting tp=1 for the Eagle draft model to take effect (#6097)
According to the official documentation, the parameter
"draft_tensor_parallel_size": 1 is supposed to be applied to the Eagle3
model. However, based on actual debugging, it was found that the number
of tensor parallelisms (tp) of the Eagle model is consistent with that
of the target model. The setting of tp for the draft model did not take
effect as expected.
**Note:** This feature has not been superimposed and tested with `sp`
and `dp`. It will be adapted later
No
```python
from vllm import LLM, SamplingParams
def main():
prompts = [
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(
model="meta-llama/Llama-3.1-8B-Instruct",
tensor_parallel_size=4,
gpu_memory_utilization=0.9,
enforce_eager=True,
speculative_config={
"method": "eagle3",
"model": "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
"draft_tensor_parallel_size": 1,
"num_speculative_tokens": 3,
},
)
outputs = llm.generate(prompts, sampling_params)
print(f"Outputs: {outputs}")
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
Fixes vllm-project/vllm#31345
### What this PR does / why we need it?
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60
Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
Co-authored-by: drslark <slarksblood@qq.com>
2026-01-22 11:36:23 +08:00
with get_tp_context ( self . drafter ) :
self . drafter . load_model ( self . model )
2025-12-22 15:24:54 +08:00
if self . use_aux_hidden_state_outputs :
2025-09-04 11:34:47 +08:00
self . model . set_aux_hidden_state_layers (
self . model . get_eagle3_aux_hidden_state_layers ( ) )
2025-03-20 19:34:44 +08:00
if self . lora_config :
2025-10-09 10:28:38 +08:00
self . model = self . load_lora_model ( self . model , self . vllm_config ,
self . device )
2025-03-20 19:34:44 +08:00
logger . info ( " Loading model weights took %.4f GB " ,
2025-04-15 10:24:02 +08:00
m . consumed_memory / float ( 2 * * 30 ) )
2025-03-20 19:34:44 +08:00
2025-09-22 17:14:28 +08:00
# wrap the model with full graph wrapper if needed.
if self . compilation_config . cudagraph_mode . has_full_cudagraphs ( ) :
2025-10-17 21:57:05 +08:00
self . update_stream : torch . npu . Stream = torch . npu . Stream ( )
2025-09-22 17:14:28 +08:00
self . model = ACLGraphWrapper ( self . model ,
self . vllm_config ,
runtime_mode = CUDAGraphMode . FULL )
2025-03-20 19:34:44 +08:00
def initialize_kv_cache ( self , kv_cache_config : KVCacheConfig ) - > None :
"""
Initialize KV cache based on ` kv_cache_config ` .
Args :
kv_cache_config : Configuration for the KV cache , including the KV
cache size of each layer
"""
2025-09-16 01:17:42 +08:00
kv_cache_config = deepcopy ( kv_cache_config )
[V1][eagle3] Support eagle3 proposer for v1 (#1032)
### What this PR does / why we need it?
This PR implements the Eagle Pososer feature for vLLM v1, which enables
more efficient speculative decoding by using a draft model to predict
potential future tokens.
- The implementation includes the core Eagle algorithm integration with
vLLM's existing architecture, allowing for faster inference while
maintaining output quality.
- This is needed to significantly improve the generation speed of large
language models without compromising on the quality of generated text.
### Does this PR introduce any user-facing change?
Yes, this PR introduces a new speculative decoding mode that can be
enabled via configuration.
- Users can now choose to use Eagle Pososer by setting appropriate flags
in the inference configuration.
- The API remains backward compatible, with the new functionality being
opt-in.
### How was this patch tested?
CI passed with new unit tests added for the Eagle Pososer functionality.
- Benchmark tests were conducted comparing generation speed and quality
with and without Eagle Pososer.
- Integration tests were performed with various model architectures to
ensure compatibility.
- Manual testing was done using different prompt scenarios to verify
output quality remains consistent.
- we test accept rate on one Ascend 910B npu, The acceptance rate
results are basically consistent with those shown here:
https://github.com/vllm-project/vllm/pull/16937
- Currently, we support scenarios where num_spec_tokens <= 2. When
num_spec_tokens > 2, issues such as insufficient GPU memory and operator
computation errors may occur. We will address this in subsequent
updates.
- We will add support for Eagle v1 in future updates.
### Acceptance Test Script
```bash
SCRIPT="/offline/eagle.py"
DATASET="ShareGpt"
MODEL=Meta-Llama-3.1-8B-Instruct
DRAFT=EAGLE3-LLaMA3.1-Instruct-8B
CUDA_VISIBLE_DEVICES="0" VLLM_USE_V1=1 $PYTHON $SCRIPT \
--dataset $DATASET \
--num_spec_tokens 2 \
--max_num_seqs 1 \
--model_dir $MODEL \
--eagle_dir $DRAFT \
--tp 1 \
--num_prompts 80
```
### Acceptance Test Results
```bash
██████████████████████████████████████████████████████████████████████████████████████████████████████████| 80/80 [21:22<00:00, 16.03s/it, est. speed input: 4.72 toks/s, output: 13.56 toks/s]
-------------------------------------------------------------------------------------
mean acceptance length: 1.63
-------------------------------------------------------------------------------------
total_counts: 8062
acceptance at token 0: 1.00 (8062 times)
acceptance at token 1: 0.70 (5612 times)
acceptance at token 2: 0.47 (3765 times)
```
Closes: https://github.com/vllm-project/vllm-ascend/issues/1004
---------
Signed-off-by: yuancaoyaoHW <a2749322671@gmail.com>
2025-06-20 17:19:54 +08:00
self . kv_cache_config = kv_cache_config
2025-10-20 09:39:40 +08:00
self . may_add_encoder_only_layers_to_kv_cache_config ( )
2025-12-23 10:48:31 +08:00
self . maybe_add_kv_sharing_layers_to_kv_cache_groups ( kv_cache_config )
2025-10-20 09:39:40 +08:00
# NOTE(cmq): initialize_attn_backend must before using self.attn_groups
self . initialize_attn_backend ( kv_cache_config )
2025-09-18 21:43:22 +08:00
self . use_hybrid_blocks = ( len ( self . attn_groups ) > 1 )
2025-09-30 03:25:58 +08:00
# NOTE: Currently, we determine whether we need `num_accepted_tokens` through `MambaSpec`.
2025-10-09 10:28:38 +08:00
self . need_accepted_tokens = any ( [
isinstance ( attn_group [ 0 ] . kv_cache_spec , MambaSpec )
for attn_group in self . attn_groups
] )
2025-09-30 03:25:58 +08:00
2025-09-18 21:43:22 +08:00
self . may_reinitialize_input_batch ( kv_cache_config )
2025-11-04 17:26:54 +08:00
kv_caches = self . initialize_kv_cache_tensors ( kv_cache_config )
2025-07-26 17:15:47 +08:00
2025-09-16 01:17:42 +08:00
if has_kv_transfer_group ( ) :
get_kv_transfer_group ( ) . register_kv_caches ( kv_caches )
2025-05-16 12:14:55 +08:00
2025-09-18 21:43:22 +08:00
def _align_memory ( self , tensor : torch . Tensor ,
alignment : int ) - > torch . Tensor :
data_ptr = tensor . data_ptr ( )
aligned_addr = ( data_ptr + alignment - 1 ) / / alignment * alignment
offset = ( aligned_addr - data_ptr ) / / tensor . element_size ( )
return tensor [ int ( offset ) : ]
2025-11-04 17:26:54 +08:00
def initialize_kv_cache_tensors (
2025-09-16 01:17:42 +08:00
self , kv_cache_config : KVCacheConfig ) - > dict [ str , torch . Tensor ] :
2025-11-04 17:26:54 +08:00
"""
Initialize the memory buffer for KV cache .
2025-09-16 01:17:42 +08:00
2025-11-04 17:26:54 +08:00
Args :
kv_cache_config : The KV cache config
Returns :
Dict [ str , torch . Tensor ] : A map between layer names to their
corresponding memory buffer for KV cache .
"""
# Initialize the memory buffer for KV cache
kv_cache_raw_tensors = self . _allocate_kv_cache_tensors ( kv_cache_config )
# Change the memory buffer to the desired shape
kv_caches = self . _reshape_kv_cache_tensors ( kv_cache_config ,
kv_cache_raw_tensors )
2025-09-16 01:17:42 +08:00
2025-12-23 10:48:31 +08:00
# Set up cross-layer KV cache sharing
for layer_name , target_layer_name in self . shared_kv_cache_layers . items (
) :
logger . debug ( " %s reuses KV cache of %s " , layer_name ,
target_layer_name )
kv_caches [ layer_name ] = kv_caches [ target_layer_name ]
2025-12-10 22:54:24 +08:00
from vllm . v1 . worker . utils import bind_kv_cache
2026-01-06 16:41:39 +08:00
num_attn_module = 2 if self . model_config . hf_text_config . model_type == " longcat_flash " else 1
2025-09-16 01:17:42 +08:00
bind_kv_cache ( kv_caches ,
self . compilation_config . static_forward_context ,
2025-12-31 17:06:55 +08:00
self . kv_caches , num_attn_module )
2025-09-16 01:17:42 +08:00
return kv_caches
2025-11-04 17:26:54 +08:00
def _allocate_kv_cache_tensors (
2025-09-16 01:17:42 +08:00
self , kv_cache_config : KVCacheConfig ) - > dict [ str , torch . Tensor ] :
"""
2025-11-04 17:26:54 +08:00
Initializes the KV cache buffer with the correct size . The buffer needs
to be reshaped to the desired shape before being used by the models .
NOTE : To support prefill disaggregation , we need to split kvcache tensor into
k_cahce and v cache , and the addr of both are aligned by 2 M
2025-09-16 01:17:42 +08:00
Args :
kv_cache_config : The KV cache config
Returns :
2025-11-04 17:26:54 +08:00
dict [ str , torch . Tensor ] : A map between layer names to their
2025-09-16 01:17:42 +08:00
corresponding memory buffer for KV cache .
2025-11-04 17:26:54 +08:00
dict [ str , tuple ( torch . Tensor , torch . Tensor ) ] A map between layer names
to their corresponding memory buffer for K cache and V cache .
"""
2025-09-16 01:17:42 +08:00
# init kv cache tensors
2025-09-18 21:43:22 +08:00
kv_cache_raw_tensors : dict [ str , Union [ torch . Tensor ,
Optional [ torch . Tensor ] ] ] = { }
2025-12-09 22:36:43 +08:00
# prefill disaggregation need the addr of cache tensor be aligned with 2M
2025-09-18 21:43:22 +08:00
alignment = 2 * 1024 * 1024
2025-09-16 01:17:42 +08:00
for kv_cache_tensor in kv_cache_config . kv_cache_tensors :
# TODO: REFACTOR ME to sharing hybrid cache
for idx in range ( len ( kv_cache_tensor . shared_by ) ) :
layer_name = kv_cache_tensor . shared_by [ idx ]
2025-10-29 14:18:52 +08:00
if " linear_attn " in layer_name and layer_name not in kv_cache_raw_tensors . keys (
) :
2025-09-24 11:32:34 +08:00
# for mamba linear attention
2025-10-29 14:18:52 +08:00
if self . vllm_config . kv_transfer_config is None :
tensor = torch . zeros ( kv_cache_tensor . size ,
dtype = torch . int8 ,
device = self . device )
else :
cache_size_aligned = kv_cache_tensor . size + alignment
tensor = torch . zeros ( cache_size_aligned ,
dtype = torch . int8 ,
device = self . device )
tensor = self . _align_memory (
tensor , alignment ) [ : kv_cache_tensor . size ]
2025-11-04 17:26:54 +08:00
2025-09-16 01:17:42 +08:00
for layer_name_inner in kv_cache_tensor . shared_by :
2025-11-04 17:26:54 +08:00
# shared the kvcache between the self_attn specs in the same group
2025-10-29 14:18:52 +08:00
if " linear_attn " in layer_name_inner :
kv_cache_raw_tensors [ layer_name_inner ] = tensor
elif " attn " in layer_name and layer_name not in kv_cache_raw_tensors . keys (
) :
2025-11-04 17:26:54 +08:00
# NOTE: We need to init k cache tensor (nope cache tensor in mla) and
2025-12-09 22:36:43 +08:00
# v cache tensor (rope cache tensor in mla) separately to support prefill disaggregation,
2025-11-04 17:26:54 +08:00
# as it only support the 0-dim of kv_cache is `num_blocks`.
# For deepseek mla, we need to spilt cache tensor accrodding to the nope head dim
# and rope head dim.
2025-12-10 15:58:52 +08:00
if self . model_config . use_mla :
2025-11-04 17:26:54 +08:00
head_size = self . model_config . hf_text_config . qk_rope_head_dim + \
self . model_config . hf_text_config . kv_lora_rank
dsa_k_cache_factor = None
dsa_k_cache_size = None
2025-12-10 15:58:52 +08:00
if not self . model_config . use_mla :
2025-11-04 17:26:54 +08:00
# for non-mla model, use FullAttentionSpec
k_tensor_split_factor = 2
v_tensor_split_factor = 2
elif self . use_sparse :
# for deepseek v3.2, DSA use FullAttentionSpec
# FullAttentionSpec allocate 2 * mla page size bytes,
# and we use half of that for k cache in DSA
dsa_k_cache_factor = 2
k_tensor_split_factor = 2 * head_size / self . model_config . hf_text_config . kv_lora_rank
v_tensor_split_factor = 2 * head_size / self . model_config . hf_text_config . qk_rope_head_dim
dsa_k_cache_size = int ( kv_cache_tensor . size / /
dsa_k_cache_factor )
else :
# for other deepseek models, use MLAAttentionSpec
k_tensor_split_factor = head_size / self . model_config . hf_text_config . kv_lora_rank
v_tensor_split_factor = head_size / self . model_config . hf_text_config . qk_rope_head_dim
k_tensor_size = int ( kv_cache_tensor . size / /
k_tensor_split_factor )
v_tensor_size = int ( kv_cache_tensor . size / /
v_tensor_split_factor )
2025-09-24 11:32:34 +08:00
# for other attentions, e.g., self_attn, sliding window attn
2025-09-18 21:43:22 +08:00
if self . vllm_config . kv_transfer_config is None :
2025-11-04 17:26:54 +08:00
k_tensor = torch . zeros ( k_tensor_size ,
2025-09-18 21:43:22 +08:00
dtype = torch . int8 ,
device = self . device )
2025-11-04 17:26:54 +08:00
v_tensor = torch . zeros ( v_tensor_size ,
2025-09-18 21:43:22 +08:00
dtype = torch . int8 ,
device = self . device )
2025-11-04 17:26:54 +08:00
#### k cache: for deepseek sparse attention
if dsa_k_cache_factor is not None :
dsa_k_cache_tensor = torch . zeros (
dsa_k_cache_size ,
dtype = torch . int8 ,
device = self . device )
2025-09-18 21:43:22 +08:00
else :
2025-11-04 17:26:54 +08:00
k_tensor = torch . zeros ( k_tensor_size + alignment ,
2025-09-18 21:43:22 +08:00
dtype = torch . int8 ,
device = self . device )
2025-11-04 17:26:54 +08:00
v_tensor = torch . zeros ( v_tensor_size + alignment ,
2025-09-18 21:43:22 +08:00
dtype = torch . int8 ,
device = self . device )
2025-11-04 17:26:54 +08:00
k_tensor = self . _align_memory (
k_tensor , alignment ) [ : k_tensor_size ]
v_tensor = self . _align_memory (
v_tensor , alignment ) [ : v_tensor_size ]
#### k cache: for deepseek sparse attention
if dsa_k_cache_factor is not None and dsa_k_cache_size is not None :
dsa_k_cache_tensor = torch . zeros (
dsa_k_cache_size + alignment ,
dtype = torch . int8 ,
device = self . device )
dsa_k_cache_tensor = self . _align_memory (
dsa_k_cache_tensor ,
alignment ) [ : dsa_k_cache_size ]
2025-10-29 14:18:52 +08:00
for layer_name_inner in kv_cache_tensor . shared_by :
# shared the kvcache between the self_attn specs in the same group
if ( " attn " in layer_name_inner
and " linear_attn " not in layer_name_inner ) :
2025-11-04 17:26:54 +08:00
kv_cache_raw_tensors [ layer_name_inner ] = ( k_tensor , v_tensor ) if \
not self . use_sparse else ( k_tensor , v_tensor , dsa_k_cache_tensor )
2025-09-16 01:17:42 +08:00
layer_names = set ( )
for group in kv_cache_config . kv_cache_groups :
for layer_name in group . layer_names :
if layer_name in self . runner_only_attn_layers :
continue
layer_names . add ( layer_name )
assert layer_names == set ( kv_cache_raw_tensors . keys (
) ) , " Some layers are not correctly initialized "
2025-11-04 17:26:54 +08:00
return kv_cache_raw_tensors
def _reshape_kv_cache_tensors (
self ,
kv_cache_config : KVCacheConfig ,
kv_cache_raw_tensors : dict [ str , torch . Tensor ] ,
) - > dict [ str , torch . Tensor ] :
"""
Reshape the KV cache tensors to the desired shape and dtype .
Args :
kv_cache_config : The KV cache config
kv_cache_raw_tensors : The KV cache buffer of each layer , with
correct size but uninitialized shape .
Returns :
Dict [ str , torch . Tensor ] : A map between layer names to their
corresponding memory buffer for KV cache .
"""
2025-09-16 01:17:42 +08:00
kv_caches : Dict [ str , torch . Tensor ] = { }
2025-10-09 10:28:38 +08:00
for group in self . _kv_cache_spec_attn_group_iterator ( ) :
kv_cache_spec = group . kv_cache_spec
2025-09-22 15:02:41 +08:00
attn_backend = group . backend
for layer_name in group . layer_names :
2025-09-16 01:17:42 +08:00
if layer_name in self . runner_only_attn_layers :
continue
2025-04-19 17:38:18 +08:00
# TODO: remove this after the OOM issue is located and fixed, otherwise, some model may
# encounter OOM issue
2026-01-11 11:38:45 +08:00
if isinstance ( kv_cache_spec , AttentionSpec ) :
2025-11-04 17:26:54 +08:00
raw_dsa_k_tensor = None
if self . use_sparse :
raw_k_tensor , raw_v_tensor , raw_dsa_k_tensor = kv_cache_raw_tensors [ # type: ignore
layer_name ]
assert raw_dsa_k_tensor is not None
sum_page_size_bytes = raw_k_tensor . numel (
) + raw_v_tensor . numel ( ) + raw_dsa_k_tensor . numel ( )
else :
raw_k_tensor , raw_v_tensor = kv_cache_raw_tensors [ # type: ignore
layer_name ]
sum_page_size_bytes = raw_k_tensor . numel (
) + raw_v_tensor . numel ( )
2025-09-18 21:43:22 +08:00
assert raw_k_tensor is not None
assert raw_v_tensor is not None
2025-11-04 17:26:54 +08:00
assert sum_page_size_bytes % kv_cache_spec . page_size_bytes == 0
num_blocks = sum_page_size_bytes / / kv_cache_spec . page_size_bytes
2025-09-18 21:43:22 +08:00
# `num_blocks` is the number of blocks the model runner can use.
# `kv_cache_config.num_blocks` is the number of blocks that
# KVCacheManager may allocate.
# Since different GPUs may have different number of layers and
# different memory capacities, `num_blocks` can be different on
# different GPUs, and `kv_cache_config.num_blocks` is set to
# the min of all `num_blocks`. Verify it here.
assert num_blocks > = kv_cache_config . num_blocks
2025-12-19 09:00:07 +08:00
if hasattr ( attn_backend , " get_supported_block_size "
) and self . use_hybrid_blocks :
2025-09-16 01:17:42 +08:00
block_size = attn_backend . get_supported_block_size ( ) [ 0 ]
2025-09-18 21:43:22 +08:00
2025-09-16 01:17:42 +08:00
block_size_chunk = kv_cache_spec . block_size / / block_size
kv_cache_shape = attn_backend . get_kv_cache_shape (
num_blocks * block_size_chunk , block_size ,
kv_cache_spec . num_kv_heads ,
kv_cache_spec . head_size )
2025-06-28 18:51:07 +08:00
else :
kv_cache_shape = self . attn_backend . get_kv_cache_shape (
num_blocks , kv_cache_spec . block_size ,
kv_cache_spec . num_kv_heads ,
kv_cache_spec . head_size )
2025-07-26 17:15:47 +08:00
dtype = kv_cache_spec . dtype
2025-12-10 15:58:52 +08:00
if not self . model_config . use_mla :
2025-11-04 17:26:54 +08:00
k_shape = kv_cache_shape [ 1 : ]
v_shape = k_shape
else :
# k_cache: nope_cache v_cache: rope_cache
mla_num_blocks , mla_block_size , num_kv_heads , _ = kv_cache_shape
k_shape = [
mla_num_blocks , mla_block_size , num_kv_heads ,
self . model_config . hf_text_config . kv_lora_rank
]
v_shape = [
mla_num_blocks , mla_block_size , num_kv_heads ,
self . model_config . hf_text_config . qk_rope_head_dim
]
k_cache = raw_k_tensor . view ( dtype ) . view ( k_shape )
v_cache = raw_v_tensor . view ( dtype ) . view ( v_shape )
2025-12-19 14:27:24 +08:00
if get_ascend_device_type ( ) == AscendDeviceType . _310P :
k_cache = maybe_trans_nz ( k_cache )
v_cache = maybe_trans_nz ( v_cache )
2025-11-04 17:26:54 +08:00
if self . use_sparse and raw_dsa_k_tensor is not None :
dsa_k_cache_shape = ( num_blocks ,
kv_cache_spec . block_size , 1 , 128 )
dsa_k_cache_size = (
num_blocks
) * kv_cache_spec . block_size * 128 * dtype . itemsize
dsa_k_cache = raw_dsa_k_tensor [ : dsa_k_cache_size ] . view (
dtype ) . view ( dsa_k_cache_shape )
kv_caches [ layer_name ] = ( k_cache , v_cache , dsa_k_cache )
else :
kv_caches [ layer_name ] = ( k_cache , v_cache )
2025-09-16 01:17:42 +08:00
elif isinstance ( kv_cache_spec , MambaSpec ) :
raw_tensor = kv_cache_raw_tensors [ layer_name ]
2025-09-18 21:43:22 +08:00
assert raw_tensor is not None
assert raw_tensor . numel (
) % kv_cache_spec . page_size_bytes == 0
num_blocks = raw_tensor . numel (
) / / kv_cache_spec . page_size_bytes
2025-12-26 09:19:47 +08:00
assert num_blocks > = kv_cache_config . num_blocks
2025-09-18 21:43:22 +08:00
# `num_blocks` is the number of blocks the model runner can use.
# `kv_cache_config.num_blocks` is the number of blocks that
# KVCacheManager may allocate.
# Since different GPUs may have different number of layers and
# different memory capacities, `num_blocks` can be different on
# different GPUs, and `kv_cache_config.num_blocks` is set to
# the min of all `num_blocks`. Verify it here.
2025-09-16 01:17:42 +08:00
state_tensors = [ ]
2025-12-26 09:19:47 +08:00
target_idx = 0
start_idx = 0
for shape , dtype in zip ( kv_cache_spec . shapes ,
kv_cache_spec . dtypes ) :
# normally, there is conv state and ssm state in this loop. And there is only
# a conv state in some special models.
2025-09-16 01:17:42 +08:00
target_shape = ( num_blocks , * shape )
2025-12-26 09:19:47 +08:00
target_idx + = torch . prod (
torch . tensor ( target_shape ) ) . item ( )
tensor = raw_tensor . view (
dtype ) [ start_idx : target_idx ] . view ( target_shape )
start_idx = target_idx
2025-09-16 01:17:42 +08:00
state_tensors . append ( tensor )
kv_caches [ layer_name ] = state_tensors
2025-03-28 19:34:23 +08:00
else :
raise ValueError ( " Unknown KV cache spec type. " )
2025-03-20 19:34:44 +08:00
2025-09-16 01:17:42 +08:00
return kv_caches
def may_reinitialize_input_batch ( self ,
kv_cache_config : KVCacheConfig ) - > None :
"""
Re - initialize the input batch if the block sizes are different from
` [ self . cache_config . block_size ] ` . This usually happens when there
are multiple KV cache groups .
Args :
kv_cache_config : The KV cache configuration .
"""
block_sizes = [
kv_cache_group . kv_cache_spec . block_size
for kv_cache_group in kv_cache_config . kv_cache_groups
2025-10-20 09:39:40 +08:00
if not isinstance ( kv_cache_group . kv_cache_spec ,
EncoderOnlyAttentionSpec )
2025-09-16 01:17:42 +08:00
]
# Generate kernel_block_sizes that matches each block_size
# For attention backends that support virtual block splitting,
# use the supported block sizes from the backend
# For other backends (like Mamba), use [0] (no splitting)
kernel_block_sizes = [ ]
for kv_cache_group_id , kv_cache_group in enumerate (
kv_cache_config . kv_cache_groups ) :
[BugFix] Support setting tp=1 for the Eagle draft model to take effect (#6097)
According to the official documentation, the parameter
"draft_tensor_parallel_size": 1 is supposed to be applied to the Eagle3
model. However, based on actual debugging, it was found that the number
of tensor parallelisms (tp) of the Eagle model is consistent with that
of the target model. The setting of tp for the draft model did not take
effect as expected.
**Note:** This feature has not been superimposed and tested with `sp`
and `dp`. It will be adapted later
No
```python
from vllm import LLM, SamplingParams
def main():
prompts = [
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(
model="meta-llama/Llama-3.1-8B-Instruct",
tensor_parallel_size=4,
gpu_memory_utilization=0.9,
enforce_eager=True,
speculative_config={
"method": "eagle3",
"model": "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
"draft_tensor_parallel_size": 1,
"num_speculative_tokens": 3,
},
)
outputs = llm.generate(prompts, sampling_params)
print(f"Outputs: {outputs}")
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
Fixes vllm-project/vllm#31345
### What this PR does / why we need it?
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60
Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
Co-authored-by: drslark <slarksblood@qq.com>
2026-01-22 11:36:23 +08:00
kv_cache_spec = kv_cache_group . kv_cache_spec
if isinstance ( kv_cache_spec , UniformTypeKVCacheSpecs ) :
# All layers in the UniformTypeKVCacheSpecs have the same type,
# Pick an arbitrary one to dispatch.
kv_cache_spec = next (
iter ( kv_cache_spec . kv_cache_specs . values ( ) ) )
if isinstance ( kv_cache_spec , EncoderOnlyAttentionSpec ) :
2025-10-20 09:39:40 +08:00
continue
[BugFix] Support setting tp=1 for the Eagle draft model to take effect (#6097)
According to the official documentation, the parameter
"draft_tensor_parallel_size": 1 is supposed to be applied to the Eagle3
model. However, based on actual debugging, it was found that the number
of tensor parallelisms (tp) of the Eagle model is consistent with that
of the target model. The setting of tp for the draft model did not take
effect as expected.
**Note:** This feature has not been superimposed and tested with `sp`
and `dp`. It will be adapted later
No
```python
from vllm import LLM, SamplingParams
def main():
prompts = [
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(
model="meta-llama/Llama-3.1-8B-Instruct",
tensor_parallel_size=4,
gpu_memory_utilization=0.9,
enforce_eager=True,
speculative_config={
"method": "eagle3",
"model": "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
"draft_tensor_parallel_size": 1,
"num_speculative_tokens": 3,
},
)
outputs = llm.generate(prompts, sampling_params)
print(f"Outputs: {outputs}")
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
Fixes vllm-project/vllm#31345
### What this PR does / why we need it?
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60
Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
Co-authored-by: drslark <slarksblood@qq.com>
2026-01-22 11:36:23 +08:00
elif isinstance ( kv_cache_spec , AttentionSpec ) :
2025-09-16 01:17:42 +08:00
# This is an attention backend that supports virtual
# block splitting. Get the supported block sizes from
# the backend.
try :
attn_groups = self . attn_groups [ kv_cache_group_id ]
except IndexError :
attn_groups = None
2025-09-18 21:43:22 +08:00
if attn_groups and self . use_hybrid_blocks :
2025-09-16 01:17:42 +08:00
# Use the backend's supported block size list
backend = attn_groups [ 0 ] . backend
supported_sizes = backend . get_supported_block_size ( )
# If no specific sizes supported, use cache config
# block_size
kernel_block_size_list = ( supported_sizes
if supported_sizes else
[ self . cache_config . block_size ] )
else :
# Fallback to cache config block_size if no backend found
2025-09-18 21:43:22 +08:00
kernel_block_size_list = [ self . cache_config . block_size ]
2025-09-16 01:17:42 +08:00
kernel_block_sizes . append ( kernel_block_size_list )
else :
# This is likely Mamba or other non-attention cache,
# no splitting.
2025-09-18 21:43:22 +08:00
# NOTE: set kernel_block_sizes to 0 to disable slotmapping computation
# of mamba block. In this case, BlockTable.block_size will never equal
# to kernel_block_sizes[0]
2025-09-16 01:17:42 +08:00
kernel_block_sizes . append ( [ 0 ] )
2025-10-20 09:39:40 +08:00
if block_sizes != [
self . cache_config . block_size
2025-10-20 15:29:48 +08:00
] or kernel_block_sizes != [ [ self . cache_config . block_size ] ] :
2025-09-16 01:17:42 +08:00
assert self . cache_config . cpu_offload_gb == 0 , (
" Cannot re-initialize the input batch when CPU weight "
" offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
" for more details. " )
2025-12-15 19:54:23 +08:00
self . input_batch = NPUInputBatch (
2025-09-16 01:17:42 +08:00
max_num_reqs = self . max_num_reqs ,
2026-01-11 11:38:45 +08:00
max_model_len = max ( self . model_config . max_model_len ,
self . max_encoder_len ) ,
2025-09-16 01:17:42 +08:00
max_num_batched_tokens = self . max_num_tokens ,
device = self . device ,
pin_memory = self . pin_memory ,
vocab_size = self . model_config . get_vocab_size ( ) ,
block_sizes = block_sizes ,
is_spec_decode = bool ( self . vllm_config . speculative_config ) ,
logitsprocs = self . input_batch . logitsprocs ,
is_pooling_model = self . is_pooling_model ,
num_speculative_tokens = (
self . vllm_config . speculative_config . num_speculative_tokens
if self . vllm_config . speculative_config else 0 ) ,
kernel_block_sizes = kernel_block_sizes ,
)
def initialize_attn_backend ( self , kv_cache_config : KVCacheConfig ) - > None :
"""
Initialize the attention backends and attention metadata builders .
"""
assert len ( self . attn_groups ) == 0 , \
" Attention backends are already initialized "
2025-09-22 15:02:41 +08:00
class AttentionGroupKey ( NamedTuple ) :
attn_backend : type [ AttentionBackend ]
kv_cache_spec : KVCacheSpec
def get_attn_backends_for_group (
kv_cache_group_spec : KVCacheGroupSpec ,
2025-12-30 08:32:14 +08:00
) - > tuple [ dict [ AttentionGroupKey , list [ str ] ] ,
set [ type [ AttentionBackend ] ] ] :
2025-09-22 15:02:41 +08:00
layers = get_layers_from_vllm_config (
self . vllm_config , AttentionLayerBase ,
kv_cache_group_spec . layer_names )
attn_backends = { }
attn_backend_layers = defaultdict ( list )
# Dedupe based on full class name; this is a bit safer than
# using the class itself as the key because when we create dynamic
# attention backend subclasses (e.g. ChunkedLocalAttention) unless
# they are cached correctly, there will be different objects per
# layer.
for layer_name in kv_cache_group_spec . layer_names :
attn_backend = layers [ layer_name ] . get_attn_backend ( )
full_cls_name = attn_backend . full_cls_name ( )
layer_kv_cache_spec = kv_cache_group_spec . kv_cache_spec
if isinstance ( layer_kv_cache_spec , UniformTypeKVCacheSpecs ) :
layer_kv_cache_spec = layer_kv_cache_spec . kv_cache_specs [
layer_name ]
key = ( full_cls_name , layer_kv_cache_spec )
attn_backends [ key ] = AttentionGroupKey ( attn_backend ,
layer_kv_cache_spec )
attn_backend_layers [ key ] . append ( layer_name )
2025-12-30 08:32:14 +08:00
return (
{
attn_backends [ k ] : v
for k , v in attn_backend_layers . items ( )
} ,
set ( group_key . attn_backend
for group_key in attn_backends . values ( ) ) ,
)
2025-09-22 15:02:41 +08:00
2025-11-26 11:48:58 +08:00
def create_attn_groups ( attn_backends_map : dict [ AttentionBackend ,
list [ str ] ] ,
kv_cache_group_id : int ) - > list [ AttentionGroup ] :
2025-09-20 17:37:57 +08:00
attn_groups : list [ AttentionGroup ] = [ ]
2025-09-22 15:02:41 +08:00
for ( attn_backend ,
kv_cache_spec ) , layer_names in attn_backends_map . items ( ) :
2025-09-20 17:37:57 +08:00
attn_metadata_builders = [ ]
attn_metadata_builders . append ( attn_backend . get_builder_cls ( ) (
kv_cache_spec ,
layer_names ,
self . vllm_config ,
self . device ,
) )
2025-11-26 11:48:58 +08:00
attn_group = AttentionGroup ( attn_backend , layer_names ,
kv_cache_spec , kv_cache_group_id ,
attn_metadata_builders )
2025-09-20 17:37:57 +08:00
attn_groups . append ( attn_group )
return attn_groups
2025-12-30 08:32:14 +08:00
attention_backend_maps = [ ]
attention_backend_list = [ ]
for kv_cache_group_spec in kv_cache_config . kv_cache_groups :
attn_backends = get_attn_backends_for_group ( kv_cache_group_spec )
attention_backend_maps . append ( attn_backends [ 0 ] )
attention_backend_list . append ( attn_backends [ 1 ] )
self . _check_and_update_cudagraph_mode ( attention_backend_list ,
kv_cache_config . kv_cache_groups )
2025-11-26 11:48:58 +08:00
for i , kv_cache_group_spec in enumerate (
kv_cache_config . kv_cache_groups ) :
2025-10-09 10:28:38 +08:00
attn_backends = get_attn_backends_for_group ( # type: ignore
kv_cache_group_spec )
2025-12-30 08:32:14 +08:00
self . attn_groups . append ( create_attn_groups ( attn_backends [ 0 ] , i ) )
2025-09-16 01:17:42 +08:00
# Calculate reorder batch threshold (if needed)
self . calculate_reorder_batch_threshold ( )
def calculate_reorder_batch_threshold ( self ) - > None :
"""
Check that if any backends reorder batches ; that the reordering
is compatible ( e . g . , decode threshold is the same )
"""
for group in self . _attn_group_iterator ( ) :
2025-10-09 10:28:38 +08:00
attn_metadata_builder_i = group . get_metadata_builder ( )
2025-12-12 17:27:09 +08:00
if hasattr ( attn_metadata_builder_i ,
" reorder_batch_threshold " ) : # noqa
2025-09-16 01:17:42 +08:00
# check that if any backends reorder batches; that the reordering
# is compatible (e.g., decode threshold is the same)
reorder_batch_threshold_i = (
attn_metadata_builder_i . reorder_batch_threshold )
2025-12-12 17:27:09 +08:00
if reorder_batch_threshold_i is not None : # noqa
2025-09-16 01:17:42 +08:00
if self . reorder_batch_threshold is not None :
if reorder_batch_threshold_i != \
self . reorder_batch_threshold :
raise ValueError (
f " Attention backend reorders decodes with "
f " threshold { reorder_batch_threshold_i } but other "
f " backend uses threshold "
f " { self . reorder_batch_threshold } " )
else :
2025-12-12 17:27:09 +08:00
self . reorder_batch_threshold = reorder_batch_threshold_i # noqa
2025-07-26 17:15:47 +08:00
[1/N][Refactor] Refactor code to adapt with vllm main (#3612)
### What this PR does / why we need it?
This is the step 1 of refactoring code to adapt with vllm main, and this
pr aligned with
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
1. refactor deepseek to the latest code arch as of
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
2. bunches of fixes due to vllm changes
- Fix `AscendScheduler` `__post_init__`, caused by
https://github.com/vllm-project/vllm/pull/25075
- Fix `AscendScheduler` init got an unexpected arg `block_size`, caused
by https://github.com/vllm-project/vllm/pull/26296
- Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by
https://github.com/vllm-project/vllm/pull/23485
- Fix `MLAAttention` import,caused by
https://github.com/vllm-project/vllm/pull/25103
- Fix `SharedFusedMoE` import, caused by
https://github.com/vllm-project/vllm/pull/26145
- Fix `LazyLoader` improt, caused by
https://github.com/vllm-project/vllm/pull/27022
- Fix `vllm.utils.swap_dict_values` improt, caused by
https://github.com/vllm-project/vllm/pull/26990
- Fix `Backend` enum import, caused by
https://github.com/vllm-project/vllm/pull/25893
- Fix `CompilationLevel` renaming to `CompilationMode` issue introduced
by https://github.com/vllm-project/vllm/pull/26355
- Fix fused_moe ops, caused by
https://github.com/vllm-project/vllm/pull/24097
- Fix bert model because of `inputs_embeds`, caused by
https://github.com/vllm-project/vllm/pull/25922
- Fix MRope because of `get_input_positions_tensor` to
`get_mrope_input_positions`, caused by
https://github.com/vllm-project/vllm/pull/24172
- Fix `splitting_ops` changes introduced by
https://github.com/vllm-project/vllm/pull/25845
- Fix multi-modality changes introduced by
https://github.com/vllm-project/vllm/issues/16229
- Fix lora bias dropping issue introduced by
https://github.com/vllm-project/vllm/pull/25807
- Fix structured ouput break introduced by
https://github.com/vllm-project/vllm/issues/26737
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
CI passed with existing test.
- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: Icey <1790571317@qq.com>
Co-authored-by: Icey <1790571317@qq.com>
2025-10-24 16:55:08 +08:00
def get_kv_cache_spec ( self ) - > dict [ str , KVCacheSpec ] :
"""
Generates the KVCacheSpec by parsing the kv cache format from each
Attention module in the static forward context .
Returns :
KVCacheSpec : A dictionary mapping layer names to their KV cache
format . Layers that do not need KV cache are not included .
"""
2025-12-03 20:48:45 +08:00
if has_ec_transfer ( ) and get_ec_transfer ( ) . is_producer :
return { }
[1/N][Refactor] Refactor code to adapt with vllm main (#3612)
### What this PR does / why we need it?
This is the step 1 of refactoring code to adapt with vllm main, and this
pr aligned with
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
1. refactor deepseek to the latest code arch as of
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
2. bunches of fixes due to vllm changes
- Fix `AscendScheduler` `__post_init__`, caused by
https://github.com/vllm-project/vllm/pull/25075
- Fix `AscendScheduler` init got an unexpected arg `block_size`, caused
by https://github.com/vllm-project/vllm/pull/26296
- Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by
https://github.com/vllm-project/vllm/pull/23485
- Fix `MLAAttention` import,caused by
https://github.com/vllm-project/vllm/pull/25103
- Fix `SharedFusedMoE` import, caused by
https://github.com/vllm-project/vllm/pull/26145
- Fix `LazyLoader` improt, caused by
https://github.com/vllm-project/vllm/pull/27022
- Fix `vllm.utils.swap_dict_values` improt, caused by
https://github.com/vllm-project/vllm/pull/26990
- Fix `Backend` enum import, caused by
https://github.com/vllm-project/vllm/pull/25893
- Fix `CompilationLevel` renaming to `CompilationMode` issue introduced
by https://github.com/vllm-project/vllm/pull/26355
- Fix fused_moe ops, caused by
https://github.com/vllm-project/vllm/pull/24097
- Fix bert model because of `inputs_embeds`, caused by
https://github.com/vllm-project/vllm/pull/25922
- Fix MRope because of `get_input_positions_tensor` to
`get_mrope_input_positions`, caused by
https://github.com/vllm-project/vllm/pull/24172
- Fix `splitting_ops` changes introduced by
https://github.com/vllm-project/vllm/pull/25845
- Fix multi-modality changes introduced by
https://github.com/vllm-project/vllm/issues/16229
- Fix lora bias dropping issue introduced by
https://github.com/vllm-project/vllm/pull/25807
- Fix structured ouput break introduced by
https://github.com/vllm-project/vllm/issues/26737
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
CI passed with existing test.
- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: Icey <1790571317@qq.com>
Co-authored-by: Icey <1790571317@qq.com>
2025-10-24 16:55:08 +08:00
kv_cache_spec : dict [ str , KVCacheSpec ] = { }
attn_layers = get_layers_from_vllm_config ( self . vllm_config ,
AttentionLayerBase )
2026-01-19 14:22:18 +08:00
# NOTE: Must process Attention/MLAAttention before MambaBase to maintain
2026-01-26 09:04:54 +08:00
# ordering expected by graph parameter update logic in attention backends.
2026-01-19 14:22:18 +08:00
mamba_layers : dict [ str , MambaBase ] = { }
[1/N][Refactor] Refactor code to adapt with vllm main (#3612)
### What this PR does / why we need it?
This is the step 1 of refactoring code to adapt with vllm main, and this
pr aligned with
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
1. refactor deepseek to the latest code arch as of
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
2. bunches of fixes due to vllm changes
- Fix `AscendScheduler` `__post_init__`, caused by
https://github.com/vllm-project/vllm/pull/25075
- Fix `AscendScheduler` init got an unexpected arg `block_size`, caused
by https://github.com/vllm-project/vllm/pull/26296
- Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by
https://github.com/vllm-project/vllm/pull/23485
- Fix `MLAAttention` import,caused by
https://github.com/vllm-project/vllm/pull/25103
- Fix `SharedFusedMoE` import, caused by
https://github.com/vllm-project/vllm/pull/26145
- Fix `LazyLoader` improt, caused by
https://github.com/vllm-project/vllm/pull/27022
- Fix `vllm.utils.swap_dict_values` improt, caused by
https://github.com/vllm-project/vllm/pull/26990
- Fix `Backend` enum import, caused by
https://github.com/vllm-project/vllm/pull/25893
- Fix `CompilationLevel` renaming to `CompilationMode` issue introduced
by https://github.com/vllm-project/vllm/pull/26355
- Fix fused_moe ops, caused by
https://github.com/vllm-project/vllm/pull/24097
- Fix bert model because of `inputs_embeds`, caused by
https://github.com/vllm-project/vllm/pull/25922
- Fix MRope because of `get_input_positions_tensor` to
`get_mrope_input_positions`, caused by
https://github.com/vllm-project/vllm/pull/24172
- Fix `splitting_ops` changes introduced by
https://github.com/vllm-project/vllm/pull/25845
- Fix multi-modality changes introduced by
https://github.com/vllm-project/vllm/issues/16229
- Fix lora bias dropping issue introduced by
https://github.com/vllm-project/vllm/pull/25807
- Fix structured ouput break introduced by
https://github.com/vllm-project/vllm/issues/26737
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
CI passed with existing test.
- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: Icey <1790571317@qq.com>
Co-authored-by: Icey <1790571317@qq.com>
2025-10-24 16:55:08 +08:00
for layer_name , attn_module in attn_layers . items ( ) :
if isinstance ( attn_module , Attention ) :
if ( kv_tgt_layer :=
attn_module . kv_sharing_target_layer_name ) is not None :
# The layer doesn't need its own KV cache and will use that of
# the target layer. We skip creating a KVCacheSpec for it, so
# that KV cache management logic will act as this layer does
# not exist, and doesn't allocate KV cache for the layer. This
# enables the memory saving of cross-layer kv sharing, allowing
# a given amount of memory to accommodate longer context lengths
# or enable more requests to be processed simultaneously.
self . shared_kv_cache_layers [ layer_name ] = kv_tgt_layer
continue
2026-01-19 14:22:18 +08:00
if spec := attn_module . get_kv_cache_spec ( self . vllm_config ) :
kv_cache_spec [ layer_name ] = spec
[1/N][Refactor] Refactor code to adapt with vllm main (#3612)
### What this PR does / why we need it?
This is the step 1 of refactoring code to adapt with vllm main, and this
pr aligned with
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
1. refactor deepseek to the latest code arch as of
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
2. bunches of fixes due to vllm changes
- Fix `AscendScheduler` `__post_init__`, caused by
https://github.com/vllm-project/vllm/pull/25075
- Fix `AscendScheduler` init got an unexpected arg `block_size`, caused
by https://github.com/vllm-project/vllm/pull/26296
- Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by
https://github.com/vllm-project/vllm/pull/23485
- Fix `MLAAttention` import,caused by
https://github.com/vllm-project/vllm/pull/25103
- Fix `SharedFusedMoE` import, caused by
https://github.com/vllm-project/vllm/pull/26145
- Fix `LazyLoader` improt, caused by
https://github.com/vllm-project/vllm/pull/27022
- Fix `vllm.utils.swap_dict_values` improt, caused by
https://github.com/vllm-project/vllm/pull/26990
- Fix `Backend` enum import, caused by
https://github.com/vllm-project/vllm/pull/25893
- Fix `CompilationLevel` renaming to `CompilationMode` issue introduced
by https://github.com/vllm-project/vllm/pull/26355
- Fix fused_moe ops, caused by
https://github.com/vllm-project/vllm/pull/24097
- Fix bert model because of `inputs_embeds`, caused by
https://github.com/vllm-project/vllm/pull/25922
- Fix MRope because of `get_input_positions_tensor` to
`get_mrope_input_positions`, caused by
https://github.com/vllm-project/vllm/pull/24172
- Fix `splitting_ops` changes introduced by
https://github.com/vllm-project/vllm/pull/25845
- Fix multi-modality changes introduced by
https://github.com/vllm-project/vllm/issues/16229
- Fix lora bias dropping issue introduced by
https://github.com/vllm-project/vllm/pull/25807
- Fix structured ouput break introduced by
https://github.com/vllm-project/vllm/issues/26737
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
CI passed with existing test.
- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: Icey <1790571317@qq.com>
Co-authored-by: Icey <1790571317@qq.com>
2025-10-24 16:55:08 +08:00
elif isinstance ( attn_module , MLAAttention ) :
2026-01-19 14:22:18 +08:00
if self . use_sparse :
[1/N][Refactor] Refactor code to adapt with vllm main (#3612)
### What this PR does / why we need it?
This is the step 1 of refactoring code to adapt with vllm main, and this
pr aligned with
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
1. refactor deepseek to the latest code arch as of
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
2. bunches of fixes due to vllm changes
- Fix `AscendScheduler` `__post_init__`, caused by
https://github.com/vllm-project/vllm/pull/25075
- Fix `AscendScheduler` init got an unexpected arg `block_size`, caused
by https://github.com/vllm-project/vllm/pull/26296
- Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by
https://github.com/vllm-project/vllm/pull/23485
- Fix `MLAAttention` import,caused by
https://github.com/vllm-project/vllm/pull/25103
- Fix `SharedFusedMoE` import, caused by
https://github.com/vllm-project/vllm/pull/26145
- Fix `LazyLoader` improt, caused by
https://github.com/vllm-project/vllm/pull/27022
- Fix `vllm.utils.swap_dict_values` improt, caused by
https://github.com/vllm-project/vllm/pull/26990
- Fix `Backend` enum import, caused by
https://github.com/vllm-project/vllm/pull/25893
- Fix `CompilationLevel` renaming to `CompilationMode` issue introduced
by https://github.com/vllm-project/vllm/pull/26355
- Fix fused_moe ops, caused by
https://github.com/vllm-project/vllm/pull/24097
- Fix bert model because of `inputs_embeds`, caused by
https://github.com/vllm-project/vllm/pull/25922
- Fix MRope because of `get_input_positions_tensor` to
`get_mrope_input_positions`, caused by
https://github.com/vllm-project/vllm/pull/24172
- Fix `splitting_ops` changes introduced by
https://github.com/vllm-project/vllm/pull/25845
- Fix multi-modality changes introduced by
https://github.com/vllm-project/vllm/issues/16229
- Fix lora bias dropping issue introduced by
https://github.com/vllm-project/vllm/pull/25807
- Fix structured ouput break introduced by
https://github.com/vllm-project/vllm/issues/26737
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
CI passed with existing test.
- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: Icey <1790571317@qq.com>
Co-authored-by: Icey <1790571317@qq.com>
2025-10-24 16:55:08 +08:00
# TODO(cmq): This is a hack way to fix deepseek kvcache when
2026-01-19 14:22:18 +08:00
# using DSA. Fix the spec in vLLM is the final way.
block_size = self . vllm_config . cache_config . block_size
[1/N][Refactor] Refactor code to adapt with vllm main (#3612)
### What this PR does / why we need it?
This is the step 1 of refactoring code to adapt with vllm main, and this
pr aligned with
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
1. refactor deepseek to the latest code arch as of
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
2. bunches of fixes due to vllm changes
- Fix `AscendScheduler` `__post_init__`, caused by
https://github.com/vllm-project/vllm/pull/25075
- Fix `AscendScheduler` init got an unexpected arg `block_size`, caused
by https://github.com/vllm-project/vllm/pull/26296
- Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by
https://github.com/vllm-project/vllm/pull/23485
- Fix `MLAAttention` import,caused by
https://github.com/vllm-project/vllm/pull/25103
- Fix `SharedFusedMoE` import, caused by
https://github.com/vllm-project/vllm/pull/26145
- Fix `LazyLoader` improt, caused by
https://github.com/vllm-project/vllm/pull/27022
- Fix `vllm.utils.swap_dict_values` improt, caused by
https://github.com/vllm-project/vllm/pull/26990
- Fix `Backend` enum import, caused by
https://github.com/vllm-project/vllm/pull/25893
- Fix `CompilationLevel` renaming to `CompilationMode` issue introduced
by https://github.com/vllm-project/vllm/pull/26355
- Fix fused_moe ops, caused by
https://github.com/vllm-project/vllm/pull/24097
- Fix bert model because of `inputs_embeds`, caused by
https://github.com/vllm-project/vllm/pull/25922
- Fix MRope because of `get_input_positions_tensor` to
`get_mrope_input_positions`, caused by
https://github.com/vllm-project/vllm/pull/24172
- Fix `splitting_ops` changes introduced by
https://github.com/vllm-project/vllm/pull/25845
- Fix multi-modality changes introduced by
https://github.com/vllm-project/vllm/issues/16229
- Fix lora bias dropping issue introduced by
https://github.com/vllm-project/vllm/pull/25807
- Fix structured ouput break introduced by
https://github.com/vllm-project/vllm/issues/26737
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
CI passed with existing test.
- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: Icey <1790571317@qq.com>
Co-authored-by: Icey <1790571317@qq.com>
2025-10-24 16:55:08 +08:00
kv_cache_spec [ layer_name ] = FullAttentionSpec (
block_size = block_size ,
num_kv_heads = 1 ,
head_size = attn_module . head_size ,
dtype = self . kv_cache_dtype )
2026-01-19 14:22:18 +08:00
elif spec := attn_module . get_kv_cache_spec ( self . vllm_config ) :
kv_cache_spec [ layer_name ] = spec
elif isinstance ( attn_module , MambaBase ) :
mamba_layers [ layer_name ] = attn_module
[1/N][Refactor] Refactor code to adapt with vllm main (#3612)
### What this PR does / why we need it?
This is the step 1 of refactoring code to adapt with vllm main, and this
pr aligned with
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
1. refactor deepseek to the latest code arch as of
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
2. bunches of fixes due to vllm changes
- Fix `AscendScheduler` `__post_init__`, caused by
https://github.com/vllm-project/vllm/pull/25075
- Fix `AscendScheduler` init got an unexpected arg `block_size`, caused
by https://github.com/vllm-project/vllm/pull/26296
- Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by
https://github.com/vllm-project/vllm/pull/23485
- Fix `MLAAttention` import,caused by
https://github.com/vllm-project/vllm/pull/25103
- Fix `SharedFusedMoE` import, caused by
https://github.com/vllm-project/vllm/pull/26145
- Fix `LazyLoader` improt, caused by
https://github.com/vllm-project/vllm/pull/27022
- Fix `vllm.utils.swap_dict_values` improt, caused by
https://github.com/vllm-project/vllm/pull/26990
- Fix `Backend` enum import, caused by
https://github.com/vllm-project/vllm/pull/25893
- Fix `CompilationLevel` renaming to `CompilationMode` issue introduced
by https://github.com/vllm-project/vllm/pull/26355
- Fix fused_moe ops, caused by
https://github.com/vllm-project/vllm/pull/24097
- Fix bert model because of `inputs_embeds`, caused by
https://github.com/vllm-project/vllm/pull/25922
- Fix MRope because of `get_input_positions_tensor` to
`get_mrope_input_positions`, caused by
https://github.com/vllm-project/vllm/pull/24172
- Fix `splitting_ops` changes introduced by
https://github.com/vllm-project/vllm/pull/25845
- Fix multi-modality changes introduced by
https://github.com/vllm-project/vllm/issues/16229
- Fix lora bias dropping issue introduced by
https://github.com/vllm-project/vllm/pull/25807
- Fix structured ouput break introduced by
https://github.com/vllm-project/vllm/issues/26737
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
CI passed with existing test.
- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: Icey <1790571317@qq.com>
Co-authored-by: Icey <1790571317@qq.com>
2025-10-24 16:55:08 +08:00
2025-09-16 01:17:42 +08:00
if len ( mamba_layers ) > 0 :
if self . vllm_config . cache_config . enable_prefix_caching :
raise NotImplementedError (
" Prefix caching is not supported for Mamba yet. " )
for layer_name , mamba_module in mamba_layers . items ( ) :
2026-01-19 14:22:18 +08:00
if spec := mamba_module . get_kv_cache_spec ( self . vllm_config ) :
kv_cache_spec [ layer_name ] = spec
2025-09-16 01:17:42 +08:00
2025-03-20 19:34:44 +08:00
return kv_cache_spec
support aclgraph (#426)
<!-- Thanks for sending a pull request!
BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html
-->
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.
- Please clarify why the changes are needed. For instance, the use case
and bug description.
- Fixes #
-->
This PR supports the access of vllm-acend to the piecewise_graph feature
provided by the v1 engine.
1. register unifiled_ascend_attention_with_output for piecewise_graph to
split graph.
2. support NPUGraph to accelerate kernel launch.
### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->
support npugraph to default, Users can disenable the npugraph feature by
configuring enforce_eager.
This has corresponding requirements for the versions of torch_npu and
CANN, and they need to support graph capture.
### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->
it turn to default
---------
Signed-off-by: Bug Hunter Yan <yanpq@zju.edu.cn>
Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
Co-authored-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-04-23 20:56:24 +08:00
2025-12-30 08:32:14 +08:00
def _check_and_update_cudagraph_mode (
self ,
attention_backends : list [ set [ type [ AttentionBackend ] ] ] ,
kv_cache_groups : list [ KVCacheGroupSpec ] ,
) - > None :
super ( ) . _check_and_update_cudagraph_mode ( attention_backends ,
kv_cache_groups )
2025-12-07 17:32:45 +08:00
2025-12-10 20:11:09 +08:00
# NOTE: Since aclgraph_batch_sizes cannot be determined until here,
# we set the graph params right before initializing the keys.
2025-12-30 08:32:14 +08:00
if self . use_aclgraph :
set_graph_params ( self . cudagraph_batch_sizes )
if self . speculative_config :
set_draft_graph_params ( self . cudagraph_batch_sizes )
2025-07-07 22:37:14 +08:00
support aclgraph (#426)
<!-- Thanks for sending a pull request!
BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html
-->
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.
- Please clarify why the changes are needed. For instance, the use case
and bug description.
- Fixes #
-->
This PR supports the access of vllm-acend to the piecewise_graph feature
provided by the v1 engine.
1. register unifiled_ascend_attention_with_output for piecewise_graph to
split graph.
2. support NPUGraph to accelerate kernel launch.
### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->
support npugraph to default, Users can disenable the npugraph feature by
configuring enforce_eager.
This has corresponding requirements for the versions of torch_npu and
CANN, and they need to support graph capture.
### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->
it turn to default
---------
Signed-off-by: Bug Hunter Yan <yanpq@zju.edu.cn>
Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
Co-authored-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-04-23 20:56:24 +08:00
def capture_model ( self ) - > None :
2026-01-15 15:40:28 +08:00
gpu_model_runner_cls = next ( ( cls for cls in self . __class__ . __mro__
if cls . __name__ == " GPUModelRunner " ) ,
None )
if gpu_model_runner_cls is None :
raise TypeError ( " Could not find GPUModelRunner in the MRO. "
" The class hierarchy may have changed. " )
parent_module_name = gpu_model_runner_cls . __module__
2025-12-30 08:32:14 +08:00
with _torch_cuda_wrapper ( ) , _replace_gpu_model_runner_function_wrapper (
parent_module_name ) :
2026-01-15 15:40:28 +08:00
GPUModelRunner . capture_model ( self )
Spec decode support for V1 Engine (#874)
<!-- Thanks for sending a pull request!
BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html
-->
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.
- Please clarify why the changes are needed. For instance, the use case
and bug description.
- Fixes #
-->
Make spec decode support for V1 Engine
- Currently, Ascend does not support the triton kernel. PyTorch is used
to rewrite the `rejection_sampler.py` triton kernel. However, PyTorch is
not as good as Triton. Therefore, ascend c is used to implement the
function in the future.
- Currently, spec decode supports only the ngram algorithm. The eagle
algorithm needs to be further adapted.
### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->
Not change user facing.
### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->
test by `tests/singlecard/spec_decode/e2e/test_v1_spec_decode.py` and
`tests/sample/test_rejection_sampler.py`, test base function of
rejection sampler and e2e function of spec decode.
Signed-off-by: ponix-j <657511300@qq.com>
2025-05-23 14:25:46 +08:00
2025-12-22 18:39:45 +08:00
def _prepare_multimodal_fields ( self ) :
"""
Ensures specific multimodal tensors are on CPU .
2026-01-19 08:58:07 +08:00
This is necessary for fields like ' grid_thw ' which are converted to numpy
2025-12-22 18:39:45 +08:00
inside the model ' s forward pass.
"""
if not self . multimodal_cpu_fields :
return
req_ids = self . input_batch . req_ids
for req_id in req_ids :
req = self . requests . get ( req_id )
if req is None :
continue
mm_data = getattr ( req , ' multimodal_data ' , None )
if not mm_data :
continue
for field in self . multimodal_cpu_fields :
if field in mm_data :
tensor = mm_data [ field ]
if isinstance (
tensor ,
torch . Tensor ) and tensor . device . type != ' cpu ' :
mm_data [ field ] = tensor . cpu ( )
2025-12-12 17:27:09 +08:00
2026-01-27 22:27:01 +08:00
def _post_process_cudagraph_mode ( tensor : torch . Tensor ) - > int :
"""
Synchronize cudagraph_mode across DP ranks by taking the minimum .
If any rank has NONE ( 0 ) , all ranks use NONE .
This ensures all ranks send consistent values ( all padded or all unpadded ) .
"""
return int ( tensor [ 1 , : ] . min ( ) . item ( ) )
2025-12-12 17:27:09 +08:00
@contextmanager
def _torch_cuda_wrapper ( ) :
class _EventPlaceholder :
def __init__ ( self , * args , * * kwargs ) - > None :
self . record = lambda : None
self . synchronize = lambda : None
class _StreamPlaceholder :
def __init__ ( self , * args , * * kwargs ) - > None :
pass
try :
# replace cuda APIs with xpu APIs, this should work by default
2025-12-17 01:35:26 +08:00
torch . Event = torch . npu . Event
2025-12-13 17:04:54 +08:00
torch . cuda . Event = torch . npu . Event
2025-12-12 17:27:09 +08:00
torch . cuda . Stream = torch . npu . Stream
torch . cuda . default_stream = torch . npu . default_stream
torch . cuda . current_stream = torch . npu . current_stream
torch . cuda . stream = torch . npu . stream
2025-12-30 08:32:14 +08:00
torch . cuda . synchronize = torch . npu . synchronize
torch . cuda . mem_get_info = torch . npu . mem_get_info
2025-12-12 17:27:09 +08:00
yield
2026-01-06 22:48:21 +08:00
except Exception as e :
2025-12-12 17:27:09 +08:00
torch . cuda . Event = _EventPlaceholder
torch . cuda . Stream = _StreamPlaceholder
torch . cuda . default_stream = _StreamPlaceholder
torch . cuda . current_stream = _StreamPlaceholder
torch . cuda . stream = _StreamPlaceholder
2025-12-30 08:32:14 +08:00
torch . cuda . synchronize = _StreamPlaceholder
torch . cuda . mem_get_info = _StreamPlaceholder
2026-01-06 22:48:21 +08:00
raise RuntimeError ( f " NPUModelRunner init failed, error is { e } " )
2025-12-12 17:27:09 +08:00
finally :
# if anything goes wrong, just patch it with a placeholder
torch . cuda . Event = _EventPlaceholder
torch . cuda . Stream = torch . cuda . Stream
torch . cuda . default_stream = torch . npu . default_stream
torch . cuda . current_stream = torch . npu . current_stream
torch . cuda . stream = torch . npu . stream
2025-12-30 08:32:14 +08:00
torch . cuda . synchronize = torch . npu . synchronize
torch . cuda . mem_get_info = torch . npu . mem_get_info
# TODO: This method will be removed subsequently and implemented in platform.
@contextmanager
def _replace_gpu_model_runner_function_wrapper ( target_module_name ) :
try :
target_module = sys . modules [ target_module_name ]
setattr ( target_module , " graph_capture " , graph_capture )
yield
finally :
setattr ( target_module , " graph_capture " , graph_capture )