2026-01-07 17:09:52 +08:00
|
|
|
|
from typing import Optional, Union
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
|
|
2025-05-30 08:59:58 +08:00
|
|
|
|
import torch
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
|
import torch.nn as nn
|
2026-01-05 14:07:54 +08:00
|
|
|
|
from vllm.config import CUDAGraphMode
|
2025-12-05 10:31:49 +08:00
|
|
|
|
from vllm.distributed import get_pcp_group
|
2025-12-02 22:10:52 +08:00
|
|
|
|
from vllm.forward_context import get_forward_context
|
2025-10-30 16:53:05 +08:00
|
|
|
|
from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
|
2025-12-29 16:25:52 +08:00
|
|
|
|
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
|
2025-09-04 11:34:47 +08:00
|
|
|
|
from vllm.v1.core.sched.output import SchedulerOutput
|
2025-05-30 08:59:58 +08:00
|
|
|
|
from vllm.v1.sample.metadata import SamplingMetadata
|
2026-01-07 18:41:45 +08:00
|
|
|
|
from vllm.v1.spec_decode.eagle import PADDING_SLOT_ID
|
2025-05-30 08:59:58 +08:00
|
|
|
|
|
2025-12-16 17:44:04 +08:00
|
|
|
|
from vllm_ascend.ascend_forward_context import set_ascend_forward_context
|
[Feat] Support MTP to running in full graph mode (#3892)
### What this PR does / why we need it?
Currently, the MTP model still runs in eager in full graph mode. This PR
adapts the MTP with the full graph capture and execution. When the graph
mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to
improve the performance.
The change in both disable_padded_drafter_batch is True and False case
include:
1. Add _mtp_graph_params in acl_graph.py to isolate the data of main
model and the data of MTP.
2. Padding some metadata in mla_v1.py when in fullgraph mode.
3. Fixed the essential data address that will be used in model.forward.
4. Adapted according to the aclgraph capture framwork:
1). Rebuild MTP model with ACLGraphWrapper.
2). Add common attn metadata when start capture in MTP dummy_run.
3). Add common attn metadata update in MTP.
4). Addapted data update when num_speculative_tokens > 1.
5. Add a patch of MTP to adapt vllm v0.11.0.
Existing Issues:
1. When disable_padded_drafter_batch=True and running in FullGraph mode,
the data of the first-round requests in MTP is abnormal. We need to
identify the cause subsequently.
2. When disable_padded_drafter_batch=False and running in FullGraph
mode, the acceptance rate of the second and third tokens will decrease
(For example, if we set the num_speculative_tokens=3, the acceptance
rate of first token is 90%, the second is only 50% lower than 60%, the
third is only 20% lower than 30%). The reason is that the data processed
after the model runs does not match. This is a problem from another PR.
It works fine in eager and PIECEWISE mode, but has problem in FullGraph
mode. Once we have a solution, we will submit a bugfix.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
|
|
|
|
from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
2025-11-12 17:22:21 +08:00
|
|
|
|
from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
|
2026-01-06 16:47:39 +08:00
|
|
|
|
from vllm_ascend.compilation.acl_graph import ACLGraphWrapper
|
2025-12-28 10:35:07 +08:00
|
|
|
|
from vllm_ascend.ops.rotary_embedding import get_cos_and_sin_mla
|
2025-12-29 16:25:52 +08:00
|
|
|
|
from vllm_ascend.spec_decode.eagle_proposer import EagleProposer
|
|
|
|
|
|
from vllm_ascend.utils import ProfileExecuteDuration, lmhead_tp_enable
|
2025-10-25 15:36:32 +08:00
|
|
|
|
|
2025-11-07 16:39:03 +08:00
|
|
|
|
|
2025-12-29 16:25:52 +08:00
|
|
|
|
class MtpProposer(EagleProposer):
|
2025-05-30 08:59:58 +08:00
|
|
|
|
|
[Feat] Support MTP to running in full graph mode (#3892)
### What this PR does / why we need it?
Currently, the MTP model still runs in eager in full graph mode. This PR
adapts the MTP with the full graph capture and execution. When the graph
mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to
improve the performance.
The change in both disable_padded_drafter_batch is True and False case
include:
1. Add _mtp_graph_params in acl_graph.py to isolate the data of main
model and the data of MTP.
2. Padding some metadata in mla_v1.py when in fullgraph mode.
3. Fixed the essential data address that will be used in model.forward.
4. Adapted according to the aclgraph capture framwork:
1). Rebuild MTP model with ACLGraphWrapper.
2). Add common attn metadata when start capture in MTP dummy_run.
3). Add common attn metadata update in MTP.
4). Addapted data update when num_speculative_tokens > 1.
5. Add a patch of MTP to adapt vllm v0.11.0.
Existing Issues:
1. When disable_padded_drafter_batch=True and running in FullGraph mode,
the data of the first-round requests in MTP is abnormal. We need to
identify the cause subsequently.
2. When disable_padded_drafter_batch=False and running in FullGraph
mode, the acceptance rate of the second and third tokens will decrease
(For example, if we set the num_speculative_tokens=3, the acceptance
rate of first token is 90%, the second is only 50% lower than 60%, the
third is only 20% lower than 30%). The reason is that the data processed
after the model runs does not match. This is a problem from another PR.
It works fine in eager and PIECEWISE mode, but has problem in FullGraph
mode. Once we have a solution, we will submit a bugfix.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
|
|
|
|
# TODO: Find out why ModelRunner does not this explicit typing?
|
|
|
|
|
|
model: Union[nn.Module, ACLGraphWrapper]
|
|
|
|
|
|
|
2025-09-04 11:34:47 +08:00
|
|
|
|
@torch.inference_mode()
|
|
|
|
|
|
def dummy_run(self,
|
|
|
|
|
|
num_tokens: int,
|
|
|
|
|
|
with_prefill: bool = False,
|
2025-12-16 09:14:05 +08:00
|
|
|
|
in_graph_capturing: bool = False,
|
2025-09-04 11:34:47 +08:00
|
|
|
|
num_reqs: int = 0,
|
2025-10-17 18:14:49 +08:00
|
|
|
|
num_tokens_across_dp=None,
|
|
|
|
|
|
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
|
2025-12-01 10:22:36 +08:00
|
|
|
|
batch_descriptor=None,
|
2025-12-18 22:27:47 +08:00
|
|
|
|
dummy_compute_logits=lambda hidden_states: None,
|
|
|
|
|
|
is_profile=False) -> None:
|
2025-10-30 16:53:05 +08:00
|
|
|
|
|
|
|
|
|
|
(
|
|
|
|
|
|
num_tokens,
|
|
|
|
|
|
num_tokens_across_dp,
|
|
|
|
|
|
with_prefill,
|
|
|
|
|
|
) = self.runner._sync_metadata_across_dp(num_tokens, with_prefill)
|
2026-01-04 12:03:21 +08:00
|
|
|
|
if not self.use_cuda_graph:
|
2025-12-10 22:31:47 +08:00
|
|
|
|
# there is synchronization between mtp steps when enabling aclgraph,
|
|
|
|
|
|
# disable aclgraph when use async scheduling to avoid the
|
|
|
|
|
|
# synchronization overhead.
|
|
|
|
|
|
# NOTE: we need to set aclgraph_runtime_mode to None in both dummy_run
|
|
|
|
|
|
# and _propose.
|
|
|
|
|
|
aclgraph_runtime_mode = CUDAGraphMode.NONE
|
2025-12-16 09:14:05 +08:00
|
|
|
|
if aclgraph_runtime_mode == CUDAGraphMode.FULL:
|
[Feat] Support MTP to running in full graph mode (#3892)
### What this PR does / why we need it?
Currently, the MTP model still runs in eager in full graph mode. This PR
adapts the MTP with the full graph capture and execution. When the graph
mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to
improve the performance.
The change in both disable_padded_drafter_batch is True and False case
include:
1. Add _mtp_graph_params in acl_graph.py to isolate the data of main
model and the data of MTP.
2. Padding some metadata in mla_v1.py when in fullgraph mode.
3. Fixed the essential data address that will be used in model.forward.
4. Adapted according to the aclgraph capture framwork:
1). Rebuild MTP model with ACLGraphWrapper.
2). Add common attn metadata when start capture in MTP dummy_run.
3). Add common attn metadata update in MTP.
4). Addapted data update when num_speculative_tokens > 1.
5. Add a patch of MTP to adapt vllm v0.11.0.
Existing Issues:
1. When disable_padded_drafter_batch=True and running in FullGraph mode,
the data of the first-round requests in MTP is abnormal. We need to
identify the cause subsequently.
2. When disable_padded_drafter_batch=False and running in FullGraph
mode, the acceptance rate of the second and third tokens will decrease
(For example, if we set the num_speculative_tokens=3, the acceptance
rate of first token is 90%, the second is only 50% lower than 60%, the
third is only 20% lower than 30%). The reason is that the data processed
after the model runs does not match. This is a problem from another PR.
It works fine in eager and PIECEWISE mode, but has problem in FullGraph
mode. Once we have a solution, we will submit a bugfix.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
|
|
|
|
if len(self.runner.attn_groups) > 0:
|
|
|
|
|
|
num_computed_tokens_cpu = (
|
|
|
|
|
|
self.runner.input_batch.
|
|
|
|
|
|
num_computed_tokens_cpu_tensor[:num_reqs])
|
|
|
|
|
|
common_attn_metadata = AscendCommonAttentionMetadata(
|
2025-12-12 17:27:09 +08:00
|
|
|
|
query_start_loc=self.runner.query_start_loc.gpu[:num_reqs +
|
|
|
|
|
|
1],
|
|
|
|
|
|
query_start_loc_cpu=self.runner.query_start_loc.
|
|
|
|
|
|
cpu[:num_reqs + 1],
|
|
|
|
|
|
seq_lens_cpu=self.runner.seq_lens.cpu,
|
|
|
|
|
|
seq_lens=self.runner.seq_lens.gpu[:num_reqs],
|
[Feat] Support MTP to running in full graph mode (#3892)
### What this PR does / why we need it?
Currently, the MTP model still runs in eager in full graph mode. This PR
adapts the MTP with the full graph capture and execution. When the graph
mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to
improve the performance.
The change in both disable_padded_drafter_batch is True and False case
include:
1. Add _mtp_graph_params in acl_graph.py to isolate the data of main
model and the data of MTP.
2. Padding some metadata in mla_v1.py when in fullgraph mode.
3. Fixed the essential data address that will be used in model.forward.
4. Adapted according to the aclgraph capture framwork:
1). Rebuild MTP model with ACLGraphWrapper.
2). Add common attn metadata when start capture in MTP dummy_run.
3). Add common attn metadata update in MTP.
4). Addapted data update when num_speculative_tokens > 1.
5. Add a patch of MTP to adapt vllm v0.11.0.
Existing Issues:
1. When disable_padded_drafter_batch=True and running in FullGraph mode,
the data of the first-round requests in MTP is abnormal. We need to
identify the cause subsequently.
2. When disable_padded_drafter_batch=False and running in FullGraph
mode, the acceptance rate of the second and third tokens will decrease
(For example, if we set the num_speculative_tokens=3, the acceptance
rate of first token is 90%, the second is only 50% lower than 60%, the
third is only 20% lower than 30%). The reason is that the data processed
after the model runs does not match. This is a problem from another PR.
It works fine in eager and PIECEWISE mode, but has problem in FullGraph
mode. Once we have a solution, we will submit a bugfix.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
|
|
|
|
num_reqs=num_reqs,
|
|
|
|
|
|
num_actual_tokens=num_tokens,
|
2025-12-10 20:11:09 +08:00
|
|
|
|
num_input_tokens=num_tokens,
|
[Feat] Support MTP to running in full graph mode (#3892)
### What this PR does / why we need it?
Currently, the MTP model still runs in eager in full graph mode. This PR
adapts the MTP with the full graph capture and execution. When the graph
mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to
improve the performance.
The change in both disable_padded_drafter_batch is True and False case
include:
1. Add _mtp_graph_params in acl_graph.py to isolate the data of main
model and the data of MTP.
2. Padding some metadata in mla_v1.py when in fullgraph mode.
3. Fixed the essential data address that will be used in model.forward.
4. Adapted according to the aclgraph capture framwork:
1). Rebuild MTP model with ACLGraphWrapper.
2). Add common attn metadata when start capture in MTP dummy_run.
3). Add common attn metadata update in MTP.
4). Addapted data update when num_speculative_tokens > 1.
5. Add a patch of MTP to adapt vllm v0.11.0.
Existing Issues:
1. When disable_padded_drafter_batch=True and running in FullGraph mode,
the data of the first-round requests in MTP is abnormal. We need to
identify the cause subsequently.
2. When disable_padded_drafter_batch=False and running in FullGraph
mode, the acceptance rate of the second and third tokens will decrease
(For example, if we set the num_speculative_tokens=3, the acceptance
rate of first token is 90%, the second is only 50% lower than 60%, the
third is only 20% lower than 30%). The reason is that the data processed
after the model runs does not match. This is a problem from another PR.
It works fine in eager and PIECEWISE mode, but has problem in FullGraph
mode. Once we have a solution, we will submit a bugfix.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
|
|
|
|
max_query_len=self.num_speculative_tokens + 1,
|
|
|
|
|
|
num_computed_tokens_cpu=num_computed_tokens_cpu,
|
|
|
|
|
|
actual_seq_lengths_q=self.runner.actual_seq_lengths_q,
|
|
|
|
|
|
block_table_tensor=self.runner.input_batch.block_table[0].
|
2025-12-12 17:27:09 +08:00
|
|
|
|
get_device_tensor(),
|
[Feat] Support MTP to running in full graph mode (#3892)
### What this PR does / why we need it?
Currently, the MTP model still runs in eager in full graph mode. This PR
adapts the MTP with the full graph capture and execution. When the graph
mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to
improve the performance.
The change in both disable_padded_drafter_batch is True and False case
include:
1. Add _mtp_graph_params in acl_graph.py to isolate the data of main
model and the data of MTP.
2. Padding some metadata in mla_v1.py when in fullgraph mode.
3. Fixed the essential data address that will be used in model.forward.
4. Adapted according to the aclgraph capture framwork:
1). Rebuild MTP model with ACLGraphWrapper.
2). Add common attn metadata when start capture in MTP dummy_run.
3). Add common attn metadata update in MTP.
4). Addapted data update when num_speculative_tokens > 1.
5. Add a patch of MTP to adapt vllm v0.11.0.
Existing Issues:
1. When disable_padded_drafter_batch=True and running in FullGraph mode,
the data of the first-round requests in MTP is abnormal. We need to
identify the cause subsequently.
2. When disable_padded_drafter_batch=False and running in FullGraph
mode, the acceptance rate of the second and third tokens will decrease
(For example, if we set the num_speculative_tokens=3, the acceptance
rate of first token is 90%, the second is only 50% lower than 60%, the
third is only 20% lower than 30%). The reason is that the data processed
after the model runs does not match. This is a problem from another PR.
It works fine in eager and PIECEWISE mode, but has problem in FullGraph
mode. Once we have a solution, we will submit a bugfix.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
|
|
|
|
slot_mapping=self.runner.input_batch.block_table[0].
|
2025-12-12 17:27:09 +08:00
|
|
|
|
slot_mapping.gpu,
|
|
|
|
|
|
positions=self.runner.positions.gpu,
|
[Feat] Support MTP to running in full graph mode (#3892)
### What this PR does / why we need it?
Currently, the MTP model still runs in eager in full graph mode. This PR
adapts the MTP with the full graph capture and execution. When the graph
mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to
improve the performance.
The change in both disable_padded_drafter_batch is True and False case
include:
1. Add _mtp_graph_params in acl_graph.py to isolate the data of main
model and the data of MTP.
2. Padding some metadata in mla_v1.py when in fullgraph mode.
3. Fixed the essential data address that will be used in model.forward.
4. Adapted according to the aclgraph capture framwork:
1). Rebuild MTP model with ACLGraphWrapper.
2). Add common attn metadata when start capture in MTP dummy_run.
3). Add common attn metadata update in MTP.
4). Addapted data update when num_speculative_tokens > 1.
5. Add a patch of MTP to adapt vllm v0.11.0.
Existing Issues:
1. When disable_padded_drafter_batch=True and running in FullGraph mode,
the data of the first-round requests in MTP is abnormal. We need to
identify the cause subsequently.
2. When disable_padded_drafter_batch=False and running in FullGraph
mode, the acceptance rate of the second and third tokens will decrease
(For example, if we set the num_speculative_tokens=3, the acceptance
rate of first token is 90%, the second is only 50% lower than 60%, the
third is only 20% lower than 30%). The reason is that the data processed
after the model runs does not match. This is a problem from another PR.
It works fine in eager and PIECEWISE mode, but has problem in FullGraph
mode. Once we have a solution, we will submit a bugfix.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
|
|
|
|
attn_state=self.runner.attn_state,
|
|
|
|
|
|
decode_token_per_req=self.runner.decode_token_per_req,
|
2025-12-23 00:10:52 +08:00
|
|
|
|
max_seq_len=0)
|
2025-12-22 16:13:39 +08:00
|
|
|
|
if self.pcp_size * self.dcp_size > 1:
|
|
|
|
|
|
# update long_seq related params and flatten block_table
|
|
|
|
|
|
common_attn_metadata.prefill_context_parallel_metadata = \
|
2025-12-31 09:29:57 +08:00
|
|
|
|
self.runner.pcp_manager.long_seq_metadata
|
2025-12-22 16:13:39 +08:00
|
|
|
|
common_attn_metadata.block_table_tensor = \
|
|
|
|
|
|
self.runner.input_batch.block_table[0].get_device_tensor()[
|
|
|
|
|
|
:num_reqs * self.decode_threshold]
|
[Feat] Support MTP to running in full graph mode (#3892)
### What this PR does / why we need it?
Currently, the MTP model still runs in eager in full graph mode. This PR
adapts the MTP with the full graph capture and execution. When the graph
mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to
improve the performance.
The change in both disable_padded_drafter_batch is True and False case
include:
1. Add _mtp_graph_params in acl_graph.py to isolate the data of main
model and the data of MTP.
2. Padding some metadata in mla_v1.py when in fullgraph mode.
3. Fixed the essential data address that will be used in model.forward.
4. Adapted according to the aclgraph capture framwork:
1). Rebuild MTP model with ACLGraphWrapper.
2). Add common attn metadata when start capture in MTP dummy_run.
3). Add common attn metadata update in MTP.
4). Addapted data update when num_speculative_tokens > 1.
5. Add a patch of MTP to adapt vllm v0.11.0.
Existing Issues:
1. When disable_padded_drafter_batch=True and running in FullGraph mode,
the data of the first-round requests in MTP is abnormal. We need to
identify the cause subsequently.
2. When disable_padded_drafter_batch=False and running in FullGraph
mode, the acceptance rate of the second and third tokens will decrease
(For example, if we set the num_speculative_tokens=3, the acceptance
rate of first token is 90%, the second is only 50% lower than 60%, the
third is only 20% lower than 30%). The reason is that the data processed
after the model runs does not match. This is a problem from another PR.
It works fine in eager and PIECEWISE mode, but has problem in FullGraph
mode. Once we have a solution, we will submit a bugfix.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
|
|
|
|
|
|
|
|
|
|
builder = self.runner.attn_groups[0][0].get_metadata_builder()
|
2026-01-04 12:03:21 +08:00
|
|
|
|
# `AscendAttentionState.SpecDecoding` is only designed for mla, `AscendAttentionState.ChunkedPrefill` is used in self-attention.
|
|
|
|
|
|
attn_state = AscendAttentionState.SpecDecoding if self.vllm_config.model_config.use_mla else AscendAttentionState.ChunkedPrefill
|
[Feat] Support MTP to running in full graph mode (#3892)
### What this PR does / why we need it?
Currently, the MTP model still runs in eager in full graph mode. This PR
adapts the MTP with the full graph capture and execution. When the graph
mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to
improve the performance.
The change in both disable_padded_drafter_batch is True and False case
include:
1. Add _mtp_graph_params in acl_graph.py to isolate the data of main
model and the data of MTP.
2. Padding some metadata in mla_v1.py when in fullgraph mode.
3. Fixed the essential data address that will be used in model.forward.
4. Adapted according to the aclgraph capture framwork:
1). Rebuild MTP model with ACLGraphWrapper.
2). Add common attn metadata when start capture in MTP dummy_run.
3). Add common attn metadata update in MTP.
4). Addapted data update when num_speculative_tokens > 1.
5. Add a patch of MTP to adapt vllm v0.11.0.
Existing Issues:
1. When disable_padded_drafter_batch=True and running in FullGraph mode,
the data of the first-round requests in MTP is abnormal. We need to
identify the cause subsequently.
2. When disable_padded_drafter_batch=False and running in FullGraph
mode, the acceptance rate of the second and third tokens will decrease
(For example, if we set the num_speculative_tokens=3, the acceptance
rate of first token is 90%, the second is only 50% lower than 60%, the
third is only 20% lower than 30%). The reason is that the data processed
after the model runs does not match. This is a problem from another PR.
It works fine in eager and PIECEWISE mode, but has problem in FullGraph
mode. Once we have a solution, we will submit a bugfix.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
|
|
|
|
attn_metadata_mtp = builder.build_for_graph_capture(
|
2026-01-04 12:03:21 +08:00
|
|
|
|
common_attn_metadata, attn_state)
|
[Feat] Support MTP to running in full graph mode (#3892)
### What this PR does / why we need it?
Currently, the MTP model still runs in eager in full graph mode. This PR
adapts the MTP with the full graph capture and execution. When the graph
mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to
improve the performance.
The change in both disable_padded_drafter_batch is True and False case
include:
1. Add _mtp_graph_params in acl_graph.py to isolate the data of main
model and the data of MTP.
2. Padding some metadata in mla_v1.py when in fullgraph mode.
3. Fixed the essential data address that will be used in model.forward.
4. Adapted according to the aclgraph capture framwork:
1). Rebuild MTP model with ACLGraphWrapper.
2). Add common attn metadata when start capture in MTP dummy_run.
3). Add common attn metadata update in MTP.
4). Addapted data update when num_speculative_tokens > 1.
5. Add a patch of MTP to adapt vllm v0.11.0.
Existing Issues:
1. When disable_padded_drafter_batch=True and running in FullGraph mode,
the data of the first-round requests in MTP is abnormal. We need to
identify the cause subsequently.
2. When disable_padded_drafter_batch=False and running in FullGraph
mode, the acceptance rate of the second and third tokens will decrease
(For example, if we set the num_speculative_tokens=3, the acceptance
rate of first token is 90%, the second is only 50% lower than 60%, the
third is only 20% lower than 30%). The reason is that the data processed
after the model runs does not match. This is a problem from another PR.
It works fine in eager and PIECEWISE mode, but has problem in FullGraph
mode. Once we have a solution, we will submit a bugfix.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
|
|
|
|
attn_metadata = {}
|
2026-01-15 10:24:35 +08:00
|
|
|
|
for layer_name in self.attn_layer_names:
|
[Feat] Support MTP to running in full graph mode (#3892)
### What this PR does / why we need it?
Currently, the MTP model still runs in eager in full graph mode. This PR
adapts the MTP with the full graph capture and execution. When the graph
mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to
improve the performance.
The change in both disable_padded_drafter_batch is True and False case
include:
1. Add _mtp_graph_params in acl_graph.py to isolate the data of main
model and the data of MTP.
2. Padding some metadata in mla_v1.py when in fullgraph mode.
3. Fixed the essential data address that will be used in model.forward.
4. Adapted according to the aclgraph capture framwork:
1). Rebuild MTP model with ACLGraphWrapper.
2). Add common attn metadata when start capture in MTP dummy_run.
3). Add common attn metadata update in MTP.
4). Addapted data update when num_speculative_tokens > 1.
5. Add a patch of MTP to adapt vllm v0.11.0.
Existing Issues:
1. When disable_padded_drafter_batch=True and running in FullGraph mode,
the data of the first-round requests in MTP is abnormal. We need to
identify the cause subsequently.
2. When disable_padded_drafter_batch=False and running in FullGraph
mode, the acceptance rate of the second and third tokens will decrease
(For example, if we set the num_speculative_tokens=3, the acceptance
rate of first token is 90%, the second is only 50% lower than 60%, the
third is only 20% lower than 30%). The reason is that the data processed
after the model runs does not match. This is a problem from another PR.
It works fine in eager and PIECEWISE mode, but has problem in FullGraph
mode. Once we have a solution, we will submit a bugfix.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
|
|
|
|
attn_metadata[layer_name] = attn_metadata_mtp
|
|
|
|
|
|
else:
|
|
|
|
|
|
attn_metadata = None
|
|
|
|
|
|
else:
|
|
|
|
|
|
attn_metadata = None
|
2025-09-04 11:34:47 +08:00
|
|
|
|
|
|
|
|
|
|
input_ids = self.input_ids[:num_tokens]
|
|
|
|
|
|
positions = self.positions[:num_tokens]
|
|
|
|
|
|
previous_hidden_states = self.hidden_states[:num_tokens]
|
[Feat] Support MTP to running in full graph mode (#3892)
### What this PR does / why we need it?
Currently, the MTP model still runs in eager in full graph mode. This PR
adapts the MTP with the full graph capture and execution. When the graph
mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to
improve the performance.
The change in both disable_padded_drafter_batch is True and False case
include:
1. Add _mtp_graph_params in acl_graph.py to isolate the data of main
model and the data of MTP.
2. Padding some metadata in mla_v1.py when in fullgraph mode.
3. Fixed the essential data address that will be used in model.forward.
4. Adapted according to the aclgraph capture framwork:
1). Rebuild MTP model with ACLGraphWrapper.
2). Add common attn metadata when start capture in MTP dummy_run.
3). Add common attn metadata update in MTP.
4). Addapted data update when num_speculative_tokens > 1.
5. Add a patch of MTP to adapt vllm v0.11.0.
Existing Issues:
1. When disable_padded_drafter_batch=True and running in FullGraph mode,
the data of the first-round requests in MTP is abnormal. We need to
identify the cause subsequently.
2. When disable_padded_drafter_batch=False and running in FullGraph
mode, the acceptance rate of the second and third tokens will decrease
(For example, if we set the num_speculative_tokens=3, the acceptance
rate of first token is 90%, the second is only 50% lower than 60%, the
third is only 20% lower than 30%). The reason is that the data processed
after the model runs does not match. This is a problem from another PR.
It works fine in eager and PIECEWISE mode, but has problem in FullGraph
mode. Once we have a solution, we will submit a bugfix.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
|
|
|
|
for i in range(self.num_speculative_tokens):
|
2025-12-16 09:14:05 +08:00
|
|
|
|
if i > 0 and not in_graph_capturing and aclgraph_runtime_mode == CUDAGraphMode.FULL:
|
[Feat] Support MTP to running in full graph mode (#3892)
### What this PR does / why we need it?
Currently, the MTP model still runs in eager in full graph mode. This PR
adapts the MTP with the full graph capture and execution. When the graph
mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to
improve the performance.
The change in both disable_padded_drafter_batch is True and False case
include:
1. Add _mtp_graph_params in acl_graph.py to isolate the data of main
model and the data of MTP.
2. Padding some metadata in mla_v1.py when in fullgraph mode.
3. Fixed the essential data address that will be used in model.forward.
4. Adapted according to the aclgraph capture framwork:
1). Rebuild MTP model with ACLGraphWrapper.
2). Add common attn metadata when start capture in MTP dummy_run.
3). Add common attn metadata update in MTP.
4). Addapted data update when num_speculative_tokens > 1.
5. Add a patch of MTP to adapt vllm v0.11.0.
Existing Issues:
1. When disable_padded_drafter_batch=True and running in FullGraph mode,
the data of the first-round requests in MTP is abnormal. We need to
identify the cause subsequently.
2. When disable_padded_drafter_batch=False and running in FullGraph
mode, the acceptance rate of the second and third tokens will decrease
(For example, if we set the num_speculative_tokens=3, the acceptance
rate of first token is 90%, the second is only 50% lower than 60%, the
third is only 20% lower than 30%). The reason is that the data processed
after the model runs does not match. This is a problem from another PR.
It works fine in eager and PIECEWISE mode, but has problem in FullGraph
mode. Once we have a solution, we will submit a bugfix.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
|
|
|
|
aclgraph_runtime_mode = CUDAGraphMode.NONE
|
2025-09-05 09:11:22 +08:00
|
|
|
|
with set_ascend_forward_context(
|
|
|
|
|
|
attn_metadata,
|
|
|
|
|
|
self.vllm_config,
|
|
|
|
|
|
num_tokens=num_tokens,
|
|
|
|
|
|
num_tokens_across_dp=num_tokens_across_dp,
|
2025-10-17 18:14:49 +08:00
|
|
|
|
num_actual_tokens=0,
|
|
|
|
|
|
aclgraph_runtime_mode=aclgraph_runtime_mode,
|
[Feat] Support MTP to running in full graph mode (#3892)
### What this PR does / why we need it?
Currently, the MTP model still runs in eager in full graph mode. This PR
adapts the MTP with the full graph capture and execution. When the graph
mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to
improve the performance.
The change in both disable_padded_drafter_batch is True and False case
include:
1. Add _mtp_graph_params in acl_graph.py to isolate the data of main
model and the data of MTP.
2. Padding some metadata in mla_v1.py when in fullgraph mode.
3. Fixed the essential data address that will be used in model.forward.
4. Adapted according to the aclgraph capture framwork:
1). Rebuild MTP model with ACLGraphWrapper.
2). Add common attn metadata when start capture in MTP dummy_run.
3). Add common attn metadata update in MTP.
4). Addapted data update when num_speculative_tokens > 1.
5. Add a patch of MTP to adapt vllm v0.11.0.
Existing Issues:
1. When disable_padded_drafter_batch=True and running in FullGraph mode,
the data of the first-round requests in MTP is abnormal. We need to
identify the cause subsequently.
2. When disable_padded_drafter_batch=False and running in FullGraph
mode, the acceptance rate of the second and third tokens will decrease
(For example, if we set the num_speculative_tokens=3, the acceptance
rate of first token is 90%, the second is only 50% lower than 60%, the
third is only 20% lower than 30%). The reason is that the data processed
after the model runs does not match. This is a problem from another PR.
It works fine in eager and PIECEWISE mode, but has problem in FullGraph
mode. Once we have a solution, we will submit a bugfix.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
|
|
|
|
batch_descriptor=batch_descriptor,
|
2025-12-29 09:54:51 +08:00
|
|
|
|
is_draft_model=True,
|
2025-12-18 22:27:47 +08:00
|
|
|
|
in_profile_run=is_profile):
|
2026-01-15 10:24:35 +08:00
|
|
|
|
previous_hidden_states, positions = self.maybe_pad_and_reduce(
|
|
|
|
|
|
previous_hidden_states, positions)
|
2025-10-30 16:53:05 +08:00
|
|
|
|
self.model(input_ids=input_ids,
|
|
|
|
|
|
positions=positions,
|
|
|
|
|
|
hidden_states=previous_hidden_states)
|
[Feat] Support MTP to running in full graph mode (#3892)
### What this PR does / why we need it?
Currently, the MTP model still runs in eager in full graph mode. This PR
adapts the MTP with the full graph capture and execution. When the graph
mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to
improve the performance.
The change in both disable_padded_drafter_batch is True and False case
include:
1. Add _mtp_graph_params in acl_graph.py to isolate the data of main
model and the data of MTP.
2. Padding some metadata in mla_v1.py when in fullgraph mode.
3. Fixed the essential data address that will be used in model.forward.
4. Adapted according to the aclgraph capture framwork:
1). Rebuild MTP model with ACLGraphWrapper.
2). Add common attn metadata when start capture in MTP dummy_run.
3). Add common attn metadata update in MTP.
4). Addapted data update when num_speculative_tokens > 1.
5. Add a patch of MTP to adapt vllm v0.11.0.
Existing Issues:
1. When disable_padded_drafter_batch=True and running in FullGraph mode,
the data of the first-round requests in MTP is abnormal. We need to
identify the cause subsequently.
2. When disable_padded_drafter_batch=False and running in FullGraph
mode, the acceptance rate of the second and third tokens will decrease
(For example, if we set the num_speculative_tokens=3, the acceptance
rate of first token is 90%, the second is only 50% lower than 60%, the
third is only 20% lower than 30%). The reason is that the data processed
after the model runs does not match. This is a problem from another PR.
It works fine in eager and PIECEWISE mode, but has problem in FullGraph
mode. Once we have a solution, we will submit a bugfix.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
|
|
|
|
forward_context = get_forward_context()
|
|
|
|
|
|
if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL and \
|
2026-01-04 12:03:21 +08:00
|
|
|
|
not forward_context.capturing and not self.use_sparse:
|
|
|
|
|
|
self._update_full_graph_params(forward_context, num_tokens)
|
|
|
|
|
|
|
2026-01-15 10:24:35 +08:00
|
|
|
|
previous_hidden_states, positions, _ = self.maybe_all_gather_and_unpad(
|
|
|
|
|
|
previous_hidden_states, positions)
|
2025-12-01 10:22:36 +08:00
|
|
|
|
dummy_compute_logits(previous_hidden_states)
|
2025-09-05 09:11:22 +08:00
|
|
|
|
if with_prefill:
|
|
|
|
|
|
break
|
2025-09-04 11:34:47 +08:00
|
|
|
|
|
|
|
|
|
|
def _propose(
|
2025-10-30 16:53:05 +08:00
|
|
|
|
self,
|
|
|
|
|
|
# [num_tokens]
|
|
|
|
|
|
target_token_ids: torch.Tensor,
|
|
|
|
|
|
# [num_tokens] or [3, num_tokens] when M-RoPE is enabled
|
|
|
|
|
|
target_positions: torch.Tensor,
|
|
|
|
|
|
# [num_tokens, hidden_size]
|
|
|
|
|
|
target_hidden_states: torch.Tensor,
|
|
|
|
|
|
# [batch_size]
|
|
|
|
|
|
next_token_ids: torch.Tensor,
|
|
|
|
|
|
last_token_indices: Optional[torch.Tensor],
|
|
|
|
|
|
common_attn_metadata: CommonAttentionMetadata,
|
|
|
|
|
|
sampling_metadata: SamplingMetadata,
|
|
|
|
|
|
mm_embed_inputs: Optional[tuple[list[torch.Tensor],
|
|
|
|
|
|
torch.Tensor]] = None,
|
2025-10-31 15:43:22 +08:00
|
|
|
|
req_scheduled_tokens=None,
|
|
|
|
|
|
long_seq_metadata=None,
|
2025-11-12 17:22:21 +08:00
|
|
|
|
num_prefill_reqs=0,
|
|
|
|
|
|
num_decode_reqs=0,
|
[Feat] Support MTP to running in full graph mode (#3892)
### What this PR does / why we need it?
Currently, the MTP model still runs in eager in full graph mode. This PR
adapts the MTP with the full graph capture and execution. When the graph
mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to
improve the performance.
The change in both disable_padded_drafter_batch is True and False case
include:
1. Add _mtp_graph_params in acl_graph.py to isolate the data of main
model and the data of MTP.
2. Padding some metadata in mla_v1.py when in fullgraph mode.
3. Fixed the essential data address that will be used in model.forward.
4. Adapted according to the aclgraph capture framwork:
1). Rebuild MTP model with ACLGraphWrapper.
2). Add common attn metadata when start capture in MTP dummy_run.
3). Add common attn metadata update in MTP.
4). Addapted data update when num_speculative_tokens > 1.
5. Add a patch of MTP to adapt vllm v0.11.0.
Existing Issues:
1. When disable_padded_drafter_batch=True and running in FullGraph mode,
the data of the first-round requests in MTP is abnormal. We need to
identify the cause subsequently.
2. When disable_padded_drafter_batch=False and running in FullGraph
mode, the acceptance rate of the second and third tokens will decrease
(For example, if we set the num_speculative_tokens=3, the acceptance
rate of first token is 90%, the second is only 50% lower than 60%, the
third is only 20% lower than 30%). The reason is that the data processed
after the model runs does not match. This is a problem from another PR.
It works fine in eager and PIECEWISE mode, but has problem in FullGraph
mode. Once we have a solution, we will submit a bugfix.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
|
|
|
|
scheduler_output: SchedulerOutput = None,
|
|
|
|
|
|
num_scheduled_tokens: int = 0,
|
2025-10-30 16:53:05 +08:00
|
|
|
|
) -> torch.Tensor:
|
2025-05-30 08:59:58 +08:00
|
|
|
|
num_tokens = target_token_ids.shape[0]
|
|
|
|
|
|
batch_size = next_token_ids.shape[0]
|
2025-10-30 16:53:05 +08:00
|
|
|
|
|
|
|
|
|
|
if last_token_indices is None:
|
|
|
|
|
|
last_token_indices = common_attn_metadata.query_start_loc[1:] - 1
|
|
|
|
|
|
|
|
|
|
|
|
if self.method == "eagle3":
|
|
|
|
|
|
assert isinstance(self.model, Eagle3LlamaForCausalLM)
|
|
|
|
|
|
target_hidden_states = self.model.combine_hidden_states(
|
|
|
|
|
|
target_hidden_states)
|
|
|
|
|
|
assert target_hidden_states.shape[-1] == self.hidden_size
|
2025-05-30 08:59:58 +08:00
|
|
|
|
|
|
|
|
|
|
# Shift the input ids by one token.
|
|
|
|
|
|
# E.g., [a1, b1, b2, c1, c2, c3] -> [b1, b2, c1, c2, c3, c3]
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
|
self.input_ids[:num_tokens - 1] = target_token_ids[1:]
|
2025-05-30 08:59:58 +08:00
|
|
|
|
# Replace the last token with the next token.
|
|
|
|
|
|
# E.g., [b1, b2, c1, c2, c3, c3] -> [a2, b2, b3, c2, c3, c4]
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
|
self.input_ids[last_token_indices] = next_token_ids
|
2025-05-30 08:59:58 +08:00
|
|
|
|
|
2025-10-31 15:43:22 +08:00
|
|
|
|
# update pcp related params
|
2025-12-22 16:13:39 +08:00
|
|
|
|
if self.pcp_size * self.dcp_size > 1:
|
2025-11-12 17:22:21 +08:00
|
|
|
|
assert long_seq_metadata is not None
|
2025-10-31 15:43:22 +08:00
|
|
|
|
common_attn_metadata.prefill_context_parallel_metadata = long_seq_metadata
|
2025-12-22 16:13:39 +08:00
|
|
|
|
ori_last_token_indices = last_token_indices.clone()
|
|
|
|
|
|
query_lens_d = self.runner.query_lens[:num_decode_reqs]
|
|
|
|
|
|
if self.pcp_size > 1:
|
2025-11-12 17:22:21 +08:00
|
|
|
|
# 1. preprocess decode/prefill input_ids & target_hidden_states
|
|
|
|
|
|
# decode input_ids: keep unchanged
|
|
|
|
|
|
# decode target_hidden_states: remove padding
|
|
|
|
|
|
# prefill input_ids: add padding and pcp split
|
|
|
|
|
|
# prefill target_hidden_states: pcp split
|
2025-12-22 16:13:39 +08:00
|
|
|
|
num_tokens_d = query_lens_d.sum().item()
|
2025-11-12 17:22:21 +08:00
|
|
|
|
num_tokens_d_padded = num_tokens_d * self.pcp_size
|
|
|
|
|
|
input_ids_d = self.input_ids[:num_tokens_d]
|
|
|
|
|
|
input_ids_p = self.input_ids[num_tokens_d:num_tokens]
|
|
|
|
|
|
target_hidden_states_d_padded = \
|
|
|
|
|
|
target_hidden_states[:num_tokens_d_padded]
|
|
|
|
|
|
if num_tokens_d:
|
|
|
|
|
|
# remove padding (from pcp all-gather) in decode part
|
2025-12-22 16:13:39 +08:00
|
|
|
|
mask_start_loc = torch.cat([
|
|
|
|
|
|
torch.tensor([0], dtype=torch.int32),
|
|
|
|
|
|
torch.cumsum(query_lens_d * self.pcp_size, dim=0)[:-1]
|
|
|
|
|
|
])
|
|
|
|
|
|
mask_len = query_lens_d
|
|
|
|
|
|
mask = []
|
|
|
|
|
|
for req_id in range(num_decode_reqs):
|
|
|
|
|
|
mask += list(
|
|
|
|
|
|
range(mask_start_loc[req_id],
|
|
|
|
|
|
mask_start_loc[req_id] + mask_len[req_id]))
|
|
|
|
|
|
target_hidden_states_d = target_hidden_states_d_padded[mask]
|
2025-11-12 17:22:21 +08:00
|
|
|
|
else:
|
|
|
|
|
|
target_hidden_states_d = target_hidden_states_d_padded
|
|
|
|
|
|
target_hidden_states_p = target_hidden_states[num_tokens_d_padded:]
|
|
|
|
|
|
req_scheduled_tokens_p = {}
|
|
|
|
|
|
for i, req_id in enumerate(self.runner.input_batch.req_ids):
|
|
|
|
|
|
if i >= num_decode_reqs:
|
|
|
|
|
|
req_scheduled_tokens_p[req_id] = \
|
|
|
|
|
|
req_scheduled_tokens[req_id]
|
|
|
|
|
|
(num_tokens_p, input_ids_p, target_hidden_states_p,
|
|
|
|
|
|
max_query_len_p, seq_lens_p, cu_num_tokens_p) = \
|
|
|
|
|
|
self._split_pcp_input(
|
|
|
|
|
|
req_scheduled_tokens_p, input_ids_p, target_hidden_states_p)
|
|
|
|
|
|
num_tokens = num_tokens_d + num_tokens_p
|
|
|
|
|
|
target_positions = target_positions[:num_tokens]
|
|
|
|
|
|
self.input_ids[:num_tokens].copy_(
|
|
|
|
|
|
torch.cat([input_ids_d, input_ids_p], dim=0))
|
|
|
|
|
|
target_hidden_states = torch.cat(
|
|
|
|
|
|
[target_hidden_states_d, target_hidden_states_p], dim=0)
|
2025-12-22 16:13:39 +08:00
|
|
|
|
# 2. update sample_indices according to main model
|
2025-11-12 17:22:21 +08:00
|
|
|
|
if num_decode_reqs:
|
|
|
|
|
|
last_token_indices[:num_decode_reqs] = \
|
|
|
|
|
|
self.runner.logits_indices[last_token_indices[:num_decode_reqs]]
|
|
|
|
|
|
if num_prefill_reqs:
|
|
|
|
|
|
last_token_indices[-num_prefill_reqs:] = \
|
|
|
|
|
|
self.runner.logits_indices[-num_prefill_reqs:]
|
2025-12-22 16:13:39 +08:00
|
|
|
|
# 3. update attn_metadata params that may be influenced by pcp
|
|
|
|
|
|
common_attn_metadata.num_actual_tokens = num_tokens
|
|
|
|
|
|
common_attn_metadata.max_query_len = max(
|
|
|
|
|
|
self.decode_threshold, max_query_len_p)
|
|
|
|
|
|
common_attn_metadata.seq_lens[-num_prefill_reqs:] = seq_lens_p
|
|
|
|
|
|
common_attn_metadata.seq_lens_cpu[
|
|
|
|
|
|
-num_prefill_reqs:] = seq_lens_p
|
|
|
|
|
|
query_start_loc_p = cu_num_tokens_p[1:] + \
|
|
|
|
|
|
common_attn_metadata.query_start_loc[num_decode_reqs].item()
|
|
|
|
|
|
common_attn_metadata.query_start_loc[-num_prefill_reqs:] = \
|
|
|
|
|
|
query_start_loc_p
|
|
|
|
|
|
common_attn_metadata.query_start_loc_cpu[-num_prefill_reqs:] = \
|
|
|
|
|
|
query_start_loc_p
|
2025-10-31 15:43:22 +08:00
|
|
|
|
|
2025-10-30 16:53:05 +08:00
|
|
|
|
assert self.runner is not None
|
|
|
|
|
|
|
2025-12-22 03:08:27 -03:00
|
|
|
|
# Note(qcs): We may need to refactor these check logics.
|
2026-01-22 21:08:07 +08:00
|
|
|
|
if self.use_cuda_graph and num_scheduled_tokens <= self.runner.cudagraph_batch_sizes[
|
2025-12-10 20:11:09 +08:00
|
|
|
|
-1]:
|
[Feat] Support MTP to running in full graph mode (#3892)
### What this PR does / why we need it?
Currently, the MTP model still runs in eager in full graph mode. This PR
adapts the MTP with the full graph capture and execution. When the graph
mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to
improve the performance.
The change in both disable_padded_drafter_batch is True and False case
include:
1. Add _mtp_graph_params in acl_graph.py to isolate the data of main
model and the data of MTP.
2. Padding some metadata in mla_v1.py when in fullgraph mode.
3. Fixed the essential data address that will be used in model.forward.
4. Adapted according to the aclgraph capture framwork:
1). Rebuild MTP model with ACLGraphWrapper.
2). Add common attn metadata when start capture in MTP dummy_run.
3). Add common attn metadata update in MTP.
4). Addapted data update when num_speculative_tokens > 1.
5. Add a patch of MTP to adapt vllm v0.11.0.
Existing Issues:
1. When disable_padded_drafter_batch=True and running in FullGraph mode,
the data of the first-round requests in MTP is abnormal. We need to
identify the cause subsequently.
2. When disable_padded_drafter_batch=False and running in FullGraph
mode, the acceptance rate of the second and third tokens will decrease
(For example, if we set the num_speculative_tokens=3, the acceptance
rate of first token is 90%, the second is only 50% lower than 60%, the
third is only 20% lower than 30%). The reason is that the data processed
after the model runs does not match. This is a problem from another PR.
It works fine in eager and PIECEWISE mode, but has problem in FullGraph
mode. Once we have a solution, we will submit a bugfix.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
|
|
|
|
num_input_tokens = self.vllm_config.pad_for_cudagraph(
|
|
|
|
|
|
num_scheduled_tokens)
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
|
else:
|
2025-09-18 14:05:33 +08:00
|
|
|
|
# Eager mode, no padding needed
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
|
num_input_tokens = num_tokens
|
2025-05-30 08:59:58 +08:00
|
|
|
|
|
2025-10-30 16:53:05 +08:00
|
|
|
|
# copy inputs to buffer for cudagraph
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
|
self.positions[:num_tokens] = target_positions
|
|
|
|
|
|
self.hidden_states[:num_tokens] = target_hidden_states
|
2025-10-30 16:53:05 +08:00
|
|
|
|
# eager/acl piecewise mode need to update num_tokens_across_dp
|
|
|
|
|
|
(num_input_tokens, num_tokens_across_dp,
|
|
|
|
|
|
with_prefill) = self.runner._sync_metadata_across_dp(
|
|
|
|
|
|
num_input_tokens, self.runner.with_prefill)
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
|
|
2025-12-01 20:44:11 +08:00
|
|
|
|
# Enable shared_expert_dp and MTP FULL graph may cause accuracy issues.
|
|
|
|
|
|
if scheduler_output and not self.enable_shared_expert_dp:
|
[Feat] Support MTP to running in full graph mode (#3892)
### What this PR does / why we need it?
Currently, the MTP model still runs in eager in full graph mode. This PR
adapts the MTP with the full graph capture and execution. When the graph
mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to
improve the performance.
The change in both disable_padded_drafter_batch is True and False case
include:
1. Add _mtp_graph_params in acl_graph.py to isolate the data of main
model and the data of MTP.
2. Padding some metadata in mla_v1.py when in fullgraph mode.
3. Fixed the essential data address that will be used in model.forward.
4. Adapted according to the aclgraph capture framwork:
1). Rebuild MTP model with ACLGraphWrapper.
2). Add common attn metadata when start capture in MTP dummy_run.
3). Add common attn metadata update in MTP.
4). Addapted data update when num_speculative_tokens > 1.
5. Add a patch of MTP to adapt vllm v0.11.0.
Existing Issues:
1. When disable_padded_drafter_batch=True and running in FullGraph mode,
the data of the first-round requests in MTP is abnormal. We need to
identify the cause subsequently.
2. When disable_padded_drafter_batch=False and running in FullGraph
mode, the acceptance rate of the second and third tokens will decrease
(For example, if we set the num_speculative_tokens=3, the acceptance
rate of first token is 90%, the second is only 50% lower than 60%, the
third is only 20% lower than 30%). The reason is that the data processed
after the model runs does not match. This is a problem from another PR.
It works fine in eager and PIECEWISE mode, but has problem in FullGraph
mode. Once we have a solution, we will submit a bugfix.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
|
|
|
|
max_query_len = common_attn_metadata.max_query_len
|
|
|
|
|
|
uniform_decode = (max_query_len in list(
|
|
|
|
|
|
range(1, self.num_speculative_tokens +
|
|
|
|
|
|
2))) and (scheduler_output.total_num_scheduled_tokens
|
|
|
|
|
|
== self.runner.input_batch.num_reqs *
|
|
|
|
|
|
(self.num_speculative_tokens + 1))
|
|
|
|
|
|
else:
|
2025-12-02 22:10:52 +08:00
|
|
|
|
uniform_decode = False
|
|
|
|
|
|
has_lora = len(self.runner.input_batch.lora_id_to_lora_request) > 0
|
2025-09-18 14:05:33 +08:00
|
|
|
|
aclgraph_runtime_mode, batch_descriptor = \
|
2025-12-12 17:27:09 +08:00
|
|
|
|
self.runner.cudagraph_dispatcher.dispatch(num_tokens=num_input_tokens, uniform_decode=uniform_decode, has_lora=has_lora)
|
2026-01-04 12:03:21 +08:00
|
|
|
|
if not self.use_cuda_graph:
|
2025-12-10 22:31:47 +08:00
|
|
|
|
# there is synchronization between mtp steps when enabling aclgraph,
|
2025-12-06 17:15:57 +08:00
|
|
|
|
# disable aclgraph when use async scheduling to avoid the
|
2025-12-10 22:31:47 +08:00
|
|
|
|
# synchronization overhead.
|
|
|
|
|
|
# NOTE: we need to set aclgraph_runtime_mode to None in both dummy_run
|
|
|
|
|
|
# and _propose.
|
2025-12-06 17:15:57 +08:00
|
|
|
|
aclgraph_runtime_mode = CUDAGraphMode.NONE
|
[Feat] Support MTP to running in full graph mode (#3892)
### What this PR does / why we need it?
Currently, the MTP model still runs in eager in full graph mode. This PR
adapts the MTP with the full graph capture and execution. When the graph
mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to
improve the performance.
The change in both disable_padded_drafter_batch is True and False case
include:
1. Add _mtp_graph_params in acl_graph.py to isolate the data of main
model and the data of MTP.
2. Padding some metadata in mla_v1.py when in fullgraph mode.
3. Fixed the essential data address that will be used in model.forward.
4. Adapted according to the aclgraph capture framwork:
1). Rebuild MTP model with ACLGraphWrapper.
2). Add common attn metadata when start capture in MTP dummy_run.
3). Add common attn metadata update in MTP.
4). Addapted data update when num_speculative_tokens > 1.
5. Add a patch of MTP to adapt vllm v0.11.0.
Existing Issues:
1. When disable_padded_drafter_batch=True and running in FullGraph mode,
the data of the first-round requests in MTP is abnormal. We need to
identify the cause subsequently.
2. When disable_padded_drafter_batch=False and running in FullGraph
mode, the acceptance rate of the second and third tokens will decrease
(For example, if we set the num_speculative_tokens=3, the acceptance
rate of first token is 90%, the second is only 50% lower than 60%, the
third is only 20% lower than 30%). The reason is that the data processed
after the model runs does not match. This is a problem from another PR.
It works fine in eager and PIECEWISE mode, but has problem in FullGraph
mode. Once we have a solution, we will submit a bugfix.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
|
|
|
|
|
|
|
|
|
|
if self.vllm_config.compilation_config.cudagraph_mode.has_full_cudagraphs(
|
|
|
|
|
|
) and aclgraph_runtime_mode == CUDAGraphMode.FULL:
|
|
|
|
|
|
graph_pad_size = num_input_tokens
|
|
|
|
|
|
else:
|
2025-12-11 11:21:13 +08:00
|
|
|
|
graph_pad_size = -1
|
[Feat] Support MTP to running in full graph mode (#3892)
### What this PR does / why we need it?
Currently, the MTP model still runs in eager in full graph mode. This PR
adapts the MTP with the full graph capture and execution. When the graph
mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to
improve the performance.
The change in both disable_padded_drafter_batch is True and False case
include:
1. Add _mtp_graph_params in acl_graph.py to isolate the data of main
model and the data of MTP.
2. Padding some metadata in mla_v1.py when in fullgraph mode.
3. Fixed the essential data address that will be used in model.forward.
4. Adapted according to the aclgraph capture framwork:
1). Rebuild MTP model with ACLGraphWrapper.
2). Add common attn metadata when start capture in MTP dummy_run.
3). Add common attn metadata update in MTP.
4). Addapted data update when num_speculative_tokens > 1.
5. Add a patch of MTP to adapt vllm v0.11.0.
Existing Issues:
1. When disable_padded_drafter_batch=True and running in FullGraph mode,
the data of the first-round requests in MTP is abnormal. We need to
identify the cause subsequently.
2. When disable_padded_drafter_batch=False and running in FullGraph
mode, the acceptance rate of the second and third tokens will decrease
(For example, if we set the num_speculative_tokens=3, the acceptance
rate of first token is 90%, the second is only 50% lower than 60%, the
third is only 20% lower than 30%). The reason is that the data processed
after the model runs does not match. This is a problem from another PR.
It works fine in eager and PIECEWISE mode, but has problem in FullGraph
mode. Once we have a solution, we will submit a bugfix.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
|
|
|
|
|
|
|
|
|
|
# If use fullgraph and disable_padded_drafter_batch=True, We need to
|
|
|
|
|
|
# update the graph_pad_size in common_attn_metadata, to tell the
|
|
|
|
|
|
# builder padding some elements.
|
|
|
|
|
|
common_attn_metadata.graph_pad_size = graph_pad_size
|
2026-01-22 21:08:07 +08:00
|
|
|
|
common_attn_metadata.num_input_tokens = num_input_tokens
|
[Feat] Support MTP to running in full graph mode (#3892)
### What this PR does / why we need it?
Currently, the MTP model still runs in eager in full graph mode. This PR
adapts the MTP with the full graph capture and execution. When the graph
mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to
improve the performance.
The change in both disable_padded_drafter_batch is True and False case
include:
1. Add _mtp_graph_params in acl_graph.py to isolate the data of main
model and the data of MTP.
2. Padding some metadata in mla_v1.py when in fullgraph mode.
3. Fixed the essential data address that will be used in model.forward.
4. Adapted according to the aclgraph capture framwork:
1). Rebuild MTP model with ACLGraphWrapper.
2). Add common attn metadata when start capture in MTP dummy_run.
3). Add common attn metadata update in MTP.
4). Addapted data update when num_speculative_tokens > 1.
5. Add a patch of MTP to adapt vllm v0.11.0.
Existing Issues:
1. When disable_padded_drafter_batch=True and running in FullGraph mode,
the data of the first-round requests in MTP is abnormal. We need to
identify the cause subsequently.
2. When disable_padded_drafter_batch=False and running in FullGraph
mode, the acceptance rate of the second and third tokens will decrease
(For example, if we set the num_speculative_tokens=3, the acceptance
rate of first token is 90%, the second is only 50% lower than 60%, the
third is only 20% lower than 30%). The reason is that the data processed
after the model runs does not match. This is a problem from another PR.
It works fine in eager and PIECEWISE mode, but has problem in FullGraph
mode. Once we have a solution, we will submit a bugfix.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
|
|
|
|
builder = self.runner.attn_groups[0][0].get_metadata_builder()
|
|
|
|
|
|
attn_metadata_mtp = builder.build(0, common_attn_metadata,
|
|
|
|
|
|
self.runner.get_model())
|
|
|
|
|
|
attn_metadata = {}
|
2026-01-15 10:24:35 +08:00
|
|
|
|
for layer_name in self.attn_layer_names:
|
[Feat] Support MTP to running in full graph mode (#3892)
### What this PR does / why we need it?
Currently, the MTP model still runs in eager in full graph mode. This PR
adapts the MTP with the full graph capture and execution. When the graph
mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to
improve the performance.
The change in both disable_padded_drafter_batch is True and False case
include:
1. Add _mtp_graph_params in acl_graph.py to isolate the data of main
model and the data of MTP.
2. Padding some metadata in mla_v1.py when in fullgraph mode.
3. Fixed the essential data address that will be used in model.forward.
4. Adapted according to the aclgraph capture framwork:
1). Rebuild MTP model with ACLGraphWrapper.
2). Add common attn metadata when start capture in MTP dummy_run.
3). Add common attn metadata update in MTP.
4). Addapted data update when num_speculative_tokens > 1.
5. Add a patch of MTP to adapt vllm v0.11.0.
Existing Issues:
1. When disable_padded_drafter_batch=True and running in FullGraph mode,
the data of the first-round requests in MTP is abnormal. We need to
identify the cause subsequently.
2. When disable_padded_drafter_batch=False and running in FullGraph
mode, the acceptance rate of the second and third tokens will decrease
(For example, if we set the num_speculative_tokens=3, the acceptance
rate of first token is 90%, the second is only 50% lower than 60%, the
third is only 20% lower than 30%). The reason is that the data processed
after the model runs does not match. This is a problem from another PR.
It works fine in eager and PIECEWISE mode, but has problem in FullGraph
mode. Once we have a solution, we will submit a bugfix.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
|
|
|
|
attn_metadata[layer_name] = attn_metadata_mtp
|
2025-09-16 11:06:00 +08:00
|
|
|
|
|
2025-09-05 09:11:22 +08:00
|
|
|
|
for step in range(self.num_speculative_tokens):
|
|
|
|
|
|
with set_ascend_forward_context(
|
|
|
|
|
|
attn_metadata,
|
|
|
|
|
|
self.vllm_config,
|
|
|
|
|
|
num_tokens=num_input_tokens,
|
|
|
|
|
|
num_tokens_across_dp=num_tokens_across_dp,
|
2025-09-18 14:05:33 +08:00
|
|
|
|
aclgraph_runtime_mode=aclgraph_runtime_mode,
|
2025-10-17 18:14:49 +08:00
|
|
|
|
batch_descriptor=batch_descriptor,
|
[Feat] Support MTP to running in full graph mode (#3892)
### What this PR does / why we need it?
Currently, the MTP model still runs in eager in full graph mode. This PR
adapts the MTP with the full graph capture and execution. When the graph
mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to
improve the performance.
The change in both disable_padded_drafter_batch is True and False case
include:
1. Add _mtp_graph_params in acl_graph.py to isolate the data of main
model and the data of MTP.
2. Padding some metadata in mla_v1.py when in fullgraph mode.
3. Fixed the essential data address that will be used in model.forward.
4. Adapted according to the aclgraph capture framwork:
1). Rebuild MTP model with ACLGraphWrapper.
2). Add common attn metadata when start capture in MTP dummy_run.
3). Add common attn metadata update in MTP.
4). Addapted data update when num_speculative_tokens > 1.
5. Add a patch of MTP to adapt vllm v0.11.0.
Existing Issues:
1. When disable_padded_drafter_batch=True and running in FullGraph mode,
the data of the first-round requests in MTP is abnormal. We need to
identify the cause subsequently.
2. When disable_padded_drafter_batch=False and running in FullGraph
mode, the acceptance rate of the second and third tokens will decrease
(For example, if we set the num_speculative_tokens=3, the acceptance
rate of first token is 90%, the second is only 50% lower than 60%, the
third is only 20% lower than 30%). The reason is that the data processed
after the model runs does not match. This is a problem from another PR.
It works fine in eager and PIECEWISE mode, but has problem in FullGraph
mode. Once we have a solution, we will submit a bugfix.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
|
|
|
|
num_actual_tokens=num_tokens,
|
2025-12-29 09:54:51 +08:00
|
|
|
|
is_draft_model=True):
|
2025-09-05 09:11:22 +08:00
|
|
|
|
with ProfileExecuteDuration().capture_async('mtp_forward'):
|
|
|
|
|
|
model_kwargs = {}
|
|
|
|
|
|
model_kwargs["attn_metadata"] = attn_metadata
|
2025-12-01 20:44:11 +08:00
|
|
|
|
input_ids = self.input_ids[:num_input_tokens]
|
|
|
|
|
|
positions = self.positions[:num_input_tokens]
|
|
|
|
|
|
hidden_states = self.hidden_states[:num_input_tokens]
|
|
|
|
|
|
|
2026-01-15 10:24:35 +08:00
|
|
|
|
hidden_states, positions = self.maybe_pad_and_reduce(
|
|
|
|
|
|
hidden_states, positions)
|
2025-12-14 00:10:11 +08:00
|
|
|
|
|
2025-12-01 20:44:11 +08:00
|
|
|
|
hidden_states = self.model(input_ids=input_ids,
|
|
|
|
|
|
positions=positions,
|
|
|
|
|
|
hidden_states=hidden_states)
|
[Feat] Support MTP to running in full graph mode (#3892)
### What this PR does / why we need it?
Currently, the MTP model still runs in eager in full graph mode. This PR
adapts the MTP with the full graph capture and execution. When the graph
mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to
improve the performance.
The change in both disable_padded_drafter_batch is True and False case
include:
1. Add _mtp_graph_params in acl_graph.py to isolate the data of main
model and the data of MTP.
2. Padding some metadata in mla_v1.py when in fullgraph mode.
3. Fixed the essential data address that will be used in model.forward.
4. Adapted according to the aclgraph capture framwork:
1). Rebuild MTP model with ACLGraphWrapper.
2). Add common attn metadata when start capture in MTP dummy_run.
3). Add common attn metadata update in MTP.
4). Addapted data update when num_speculative_tokens > 1.
5. Add a patch of MTP to adapt vllm v0.11.0.
Existing Issues:
1. When disable_padded_drafter_batch=True and running in FullGraph mode,
the data of the first-round requests in MTP is abnormal. We need to
identify the cause subsequently.
2. When disable_padded_drafter_batch=False and running in FullGraph
mode, the acceptance rate of the second and third tokens will decrease
(For example, if we set the num_speculative_tokens=3, the acceptance
rate of first token is 90%, the second is only 50% lower than 60%, the
third is only 20% lower than 30%). The reason is that the data processed
after the model runs does not match. This is a problem from another PR.
It works fine in eager and PIECEWISE mode, but has problem in FullGraph
mode. Once we have a solution, we will submit a bugfix.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
|
|
|
|
forward_context = get_forward_context()
|
2026-01-04 12:03:21 +08:00
|
|
|
|
if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL and not self.use_sparse:
|
|
|
|
|
|
self._update_full_graph_params(forward_context,
|
|
|
|
|
|
num_input_tokens)
|
2025-09-05 09:11:22 +08:00
|
|
|
|
|
2026-01-15 10:24:35 +08:00
|
|
|
|
hidden_states, positions, _ = self.maybe_all_gather_and_unpad(
|
|
|
|
|
|
hidden_states, positions)
|
2025-12-01 20:44:11 +08:00
|
|
|
|
|
2025-09-05 09:11:22 +08:00
|
|
|
|
num_indices = last_token_indices.shape[0]
|
|
|
|
|
|
if lmhead_tp_enable():
|
2025-12-13 18:59:54 +08:00
|
|
|
|
max_num_reqs_across_dp = self.vllm_config.scheduler_config.max_num_seqs * self.runner.uniform_decode_query_len
|
2025-09-05 09:11:22 +08:00
|
|
|
|
last_token_indices = nn.functional.pad(
|
|
|
|
|
|
last_token_indices,
|
|
|
|
|
|
(0, max_num_reqs_across_dp - num_indices))
|
|
|
|
|
|
|
2025-12-22 16:13:39 +08:00
|
|
|
|
if self.pcp_size > 1 and step == 0:
|
|
|
|
|
|
# remove graph padding before all_gather
|
|
|
|
|
|
hidden_states = hidden_states[:num_tokens]
|
2025-11-12 17:22:21 +08:00
|
|
|
|
hidden_states = get_pcp_group().all_gather(hidden_states, 0)
|
|
|
|
|
|
hidden_states = torch.index_select(
|
2025-12-31 09:29:57 +08:00
|
|
|
|
hidden_states, 0, self.runner.pcp_manager.
|
|
|
|
|
|
pcp_allgather_restore_idx.gpu[:hidden_states.shape[0]])
|
2025-11-12 17:22:21 +08:00
|
|
|
|
|
2025-09-05 09:11:22 +08:00
|
|
|
|
sample_hidden_states = hidden_states[last_token_indices]
|
2025-10-21 20:17:09 +08:00
|
|
|
|
logits = self.model.compute_logits(sample_hidden_states)
|
2025-09-05 09:11:22 +08:00
|
|
|
|
if lmhead_tp_enable() and num_indices < logits.shape[0]:
|
|
|
|
|
|
logits = logits[:num_indices]
|
2025-12-01 10:22:36 +08:00
|
|
|
|
last_token_indices = last_token_indices[:num_indices]
|
2025-09-05 09:11:22 +08:00
|
|
|
|
draft_token_ids = logits.argmax(dim=-1)
|
|
|
|
|
|
|
|
|
|
|
|
if self.num_speculative_tokens == 1:
|
|
|
|
|
|
# [batch_size, 1]
|
|
|
|
|
|
return draft_token_ids.view(-1, 1)
|
|
|
|
|
|
|
|
|
|
|
|
if step == 0:
|
|
|
|
|
|
draft_token_ids_list = [draft_token_ids]
|
2025-08-29 11:41:21 +08:00
|
|
|
|
else:
|
2025-09-05 09:11:22 +08:00
|
|
|
|
draft_token_ids_list.append(draft_token_ids)
|
|
|
|
|
|
|
|
|
|
|
|
# prepare next mtp inputs
|
|
|
|
|
|
# mtp>1: prefill skip or decode skip last loop
|
2025-10-22 21:52:57 +08:00
|
|
|
|
if with_prefill:
|
2025-09-05 09:11:22 +08:00
|
|
|
|
for _ in range(self.num_speculative_tokens - 1):
|
|
|
|
|
|
draft_token_ids_list.append(draft_token_ids)
|
|
|
|
|
|
if step == self.num_speculative_tokens - 1 or with_prefill:
|
|
|
|
|
|
break
|
|
|
|
|
|
|
2026-01-15 10:24:35 +08:00
|
|
|
|
attn_metadata_i = attn_metadata[self.attn_layer_names[0]]
|
2025-09-16 01:17:42 +08:00
|
|
|
|
|
2025-09-05 09:11:22 +08:00
|
|
|
|
if step == 0:
|
|
|
|
|
|
positions = target_positions[last_token_indices]
|
|
|
|
|
|
hidden_states = hidden_states[last_token_indices]
|
2025-09-16 01:17:42 +08:00
|
|
|
|
slot_mapping = attn_metadata_i.slot_mapping[last_token_indices]
|
|
|
|
|
|
attn_metadata_i.slot_mapping.fill_(-1)
|
|
|
|
|
|
attn_metadata_i.query_start_loc = self.arange[:batch_size + 1]
|
2025-09-05 09:11:22 +08:00
|
|
|
|
last_token_indices = self.arange[:batch_size]
|
2025-12-03 14:24:33 +08:00
|
|
|
|
if getattr(attn_metadata_i, "num_decode_tokens", 0):
|
2025-09-16 01:17:42 +08:00
|
|
|
|
attn_metadata_i.num_decode_tokens = batch_size
|
2025-12-22 16:13:39 +08:00
|
|
|
|
if self.pcp_size * self.dcp_size > 1:
|
|
|
|
|
|
positions = target_positions[ori_last_token_indices]
|
|
|
|
|
|
# For pcp/dcp, tokens are split across different cp ranks,
|
|
|
|
|
|
# so we can not simply update slot_mapping by += 1.
|
|
|
|
|
|
# Instead, we pre-allocate mtp slot_mapping in model_runner
|
|
|
|
|
|
# (_generate_pcp_mtp_input), and use updated slot_indices
|
|
|
|
|
|
# to get corresponding slot_mapping in each step.
|
|
|
|
|
|
num_reject_tokens = torch.tensor(
|
2025-12-31 09:29:57 +08:00
|
|
|
|
self.runner.pcp_manager.cu_num_tokens_pcp_full,
|
2025-12-22 16:13:39 +08:00
|
|
|
|
dtype=torch.int32).to(
|
|
|
|
|
|
self.device) - ori_last_token_indices - 1
|
|
|
|
|
|
num_accept_tokens = \
|
|
|
|
|
|
query_lens_d.to(self.device) - num_reject_tokens
|
|
|
|
|
|
ori_seq_len = attn_metadata_i.seq_lens
|
2025-12-31 09:29:57 +08:00
|
|
|
|
mtp_slot_mapping = self.runner.pcp_manager.mtp_slot_pad
|
2025-12-22 16:13:39 +08:00
|
|
|
|
|
|
|
|
|
|
# slot_mapping index base offset:
|
|
|
|
|
|
# scheduled tokens + pre-allocated mtp tokens + accepted tokens
|
|
|
|
|
|
slot_idx_base = (
|
|
|
|
|
|
torch.cat([
|
|
|
|
|
|
torch.tensor(
|
|
|
|
|
|
[0], dtype=torch.int32, device=self.device),
|
|
|
|
|
|
(torch.cumsum(query_lens_d, dim=0)[:-1] *
|
|
|
|
|
|
self.pcp_size).to(self.device)
|
|
|
|
|
|
]) +
|
|
|
|
|
|
torch.arange(num_decode_reqs, device=self.device) *
|
|
|
|
|
|
(self.num_speculative_tokens - 1) * self.pcp_size +
|
|
|
|
|
|
(num_accept_tokens - 1) * self.pcp_size)
|
|
|
|
|
|
slot_indices_list = []
|
|
|
|
|
|
for req_id in range(num_decode_reqs):
|
|
|
|
|
|
slot_indices_list.append(
|
|
|
|
|
|
torch.arange(slot_idx_base[req_id],
|
|
|
|
|
|
slot_idx_base[req_id] + self.pcp_size,
|
|
|
|
|
|
device=self.device))
|
|
|
|
|
|
slot_indices = torch.cat(slot_indices_list, dim=0)
|
|
|
|
|
|
|
|
|
|
|
|
# fold block_table (restore it to original size before flattened)
|
|
|
|
|
|
block_indices = torch.cat([
|
|
|
|
|
|
torch.tensor([0], dtype=torch.int32),
|
|
|
|
|
|
torch.cumsum(query_lens_d, dim=0)[:-1]
|
|
|
|
|
|
])
|
|
|
|
|
|
attn_metadata_i.decode.block_table[:batch_size] = \
|
|
|
|
|
|
attn_metadata_i.decode.block_table[block_indices]
|
|
|
|
|
|
attn_metadata_i.decode.block_table = \
|
|
|
|
|
|
attn_metadata_i.decode.block_table[:batch_size]
|
2025-09-05 09:11:22 +08:00
|
|
|
|
|
|
|
|
|
|
input_ids = draft_token_ids_list[-1].int()
|
|
|
|
|
|
positions += 1
|
|
|
|
|
|
|
2025-12-03 14:24:33 +08:00
|
|
|
|
decode_metadata = getattr(attn_metadata_i, "decode", None)
|
|
|
|
|
|
prefill_metadata = getattr(attn_metadata_i, "prefill", None)
|
[Feat] Support MTP to running in full graph mode (#3892)
### What this PR does / why we need it?
Currently, the MTP model still runs in eager in full graph mode. This PR
adapts the MTP with the full graph capture and execution. When the graph
mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to
improve the performance.
The change in both disable_padded_drafter_batch is True and False case
include:
1. Add _mtp_graph_params in acl_graph.py to isolate the data of main
model and the data of MTP.
2. Padding some metadata in mla_v1.py when in fullgraph mode.
3. Fixed the essential data address that will be used in model.forward.
4. Adapted according to the aclgraph capture framwork:
1). Rebuild MTP model with ACLGraphWrapper.
2). Add common attn metadata when start capture in MTP dummy_run.
3). Add common attn metadata update in MTP.
4). Addapted data update when num_speculative_tokens > 1.
5. Add a patch of MTP to adapt vllm v0.11.0.
Existing Issues:
1. When disable_padded_drafter_batch=True and running in FullGraph mode,
the data of the first-round requests in MTP is abnormal. We need to
identify the cause subsequently.
2. When disable_padded_drafter_batch=False and running in FullGraph
mode, the acceptance rate of the second and third tokens will decrease
(For example, if we set the num_speculative_tokens=3, the acceptance
rate of first token is 90%, the second is only 50% lower than 60%, the
third is only 20% lower than 30%). The reason is that the data processed
after the model runs does not match. This is a problem from another PR.
It works fine in eager and PIECEWISE mode, but has problem in FullGraph
mode. Once we have a solution, we will submit a bugfix.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
|
|
|
|
# When disable_padded_drafter_batch=False, it should not to be updating these params, maybe.
|
2025-12-03 14:24:33 +08:00
|
|
|
|
if decode_metadata is not None and (self.speculative_config.disable_padded_drafter_batch or \
|
|
|
|
|
|
aclgraph_runtime_mode != CUDAGraphMode.FULL):
|
2025-12-06 17:15:57 +08:00
|
|
|
|
decode_metadata.actual_seq_lengths_q = self.arange_cpu[
|
[Feat] Support MTP to running in full graph mode (#3892)
### What this PR does / why we need it?
Currently, the MTP model still runs in eager in full graph mode. This PR
adapts the MTP with the full graph capture and execution. When the graph
mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to
improve the performance.
The change in both disable_padded_drafter_batch is True and False case
include:
1. Add _mtp_graph_params in acl_graph.py to isolate the data of main
model and the data of MTP.
2. Padding some metadata in mla_v1.py when in fullgraph mode.
3. Fixed the essential data address that will be used in model.forward.
4. Adapted according to the aclgraph capture framwork:
1). Rebuild MTP model with ACLGraphWrapper.
2). Add common attn metadata when start capture in MTP dummy_run.
3). Add common attn metadata update in MTP.
4). Addapted data update when num_speculative_tokens > 1.
5. Add a patch of MTP to adapt vllm v0.11.0.
Existing Issues:
1. When disable_padded_drafter_batch=True and running in FullGraph mode,
the data of the first-round requests in MTP is abnormal. We need to
identify the cause subsequently.
2. When disable_padded_drafter_batch=False and running in FullGraph
mode, the acceptance rate of the second and third tokens will decrease
(For example, if we set the num_speculative_tokens=3, the acceptance
rate of first token is 90%, the second is only 50% lower than 60%, the
third is only 20% lower than 30%). The reason is that the data processed
after the model runs does not match. This is a problem from another PR.
It works fine in eager and PIECEWISE mode, but has problem in FullGraph
mode. Once we have a solution, we will submit a bugfix.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
|
|
|
|
1:batch_size + 1].tolist()
|
|
|
|
|
|
if aclgraph_runtime_mode == CUDAGraphMode.FULL:
|
2025-12-03 14:24:33 +08:00
|
|
|
|
decode_metadata.actual_seq_lengths_q = \
|
[Feat] Support MTP to running in full graph mode (#3892)
### What this PR does / why we need it?
Currently, the MTP model still runs in eager in full graph mode. This PR
adapts the MTP with the full graph capture and execution. When the graph
mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to
improve the performance.
The change in both disable_padded_drafter_batch is True and False case
include:
1. Add _mtp_graph_params in acl_graph.py to isolate the data of main
model and the data of MTP.
2. Padding some metadata in mla_v1.py when in fullgraph mode.
3. Fixed the essential data address that will be used in model.forward.
4. Adapted according to the aclgraph capture framwork:
1). Rebuild MTP model with ACLGraphWrapper.
2). Add common attn metadata when start capture in MTP dummy_run.
3). Add common attn metadata update in MTP.
4). Addapted data update when num_speculative_tokens > 1.
5. Add a patch of MTP to adapt vllm v0.11.0.
Existing Issues:
1. When disable_padded_drafter_batch=True and running in FullGraph mode,
the data of the first-round requests in MTP is abnormal. We need to
identify the cause subsequently.
2. When disable_padded_drafter_batch=False and running in FullGraph
mode, the acceptance rate of the second and third tokens will decrease
(For example, if we set the num_speculative_tokens=3, the acceptance
rate of first token is 90%, the second is only 50% lower than 60%, the
third is only 20% lower than 30%). The reason is that the data processed
after the model runs does not match. This is a problem from another PR.
It works fine in eager and PIECEWISE mode, but has problem in FullGraph
mode. Once we have a solution, we will submit a bugfix.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
|
|
|
|
builder.pad_actual_seq_len_q_mtp_disable_pad(
|
|
|
|
|
|
graph_pad_size - batch_size,
|
|
|
|
|
|
batch_size,
|
2025-12-03 14:24:33 +08:00
|
|
|
|
decode_metadata.actual_seq_lengths_q)
|
2025-12-28 10:35:07 +08:00
|
|
|
|
decode_metadata.cos, decode_metadata.sin = get_cos_and_sin_mla(
|
|
|
|
|
|
positions[:batch_size])
|
2025-09-05 09:11:22 +08:00
|
|
|
|
# NOTE(woosuk): We should handle the case where the draft model
|
|
|
|
|
|
# generates tokens beyond the max model length. Since it is complex
|
|
|
|
|
|
# to remove such requests from the batch, we keep them in the batch
|
|
|
|
|
|
# but adjust the position ids and slot mappings to avoid the
|
|
|
|
|
|
# out-of-range access during the model execution. The draft tokens
|
|
|
|
|
|
# generated with this adjustment should be ignored.
|
2025-12-01 20:44:11 +08:00
|
|
|
|
exceeds_max_model_len = positions[:
|
|
|
|
|
|
batch_size] >= self.runner.model_config.max_model_len
|
2025-09-05 09:11:22 +08:00
|
|
|
|
# Mask out the position ids that exceed the max model length.
|
|
|
|
|
|
# Otherwise, we may get out-of-range error in RoPE.
|
|
|
|
|
|
clamped_positions = torch.where(exceeds_max_model_len, 0,
|
2025-12-01 20:44:11 +08:00
|
|
|
|
positions[:batch_size])
|
2025-09-05 09:11:22 +08:00
|
|
|
|
# Increment the sequence lengths.
|
2025-12-06 17:15:57 +08:00
|
|
|
|
# This is an out-of-place operation to avoid modifying the original tensor
|
|
|
|
|
|
# when enable async_scheduling.
|
|
|
|
|
|
attn_metadata_i.seq_lens = attn_metadata_i.seq_lens + 1
|
2025-09-05 09:11:22 +08:00
|
|
|
|
# For the requests that exceed the max model length, we set the
|
|
|
|
|
|
# sequence length to 1 to minimize their overheads in attention.
|
2025-12-08 09:07:59 +08:00
|
|
|
|
exceeds_mask = attn_metadata_i.seq_lens[:batch_size] > \
|
|
|
|
|
|
self.runner.model_config.max_model_len
|
|
|
|
|
|
attn_metadata_i.seq_lens[:batch_size].masked_fill_(exceeds_mask, 1)
|
2025-09-05 09:11:22 +08:00
|
|
|
|
# Mask out the slot mappings that exceed the max model length.
|
|
|
|
|
|
# Otherwise, the KV cache will be inadvertently updated with the
|
|
|
|
|
|
# padding tokens.
|
|
|
|
|
|
slot_mapping += 1
|
2025-12-22 16:13:39 +08:00
|
|
|
|
if self.pcp_size > 1:
|
|
|
|
|
|
exceeds_max_model_len = exceeds_max_model_len.repeat_interleave(
|
|
|
|
|
|
slot_mapping.size(0) // exceeds_max_model_len.size(0))
|
2025-09-05 09:11:22 +08:00
|
|
|
|
slot_mapping.masked_fill_(exceeds_max_model_len, PADDING_SLOT_ID)
|
|
|
|
|
|
|
|
|
|
|
|
# copy inputs to buffer for cudagraph
|
|
|
|
|
|
self.input_ids[:batch_size] = input_ids
|
|
|
|
|
|
self.positions[:batch_size] = clamped_positions
|
2025-09-26 09:04:16 +08:00
|
|
|
|
self.hidden_states[:hidden_states.shape[0]] = hidden_states
|
2025-12-22 16:13:39 +08:00
|
|
|
|
if self.pcp_size * self.dcp_size > 1:
|
|
|
|
|
|
# update local seq_len and batch_seq_mask
|
2025-12-31 09:29:57 +08:00
|
|
|
|
num_computed_tokens_of_pcp_dcp = self.runner.pcp_manager._get_cp_local_seq_lens(
|
2025-12-22 16:13:39 +08:00
|
|
|
|
ori_seq_len + step + 1,
|
|
|
|
|
|
self.pcp_size,
|
|
|
|
|
|
self.dcp_size,
|
|
|
|
|
|
self.runner.parallel_config.cp_kv_cache_interleave_size,
|
|
|
|
|
|
)
|
|
|
|
|
|
cp_seq_len = \
|
|
|
|
|
|
num_computed_tokens_of_pcp_dcp[:, self.pcp_rank, self.dcp_rank]
|
|
|
|
|
|
batch_seq_mask = (cp_seq_len == 0)
|
|
|
|
|
|
builder.batch_seq_mask_buf[:batch_seq_mask.shape[0]].copy_(
|
|
|
|
|
|
batch_seq_mask, non_blocking=True)
|
|
|
|
|
|
batch_seq_mask = builder.batch_seq_mask_buf[:batch_seq_mask.
|
|
|
|
|
|
shape[0]]
|
|
|
|
|
|
cp_seq_len = torch.where(cp_seq_len == 0, 1, cp_seq_len)
|
|
|
|
|
|
attn_metadata_i.decode.cp_seq_len = cp_seq_len
|
|
|
|
|
|
attn_metadata_i.decode.batch_seq_mask = batch_seq_mask
|
|
|
|
|
|
# update slot_mapping
|
|
|
|
|
|
slot_indices += self.pcp_size
|
|
|
|
|
|
slot_mapping = mtp_slot_mapping[slot_indices]
|
|
|
|
|
|
attn_metadata_i.slot_mapping[:batch_size *
|
|
|
|
|
|
self.pcp_size] = slot_mapping
|
|
|
|
|
|
else:
|
|
|
|
|
|
attn_metadata_i.slot_mapping[:batch_size] = slot_mapping
|
[Feat] Support MTP to running in full graph mode (#3892)
### What this PR does / why we need it?
Currently, the MTP model still runs in eager in full graph mode. This PR
adapts the MTP with the full graph capture and execution. When the graph
mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to
improve the performance.
The change in both disable_padded_drafter_batch is True and False case
include:
1. Add _mtp_graph_params in acl_graph.py to isolate the data of main
model and the data of MTP.
2. Padding some metadata in mla_v1.py when in fullgraph mode.
3. Fixed the essential data address that will be used in model.forward.
4. Adapted according to the aclgraph capture framwork:
1). Rebuild MTP model with ACLGraphWrapper.
2). Add common attn metadata when start capture in MTP dummy_run.
3). Add common attn metadata update in MTP.
4). Addapted data update when num_speculative_tokens > 1.
5. Add a patch of MTP to adapt vllm v0.11.0.
Existing Issues:
1. When disable_padded_drafter_batch=True and running in FullGraph mode,
the data of the first-round requests in MTP is abnormal. We need to
identify the cause subsequently.
2. When disable_padded_drafter_batch=False and running in FullGraph
mode, the acceptance rate of the second and third tokens will decrease
(For example, if we set the num_speculative_tokens=3, the acceptance
rate of first token is 90%, the second is only 50% lower than 60%, the
third is only 20% lower than 30%). The reason is that the data processed
after the model runs does not match. This is a problem from another PR.
It works fine in eager and PIECEWISE mode, but has problem in FullGraph
mode. Once we have a solution, we will submit a bugfix.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
|
|
|
|
if self.speculative_config.disable_padded_drafter_batch:
|
|
|
|
|
|
self.positions[batch_size:num_input_tokens] = 0
|
|
|
|
|
|
self.input_ids[batch_size:num_input_tokens] = 0
|
|
|
|
|
|
self.hidden_states[batch_size:num_input_tokens].fill_(0)
|
2025-09-16 01:17:42 +08:00
|
|
|
|
|
2025-12-03 14:24:33 +08:00
|
|
|
|
if prefill_metadata is not None:
|
|
|
|
|
|
prefill_metadata.seq_lens = attn_metadata_i.seq_lens
|
|
|
|
|
|
prefill_metadata.seq_lens_list = prefill_metadata.seq_lens.tolist(
|
2025-10-09 19:22:46 +08:00
|
|
|
|
)
|
2025-12-03 14:24:33 +08:00
|
|
|
|
prefill_metadata.context_lens = attn_metadata_i.seq_lens
|
|
|
|
|
|
prefill_metadata.input_positions = self.positions[:
|
|
|
|
|
|
num_input_tokens]
|
|
|
|
|
|
prefill_metadata.max_seq_lens += 1
|
|
|
|
|
|
prefill_metadata.max_seq_lens = min(
|
|
|
|
|
|
prefill_metadata.max_seq_lens,
|
2025-09-05 09:11:22 +08:00
|
|
|
|
self.runner.model_config.max_model_len)
|
2025-12-03 14:24:33 +08:00
|
|
|
|
if decode_metadata is not None:
|
|
|
|
|
|
decode_metadata.seq_lens = attn_metadata_i.seq_lens
|
|
|
|
|
|
decode_metadata.seq_lens_list = decode_metadata.seq_lens.tolist(
|
2025-10-09 19:22:46 +08:00
|
|
|
|
)
|
2025-12-03 14:24:33 +08:00
|
|
|
|
decode_seq_lens_list = decode_metadata.seq_lens_list
|
[Feat] Support MTP to running in full graph mode (#3892)
### What this PR does / why we need it?
Currently, the MTP model still runs in eager in full graph mode. This PR
adapts the MTP with the full graph capture and execution. When the graph
mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to
improve the performance.
The change in both disable_padded_drafter_batch is True and False case
include:
1. Add _mtp_graph_params in acl_graph.py to isolate the data of main
model and the data of MTP.
2. Padding some metadata in mla_v1.py when in fullgraph mode.
3. Fixed the essential data address that will be used in model.forward.
4. Adapted according to the aclgraph capture framwork:
1). Rebuild MTP model with ACLGraphWrapper.
2). Add common attn metadata when start capture in MTP dummy_run.
3). Add common attn metadata update in MTP.
4). Addapted data update when num_speculative_tokens > 1.
5. Add a patch of MTP to adapt vllm v0.11.0.
Existing Issues:
1. When disable_padded_drafter_batch=True and running in FullGraph mode,
the data of the first-round requests in MTP is abnormal. We need to
identify the cause subsequently.
2. When disable_padded_drafter_batch=False and running in FullGraph
mode, the acceptance rate of the second and third tokens will decrease
(For example, if we set the num_speculative_tokens=3, the acceptance
rate of first token is 90%, the second is only 50% lower than 60%, the
third is only 20% lower than 30%). The reason is that the data processed
after the model runs does not match. This is a problem from another PR.
It works fine in eager and PIECEWISE mode, but has problem in FullGraph
mode. Once we have a solution, we will submit a bugfix.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
|
|
|
|
if aclgraph_runtime_mode == CUDAGraphMode.FULL and \
|
|
|
|
|
|
self.speculative_config.disable_padded_drafter_batch:
|
2025-12-03 14:24:33 +08:00
|
|
|
|
decode_metadata.seq_lens_list = decode_seq_lens_list + [
|
[Feat] Support MTP to running in full graph mode (#3892)
### What this PR does / why we need it?
Currently, the MTP model still runs in eager in full graph mode. This PR
adapts the MTP with the full graph capture and execution. When the graph
mode is set to "FULL_DECODE_ONLY", the MTP will run in full-graph to
improve the performance.
The change in both disable_padded_drafter_batch is True and False case
include:
1. Add _mtp_graph_params in acl_graph.py to isolate the data of main
model and the data of MTP.
2. Padding some metadata in mla_v1.py when in fullgraph mode.
3. Fixed the essential data address that will be used in model.forward.
4. Adapted according to the aclgraph capture framwork:
1). Rebuild MTP model with ACLGraphWrapper.
2). Add common attn metadata when start capture in MTP dummy_run.
3). Add common attn metadata update in MTP.
4). Addapted data update when num_speculative_tokens > 1.
5. Add a patch of MTP to adapt vllm v0.11.0.
Existing Issues:
1. When disable_padded_drafter_batch=True and running in FullGraph mode,
the data of the first-round requests in MTP is abnormal. We need to
identify the cause subsequently.
2. When disable_padded_drafter_batch=False and running in FullGraph
mode, the acceptance rate of the second and third tokens will decrease
(For example, if we set the num_speculative_tokens=3, the acceptance
rate of first token is 90%, the second is only 50% lower than 60%, the
third is only 20% lower than 30%). The reason is that the data processed
after the model runs does not match. This is a problem from another PR.
It works fine in eager and PIECEWISE mode, but has problem in FullGraph
mode. Once we have a solution, we will submit a bugfix.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
---------
Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-11-20 20:34:54 +08:00
|
|
|
|
0
|
|
|
|
|
|
] * (graph_pad_size - len(decode_seq_lens_list))
|
2025-12-03 14:24:33 +08:00
|
|
|
|
decode_metadata.input_positions = self.positions[:
|
|
|
|
|
|
num_input_tokens]
|
|
|
|
|
|
decode_metadata.max_seq_lens += 1
|
|
|
|
|
|
decode_metadata.max_seq_lens = min(
|
|
|
|
|
|
decode_metadata.max_seq_lens,
|
2025-09-05 09:11:22 +08:00
|
|
|
|
self.runner.model_config.max_model_len)
|
|
|
|
|
|
|
|
|
|
|
|
# mtp>1: [batch_size, k]
|
|
|
|
|
|
draft_token_ids = torch.stack(draft_token_ids_list, dim=1)
|
|
|
|
|
|
return draft_token_ids
|