[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
import types
|
|
|
|
|
|
2025-05-30 08:59:58 +08:00
|
|
|
import torch
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
import torch.nn as nn
|
|
|
|
|
import torchair
|
|
|
|
|
from torchair import patch_for_hcom
|
2025-05-30 08:59:58 +08:00
|
|
|
from vllm.attention.layer import Attention
|
2025-10-17 18:14:49 +08:00
|
|
|
from vllm.config import (CUDAGraphMode, VllmConfig,
|
|
|
|
|
get_layers_from_vllm_config, set_current_vllm_config)
|
2025-09-18 14:05:33 +08:00
|
|
|
from vllm.forward_context import BatchDescriptor, get_forward_context
|
2025-05-30 08:59:58 +08:00
|
|
|
from vllm.model_executor.model_loader import get_model_loader
|
2025-06-09 22:21:42 +08:00
|
|
|
from vllm.model_executor.model_loader.utils import (
|
|
|
|
|
process_weights_after_loading, set_default_torch_dtype)
|
2025-09-04 11:34:47 +08:00
|
|
|
from vllm.v1.core.sched.output import SchedulerOutput
|
2025-05-30 08:59:58 +08:00
|
|
|
from vllm.v1.sample.metadata import SamplingMetadata
|
2025-09-04 11:34:47 +08:00
|
|
|
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
|
2025-05-30 08:59:58 +08:00
|
|
|
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
from vllm_ascend.ascend_config import get_ascend_config
|
2025-07-28 14:06:20 +08:00
|
|
|
from vllm_ascend.ascend_forward_context import set_ascend_forward_context
|
2025-08-20 09:01:04 +08:00
|
|
|
from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
|
2025-05-30 08:59:58 +08:00
|
|
|
from vllm_ascend.models.deepseek_mtp import CustomDeepSeekMTP
|
2025-09-04 11:34:47 +08:00
|
|
|
from vllm_ascend.spec_decode.interface import Proposer, SpecDcodeType
|
2025-09-02 11:12:41 +08:00
|
|
|
from vllm_ascend.torchair.models.torchair_deepseek_mtp import \
|
|
|
|
|
TorchairDeepSeekMTP
|
2025-09-03 17:56:12 +08:00
|
|
|
from vllm_ascend.torchair.utils import (TORCHAIR_CACHE_DIR,
|
|
|
|
|
TorchairCommonAttentionMetadata)
|
2025-10-09 10:28:38 +08:00
|
|
|
from vllm_ascend.utils import ProfileExecuteDuration, lmhead_tp_enable
|
2025-05-30 08:59:58 +08:00
|
|
|
|
2025-09-05 09:11:22 +08:00
|
|
|
PADDING_SLOT_ID = -1
|
|
|
|
|
|
2025-05-30 08:59:58 +08:00
|
|
|
|
2025-09-04 11:34:47 +08:00
|
|
|
class MtpProposer(Proposer):
|
2025-05-30 08:59:58 +08:00
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
|
vllm_config: VllmConfig,
|
2025-09-04 11:34:47 +08:00
|
|
|
device,
|
2025-05-30 08:59:58 +08:00
|
|
|
runner,
|
|
|
|
|
):
|
2025-09-04 11:34:47 +08:00
|
|
|
self.name = SpecDcodeType.MTP
|
2025-05-30 08:59:58 +08:00
|
|
|
self.vllm_config = vllm_config
|
2025-09-04 11:34:47 +08:00
|
|
|
self.device = device
|
2025-05-30 08:59:58 +08:00
|
|
|
self.runner = runner
|
2025-09-05 09:11:22 +08:00
|
|
|
self.num_speculative_tokens = vllm_config.speculative_config.num_speculative_tokens
|
2025-09-04 11:34:47 +08:00
|
|
|
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
# persistent buffers for graph
|
|
|
|
|
self.input_ids = torch.zeros(self.runner.max_num_tokens,
|
|
|
|
|
dtype=torch.int32,
|
2025-09-04 11:34:47 +08:00
|
|
|
device=self.device)
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
self.positions = torch.zeros(self.runner.max_num_tokens,
|
|
|
|
|
dtype=torch.int64,
|
2025-09-04 11:34:47 +08:00
|
|
|
device=self.device)
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
self.hidden_states = torch.zeros(
|
2025-09-04 11:34:47 +08:00
|
|
|
(self.runner.max_num_tokens,
|
|
|
|
|
vllm_config.model_config.get_hidden_size()),
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
dtype=self.runner.dtype,
|
2025-09-04 11:34:47 +08:00
|
|
|
device=self.device)
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
self.torchair_compiled_model = None # type: ignore
|
|
|
|
|
self.torchair_compiled_models = {} # type: ignore
|
2025-08-21 08:54:57 +08:00
|
|
|
self.torchair_graph_enabled = get_ascend_config(
|
|
|
|
|
).torchair_graph_config.enabled
|
2025-09-23 14:52:42 +08:00
|
|
|
self.enable_shared_expert_dp = get_ascend_config(
|
|
|
|
|
).enable_shared_expert_dp
|
2025-09-05 09:11:22 +08:00
|
|
|
# We need +1 here because the arange is used to set query_start_loc,
|
|
|
|
|
# which has one more element than batch_size.
|
|
|
|
|
self.arange = torch.arange(vllm_config.scheduler_config.max_num_seqs +
|
|
|
|
|
1,
|
|
|
|
|
device=self.runner.device,
|
|
|
|
|
dtype=torch.int32)
|
2025-10-15 17:48:58 +08:00
|
|
|
self.use_sparse = hasattr(vllm_config.model_config.hf_config,
|
|
|
|
|
"index_topk")
|
2025-05-30 08:59:58 +08:00
|
|
|
|
2025-09-04 11:34:47 +08:00
|
|
|
def load_model(self, model) -> None:
|
|
|
|
|
loader = get_model_loader(self.vllm_config.load_config)
|
|
|
|
|
|
|
|
|
|
target_attn_layer_names = set(
|
|
|
|
|
get_layers_from_vllm_config(self.vllm_config, Attention).keys())
|
|
|
|
|
draft_model_config = \
|
|
|
|
|
self.vllm_config.speculative_config.draft_model_config
|
|
|
|
|
target_device = self.vllm_config.device_config.device
|
|
|
|
|
|
|
|
|
|
with set_default_torch_dtype(
|
|
|
|
|
draft_model_config.dtype), set_current_vllm_config(
|
|
|
|
|
self.vllm_config):
|
2025-10-17 15:06:37 +08:00
|
|
|
if self.torchair_graph_enabled:
|
2025-09-04 11:34:47 +08:00
|
|
|
self.model = TorchairDeepSeekMTP(
|
|
|
|
|
vllm_config=self.vllm_config).to(target_device)
|
|
|
|
|
else:
|
|
|
|
|
self.model = CustomDeepSeekMTP(
|
|
|
|
|
vllm_config=self.vllm_config).to(target_device)
|
|
|
|
|
|
|
|
|
|
draft_attn_layer_names = (
|
|
|
|
|
get_layers_from_vllm_config(self.vllm_config, Attention).keys() -
|
|
|
|
|
target_attn_layer_names)
|
|
|
|
|
|
|
|
|
|
assert len(draft_attn_layer_names) == 1
|
2025-09-16 01:17:42 +08:00
|
|
|
self.attn_layer_name = list(draft_attn_layer_names)
|
2025-09-04 11:34:47 +08:00
|
|
|
|
|
|
|
|
self.model.load_weights(
|
|
|
|
|
loader.get_all_weights(
|
|
|
|
|
self.vllm_config.speculative_config.draft_model_config,
|
|
|
|
|
self.model))
|
|
|
|
|
process_weights_after_loading(self.model, draft_model_config,
|
|
|
|
|
target_device)
|
|
|
|
|
|
|
|
|
|
@torch.inference_mode()
|
|
|
|
|
def dummy_run(self,
|
|
|
|
|
num_tokens: int,
|
|
|
|
|
with_prefill: bool = False,
|
|
|
|
|
skip_attn: bool = False,
|
|
|
|
|
num_reqs: int = 0,
|
2025-10-17 18:14:49 +08:00
|
|
|
num_tokens_across_dp=None,
|
|
|
|
|
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
|
|
|
|
|
batch_descriptor=None) -> None:
|
2025-09-04 11:34:47 +08:00
|
|
|
if not self.torchair_graph_enabled:
|
|
|
|
|
# TODO: adapt enable_dbo later
|
|
|
|
|
(num_tokens, num_tokens_across_dp, with_prefill,
|
|
|
|
|
_) = self.runner._sync_metadata_across_dp(num_tokens,
|
|
|
|
|
with_prefill, False)
|
2025-09-16 11:06:00 +08:00
|
|
|
|
2025-09-22 19:12:58 +08:00
|
|
|
moe_comm_type = self.runner._select_moe_comm_method(
|
2025-09-16 11:06:00 +08:00
|
|
|
num_tokens, with_prefill)
|
|
|
|
|
|
2025-09-04 11:34:47 +08:00
|
|
|
is_running_torchair = self.torchair_graph_enabled and \
|
|
|
|
|
not with_prefill
|
|
|
|
|
|
|
|
|
|
if is_running_torchair:
|
|
|
|
|
skip_attn = False
|
|
|
|
|
if skip_attn:
|
|
|
|
|
attn_metadata = None
|
|
|
|
|
else:
|
|
|
|
|
common_attn_metadata = TorchairCommonAttentionMetadata(
|
|
|
|
|
num_reqs=num_reqs,
|
|
|
|
|
num_actual_tokens=1,
|
|
|
|
|
actual_seq_lengths_q=self.runner.actual_seq_lengths_q,
|
|
|
|
|
attn_mask=self.runner.attn_mask,
|
|
|
|
|
spec_attn_mask=self.runner.spec_attn_mask,
|
|
|
|
|
decode_token_per_req=self.runner.decode_token_per_req,
|
|
|
|
|
)
|
|
|
|
|
attn_metadata = self.runner.attn_metadata_builder.build_torchair_graph_dummy(
|
|
|
|
|
common_attn_metadata)
|
|
|
|
|
|
|
|
|
|
input_ids = self.input_ids[:num_tokens]
|
|
|
|
|
positions = self.positions[:num_tokens]
|
|
|
|
|
previous_hidden_states = self.hidden_states[:num_tokens]
|
2025-09-05 09:11:22 +08:00
|
|
|
for _ in range(self.num_speculative_tokens):
|
|
|
|
|
with set_ascend_forward_context(
|
|
|
|
|
attn_metadata,
|
|
|
|
|
self.vllm_config,
|
|
|
|
|
num_tokens=num_tokens,
|
|
|
|
|
with_prefill=with_prefill,
|
|
|
|
|
num_tokens_across_dp=num_tokens_across_dp,
|
|
|
|
|
reserved_mc2_mask=self.runner.reserved_mc2_mask,
|
2025-09-22 19:12:58 +08:00
|
|
|
moe_comm_type=moe_comm_type,
|
2025-09-05 09:11:22 +08:00
|
|
|
in_profile_run=self.runner.in_profile_run,
|
2025-10-17 18:14:49 +08:00
|
|
|
num_actual_tokens=0,
|
|
|
|
|
aclgraph_runtime_mode=aclgraph_runtime_mode,
|
|
|
|
|
batch_descriptor=batch_descriptor):
|
2025-09-05 09:11:22 +08:00
|
|
|
if is_running_torchair:
|
|
|
|
|
assert attn_metadata is not None
|
|
|
|
|
torch._dynamo.mark_static(input_ids)
|
|
|
|
|
torch._dynamo.mark_static(positions)
|
|
|
|
|
torch._dynamo.mark_static(previous_hidden_states)
|
|
|
|
|
torch._dynamo.mark_static(attn_metadata.decode.block_table)
|
|
|
|
|
torch._dynamo.mark_static(
|
|
|
|
|
attn_metadata.decode.input_positions)
|
|
|
|
|
if hasattr(attn_metadata.decode, "sin"):
|
|
|
|
|
torch._dynamo.mark_static(attn_metadata.decode.sin)
|
|
|
|
|
torch._dynamo.mark_static(attn_metadata.decode.cos)
|
|
|
|
|
torch._dynamo.mark_static(get_forward_context().mc2_mask)
|
|
|
|
|
torch._dynamo.mark_static(attn_metadata.slot_mapping)
|
|
|
|
|
torch._dynamo.mark_static(attn_metadata.decode.attn_mask)
|
|
|
|
|
torchair_compiled_model = self._get_torchair_lazy_compiled_model(
|
|
|
|
|
num_tokens)
|
|
|
|
|
torchair_compiled_model(
|
|
|
|
|
input_ids=input_ids,
|
|
|
|
|
positions=positions,
|
|
|
|
|
previous_hidden_states=previous_hidden_states,
|
|
|
|
|
inputs_embeds=None,
|
|
|
|
|
intermediate_tensors=None,
|
|
|
|
|
attn_metadata=attn_metadata,
|
|
|
|
|
kv_caches=self.runner.kv_caches[-1:],
|
|
|
|
|
spec_step_idx=0)
|
|
|
|
|
else:
|
|
|
|
|
self.model(input_ids=input_ids,
|
|
|
|
|
positions=positions,
|
|
|
|
|
previous_hidden_states=previous_hidden_states)
|
|
|
|
|
if with_prefill:
|
|
|
|
|
break
|
2025-09-04 11:34:47 +08:00
|
|
|
|
|
|
|
|
def generate_token_ids(self,
|
|
|
|
|
valid_sampled_token_ids: list[list[int]],
|
|
|
|
|
sampling_metadata: SamplingMetadata = None,
|
|
|
|
|
scheduler_output: SchedulerOutput = None,
|
|
|
|
|
spec_decode_metadata: SpecDecodeMetadata = None,
|
|
|
|
|
positions: torch.Tensor = None,
|
|
|
|
|
num_scheduled_tokens: int = 0,
|
|
|
|
|
hidden_states: torch.Tensor = None,
|
|
|
|
|
attn_metadata=None,
|
|
|
|
|
aux_hidden_states: torch.Tensor = None):
|
2025-09-16 01:17:42 +08:00
|
|
|
if attn_metadata is not None and isinstance(attn_metadata, dict):
|
|
|
|
|
attn_metadata = attn_metadata['model.layers.0.self_attn.attn']
|
2025-09-04 11:34:47 +08:00
|
|
|
next_token_ids: list[int] = []
|
|
|
|
|
for i, token_ids in enumerate(valid_sampled_token_ids):
|
|
|
|
|
if token_ids:
|
|
|
|
|
# Common case.
|
|
|
|
|
next_token_id = token_ids[-1]
|
|
|
|
|
else:
|
|
|
|
|
# Partial prefill (rare case).
|
|
|
|
|
# Get the next token id from the request state.
|
|
|
|
|
req_id = self.runner.input_batch.req_ids[i]
|
|
|
|
|
req_state = self.runner.requests[req_id]
|
|
|
|
|
seq_len = (req_state.num_computed_tokens +
|
|
|
|
|
scheduler_output.num_scheduled_tokens[req_id])
|
|
|
|
|
next_token_id = req_state.get_token_id(seq_len)
|
|
|
|
|
next_token_ids.append(next_token_id)
|
|
|
|
|
next_token_ids = torch.tensor(next_token_ids,
|
|
|
|
|
dtype=torch.int32,
|
|
|
|
|
device=self.device)
|
|
|
|
|
accepted_token_indices = None
|
|
|
|
|
if spec_decode_metadata is None:
|
|
|
|
|
# input_ids can be None for multimodal models.
|
|
|
|
|
target_token_ids = self.runner.input_ids[:num_scheduled_tokens]
|
|
|
|
|
target_positions = positions[:num_scheduled_tokens]
|
|
|
|
|
target_hidden_states = hidden_states[:num_scheduled_tokens]
|
|
|
|
|
target_slot_mapping = attn_metadata.slot_mapping
|
|
|
|
|
cu_num_tokens = attn_metadata.query_start_loc
|
|
|
|
|
else:
|
|
|
|
|
# TODO(woosuk): Refactor this.
|
|
|
|
|
num_draft_tokens = spec_decode_metadata.num_draft_tokens
|
|
|
|
|
num_rejected_tokens = [
|
|
|
|
|
n + 1 - len(valid_sampled_token_ids[i]) if n > 0 else 0
|
|
|
|
|
for i, n in enumerate(num_draft_tokens)
|
|
|
|
|
]
|
|
|
|
|
num_rejected_tokens = torch.tensor(
|
|
|
|
|
num_rejected_tokens,
|
|
|
|
|
dtype=torch.int32,
|
|
|
|
|
device=self.device,
|
|
|
|
|
)
|
|
|
|
|
cu_num_tokens, accepted_token_indices, target_token_ids, \
|
|
|
|
|
target_positions, target_hidden_states, target_slot_mapping = self._prepare_inputs(
|
|
|
|
|
attn_metadata.query_start_loc,
|
|
|
|
|
num_rejected_tokens,
|
|
|
|
|
self.runner.input_ids[:num_scheduled_tokens],
|
|
|
|
|
positions[:num_scheduled_tokens],
|
|
|
|
|
hidden_states[:num_scheduled_tokens],
|
|
|
|
|
attn_metadata.slot_mapping[:num_scheduled_tokens],
|
|
|
|
|
is_torchair_graph=self.runner._build_drafter_prepare_inputs_torchair_param(),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
draft_token_ids = self._propose(
|
|
|
|
|
target_token_ids=target_token_ids,
|
|
|
|
|
target_positions=target_positions,
|
|
|
|
|
target_hidden_states=target_hidden_states,
|
|
|
|
|
target_slot_mapping=target_slot_mapping,
|
|
|
|
|
next_token_ids=next_token_ids,
|
|
|
|
|
cu_num_tokens=cu_num_tokens,
|
|
|
|
|
block_table=attn_metadata.block_tables,
|
|
|
|
|
sampling_metadata=sampling_metadata,
|
|
|
|
|
token_indices=accepted_token_indices)
|
|
|
|
|
spec_token_ids = draft_token_ids.tolist()
|
|
|
|
|
return spec_token_ids
|
|
|
|
|
|
|
|
|
|
def _prepare_inputs(
|
|
|
|
|
self,
|
2025-05-30 08:59:58 +08:00
|
|
|
# [batch_size + 1]
|
|
|
|
|
cu_target_query_lens: torch.Tensor,
|
|
|
|
|
# [batch_size]
|
|
|
|
|
num_rejected_tokens: torch.Tensor,
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
token_ids: torch.Tensor,
|
|
|
|
|
positions: torch.Tensor,
|
|
|
|
|
hidden_states: torch.Tensor,
|
|
|
|
|
slot_mapping: torch.Tensor,
|
|
|
|
|
is_torchair_graph: bool = False
|
|
|
|
|
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
|
|
|
|
|
torch.Tensor, torch.Tensor]:
|
2025-05-30 08:59:58 +08:00
|
|
|
# cu_target_query_lens: [0, a, a + b, a + b + c]
|
|
|
|
|
# num_rejected_tokens: [n1, n2, n3]
|
|
|
|
|
# num_tokens_per_req: [a - n1, b - n2, c - n3]
|
|
|
|
|
# cu_num_tokens: [0, a - n1, a + b - n1 - n2, a + b + c - n1 - n2 - n3]
|
|
|
|
|
# token_indices: [0, 1, ..., a - n1 - 1,
|
|
|
|
|
# a, a + 1, ..., a + b - n2 - 1,
|
|
|
|
|
# a + b, a + b + 1, ..., a + b + c - n3 - 1]
|
|
|
|
|
# [0, a, a + b, a + b + c] -> [a, b, c]
|
|
|
|
|
query_len_per_req = (cu_target_query_lens[1:] -
|
|
|
|
|
cu_target_query_lens[:-1])
|
|
|
|
|
# [a, b, c] -> [a - n1, b - n2, c - n3]
|
|
|
|
|
num_tokens_per_req = query_len_per_req - num_rejected_tokens
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
if is_torchair_graph:
|
|
|
|
|
cu_num_tokens = cu_target_query_lens
|
|
|
|
|
relative_index = query_len_per_req - num_rejected_tokens - 1
|
|
|
|
|
token_indices = cu_num_tokens[:-1] + relative_index
|
|
|
|
|
# the seq len of each bath is padded to 1+num_speculative_tokens, thus input is same as the main model
|
|
|
|
|
target_token_ids = token_ids
|
|
|
|
|
target_positions = positions
|
|
|
|
|
target_hidden_states = hidden_states
|
|
|
|
|
target_slot_mapping = slot_mapping
|
|
|
|
|
else:
|
|
|
|
|
cu_num_tokens = torch.empty_like(cu_target_query_lens)
|
|
|
|
|
torch.cumsum(num_tokens_per_req, dim=0, out=cu_num_tokens[1:])
|
|
|
|
|
cu_num_tokens[0] = 0
|
|
|
|
|
|
|
|
|
|
# FIXME(woosuk): Avoid synchronization.
|
|
|
|
|
num_tokens = cu_num_tokens[-1].item()
|
2025-08-20 09:01:04 +08:00
|
|
|
token_indices = torch.zeros(
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
num_tokens,
|
|
|
|
|
dtype=torch.int32,
|
|
|
|
|
device=cu_num_tokens.device,
|
|
|
|
|
)
|
2025-05-30 08:59:58 +08:00
|
|
|
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
BLOCK_SIZE = 1024
|
2025-09-04 11:34:47 +08:00
|
|
|
self._prepare_input_kernel(
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
token_indices,
|
|
|
|
|
cu_target_query_lens,
|
|
|
|
|
cu_num_tokens,
|
|
|
|
|
block_size=BLOCK_SIZE,
|
|
|
|
|
)
|
|
|
|
|
target_token_ids = token_ids[token_indices]
|
|
|
|
|
target_positions = positions[token_indices]
|
|
|
|
|
target_hidden_states = hidden_states[token_indices]
|
|
|
|
|
target_slot_mapping = slot_mapping[token_indices]
|
|
|
|
|
return cu_num_tokens, token_indices, target_token_ids, target_positions, target_hidden_states, target_slot_mapping
|
2025-05-30 08:59:58 +08:00
|
|
|
|
2025-09-04 11:34:47 +08:00
|
|
|
def _propose(
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
self,
|
|
|
|
|
# [num_tokens]
|
|
|
|
|
target_token_ids: torch.Tensor,
|
|
|
|
|
# [num_tokens]
|
|
|
|
|
target_positions: torch.Tensor,
|
|
|
|
|
# [num_tokens, hidden_size]
|
|
|
|
|
target_hidden_states: torch.Tensor,
|
|
|
|
|
# [num_tokens]
|
|
|
|
|
target_slot_mapping: torch.Tensor,
|
|
|
|
|
# [batch_size]
|
|
|
|
|
next_token_ids: torch.Tensor,
|
|
|
|
|
# [batch_size + 1] starting with 0
|
|
|
|
|
cu_num_tokens: torch.Tensor,
|
|
|
|
|
# [batch_size, max_num_blocks_per_req]
|
|
|
|
|
block_table: torch.Tensor,
|
|
|
|
|
sampling_metadata: SamplingMetadata,
|
|
|
|
|
token_indices=None) -> torch.Tensor:
|
2025-05-30 08:59:58 +08:00
|
|
|
num_tokens = target_token_ids.shape[0]
|
|
|
|
|
batch_size = next_token_ids.shape[0]
|
|
|
|
|
last_token_indices = cu_num_tokens[1:] - 1
|
|
|
|
|
|
|
|
|
|
# Shift the input ids by one token.
|
|
|
|
|
# E.g., [a1, b1, b2, c1, c2, c3] -> [b1, b2, c1, c2, c3, c3]
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
self.input_ids[:num_tokens - 1] = target_token_ids[1:]
|
2025-05-30 08:59:58 +08:00
|
|
|
# Replace the last token with the next token.
|
|
|
|
|
# E.g., [b1, b2, c1, c2, c3, c3] -> [a2, b2, b3, c2, c3, c4]
|
2025-08-21 08:54:57 +08:00
|
|
|
if token_indices is not None and self.torchair_graph_enabled:
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
last_token_indices = token_indices
|
|
|
|
|
|
|
|
|
|
self.input_ids[last_token_indices] = next_token_ids
|
2025-05-30 08:59:58 +08:00
|
|
|
|
|
|
|
|
query_lens = cu_num_tokens[1:] - cu_num_tokens[:-1]
|
|
|
|
|
max_query_len = query_lens.max().item()
|
|
|
|
|
|
|
|
|
|
# FIXME: reorder_batch() needs to be called before build()
|
|
|
|
|
# because fields of attn_metadata_builder needs to be updated.
|
|
|
|
|
# However, currently reorder_batch() takes input_batch and
|
|
|
|
|
# scheduler_output as arguments, we should probably refactor
|
|
|
|
|
# the method to use new data structures which are independent
|
|
|
|
|
# from input_batch and scheduler_output.
|
|
|
|
|
# self.runner.attn_metadata_builder.reorder_batch(
|
|
|
|
|
# input_batch=self.runner.input_batch,
|
|
|
|
|
# scheduler_output=self.runner.scheduler_output,
|
|
|
|
|
# )
|
2025-08-21 08:54:57 +08:00
|
|
|
is_running_torchair = self.torchair_graph_enabled and \
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
not self.runner.with_prefill
|
|
|
|
|
|
|
|
|
|
if is_running_torchair:
|
2025-09-18 14:05:33 +08:00
|
|
|
# Torchair graph mode, padding is same as the main model
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
num_input_tokens = self.runner.graph_pad_size
|
2025-09-18 14:05:33 +08:00
|
|
|
elif (self.runner.use_aclgraph
|
|
|
|
|
and num_tokens <= self.runner.aclgraph_batch_sizes[-1]):
|
|
|
|
|
# Acl graph mode, add padding to the batch size
|
|
|
|
|
num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
else:
|
2025-09-18 14:05:33 +08:00
|
|
|
# Eager mode, no padding needed
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
num_input_tokens = num_tokens
|
2025-05-30 08:59:58 +08:00
|
|
|
|
2025-08-20 09:01:04 +08:00
|
|
|
seq_lens = target_positions[last_token_indices] + 1
|
|
|
|
|
seq_lens = seq_lens.int()
|
|
|
|
|
common_attn_metadata = AscendCommonAttentionMetadata(
|
|
|
|
|
query_start_loc=cu_num_tokens[:batch_size + 1],
|
|
|
|
|
query_start_loc_cpu=cu_num_tokens[:batch_size + 1].cpu(),
|
|
|
|
|
seq_lens_cpu=seq_lens.cpu(),
|
2025-05-30 08:59:58 +08:00
|
|
|
num_reqs=batch_size,
|
|
|
|
|
num_actual_tokens=num_tokens,
|
|
|
|
|
max_query_len=max_query_len,
|
2025-08-20 09:01:04 +08:00
|
|
|
actual_seq_lengths_q=self.runner.actual_seq_lengths_q,
|
|
|
|
|
block_table_tensor=self.runner.input_batch.block_table[0].
|
|
|
|
|
get_device_tensor(),
|
2025-09-22 17:14:28 +08:00
|
|
|
slot_mapping=target_slot_mapping,
|
2025-08-20 09:01:04 +08:00
|
|
|
positions=target_positions,
|
|
|
|
|
attn_mask=self.runner.attn_mask,
|
|
|
|
|
spec_attn_mask=self.runner.spec_attn_mask,
|
|
|
|
|
attn_state=self.runner.attn_state,
|
|
|
|
|
graph_pad_size=self.runner.graph_pad_size,
|
|
|
|
|
decode_token_per_req=self.runner.decode_token_per_req,
|
2025-09-16 01:17:42 +08:00
|
|
|
num_computed_tokens_cpu=None,
|
|
|
|
|
seq_lens=None)
|
|
|
|
|
|
|
|
|
|
if not self.torchair_graph_enabled:
|
2025-10-09 10:28:38 +08:00
|
|
|
builder = self.runner.attn_groups[0][0].get_metadata_builder()
|
2025-09-16 01:17:42 +08:00
|
|
|
attn_metadata_mtp = builder.build(0, common_attn_metadata,
|
|
|
|
|
self.runner.get_model())
|
|
|
|
|
|
|
|
|
|
attn_metadata = {}
|
|
|
|
|
for layer_name in self.attn_layer_name:
|
|
|
|
|
attn_metadata[layer_name] = attn_metadata_mtp
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
attn_metadata = self.runner.attn_metadata_builder.build(
|
|
|
|
|
0, common_attn_metadata, self.runner.get_model())
|
2025-05-30 08:59:58 +08:00
|
|
|
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
self.positions[:num_tokens] = target_positions
|
|
|
|
|
self.hidden_states[:num_tokens] = target_hidden_states
|
|
|
|
|
|
2025-08-21 08:54:57 +08:00
|
|
|
if not self.torchair_graph_enabled:
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
# torch mode need to update num_tokens_across_dp
|
|
|
|
|
# TODO: adapt enable_dbo later
|
|
|
|
|
(num_input_tokens, num_tokens_across_dp, with_prefill,
|
[Fix] Fix DP-related padding logic (#2582)
### What this PR does / why we need it?
The determination of attention state, padding, and other forward
metadata has been moved to an earlier stage within the input preparation
process. This change enables us to utilize a single all-reduce
operation, maximizing synchronization efficiency as early as possible.
The logic for synchronizing metadata—such as the number of tokens,
prefill status, and DBO status—across data parallel (DP) ranks has now
been unified and simplified.
For performance improvements, the all-reduce operation has been switched
from the `gloo` backend to the `npu` backend, which results in an
reduction of several milliseconds per step (**approximately 10%
performance gain for TPOT!**).
Additionally, the multi-DP server hang issue has been resolved, ensuring
no more hangs occur when `num_requests < dp_size`. Alas, a relief.
Finally, the miscalculated memory usage issue has been addressed by
removing the unnecessary `DummyCommImpl`, allowing the system to use the
real communication method when determining available memory.
### Does this PR introduce _any_ user-facing change?
None.
### How was this patch tested?
Maybe we should add an test case for multi-DP online server?
@MengqingCao
- vLLM version: v0.10.1.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/c5d004aaaf3b2106d33974c673bec0568c18f762
---------
Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-08-28 19:39:58 +08:00
|
|
|
_) = self.runner._sync_metadata_across_dp(
|
2025-09-18 14:05:33 +08:00
|
|
|
num_input_tokens, self.runner.with_prefill, False)
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
else:
|
|
|
|
|
# torchair mode can reuse self.runner.num_tokens_across_dp
|
|
|
|
|
num_tokens_across_dp = self.runner.num_tokens_across_dp
|
|
|
|
|
with_prefill = self.runner.with_prefill
|
|
|
|
|
|
2025-09-22 19:12:58 +08:00
|
|
|
moe_comm_type = self.runner._select_moe_comm_method(
|
2025-09-16 11:06:00 +08:00
|
|
|
num_input_tokens, with_prefill)
|
2025-09-18 14:05:33 +08:00
|
|
|
batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens,
|
|
|
|
|
uniform_decode=False)
|
|
|
|
|
aclgraph_runtime_mode, batch_descriptor = \
|
|
|
|
|
self.runner.aclgraph_dispatcher.dispatch(batch_descriptor)
|
2025-09-16 11:06:00 +08:00
|
|
|
|
2025-09-05 09:11:22 +08:00
|
|
|
for step in range(self.num_speculative_tokens):
|
|
|
|
|
with set_ascend_forward_context(
|
|
|
|
|
attn_metadata,
|
|
|
|
|
self.vllm_config,
|
|
|
|
|
num_tokens=num_input_tokens,
|
|
|
|
|
with_prefill=with_prefill,
|
|
|
|
|
num_tokens_across_dp=num_tokens_across_dp,
|
|
|
|
|
reserved_mc2_mask=self.runner.reserved_mc2_mask,
|
2025-09-22 19:12:58 +08:00
|
|
|
moe_comm_type=moe_comm_type,
|
2025-09-18 14:05:33 +08:00
|
|
|
aclgraph_runtime_mode=aclgraph_runtime_mode,
|
2025-10-17 18:14:49 +08:00
|
|
|
batch_descriptor=batch_descriptor,
|
2025-09-05 09:11:22 +08:00
|
|
|
in_profile_run=self.runner.in_profile_run,
|
|
|
|
|
num_actual_tokens=num_tokens):
|
|
|
|
|
with ProfileExecuteDuration().capture_async('mtp_forward'):
|
|
|
|
|
model_kwargs = {}
|
|
|
|
|
model_kwargs["attn_metadata"] = attn_metadata
|
|
|
|
|
if self.torchair_graph_enabled:
|
|
|
|
|
model_kwargs["kv_caches"] = self.runner.kv_caches[-1:]
|
|
|
|
|
if is_running_torchair:
|
|
|
|
|
torchair_compiled_model = self._get_torchair_lazy_compiled_model(
|
|
|
|
|
num_input_tokens)
|
|
|
|
|
hidden_states = torchair_compiled_model(
|
|
|
|
|
input_ids=self.input_ids[:num_input_tokens],
|
|
|
|
|
positions=self.positions[:num_input_tokens],
|
|
|
|
|
previous_hidden_states=self.
|
|
|
|
|
hidden_states[:num_input_tokens],
|
|
|
|
|
inputs_embeds=None,
|
|
|
|
|
intermediate_tensors=None,
|
|
|
|
|
spec_step_idx=0,
|
|
|
|
|
**model_kwargs)
|
|
|
|
|
else:
|
|
|
|
|
hidden_states = self.model(
|
|
|
|
|
input_ids=self.input_ids[:num_input_tokens],
|
|
|
|
|
positions=self.positions[:num_input_tokens],
|
|
|
|
|
previous_hidden_states=self.
|
|
|
|
|
hidden_states[:num_input_tokens],
|
|
|
|
|
kv_caches=self.runner.kv_caches[-1:])
|
|
|
|
|
|
|
|
|
|
num_indices = last_token_indices.shape[0]
|
|
|
|
|
if lmhead_tp_enable():
|
|
|
|
|
if not self.runner.with_prefill:
|
|
|
|
|
max_num_reqs_across_dp = num_input_tokens
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
else:
|
2025-09-05 09:11:22 +08:00
|
|
|
max_num_reqs_across_dp = self.vllm_config.scheduler_config.max_num_seqs
|
|
|
|
|
last_token_indices = nn.functional.pad(
|
|
|
|
|
last_token_indices,
|
|
|
|
|
(0, max_num_reqs_across_dp - num_indices))
|
|
|
|
|
|
|
|
|
|
sample_hidden_states = hidden_states[last_token_indices]
|
|
|
|
|
logits = self.model.compute_logits(sample_hidden_states, None)
|
|
|
|
|
if lmhead_tp_enable() and num_indices < logits.shape[0]:
|
|
|
|
|
logits = logits[:num_indices]
|
|
|
|
|
draft_token_ids = logits.argmax(dim=-1)
|
|
|
|
|
|
|
|
|
|
if self.num_speculative_tokens == 1:
|
|
|
|
|
# [batch_size, 1]
|
|
|
|
|
return draft_token_ids.view(-1, 1)
|
|
|
|
|
|
|
|
|
|
if step == 0:
|
|
|
|
|
draft_token_ids_list = [draft_token_ids]
|
2025-08-29 11:41:21 +08:00
|
|
|
else:
|
2025-09-05 09:11:22 +08:00
|
|
|
draft_token_ids_list.append(draft_token_ids)
|
|
|
|
|
|
|
|
|
|
# prepare next mtp inputs
|
|
|
|
|
# mtp>1: prefill skip or decode skip last loop
|
|
|
|
|
if with_prefill and self.torchair_graph_enabled:
|
|
|
|
|
for _ in range(self.num_speculative_tokens - 1):
|
|
|
|
|
draft_token_ids_list.append(draft_token_ids)
|
|
|
|
|
if step == self.num_speculative_tokens - 1 or with_prefill:
|
|
|
|
|
break
|
|
|
|
|
|
2025-09-16 01:17:42 +08:00
|
|
|
if not self.torchair_graph_enabled:
|
|
|
|
|
attn_metadata_i = attn_metadata[self.attn_layer_name[0]]
|
|
|
|
|
else:
|
|
|
|
|
attn_metadata_i = attn_metadata
|
|
|
|
|
|
2025-09-05 09:11:22 +08:00
|
|
|
if step == 0:
|
|
|
|
|
positions = target_positions[last_token_indices]
|
|
|
|
|
hidden_states = hidden_states[last_token_indices]
|
2025-09-16 01:17:42 +08:00
|
|
|
slot_mapping = attn_metadata_i.slot_mapping[last_token_indices]
|
|
|
|
|
attn_metadata_i.slot_mapping.fill_(-1)
|
|
|
|
|
attn_metadata_i.query_start_loc = self.arange[:batch_size + 1]
|
2025-09-05 09:11:22 +08:00
|
|
|
last_token_indices = self.arange[:batch_size]
|
2025-09-16 01:17:42 +08:00
|
|
|
if attn_metadata_i.num_decode_tokens != 0:
|
|
|
|
|
attn_metadata_i.num_decode_tokens = batch_size
|
2025-09-05 09:11:22 +08:00
|
|
|
if is_running_torchair:
|
2025-09-16 01:17:42 +08:00
|
|
|
attn_metadata_i.num_actual_tokens = batch_size
|
|
|
|
|
attn_metadata_i.query_lens = [1] * batch_size
|
2025-09-05 09:11:22 +08:00
|
|
|
|
|
|
|
|
input_ids = draft_token_ids_list[-1].int()
|
|
|
|
|
positions += 1
|
|
|
|
|
|
2025-10-09 19:22:46 +08:00
|
|
|
if not self.torchair_graph_enabled:
|
|
|
|
|
attn_metadata_i.decode.actual_seq_lengths_q = attn_metadata_i.query_start_loc[
|
|
|
|
|
1:batch_size + 1].tolist()
|
|
|
|
|
attn_metadata_i.decode.cos = builder.cos_cache[
|
|
|
|
|
positions].unsqueeze(1).unsqueeze(2)
|
|
|
|
|
attn_metadata_i.decode.sin = builder.sin_cache[
|
|
|
|
|
positions].unsqueeze(1).unsqueeze(2)
|
|
|
|
|
|
2025-09-05 09:11:22 +08:00
|
|
|
# NOTE(woosuk): We should handle the case where the draft model
|
|
|
|
|
# generates tokens beyond the max model length. Since it is complex
|
|
|
|
|
# to remove such requests from the batch, we keep them in the batch
|
|
|
|
|
# but adjust the position ids and slot mappings to avoid the
|
|
|
|
|
# out-of-range access during the model execution. The draft tokens
|
|
|
|
|
# generated with this adjustment should be ignored.
|
|
|
|
|
exceeds_max_model_len = positions >= self.runner.model_config.max_model_len
|
|
|
|
|
# Mask out the position ids that exceed the max model length.
|
|
|
|
|
# Otherwise, we may get out-of-range error in RoPE.
|
|
|
|
|
clamped_positions = torch.where(exceeds_max_model_len, 0,
|
|
|
|
|
positions)
|
|
|
|
|
# Increment the sequence lengths.
|
2025-09-16 01:17:42 +08:00
|
|
|
attn_metadata_i.seq_lens[:batch_size] += 1
|
2025-09-05 09:11:22 +08:00
|
|
|
# For the requests that exceed the max model length, we set the
|
|
|
|
|
# sequence length to 1 to minimize their overheads in attention.
|
|
|
|
|
exceeds_max_model_len_cpu = exceeds_max_model_len.to(
|
2025-09-16 01:17:42 +08:00
|
|
|
attn_metadata_i.seq_lens.device, non_blocking=True)
|
|
|
|
|
attn_metadata_i.seq_lens[:batch_size].masked_fill_(
|
2025-09-05 09:11:22 +08:00
|
|
|
exceeds_max_model_len_cpu, 1)
|
|
|
|
|
# Mask out the slot mappings that exceed the max model length.
|
|
|
|
|
# Otherwise, the KV cache will be inadvertently updated with the
|
|
|
|
|
# padding tokens.
|
|
|
|
|
slot_mapping += 1
|
|
|
|
|
slot_mapping.masked_fill_(exceeds_max_model_len, PADDING_SLOT_ID)
|
|
|
|
|
|
|
|
|
|
# copy inputs to buffer for cudagraph
|
|
|
|
|
self.input_ids[:batch_size] = input_ids
|
|
|
|
|
self.positions[:batch_size] = clamped_positions
|
2025-09-26 09:04:16 +08:00
|
|
|
self.hidden_states[:hidden_states.shape[0]] = hidden_states
|
2025-09-16 01:17:42 +08:00
|
|
|
attn_metadata_i.slot_mapping[:batch_size] = slot_mapping
|
|
|
|
|
|
|
|
|
|
if attn_metadata_i.prefill is not None:
|
|
|
|
|
attn_metadata_i.prefill.seq_lens = attn_metadata_i.seq_lens
|
2025-10-09 19:22:46 +08:00
|
|
|
attn_metadata_i.prefill.seq_lens_list = attn_metadata_i.prefill.seq_lens.tolist(
|
|
|
|
|
)
|
2025-09-16 01:17:42 +08:00
|
|
|
attn_metadata_i.prefill.context_lens = attn_metadata_i.seq_lens
|
|
|
|
|
attn_metadata_i.prefill.input_positions = self.positions[:
|
|
|
|
|
num_input_tokens]
|
|
|
|
|
attn_metadata_i.prefill.max_seq_lens += 1
|
|
|
|
|
attn_metadata_i.prefill.max_seq_lens = min(
|
|
|
|
|
attn_metadata_i.prefill.max_seq_lens,
|
2025-09-05 09:11:22 +08:00
|
|
|
self.runner.model_config.max_model_len)
|
2025-09-16 01:17:42 +08:00
|
|
|
if attn_metadata_i.decode is not None:
|
|
|
|
|
attn_metadata_i.decode.seq_lens = attn_metadata_i.seq_lens
|
2025-10-09 19:22:46 +08:00
|
|
|
attn_metadata_i.decode.seq_lens_list = attn_metadata_i.decode.seq_lens.tolist(
|
|
|
|
|
)
|
2025-09-16 01:17:42 +08:00
|
|
|
attn_metadata_i.decode.input_positions = self.positions[:
|
|
|
|
|
num_input_tokens]
|
|
|
|
|
attn_metadata_i.decode.max_seq_lens += 1
|
|
|
|
|
attn_metadata_i.decode.max_seq_lens = min(
|
|
|
|
|
attn_metadata_i.decode.max_seq_lens,
|
2025-09-05 09:11:22 +08:00
|
|
|
self.runner.model_config.max_model_len)
|
|
|
|
|
|
|
|
|
|
# mtp>1: [batch_size, k]
|
|
|
|
|
draft_token_ids = torch.stack(draft_token_ids_list, dim=1)
|
|
|
|
|
return draft_token_ids
|
2025-05-30 08:59:58 +08:00
|
|
|
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
def _get_torchair_lazy_compiled_model(self, batch_size: int):
|
|
|
|
|
if batch_size < 0 or batch_size > self.runner.torchair_graph_batch_sizes[
|
|
|
|
|
-1]:
|
|
|
|
|
raise ValueError(
|
|
|
|
|
f"Bad graph batch size:{batch_size}! max_graph_batch_sizes:{self.runner.torchair_graph_batch_sizes[-1]}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
compiled_model = self.torchair_compiled_models.get(
|
|
|
|
|
batch_size
|
|
|
|
|
) if self.runner.use_cached_npu_graph else self.torchair_compiled_model
|
|
|
|
|
|
|
|
|
|
if compiled_model:
|
|
|
|
|
return compiled_model
|
|
|
|
|
|
|
|
|
|
patch_for_hcom()
|
|
|
|
|
config = torchair.CompilerConfig()
|
|
|
|
|
config.experimental_config.frozen_parameter = True
|
|
|
|
|
config.experimental_config.tiling_schedule_optimize = True
|
|
|
|
|
config.experimental_config.enable_view_optimize = \
|
|
|
|
|
get_ascend_config().torchair_graph_config.enable_view_optimize
|
|
|
|
|
torch.npu.set_compile_mode(jit_compile=False)
|
|
|
|
|
if not self.runner.use_cached_npu_graph:
|
|
|
|
|
npu_backend = torchair.get_npu_backend(compiler_config=config)
|
2025-09-30 03:25:58 +08:00
|
|
|
self.torchair_compiled_model = torch.compile(
|
|
|
|
|
self.model,
|
2025-10-15 17:48:58 +08:00
|
|
|
dynamic=not self.use_sparse,
|
2025-09-30 03:25:58 +08:00
|
|
|
fullgraph=True,
|
|
|
|
|
backend=npu_backend)
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
return self.torchair_compiled_model
|
|
|
|
|
else:
|
|
|
|
|
# Generate a new forward proxy code object to prevent the invalidation of
|
|
|
|
|
# compilation cache caused by dynamo retracing
|
|
|
|
|
forward_proxy_name = f"{self.model.__class__.__name__}_forward_with_batch_size_{batch_size}"
|
|
|
|
|
forward_fn = self.model.forward
|
|
|
|
|
code = forward_fn.__code__
|
|
|
|
|
# Mark code object with a new proxy name
|
|
|
|
|
modified_code = code.replace(co_name=forward_proxy_name, )
|
|
|
|
|
|
|
|
|
|
modified_func = types.FunctionType(modified_code,
|
|
|
|
|
forward_fn.__globals__,
|
|
|
|
|
name=forward_proxy_name,
|
|
|
|
|
argdefs=forward_fn.__defaults__)
|
|
|
|
|
|
|
|
|
|
self.model.__dict__[forward_proxy_name] = modified_func.__get__(
|
|
|
|
|
self.model, nn.Module)
|
|
|
|
|
self.torchair_compiled_models[
|
|
|
|
|
batch_size] = torchair.inference.cache_compile(
|
|
|
|
|
self.model.__dict__[forward_proxy_name],
|
2025-10-15 17:48:58 +08:00
|
|
|
dynamic=not self.use_sparse,
|
2025-09-19 17:22:30 -07:00
|
|
|
fullgraph=True,
|
2025-09-03 17:56:12 +08:00
|
|
|
cache_dir=TORCHAIR_CACHE_DIR,
|
[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
|
|
|
config=config,
|
|
|
|
|
ge_cache=False)
|
|
|
|
|
return self.torchair_compiled_models[batch_size]
|
|
|
|
|
|
2025-09-04 11:34:47 +08:00
|
|
|
# TODO Using torch instead of triton may result in poor performance
|
|
|
|
|
def _prepare_input_kernel(self, out_ptr: torch.Tensor,
|
|
|
|
|
cu_query_lens: torch.Tensor,
|
|
|
|
|
cu_num_tokens: torch.Tensor, block_size: int):
|
|
|
|
|
device = cu_query_lens.device
|
|
|
|
|
dtype = out_ptr.dtype
|
2025-05-30 08:59:58 +08:00
|
|
|
|
2025-09-04 11:34:47 +08:00
|
|
|
offsets = torch.arange(block_size, device=device, dtype=dtype)
|
|
|
|
|
start_pos = cu_num_tokens[:-1]
|
|
|
|
|
end_pos = cu_num_tokens[1:]
|
|
|
|
|
num_tokens = end_pos - start_pos
|
2025-05-30 08:59:58 +08:00
|
|
|
|
2025-09-04 11:34:47 +08:00
|
|
|
global_indices = (start_pos.view(-1, 1) + offsets.view(1, -1))
|
|
|
|
|
values = (cu_query_lens[:-1].view(-1, 1) + offsets.view(1, -1))
|
2025-05-30 08:59:58 +08:00
|
|
|
|
2025-09-04 11:34:47 +08:00
|
|
|
mask = (offsets.view(1, -1) < num_tokens.view(-1, 1))
|
2025-05-30 08:59:58 +08:00
|
|
|
|
2025-09-04 11:34:47 +08:00
|
|
|
global_indices_flat = global_indices[mask]
|
|
|
|
|
values_flat = values[mask]
|
2025-09-05 09:11:22 +08:00
|
|
|
out_ptr[global_indices_flat] = values_flat
|