[V1] MTP supports torchair (#2145)
### What this PR does / why we need it?
Support MTP with:
- [x] V0 Scheduler
- [x] TorchAir
- [x] Single DP
- [x] Multi DP
- [x] Disaggregate PD
Known issues:
- [ ] Not support V1 Scheduler (chunked prefill), will be supported in a
few weeks
- [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now,
need to comment out the line 171-175 in file
`vllm/vllm/v1/metrics/loggers.py`
```
if (len(self.engine_indexes) > 1
and vllm_config.speculative_config is not None):
raise NotImplementedError("Prometheus metrics with Spec Decoding "
"with >1 EngineCore per AsyncLLM is not "
"supported yet.")
```
To start an online server with torchair enabled, here is an example:
```
python -m vllm.entrypoints.openai.api_server \
--model="/weights/DeepSeek-R1_w8a8/" \
--trust-remote-code \
--max-model-len 40000 \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--enable_expert_parallel \
--served-model-name deepseekr1 \
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
--quantization ascend \
--host 0.0.0.0 \
--port 1234 \
--additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \
--gpu_memory_utilization 0.9
```
offline example with torchair enabled
```
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=16, temperature=0)
# Create an LLM.
llm = LLM(
model="/home/data/DeepSeek-R1_w8a8/",
tensor_parallel_size=16,
max_num_seqs=16,
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
trust_remote_code=True,
enforce_eager=False,
max_model_len=2000,
additional_config = {
'torchair_graph_config': {
'enabled': True,
"graph_batch_sizes": [16],
'enable_multistream_shared_expert': False,
},
"ascend_scheduler_config": {
"enabled": True
},
# 'expert_tensor_parallel_size': 16,
}
)
# Generate texts from the prompts.
# llm.start_profile()
outputs = llm.generate(prompts, sampling_params)
# llm.stop_profile()
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
- vLLM version: v0.10.0
- vLLM main:
302962e806
---------
Signed-off-by: xuyexiong <xuyexiong@huawei.com>
This commit is contained in:
@@ -188,6 +188,7 @@ class TestAscendMLAMetadataBuilder(TestBase):
|
||||
runner.chunked_prefill_enabled = False
|
||||
runner.device = "cpu"
|
||||
runner.block_size = 16
|
||||
runner.decode_token_per_req = 1
|
||||
|
||||
ascend_config = MagicMock()
|
||||
ascend_config.torchair_graph_config = MagicMock()
|
||||
@@ -206,6 +207,7 @@ class TestAscendMLAMetadataBuilder(TestBase):
|
||||
def test_reorder_batch_with_torchair_graph(self, ascend_config):
|
||||
runner = MagicMock()
|
||||
runner.chunked_prefill_enabled = False
|
||||
runner.decode_token_per_req = 1
|
||||
ascend_config.torchair_graph_config = MagicMock()
|
||||
ascend_config.torchair_graph_config.enabled = True
|
||||
|
||||
@@ -238,6 +240,7 @@ class TestAscendMLAMetadataBuilder(TestBase):
|
||||
ascend_config = MagicMock()
|
||||
runner = MagicMock()
|
||||
runner.chunked_prefill_enabled = False
|
||||
runner.decode_token_per_req = 1
|
||||
ascend_config.torchair_graph_config = MagicMock()
|
||||
ascend_config.torchair_graph_config.enabled = False
|
||||
with patch("vllm_ascend.attention.mla_v1.get_ascend_config",
|
||||
@@ -275,6 +278,7 @@ class TestAscendMLAMetadataBuilder(TestBase):
|
||||
runner = MagicMock()
|
||||
runner.graph_block_tables = torch.zeros((8, 64), dtype=torch.int32)
|
||||
runner.chunked_prefill_enabled = False
|
||||
runner.decode_token_per_req = 1
|
||||
builder = AscendMLAMetadataBuilder(runner=runner)
|
||||
block_tables = torch.randint(0, 100, (3, 10), dtype=torch.int32)
|
||||
|
||||
@@ -291,6 +295,7 @@ class TestAscendMLAMetadataBuilder(TestBase):
|
||||
runner = MagicMock()
|
||||
runner.graph_block_tables = torch.zeros((8, 4), dtype=torch.int32)
|
||||
runner.chunked_prefill_enabled = False
|
||||
runner.decode_token_per_req = 1
|
||||
builder = AscendMLAMetadataBuilder(runner=runner)
|
||||
block_tables = torch.randint(0, 100, (3, 10), dtype=torch.int32)
|
||||
|
||||
@@ -308,6 +313,7 @@ class TestAscendMLAMetadataBuilder(TestBase):
|
||||
runner = MagicMock()
|
||||
runner.graph_block_tables = np.zeros((8, 64), dtype=np.int32)
|
||||
runner.chunked_prefill_enabled = False
|
||||
runner.decode_token_per_req = 1
|
||||
builder = AscendMLAMetadataBuilder(runner=runner)
|
||||
|
||||
block_tables = torch.randint(0, 100, (3, 10), dtype=torch.int32)
|
||||
@@ -332,6 +338,7 @@ class TestAscendMLAMetadataBuilder(TestBase):
|
||||
runner.attn_mask = torch.zeros((1, 1), dtype=torch.bool)
|
||||
runner.spec_attn_mask = torch.zeros((1, 1), dtype=torch.bool)
|
||||
runner.dtype = torch.float16
|
||||
runner.decode_token_per_req = 1
|
||||
|
||||
builder = AscendMLAMetadataBuilder(runner=runner,
|
||||
metadata_cls=AscendMLAMetadata)
|
||||
|
||||
@@ -77,6 +77,9 @@ class TestCustomDeepSeekMultiTokenPredictor(PytestBase):
|
||||
mock_vllm_config.model_config = mock_model_config
|
||||
mock_vllm_config.cache_config = CacheConfig()
|
||||
mock_vllm_config.quant_config = mocker.MagicMock()
|
||||
mocker.patch(
|
||||
"vllm.model_executor.layers.vocab_parallel_embedding.VocabParallelEmbedding.__init__",
|
||||
return_value=None)
|
||||
mocker.patch(
|
||||
"vllm_ascend.models.deepseek_mtp.CustomDeepSeekMultiTokenPredictorLayer.__init__",
|
||||
return_value=None)
|
||||
@@ -90,10 +93,9 @@ class TestCustomDeepSeekMultiTokenPredictor(PytestBase):
|
||||
assert predictor.num_mtp_layers == 3
|
||||
assert isinstance(predictor, CustomDeepSeekMultiTokenPredictor)
|
||||
|
||||
@pytest.mark.parametrize('kv_caches, inputs_embeds', [
|
||||
(torch.tensor([[[0.1, 0.2, 0.3]]]), torch.tensor([[0.1, 0.2, 0.3]])),
|
||||
(None, None),
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
'kv_caches, inputs_embeds',
|
||||
[(torch.tensor([[[0.1, 0.2, 0.3]]]), torch.tensor([[0.1, 0.2, 0.3]]))])
|
||||
def test_forward(self, mocker: MockerFixture, setup_predictor, kv_caches,
|
||||
inputs_embeds):
|
||||
predictor = setup_predictor
|
||||
@@ -147,6 +149,9 @@ class TestCustomDeepSeekMTP(PytestBase):
|
||||
mocker.patch("torch.nn.Module.__setattr__")
|
||||
mocker.patch("torch.nn.Module.__getattr__")
|
||||
mocker.patch("torch.nn.Module.__delattr__")
|
||||
mocker.patch(
|
||||
"vllm.model_executor.layers.vocab_parallel_embedding.VocabParallelEmbedding.__init__",
|
||||
return_value=None)
|
||||
mocker.patch(
|
||||
"vllm_ascend.models.deepseek_mtp.CustomDeepSeekMultiTokenPredictorLayer.__call__",
|
||||
return_value=None)
|
||||
@@ -172,4 +177,4 @@ class TestCustomDeepSeekMTP(PytestBase):
|
||||
output = setup_mtp.forward(input_ids, positions, kv_caches, None,
|
||||
previous_hidden_states, inputs_embeds,
|
||||
spec_step_idx)
|
||||
assert torch.allclose(output, torch.tensor([[1.0, 2.0, 3.0]]))
|
||||
assert torch.allclose(output, torch.tensor([[1.0, 2.0, 3.0]]))
|
||||
@@ -3,6 +3,7 @@ from unittest.mock import MagicMock, patch
|
||||
import torch
|
||||
from vllm.attention.layer import Attention
|
||||
from vllm.model_executor.layers.fused_moe import FusedMoE
|
||||
from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
|
||||
from vllm.model_executor.layers.linear import (LinearBase,
|
||||
UnquantizedLinearMethod)
|
||||
|
||||
@@ -111,6 +112,7 @@ class TestAscendQuantConfig(TestBase):
|
||||
|
||||
def test_get_quant_method_for_fused_moe(self):
|
||||
fused_moe_layer = MagicMock(spec=FusedMoE)
|
||||
fused_moe_layer.moe = MagicMock(spec=FusedMoEConfig)
|
||||
|
||||
# Test skipped layer
|
||||
with patch.object(self.ascend_config, 'is_layer_skipped_ascend', return_value=True), \
|
||||
|
||||
Reference in New Issue
Block a user