[V1] MTP supports torchair (#2145)

### What this PR does / why we need it? Support MTP with： - [x] V0 Scheduler - [x] TorchAir - [x] Single DP - [x] Multi DP - [x] Disaggregate PD Known issues： - [ ] Not support V1 Scheduler (chunked prefill), will be supported in a few weeks - [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now, need to comment out the line 171-175 in file `vllm/vllm/v1/metrics/loggers.py` ``` if (len(self.engine_indexes) > 1 and vllm_config.speculative_config is not None): raise NotImplementedError("Prometheus metrics with Spec Decoding " "with >1 EngineCore per AsyncLLM is not " "supported yet.") ``` To start an online server with torchair enabled, here is an example: ``` python -m vllm.entrypoints.openai.api_server \ --model="/weights/DeepSeek-R1_w8a8/" \ --trust-remote-code \ --max-model-len 40000 \ --tensor-parallel-size 4 \ --data_parallel_size 4 \ --max-num-seqs 16 \ --no-enable-prefix-caching \ --enable_expert_parallel \ --served-model-name deepseekr1 \ --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \ --quantization ascend \ --host 0.0.0.0 \ --port 1234 \ --additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \ --gpu_memory_utilization 0.9 ``` offline example with torchair enabled ``` from vllm import LLM, SamplingParams prompts = [ "Hello, my name is", "The president of the United States is", "The capital of France is", "The future of AI is", ] # Create a sampling params object. sampling_params = SamplingParams(max_tokens=16, temperature=0) # Create an LLM. llm = LLM( model="/home/data/DeepSeek-R1_w8a8/", tensor_parallel_size=16, max_num_seqs=16, gpu_memory_utilization=0.9, distributed_executor_backend="mp", enable_expert_parallel=True, speculative_config={ "method": "deepseek_mtp", "num_speculative_tokens": 1, }, trust_remote_code=True, enforce_eager=False, max_model_len=2000, additional_config = { 'torchair_graph_config': { 'enabled': True, "graph_batch_sizes": [16], 'enable_multistream_shared_expert': False, }, "ascend_scheduler_config": { "enabled": True }, # 'expert_tensor_parallel_size': 16, } ) # Generate texts from the prompts. # llm.start_profile() outputs = llm.generate(prompts, sampling_params) # llm.stop_profile() for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` - vLLM version: v0.10.0 - vLLM main: 302962e806 --------- Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
parent bf84f2dbfa
commit 26fc36b0e0
12 changed files with 541 additions and 160 deletions
--- a/tests/ut/models/test_deepseek_mtp.py
+++ b/tests/ut/models/test_deepseek_mtp.py
@@ -77,6 +77,9 @@ class TestCustomDeepSeekMultiTokenPredictor(PytestBase):
        mock_vllm_config.model_config = mock_model_config
        mock_vllm_config.cache_config = CacheConfig()
        mock_vllm_config.quant_config = mocker.MagicMock()
+        mocker.patch(
+            "vllm.model_executor.layers.vocab_parallel_embedding.VocabParallelEmbedding.__init__",
+            return_value=None)
        mocker.patch(
            "vllm_ascend.models.deepseek_mtp.CustomDeepSeekMultiTokenPredictorLayer.__init__",
            return_value=None)
@@ -90,10 +93,9 @@ class TestCustomDeepSeekMultiTokenPredictor(PytestBase):
        assert predictor.num_mtp_layers == 3
        assert isinstance(predictor, CustomDeepSeekMultiTokenPredictor)

-    @pytest.mark.parametrize('kv_caches, inputs_embeds', [
-        (torch.tensor([[[0.1, 0.2, 0.3]]]), torch.tensor([[0.1, 0.2, 0.3]])),
-        (None, None),
-    ])
+    @pytest.mark.parametrize(
+        'kv_caches, inputs_embeds',
+        [(torch.tensor([[[0.1, 0.2, 0.3]]]), torch.tensor([[0.1, 0.2, 0.3]]))])
    def test_forward(self, mocker: MockerFixture, setup_predictor, kv_caches,
                     inputs_embeds):
        predictor = setup_predictor
@@ -147,6 +149,9 @@ class TestCustomDeepSeekMTP(PytestBase):
        mocker.patch("torch.nn.Module.__setattr__")
        mocker.patch("torch.nn.Module.__getattr__")
        mocker.patch("torch.nn.Module.__delattr__")
+        mocker.patch(
+            "vllm.model_executor.layers.vocab_parallel_embedding.VocabParallelEmbedding.__init__",
+            return_value=None)
        mocker.patch(
            "vllm_ascend.models.deepseek_mtp.CustomDeepSeekMultiTokenPredictorLayer.__call__",
            return_value=None)
@@ -172,4 +177,4 @@ class TestCustomDeepSeekMTP(PytestBase):
        output = setup_mtp.forward(input_ids, positions, kv_caches, None,
                                   previous_hidden_states, inputs_embeds,
                                   spec_step_idx)
-        assert torch.allclose(output, torch.tensor([[1.0, 2.0, 3.0]]))
+        assert torch.allclose(output, torch.tensor([[1.0, 2.0, 3.0]]))