[BUGFIX] main-sd-bugfix && [UT] add mtp UT (#593)

### What this PR does / why we need it? The pr will fix some bug about spec decode / MTP The pr add a mtp e2e UT `test_mtp_correctness.py` **vllm_ascend/attention/attention.py** 1. add support `self.attn_mask_cache` only has 1 element to cover scene in which both spec docode and chunked prefill are enabled. **vllm_ascend/distributed/parallel_state.py** 1. remove 2 assert because spec decode worker would use init_worker twice **vllm_ascend/models/deepseek_mtp.py** 1. remove unused params; 2. add support w8a8 in `CustomDeepSeekMTP` **vllm_ascend/quantization/quant_config.py** 1. use `AscendUnquantizedFusedMoEMethod` instead of `UnquantizedFusedMoEMethod` **other** 1. replace `from vllm.logger import init_logger` to `from vllm.logger import logger` all of the vllm-ascend project ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? Signed-off-by: mengwei805 <mengwei25@huawei.com>
2025-04-21 19:25:51 +08:00
parent 5442b463fd
commit 0ae9ee0f8a
10 changed files with 375 additions and 31 deletions
--- a/vllm_ascend/models/deepseek_mtp.py
+++ b/vllm_ascend/models/deepseek_mtp.py
@@ -1,6 +1,6 @@
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# Adapted from vllm/model_executor/models/qwen2_vl.py
+# Adapted from vllm/model_executor/models/deepseek_mtp.py
 # Copyright 2023 The vLLM team.
 #
 # This file is a part of the vllm-ascend project.
@@ -17,12 +17,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from typing import List, Optional
+from typing import Optional

 import torch
 import torch.nn as nn
 from transformers import PretrainedConfig
-from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -70,8 +69,6 @@ class CustomDeepSeekMultiTokenPredictorLayer(DeepSeekMultiTokenPredictorLayer):
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
        previous_hidden_states: torch.Tensor,
        inputs_embeds: Optional[torch.Tensor] = None,
        spec_step_index: int = 0,
@@ -91,8 +88,6 @@ class CustomDeepSeekMultiTokenPredictorLayer(DeepSeekMultiTokenPredictorLayer):

        hidden_states, residual = self.mtp_block(positions=positions,
                                                 hidden_states=hidden_states,
-                                                 kv_cache=kv_cache,
-                                                 attn_metadata=attn_metadata,
                                                 residual=None)
        hidden_states = residual + hidden_states
        return hidden_states
@@ -130,8 +125,6 @@ class CustomDeepSeekMultiTokenPredictor(DeepSeekMultiTokenPredictor):
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
        previous_hidden_states: torch.Tensor,
        inputs_embeds: Optional[torch.Tensor] = None,
        spec_step_idx: int = 0,
@@ -140,8 +133,6 @@ class CustomDeepSeekMultiTokenPredictor(DeepSeekMultiTokenPredictor):
        return self.layers_list[current_step_idx](
            input_ids,
            positions,
-            kv_caches[current_step_idx],
-            attn_metadata,
            previous_hidden_states,
            inputs_embeds,
            current_step_idx,
@@ -162,6 +153,14 @@ class CustomDeepSeekMultiTokenPredictor(DeepSeekMultiTokenPredictor):


 class CustomDeepSeekMTP(DeepSeekMTP):
+    # NOTE 1.The quantized MTP layer of deepseek on the NPU is not quantized;
+    # NOTE 2.The description file generated by the current msmodelslim tool does not have
+    # MTP layer info. Please manually add it and set the value to FLOAT.
+    packed_modules_mapping = {
+        "gate_up_proj": ["gate_proj", "up_proj"],
+        "experts":
+        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]
+    }

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        nn.Module.__init__(self)