[BUGFIX] main-sd-bugfix && [UT] add mtp UT (#593)

### What this PR does / why we need it?
The pr will fix some bug about spec decode / MTP
The pr add a mtp e2e UT `test_mtp_correctness.py`

**vllm_ascend/attention/attention.py**
1. add support `self.attn_mask_cache` only has 1 element to cover scene
in which both spec docode and chunked prefill are enabled.

**vllm_ascend/distributed/parallel_state.py**
1. remove 2 assert because spec decode worker would use init_worker
twice

**vllm_ascend/models/deepseek_mtp.py**
1. remove unused params;
2. add support w8a8 in `CustomDeepSeekMTP`

**vllm_ascend/quantization/quant_config.py**
1. use `AscendUnquantizedFusedMoEMethod` instead of
`UnquantizedFusedMoEMethod`

**other**
1. replace `from vllm.logger import init_logger` to `from vllm.logger
import logger` all of the vllm-ascend project



### Does this PR introduce _any_ user-facing change?


### How was this patch tested?

Signed-off-by: mengwei805 <mengwei25@huawei.com>
This commit is contained in:
wemaster
2025-04-21 19:25:51 +08:00
committed by GitHub
parent 5442b463fd
commit 0ae9ee0f8a
10 changed files with 375 additions and 31 deletions

View File

@@ -113,7 +113,8 @@ class AttentionMaskBuilder:
self.update_attn_cache(max_seq_len, dtype, device)
# FIXME: Currently the mask value of chunked-prefill situation and Prefill-Only situation
# is not the same. Fix this in the future when kernel is ready.
if self.attn_mask_cache[0][1] > 0:
if self.attn_mask_cache.numel(
) > 1 and self.attn_mask_cache[0][1] > 0:
attn_mask = self.get_attn_mask( # type: ignore
max_seq_len, dtype, device)
attn_mask *= -10000

View File

@@ -6,7 +6,6 @@ import torch_npu
from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
AttentionMetadata,
MLAAttentionImpl)
from vllm.logger import init_logger
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
LinearBase, RowParallelLinear,
UnquantizedLinearMethod)
@@ -21,8 +20,6 @@ if TYPE_CHECKING:
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.worker.gpu_input_batch import InputBatch
logger = init_logger(__name__)
class AscendMLABackend(AttentionBackend):

View File

@@ -16,14 +16,12 @@
#
from collections import deque
from vllm.logger import init_logger
from vllm.logger import logger
from vllm.utils import cdiv
from vllm.v1.core.sched.output import NewRequestData, SchedulerOutput
from vllm.v1.core.sched.scheduler import Scheduler
from vllm.v1.request import Request, RequestStatus
logger = init_logger(__name__)
class AscendScheduler(Scheduler):
"""This Scheduler extends vllm's original v1 scheduler

View File

@@ -36,7 +36,6 @@ def init_ascend_model_parallel(
expert_tensor_parallel_size)
global _EP
assert _EP is None, ("expert parallel group is already initialized")
group_ranks = []
for i in range(num_expert_parallel_groups):
ranks = list(range(i, world_size, num_expert_parallel_groups))
@@ -49,8 +48,6 @@ def init_ascend_model_parallel(
group_ranks = []
global _ETP
assert _ETP is None, (
"expert tensor parallel group is already initialized")
for i in range(num_expert_tensor_parallel_groups):
ranks = list(
range(i * expert_tensor_parallel_size,

View File

@@ -1,6 +1,6 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Adapted from vllm/model_executor/models/qwen2_vl.py
# Adapted from vllm/model_executor/models/deepseek_mtp.py
# Copyright 2023 The vLLM team.
#
# This file is a part of the vllm-ascend project.
@@ -17,12 +17,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Optional
from typing import Optional
import torch
import torch.nn as nn
from transformers import PretrainedConfig
from vllm.attention.backends.abstract import AttentionMetadata
from vllm.config import CacheConfig, ModelConfig, VllmConfig
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -70,8 +69,6 @@ class CustomDeepSeekMultiTokenPredictorLayer(DeepSeekMultiTokenPredictorLayer):
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
kv_cache: torch.Tensor,
attn_metadata: AttentionMetadata,
previous_hidden_states: torch.Tensor,
inputs_embeds: Optional[torch.Tensor] = None,
spec_step_index: int = 0,
@@ -91,8 +88,6 @@ class CustomDeepSeekMultiTokenPredictorLayer(DeepSeekMultiTokenPredictorLayer):
hidden_states, residual = self.mtp_block(positions=positions,
hidden_states=hidden_states,
kv_cache=kv_cache,
attn_metadata=attn_metadata,
residual=None)
hidden_states = residual + hidden_states
return hidden_states
@@ -130,8 +125,6 @@ class CustomDeepSeekMultiTokenPredictor(DeepSeekMultiTokenPredictor):
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
previous_hidden_states: torch.Tensor,
inputs_embeds: Optional[torch.Tensor] = None,
spec_step_idx: int = 0,
@@ -140,8 +133,6 @@ class CustomDeepSeekMultiTokenPredictor(DeepSeekMultiTokenPredictor):
return self.layers_list[current_step_idx](
input_ids,
positions,
kv_caches[current_step_idx],
attn_metadata,
previous_hidden_states,
inputs_embeds,
current_step_idx,
@@ -162,6 +153,14 @@ class CustomDeepSeekMultiTokenPredictor(DeepSeekMultiTokenPredictor):
class CustomDeepSeekMTP(DeepSeekMTP):
# NOTE 1.The quantized MTP layer of deepseek on the NPU is not quantized;
# NOTE 2.The description file generated by the current msmodelslim tool does not have
# MTP layer info. Please manually add it and set the value to FLOAT.
packed_modules_mapping = {
"gate_up_proj": ["gate_proj", "up_proj"],
"experts":
["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]
}
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
nn.Module.__init__(self)

View File

@@ -18,7 +18,7 @@
from typing import Any, Dict, Optional
from vllm.config import ParallelConfig
from vllm.logger import init_logger
from vllm.logger import logger
from vllm.model_executor.layers.rejection_sampler import RejectionSampler
from vllm.model_executor.layers.spec_decode_base_sampler import \
SpecDecodeBaseSampler
@@ -34,8 +34,6 @@ from vllm.worker.worker_base import WorkerBase
from vllm_ascend.worker.draft_model_runner import TP1DraftModelRunner
logger = init_logger(__name__)
def create_worker(
cls,

View File

@@ -23,8 +23,6 @@ import torch_npu # noqa: F401
from vllm.distributed import get_tensor_model_parallel_rank
from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
FusedMoeWeightScaleSupported)
from vllm.model_executor.layers.fused_moe.layer import \
UnquantizedFusedMoEMethod
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
RowParallelLinear,
UnquantizedLinearMethod)
@@ -36,6 +34,8 @@ from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
from vllm.model_executor.parameter import PerTensorScaleParameter
from vllm.model_executor.utils import set_weight_attrs
from vllm_ascend.ops.fused_moe import AscendUnquantizedFusedMoEMethod
from .quantizer import AscendQuantizer
@@ -97,7 +97,7 @@ class AscendQuantConfig(QuantizationConfig):
elif isinstance(layer, FusedMoE):
if self.is_layer_skipped_ascend(prefix,
self.packed_modules_mapping):
return UnquantizedFusedMoEMethod()
return AscendUnquantizedFusedMoEMethod()
return AscendFusedMoEMethod(self, prefix,
self.packed_modules_mapping)
return None

View File

@@ -19,7 +19,7 @@ from typing import List, Optional
import torch
from vllm.forward_context import set_forward_context
from vllm.logger import init_logger
from vllm.logger import logger
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.multimodal import MultiModalKwargs
from vllm.sequence import ExecuteModelRequest, IntermediateTensors
@@ -29,8 +29,6 @@ from vllm.worker.model_runner_base import (ModelRunnerBase,
from vllm_ascend.attention.attention import AscendMetadata
logger = init_logger(__name__)
# A flag to enable debug prints for the updated input tensors
# before each step.
debug_advance_input = False