diff --git a/.github/workflows/vllm_ascend_test_full_vllm_0.11.0.yaml b/.github/workflows/vllm_ascend_test_full_vllm_0.11.0.yaml deleted file mode 100644 index 0269fb6..0000000 --- a/.github/workflows/vllm_ascend_test_full_vllm_0.11.0.yaml +++ /dev/null @@ -1,51 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# -name: 'ascend test / vllm 0.11.0' - -on: - # Run 1-card and 2-cards e2e tests per 2h - schedule: - - cron: '0 */2 * * *' - pull_request: - branches: - - 'main' - paths: - # If we are changing the doctest we should do a PR test - - 'vllm_ascend_test_full_vllm_0.11.0.yaml' - workflow_dispatch: - -# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly -# declared as "shell: bash -el {0}" on steps that need to be properly activated. -# It's used to activate ascend-toolkit environment variables. -defaults: - run: - shell: bash -el {0} - -# only cancel in-progress runs of the same workflow -# and ignore the lint / 1 card / 4 cards test type -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - e2e-test: - uses: ./.github/workflows/_e2e_test.yaml - with: - vllm: v0.11.0 - runner: linux-aarch64-a2 - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11 - type: full diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index d0f1b76..23b6e0c 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -32,14 +32,7 @@ from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer, BatchEncoding, BatchFeature) from transformers.models.auto.auto_factory import _BaseAutoModelClass from vllm import LLM, SamplingParams - -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.10.2"): - from vllm.config import TaskOption, _get_and_verify_dtype -else: - from vllm.config.model import TaskOption, _get_and_verify_dtype - +from vllm.config.model import TaskOption, _get_and_verify_dtype from vllm.inputs import TextPrompt from vllm.outputs import RequestOutput from vllm.transformers_utils.utils import maybe_model_redirect diff --git a/tests/e2e/model_utils.py b/tests/e2e/model_utils.py index e5b353e..54b0f93 100644 --- a/tests/e2e/model_utils.py +++ b/tests/e2e/model_utils.py @@ -19,12 +19,7 @@ from typing import Dict, List, Optional, Sequence, Tuple, Union -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.10.2"): - from vllm.sequence import PromptLogprobs, SampleLogprobs -else: - from vllm.logprobs import PromptLogprobs, SampleLogprobs +from vllm.logprobs import PromptLogprobs, SampleLogprobs TokensText = Tuple[List[int], str] diff --git a/tests/e2e/singlecard/test_guided_decoding.py b/tests/e2e/singlecard/test_guided_decoding.py index ac2426e..e0e6314 100644 --- a/tests/e2e/singlecard/test_guided_decoding.py +++ b/tests/e2e/singlecard/test_guided_decoding.py @@ -22,15 +22,8 @@ from typing import Any, Dict import jsonschema import pytest import regex as re - -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.10.2"): - from vllm.sampling_params import GuidedDecodingParams, SamplingParams -else: - from vllm.sampling_params import SamplingParams, StructuredOutputsParams - from vllm.outputs import RequestOutput +from vllm.sampling_params import SamplingParams, StructuredOutputsParams from tests.e2e.conftest import VllmRunner @@ -91,27 +84,16 @@ def sample_json_schema(): def test_guided_json_completion(guided_decoding_backend: str, sample_json_schema): runner_kwargs: Dict[str, Any] = {} - if vllm_version_is("0.10.2"): - sampling_params = SamplingParams( - temperature=1.0, - max_tokens=500, - guided_decoding=GuidedDecodingParams(json=sample_json_schema)) - runner_kwargs = { - "seed": 0, - "guided_decoding_backend": guided_decoding_backend, - } - else: - sampling_params = SamplingParams( - temperature=1.0, - max_tokens=500, - structured_outputs=StructuredOutputsParams( - json=sample_json_schema)) - runner_kwargs = { - "seed": 0, - "structured_outputs_config": { - "backend": guided_decoding_backend - }, - } + sampling_params = SamplingParams( + temperature=1.0, + max_tokens=500, + structured_outputs=StructuredOutputsParams(json=sample_json_schema)) + runner_kwargs = { + "seed": 0, + "structured_outputs_config": { + "backend": guided_decoding_backend + }, + } with VllmRunner(MODEL_NAME, **runner_kwargs) as vllm_model: prompts = [ f"Give an example JSON for an employee profile " @@ -141,26 +123,16 @@ def test_guided_regex(guided_decoding_backend: str, sample_regex): if guided_decoding_backend == "outlines": pytest.skip("Outlines doesn't support regex-based guided decoding.") runner_kwargs: Dict[str, Any] = {} - if vllm_version_is("0.10.2"): - sampling_params = SamplingParams( - temperature=0.8, - top_p=0.95, - guided_decoding=GuidedDecodingParams(regex=sample_regex)) - runner_kwargs = { - "seed": 0, - "guided_decoding_backend": guided_decoding_backend, - } - else: - sampling_params = SamplingParams( - temperature=0.8, - top_p=0.95, - structured_outputs=StructuredOutputsParams(regex=sample_regex)) - runner_kwargs = { - "seed": 0, - "structured_outputs_config": { - "backend": guided_decoding_backend - }, - } + sampling_params = SamplingParams( + temperature=0.8, + top_p=0.95, + structured_outputs=StructuredOutputsParams(regex=sample_regex)) + runner_kwargs = { + "seed": 0, + "structured_outputs_config": { + "backend": guided_decoding_backend + }, + } with VllmRunner(MODEL_NAME, **runner_kwargs) as vllm_model: prompts = [ diff --git a/tests/ut/ops/test_fused_moe_prepare_and_finalize.py b/tests/ut/ops/test_fused_moe_prepare_and_finalize.py index ce7970c..3f46fbb 100644 --- a/tests/ut/ops/test_fused_moe_prepare_and_finalize.py +++ b/tests/ut/ops/test_fused_moe_prepare_and_finalize.py @@ -8,7 +8,6 @@ from vllm_ascend.ops.moe.fused_moe_prepare_and_finalize import ( FusedMoEPrepareAndFinalizeWithAll2All, FusedMoEPrepareAndFinalizeWithAllGather, FusedMoEPrepareAndFinalizeWithMC2, FusedMoEPrepareAndFinalizeWithNaiveMulticast) -from vllm_ascend.utils import vllm_version_is class TestFusedMoEPrepareAndFinalize(unittest.TestCase): @@ -231,12 +230,8 @@ class TestFusedMoEPrepareAndFinalize(unittest.TestCase): mock_get_dp_group): # Mock forward context with DP metadata mock_context = MagicMock() - if vllm_version_is("0.10.2"): - mock_context.dp_metadata.cu_tokens_across_dp_cpu = torch.tensor( - [2, 5, 7]) - else: - mock_context.dp_metadata.cu_tokens_across_sp.return_value = torch.tensor( - [2, 5, 7]) + mock_context.dp_metadata.cu_tokens_across_sp.return_value = torch.tensor( + [2, 5, 7]) mock_get_forward_context.return_value = mock_context # Setup DP group mock diff --git a/tests/ut/ops/test_fused_ops.py b/tests/ut/ops/test_fused_ops.py index a5bdfe2..b59dfb0 100644 --- a/tests/ut/ops/test_fused_ops.py +++ b/tests/ut/ops/test_fused_ops.py @@ -28,7 +28,7 @@ from vllm_ascend.ops.fused_moe import (AscendFusedMoE, AscendUnquantizedFusedMoEMethod) from vllm_ascend.ops.moe.experts_selector import select_experts from vllm_ascend.ops.moe.moe_mlp import cumsum_group_list, unified_apply_mlp -from vllm_ascend.utils import AscendSocVersion, adapt_patch, vllm_version_is +from vllm_ascend.utils import AscendSocVersion, adapt_patch adapt_patch(True) @@ -92,11 +92,7 @@ def mock_dist_env(mocker: MockerFixture): return hidden_states mock_moe_comm_method.finalize.side_effect = mock_finalize - - if vllm_version_is("0.10.2"): - dp_metadata = MagicMock(cu_tokens_across_dp_cpu=[5, 10]) - else: - dp_metadata = MagicMock(num_tokens_across_dp_cpu=[5, 5]) + dp_metadata = MagicMock(num_tokens_across_dp_cpu=[5, 5]) mock_forward_context_obj = MagicMock(moe_comm_method=mock_moe_comm_method, moe_comm_type=MoECommType.MC2, max_tokens_across_dp=10, diff --git a/tests/ut/torchair/ops/test_torchair_fused_moe.py b/tests/ut/torchair/ops/test_torchair_fused_moe.py index fb1cd81..70418a2 100644 --- a/tests/ut/torchair/ops/test_torchair_fused_moe.py +++ b/tests/ut/torchair/ops/test_torchair_fused_moe.py @@ -27,7 +27,7 @@ from vllm_ascend.quantization.quant_config import AscendFusedMoEMethod from vllm_ascend.torchair.ops.torchair_fused_moe import ( TorchairAscendFusedMoE, TorchairAscendUnquantizedFusedMoEMethod) from vllm_ascend.utils import adapt_patch # noqa E402 -from vllm_ascend.utils import AscendSocVersion, vllm_version_is +from vllm_ascend.utils import AscendSocVersion adapt_patch(True) @@ -54,10 +54,7 @@ def mock_dp_and_tp_group(mocker): @pytest.fixture def mock_dist_env(mocker: MockerFixture): # init dist env patch - if vllm_version_is("0.10.2"): - dp_metadata = MagicMock(cu_tokens_across_dp_cpu=[5, 10]) - else: - dp_metadata = MagicMock(num_tokens_across_dp_cpu=[5, 5]) + dp_metadata = MagicMock(num_tokens_across_dp_cpu=[5, 5]) with patch('torch.distributed.get_rank', return_value=0), \ patch('torch.distributed.get_world_size', return_value=4), \ diff --git a/vllm_ascend/patch/platform/patch_common/__init__.py b/vllm_ascend/patch/platform/patch_common/__init__.py index 89c74e7..7942ac0 100644 --- a/vllm_ascend/patch/platform/patch_common/__init__.py +++ b/vllm_ascend/patch/platform/patch_common/__init__.py @@ -19,6 +19,5 @@ import vllm_ascend.patch.platform.patch_common.patch_config # noqa import vllm_ascend.patch.platform.patch_common.patch_distributed # noqa import vllm_ascend.patch.platform.patch_common.patch_mamba_config # noqa import vllm_ascend.patch.platform.patch_common.patch_multimodal_merge # noqa -import vllm_ascend.patch.platform.patch_common.patch_transformers_utils # noqa import vllm_ascend.patch.worker.patch_common.patch_attention_selector # noqa import vllm_ascend.patch.worker.patch_common.patch_attentionspec # noqa diff --git a/vllm_ascend/patch/platform/patch_common/patch_config.py b/vllm_ascend/patch/platform/patch_common/patch_config.py index 9b6f5c2..d615038 100644 --- a/vllm_ascend/patch/platform/patch_common/patch_config.py +++ b/vllm_ascend/patch/platform/patch_common/patch_config.py @@ -1,87 +1,10 @@ import ast import vllm.envs as envs -from transformers import PretrainedConfig -from vllm.config import ModelConfig from vllm.config.speculative import SpeculativeConfig from vllm.logger import logger -# mypy: ignore-errors -@property -def is_deepseek_mla(self: ModelConfig): - if not hasattr(self.hf_text_config, "model_type"): - return False - elif self.hf_text_config.model_type in \ - ('deepseek_v2', 'deepseek_v3', 'deepseek_mtp', - 'kimi_k2', 'longcat_flash', 'deepseek_v32'): - return self.hf_text_config.kv_lora_rank is not None - elif self.hf_text_config.model_type == 'eagle': - # if the model is an EAGLE module, check for the - # underlying architecture - return self.hf_text_config.model.model_type in \ - ('deepseek_v2', 'deepseek_v3', 'deepseek_v32') \ - and self.hf_text_config.kv_lora_rank is not None - return False - - -@staticmethod -def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig: - if hf_config.model_type in ("deepseek_v3", "deepseek_v32"): - hf_config.model_type = "deepseek_mtp" - if hf_config.model_type == "deepseek_mtp": - n_predict = getattr(hf_config, "num_nextn_predict_layers", None) - hf_config.update({ - "n_predict": n_predict, - "architectures": ["DeepSeekMTPModel"] - }) - - if hf_config.architectures[0] == "MiMoForCausalLM": - hf_config.model_type = "mimo_mtp" - n_predict = getattr(hf_config, "num_nextn_predict_layers", None) - hf_config.update({ - "num_hidden_layers": 0, - "n_predict": n_predict, - "architectures": ["MiMoMTPModel"] - }) - - if hf_config.architectures[0] == "Glm4MoeForCausalLM": - hf_config.model_type = "glm4_moe_mtp" - n_predict = getattr(hf_config, "num_nextn_predict_layers", None) - hf_config.update({ - "num_hidden_layers": 0, - "n_predict": n_predict, - "architectures": ["Glm4MoeMTPModel"] - }) - - if hf_config.model_type == "ernie4_5_moe": - hf_config.model_type = "ernie_mtp" - if hf_config.model_type == "ernie_mtp": - n_predict = getattr(hf_config, "num_nextn_predict_layers", None) - hf_config.update({ - "n_predict": n_predict, - "architectures": ["ErnieMTPModel"] - }) - - if hf_config.model_type == "qwen3_next": - hf_config.model_type = "qwen3_next_mtp" - if hf_config.model_type == "qwen3_next_mtp": - n_predict = getattr(hf_config, "num_nextn_predict_layers", None) - hf_config.update({ - "n_predict": n_predict, - "architectures": ["Qwen3NextMTP"] - }) - if hf_config.model_type == "longcat_flash": - hf_config.model_type = "longcat_flash_mtp" - n_predict = getattr(hf_config, "num_nextn_predict_layers", 1) - hf_config.update({ - "n_predict": n_predict, - "architectures": ["LongCatFlashMTPModel"] - }) - - return hf_config - - def __post_init__(self): # Note: "method" is a new parameter that helps to extend the @@ -308,6 +231,4 @@ def __post_init__(self): self.draft_tensor_parallel_size)) -ModelConfig.is_deepseek_mla = is_deepseek_mla SpeculativeConfig.__post_init__ = __post_init__ -SpeculativeConfig.hf_config_override = hf_config_override diff --git a/vllm_ascend/patch/platform/patch_common/patch_transformers_utils.py b/vllm_ascend/patch/platform/patch_common/patch_transformers_utils.py deleted file mode 100644 index 55db190..0000000 --- a/vllm_ascend/patch/platform/patch_common/patch_transformers_utils.py +++ /dev/null @@ -1,200 +0,0 @@ -import vllm.transformers_utils.configs -from transformers.configuration_utils import PretrainedConfig -from transformers.utils import logging -from vllm.transformers_utils import config - -logger = logging.get_logger(__name__) - - -class DeepseekV3Config(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek - model according to the specified arguments, defining the model architecture. Instantiating a configuration with the - defaults will yield a similar configuration to that of the DeepSeek-V3. - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - Args: - vocab_size (`int`, *optional*, defaults to 129280): - Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`DeepseekV3Model`] - hidden_size (`int`, *optional*, defaults to 4096): - Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 11008): - Dimension of the MLP representations. - moe_intermediate_size (`int`, *optional*, defaults to 1407): - Dimension of the MoE representations. - num_hidden_layers (`int`, *optional*, defaults to 32): - Number of hidden layers in the Transformer decoder. - num_nextn_predict_layers (`int`, *optional*, defaults to 1): - Number of nextn predict layers in the DeepSeekV3 Model. - num_attention_heads (`int`, *optional*, defaults to 32): - Number of attention heads for each attention layer in the Transformer decoder. - n_shared_experts (`int`, *optional*, defaults to None): - Number of shared experts, None means dense model. - n_routed_experts (`int`, *optional*, defaults to None): - Number of routed experts, None means dense model. - routed_scaling_factor (`float`, *optional*, defaults to 1.0): - Scaling factor or routed experts. - topk_method (`str`, *optional*, defaults to `gready`): - Topk method used in routed gate. - n_group (`int`, *optional*, defaults to None): - Number of groups for routed experts. - topk_group (`int`, *optional*, defaults to None): - Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups). - num_experts_per_tok (`int`, *optional*, defaults to None): - Number of selected experts, None means dense model. - moe_layer_freq (`int`, *optional*, defaults to 1): - The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers. - first_k_dense_replace (`int`, *optional*, defaults to 0): - Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head). - \--k dense layers--/ - norm_topk_prob (`bool`, *optional*, defaults to False): - Whether to normalize the weights of the routed experts. - scoring_func (`str`, *optional*, defaults to 'softmax'): - Method of computing expert weights. - aux_loss_alpha (`float`, *optional*, defaults to 0.001): - Auxiliary loss weight coefficient. - seq_aux = (`bool`, *optional*, defaults to True): - Whether to compute the auxiliary loss for each individual sample. - num_key_value_heads (`int`, *optional*): - This is the number of key_value heads that should be used to implement Grouped Query Attention. If - `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if - `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When - converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this - paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to - `num_attention_heads`. - hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): - The non-linear activation function (function or string) in the decoder. - max_position_embeddings (`int`, *optional*, defaults to 2048): - The maximum sequence length that this model might ever be used with. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - rms_norm_eps (`float`, *optional*, defaults to 1e-06): - The epsilon used by the rms normalization layers. - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models). Only - relevant if `config.is_decoder=True`. - pad_token_id (`int`, *optional*): - Padding token id. - bos_token_id (`int`, *optional*, defaults to 1): - Beginning of stream token id. - eos_token_id (`int`, *optional*, defaults to 2): - End of stream token id. - tie_word_embeddings (`bool`, *optional*, defaults to `False`): - Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling - strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is - `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update - `max_position_embeddings` to the expected new maximum. - attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): - Whether to use a bias in the query, key, value and output projection layers during self-attention. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - ```python - >>> from transformers import DeepseekV3Model, DeepseekV3Config - >>> # Initializing a Deepseek-V3 style configuration - >>> configuration = DeepseekV3Config() - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - - model_type = "deepseek_v3" - keys_to_ignore_at_inference = ["past_key_values"] - - def __init__( - self, - vocab_size=129280, - hidden_size=7168, - intermediate_size=18432, - moe_intermediate_size=2048, - num_hidden_layers=61, - num_nextn_predict_layers=1, - num_attention_heads=128, - num_key_value_heads=128, - n_shared_experts=1, - n_routed_experts=256, - ep_size=1, - routed_scaling_factor=2.5, - kv_lora_rank=512, - q_lora_rank=1536, - qk_rope_head_dim=64, - v_head_dim=128, - qk_nope_head_dim=128, - topk_method='noaux_tc', - n_group=8, - topk_group=4, - num_experts_per_tok=8, - moe_layer_freq=1, - first_k_dense_replace=3, - norm_topk_prob=True, - scoring_func='sigmoid', - hidden_act="silu", - max_position_embeddings=4096, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=None, - bos_token_id=0, - eos_token_id=1, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.moe_intermediate_size = moe_intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_nextn_predict_layers = num_nextn_predict_layers - self.num_attention_heads = num_attention_heads - self.n_shared_experts = n_shared_experts - self.n_routed_experts = n_routed_experts - self.ep_size = ep_size - self.routed_scaling_factor = routed_scaling_factor - self.kv_lora_rank = kv_lora_rank - self.q_lora_rank = q_lora_rank - self.qk_rope_head_dim = qk_rope_head_dim - self.v_head_dim = v_head_dim - self.qk_nope_head_dim = qk_nope_head_dim - self.topk_method = topk_method - self.n_group = n_group - self.topk_group = topk_group - self.num_experts_per_tok = num_experts_per_tok - self.moe_layer_freq = moe_layer_freq - self.first_k_dense_replace = first_k_dense_replace - self.norm_topk_prob = norm_topk_prob - self.scoring_func = scoring_func - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) - - -vllm.transformers_utils.configs.__all__.append("DeepseekV3Config") -vllm.transformers_utils.configs.DeepseekV3Config = DeepseekV3Config -config._CONFIG_REGISTRY["deepseek_v32"] = "DeepseekV3Config"