diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index f5238c96..6d408f1c 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -115,6 +115,7 @@ jobs: pytest -sv --durations=0 tests/e2e/singlecard/test_sampler.py pytest -sv --durations=0 tests/e2e/singlecard/test_vlm.py pytest -sv --durations=0 tests/e2e/singlecard/test_xlite.py + pytest -sv --durations=0 tests/e2e/singlecard/test_models.py pytest -sv --durations=0 tests/e2e/singlecard/pooling/ pytest -sv --durations=0 tests/e2e/singlecard/compile/test_norm_quant_fusion.py pytest -sv --durations=0 tests/e2e/singlecard/test_multistream_overlap_shared_expert.py diff --git a/tests/e2e/singlecard/test_models.py b/tests/e2e/singlecard/test_models.py new file mode 100644 index 00000000..20fe6f77 --- /dev/null +++ b/tests/e2e/singlecard/test_models.py @@ -0,0 +1,36 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from vllm/tests/entrypoints/llm/test_guided_generate.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os + +from tests.e2e.conftest import VllmRunner + +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + + +def test_minicpm_2b() -> None: + example_prompts = [ + "Hello, my name is", + ] + max_tokens = 5 + + with VllmRunner("openbmb/MiniCPM-2B-sft-bf16", + max_model_len=512, + gpu_memory_utilization=0.7) as runner: + runner.generate_greedy(example_prompts, max_tokens) diff --git a/vllm_ascend/ops/linear.py b/vllm_ascend/ops/linear.py index 51dc011a..2dd9689b 100644 --- a/vllm_ascend/ops/linear.py +++ b/vllm_ascend/ops/linear.py @@ -36,7 +36,7 @@ from vllm.model_executor.layers.quantization.base_config import \ from vllm.model_executor.utils import set_weight_attrs from vllm_ascend.ops.linear_op import get_parallel_op, get_replicated_op -from vllm_ascend.utils import maybe_trans_nz +from vllm_ascend.utils import enable_sp, maybe_trans_nz class AscendUnquantizedLinearMethod(UnquantizedLinearMethod): @@ -219,6 +219,9 @@ class AscendRowParallelLinear(RowParallelLinear): and the original TP group in other modules. """ + # NOTE: Globally unique prefix identifier used in SP scenarios + unique_prefix_idx = 0 + def __init__( self, input_size: int, @@ -234,14 +237,15 @@ class AscendRowParallelLinear(RowParallelLinear): return_bias: bool = True, disable_tp: bool = False, ): - compilation_config = get_current_vllm_config().compilation_config - # TODO(shaopeng-666): Remove the visual check after the mm model reconstruction is complete. - # TODO(MengqingCao): Remove the empty string check, after specifying the prefix in linear layers of some models in the vLLM. - if prefix in compilation_config.static_forward_context and \ - prefix != "" and \ - "visual" not in prefix: - raise ValueError(f"Duplicate layer name: {prefix}") - compilation_config.static_forward_context[prefix] = self + # TODO(kunpengW-code): Specifying the prefix in linear layers of some models in the vLLM. + if enable_sp(): + compilation_config = get_current_vllm_config().compilation_config + unique_prefix = prefix + if prefix in compilation_config.static_forward_context: + unique_prefix = f"{prefix}.unique_prefix{AscendRowParallelLinear.unique_prefix_idx}" + AscendRowParallelLinear.unique_prefix_idx += 1 + self.unique_prefix = unique_prefix + compilation_config.static_forward_context[unique_prefix] = self self.custom_op, self.tp_rank, self.tp_size = get_parallel_op( disable_tp, prefix, self, "row") diff --git a/vllm_ascend/ops/linear_op.py b/vllm_ascend/ops/linear_op.py index eec63dd3..980fd2a2 100644 --- a/vllm_ascend/ops/linear_op.py +++ b/vllm_ascend/ops/linear_op.py @@ -484,6 +484,10 @@ class SequenceColumnParallelOp(CustomColumnParallelOp): class SequenceRowParallelOp(CustomRowParallelOp): + def __init__(self, layer): + super().__init__(layer) + self.unique_prefix = None + def apply_impl( self, input_: torch.Tensor ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]: @@ -509,7 +513,7 @@ class SequenceRowParallelOp(CustomRowParallelOp): bias=bias_) else: output = torch.ops.vllm.matmul_and_reduce(input_parallel, - self.prefix) + self.unique_prefix) output_bias = self.bias if self.skip_bias_add else None return output, output_bias @@ -602,6 +606,7 @@ class SequenceRowParallelOp(CustomRowParallelOp): super().update_attrs() self.input_is_parallel = self.layer.input_is_parallel self.reduce_results = self.layer.reduce_results + self.unique_prefix = self.layer.unique_prefix def _get_column_parallel_op(