diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index 97ccc4a6..92dc7f7b 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -208,7 +208,6 @@ jobs: pytest -sv --durations=0 tests/e2e/multicard/test_expert_parallel.py pytest -sv --durations=0 tests/e2e/multicard/test_external_launcher.py pytest -sv --durations=0 tests/e2e/multicard/test_single_request_aclgraph.py - pytest -sv --durations=0 tests/e2e/multicard/test_fused_moe_allgather_ep.py # torch 2.8 doesn't work with lora, fix me #pytest -sv --durations=0 tests/e2e/multicard/test_ilama_lora_tp2.py diff --git a/tests/e2e/multicard/test_fused_moe_allgather_ep.py b/tests/e2e/multicard/test_fused_moe_allgather_ep.py deleted file mode 100644 index 8404fe7c..00000000 --- a/tests/e2e/multicard/test_fused_moe_allgather_ep.py +++ /dev/null @@ -1,74 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -""" -Execute the inference of fused_moe_allgather_ep and fused_moe_alltoall_ep. - -Run 'pytest tests/multicard/test_fused_moe_allgather_ep.py'. -""" - -import os -from unittest.mock import patch - -import pytest -from modelscope import snapshot_download # type: ignore -from vllm import SamplingParams - -from tests.e2e.conftest import VllmRunner - - -@pytest.mark.skipif( - True, - reason= - "Current disaggregated pd implementation may cause memory pulse, which will cause this test OOM, skip this test until the ringmla is ready " -) -@patch.dict( - os.environ, { - "VLLM_WORKER_MULTIPROC_METHOD": "spawn", - "TASK_QUEUE_ENABLE": "1", - "VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP": "1" - }) -def test_deepseek_v3_moe_fused_allgather_ep_tp2(): - example_prompts = ["Hello, my name is"] - sampling_params = SamplingParams(max_tokens=100, temperature=0.0) - - with VllmRunner(snapshot_download("vllm-ascend/DeepSeek-V3-Pruning"), - tensor_parallel_size=2, - max_model_len=1024, - dtype="auto", - enable_expert_parallel=True) as vllm_model: - vllm_model.generate(example_prompts, sampling_params) - - -@pytest.mark.skipif( - True, - reason= - "Current disaggregated pd implementation may cause memory pulse, which will cause this test OOM, skip this test until the ringmla is ready " -) -@patch.dict(os.environ, { - "VLLM_WORKER_MULTIPROC_METHOD": "spawn", - "TASK_QUEUE_ENABLE": "1" -}) -def test_deepseek_v3_moe_fused_alltoall_ep_tp2(): - example_prompts = ["Hello, my name is"] - sampling_params = SamplingParams(max_tokens=100, temperature=0.0) - - with VllmRunner(snapshot_download("vllm-ascend/DeepSeek-V3-Pruning"), - tensor_parallel_size=2, - max_model_len=1024, - dtype="auto", - enable_expert_parallel=True) as vllm_model: - vllm_model.generate(example_prompts, sampling_params) diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py index 0242c332..8d8af052 100644 --- a/vllm_ascend/envs.py +++ b/vllm_ascend/envs.py @@ -68,11 +68,6 @@ env_variables: Dict[str, Callable[[], Any]] = { # that the correct package is installed. "VLLM_VERSION": lambda: os.getenv("VLLM_VERSION", None), - # Whether to enable fused_experts_allgather_ep. MoeInitRoutingV3 and - # GroupedMatmulFinalizeRouting operators are combined to implement EP. - "VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP": - lambda: bool(int(os.getenv("VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP", '0')) - ), # Whether to enable the model execute time observe profile. Disable it when # running vllm ascend in production environment. "VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE":