Drop torchair (#4814)

aclgraph is stable and fast now. Let's drop torchair graph mode now. TODO: some logic to adapt torchair should be cleaned up as well. We'll do it in the following PR. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Co-authored-by: Mengqing Cao <cmq0113@163.com>
2025-12-10 09:20:40 +08:00
parent ba9cda9dfd
commit 835b4c8f1d
84 changed files with 77 additions and 16881 deletions
--- a/tests/e2e/multicard/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -78,9 +78,6 @@ def test_models_distributed_DeepSeek_multistream_moe():
            tensor_parallel_size=2,
            distributed_executor_backend="mp",
            additional_config={
-                "torchair_graph_config": {
-                    "enabled": True,
-                },
                "enable_multistream_moe": True,
                "refresh": True,
            },
@@ -144,17 +141,12 @@ def test_models_distributed_DeepSeek_W4A8DYNAMIC(model):
        "Hello, my name is",
    ]
    max_tokens = 5
-    with VllmRunner(
-            snapshot_download(model),
-            dtype="auto",
-            tensor_parallel_size=2,
-            quantization="ascend",
-            enforce_eager=True,
-            enable_expert_parallel=True,
-            additional_config={"torchair_graph_config": {
-                "enabled": False,
-            }},
-    ) as vllm_model:
+    with VllmRunner(snapshot_download(model),
+                    dtype="auto",
+                    tensor_parallel_size=2,
+                    quantization="ascend",
+                    enforce_eager=True,
+                    enable_expert_parallel=True) as vllm_model:
        vllm_model.generate_greedy(prompts, max_tokens)


--- a/tests/e2e/multicard/test_torchair_graph_mode.py
+++ b/tests/e2e/multicard/test_torchair_graph_mode.py
@@ -1,290 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This file is a part of the vllm-ascend project.
-#
-"""Compare the short outputs of HF and vLLM when using greedy sampling.
-
-Run `pytest tests/multicard/test_torchair_graph_mode.py`.
-"""
-import os
-from typing import Dict
-
-import pytest
-
-from tests.e2e.conftest import VllmRunner
-
-os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
-
-
-def _deepseek_torchair_test_fixture(
-    additional_config: Dict,
-    *,
-    tensor_parallel_size=2,
-    use_v1_schduler=False,
-):
-    example_prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    kwargs = {}
-    if not use_v1_schduler:
-        kwargs = {
-            "refresh": True,
-        }
-    additional_config.update(**kwargs)
-
-    with VllmRunner(
-            "vllm-ascend/DeepSeek-V3-Pruning",
-            dtype="half",
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend="mp",
-            additional_config=additional_config,
-    ) as vllm_model:
-        # use greedy sampler to make sure the generated results are fix
-        vllm_output = vllm_model.generate_greedy(example_prompts, 5)
-
-    # NOTE: vllm-ascend/DeepSeek-V3-Pruning is a random weight of
-    # DeepSeek-V3 with 2 hidden layers, thus the golden results seems
-    # inaccurate. This will only change if accuracy improves with the
-    # official weights of DeepSeek-V3.
-    golden_results = [
-        'Hello, my name is下载早点向前很有่อง',
-        'The president of the United States isSender)## physiological Albany',
-        'The capital of France is Rocky转角 hospitalizedinterval sparked',
-        'The future of AI is её asegο BIOS一扫',
-    ]
-
-    assert len(golden_results) == len(vllm_output)
-    for i in range(len(vllm_output)):
-        assert golden_results[i] == vllm_output[i][1]
-        print(f"Generated text: {vllm_output[i][1]!r}")
-
-
-def test_e2e_deepseekv3_with_torchair():
-    additional_config = {
-        "torchair_graph_config": {
-            "enabled": True,
-        },
-    }
-    _deepseek_torchair_test_fixture(additional_config)
-
-
-def test_e2e_deepseekv3_with_torchair_ms_mla():
-    additional_config = {
-        "torchair_graph_config": {
-            "enabled": True,
-            "enable_multistream_mla": True,
-        },
-    }
-    _deepseek_torchair_test_fixture(additional_config)
-
-
-@pytest.mark.skip("accuracy test failed. Fix me")
-def test_e2e_deepseekv3_with_torchair_v1scheduler():
-    additional_config = {
-        "torchair_graph_config": {
-            "enabled": True,
-        },
-    }
-    _deepseek_torchair_test_fixture(additional_config, use_v1_schduler=True)
-
-
-def _pangu_torchair_test_fixture(
-    additional_config: Dict,
-    *,
-    tensor_parallel_size=2,
-):
-    example_prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-
-    # torchair is only work without chunked-prefill now
-    kwargs = {
-        "refresh": True,
-    }
-    additional_config.update(**kwargs)
-
-    with VllmRunner(
-            "vllm-ascend/pangu-pro-moe-pruing",
-            dtype="half",
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend="mp",
-            additional_config=additional_config,
-            enable_expert_parallel=True,
-    ) as vllm_model:
-        # use greedy sampler to make sure the generated results are fix
-        vllm_output = vllm_model.generate_greedy(example_prompts, 5)
-
-    # NOTE: vllm-ascend/pangu-pro-moe-pruing is only part of PanguProMoE
-    # with 2 hidden layers, thus the golden results seems inaccurate.
-    # This will only change if accuracy changes with the official weights
-    # of PanguProMoE.
-    golden_results = [
-        'Hello, my name is Remempondeprecatedmiot忱',
-        'The president of the United States is Remem下的一个 rever ceremoni Segnali',
-        'The capital of France is Rememvoud administrativ Remem投',
-        'The future of AI isotope Segnali Zoeken精细化 supus',
-    ]
-
-    assert len(golden_results) == len(vllm_output)
-    for i in range(len(vllm_output)):
-        assert golden_results[i] == vllm_output[i][1]
-        print(f"Generated text: {vllm_output[i][1]!r}")
-
-
-@pytest.mark.skip("skipping test_e2e_pangu_with_torchair")
-def test_e2e_pangu_with_torchair():
-    additional_config = {
-        "torchair_graph_config": {
-            "enabled": True,
-        },
-    }
-    _pangu_torchair_test_fixture(additional_config)
-
-
-def _qwen_torchair_test_fixture(
-    model,
-    tp,
-    enable_expert_parallel,
-):
-    # The current access control does not support 16 cards,
-    # so the MC2 operator in Qwen's graph mode cannot run.
-    # Once 16-card support is available,
-    # this e2e can be switched to graph mode.
-    example_prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-
-    additional_config = {
-        "torchair_graph_config": {
-            "enabled": False,
-        },
-        "refresh": True,
-    }
-
-    with VllmRunner(
-            model,
-            dtype="half",
-            tensor_parallel_size=tp,
-            distributed_executor_backend="mp",
-            enforce_eager=True,
-            additional_config=additional_config,
-            enable_expert_parallel=enable_expert_parallel,
-    ) as vllm_model:
-        # use greedy sampler to make sure the generated results are fix
-        vllm_output = vllm_model.generate_greedy(example_prompts, 5)
-
-    # NOTE: vllm-ascend/pangu-pro-moe-pruing is only part of PanguProMoE
-    # with 2 hidden layers, thus the golden results seems inaccurate.
-    # This will only change if accuracy changes with the official weights
-    # of PanguProMoE.
-    golden_results = [
-        'Hello, my name is Remempondeprecatedmiot忱',
-        'The president of the United States is Remem下的一个 rever ceremoni Segnali',
-        'The capital of France is Rememvoud administrativ Remem投',
-        'The future of AI isotope Segnali Zoeken精细化 supus',
-    ]
-
-    assert len(golden_results) == len(vllm_output)
-    for i in range(len(vllm_output)):
-        print(f"Generated text: {vllm_output[i][1]!r}")
-
-
-def test_e2e_qwen2_with_torchair():
-    _qwen_torchair_test_fixture("Qwen/Qwen2.5-0.5B-Instruct", 2, False)
-
-
-def test_e2e_qwen3_moe_with_torchair():
-    _qwen_torchair_test_fixture("Qwen/Qwen3-30B-A3B", 2, True)
-
-
-# test deepseek-v2-lite
-def _deepseek_v2_lite_torchair_test_fixure(
-    additional_config: Dict,
-    *,
-    tensor_parallel_size=2,
-    use_v1_schduler=False,
-):
-    example_prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-
-    kwargs = {}
-    if not use_v1_schduler:
-        kwargs = {
-            "refresh": True,
-        }
-    additional_config.update(**kwargs)
-
-    with VllmRunner(
-            "deepseek-ai/DeepSeek-V2-Lite",
-            dtype="half",
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend="mp",
-            additional_config=additional_config,
-    ) as vllm_model:
-        vllm_output = vllm_model.generate_greedy(example_prompts, 5)
-
-    # NOTE: deepseek-ai/DeepSeek-V2-Lite is a random weight of
-    # DeepSeek-V2-Lite with 2 hidden layers, thus the golden results seems
-    # inaccurate. This will only change if accuracy improves with the
-    # official weights of DeepSeek-V2-Lite.
-
-    for i in range(len(vllm_output)):
-        generated_text = vllm_output[i][1]
-        assert len(
-            generated_text.strip()) > 0, f"The {i}-th output is null, failed"
-
-
-def test_e2e_deepseekv2lite_with_torchair():
-    additional_config = {
-        "torchair_graph_config": {
-            "enabled": True,
-        },
-    }
-    _deepseek_v2_lite_torchair_test_fixure(additional_config)
-
-
-def test_e2e_deepseekv2lite_with_torchair_v1scheduler():
-    additional_config = {
-        "torchair_graph_config": {
-            "enabled": True,
-        },
-    }
-    _deepseek_v2_lite_torchair_test_fixure(additional_config,
-                                           use_v1_schduler=True)
-
-
-# kv_cache enable e2e test
-def test_e2e_deepseekv2lite_with_nz():
-    additional_config = {
-        "torchair_graph_config": {
-            "enabled": True,
-            "enable_kv_nz": True,
-        },
-    }
-    _deepseek_v2_lite_torchair_test_fixure(additional_config)