Drop torchair (#4814)

aclgraph is stable and fast now. Let's drop torchair graph mode now. TODO: some logic to adapt torchair should be cleaned up as well. We'll do it in the following PR. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Co-authored-by: Mengqing Cao <cmq0113@163.com>
2025-12-10 09:20:40 +08:00
parent ba9cda9dfd
commit 835b4c8f1d
84 changed files with 77 additions and 16881 deletions
--- a/tests/e2e/multicard/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -78,9 +78,6 @@ def test_models_distributed_DeepSeek_multistream_moe():
            tensor_parallel_size=2,
            distributed_executor_backend="mp",
            additional_config={
-                "torchair_graph_config": {
-                    "enabled": True,
-                },
                "enable_multistream_moe": True,
                "refresh": True,
            },
@@ -144,17 +141,12 @@ def test_models_distributed_DeepSeek_W4A8DYNAMIC(model):
        "Hello, my name is",
    ]
    max_tokens = 5
-    with VllmRunner(
-            snapshot_download(model),
-            dtype="auto",
-            tensor_parallel_size=2,
-            quantization="ascend",
-            enforce_eager=True,
-            enable_expert_parallel=True,
-            additional_config={"torchair_graph_config": {
-                "enabled": False,
-            }},
-    ) as vllm_model:
+    with VllmRunner(snapshot_download(model),
+                    dtype="auto",
+                    tensor_parallel_size=2,
+                    quantization="ascend",
+                    enforce_eager=True,
+                    enable_expert_parallel=True) as vllm_model:
        vllm_model.generate_greedy(prompts, max_tokens)