Drop torchair (#4814)

aclgraph is stable and fast now. Let's drop torchair graph mode now. TODO: some logic to adapt torchair should be cleaned up as well. We'll do it in the following PR. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Co-authored-by: Mengqing Cao <cmq0113@163.com>
2025-12-10 09:20:40 +08:00
parent ba9cda9dfd
commit 835b4c8f1d
84 changed files with 77 additions and 16881 deletions
--- a/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_torchair_correctness.py
+++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_torchair_correctness.py
@@ -1,106 +0,0 @@
-from __future__ import annotations
-
-import pytest
-from vllm import SamplingParams
-from vllm.config import CompilationConfig, CUDAGraphMode
-
-from tests.e2e.conftest import VllmRunner
-
-
-@pytest.fixture
-def sampling_config():
-    return SamplingParams(temperature=0, max_tokens=256, ignore_eos=False)
-
-
-@pytest.fixture
-def model_name():
-    return "wemaster/deepseek_mtp_main_random_bf16"
-
-
-def mtp_torchair_correctness(
-    sampling_config: SamplingParams,
-    model_name: str,
-    graph_mode: CUDAGraphMode = CUDAGraphMode.PIECEWISE,
-):
-    example_prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    '''
-    Compare the outputs of a original LLM and a speculative LLM
-    should be the same when using mtp speculative decoding.
-    '''
-    with VllmRunner(model_name,
-                    tensor_parallel_size=1,
-                    gpu_memory_utilization=0.7,
-                    max_model_len=256,
-                    enforce_eager=False,
-                    additional_config={
-                        "torchair_graph_config": {
-                            "enabled": True,
-                            "use_cached_graph": False,
-                            "graph_batch_sizes": [1, 2, 4],
-                        },
-                        "multistream_overlap_shared_expert": "True"
-                    }) as ref_llm:
-        ref_outputs = ref_llm.generate(example_prompts, sampling_config)
-
-    graph_mode_str = "PIECEWISE"
-    if graph_mode == CUDAGraphMode.FULL:
-        graph_mode_str = "FULL"
-
-    with VllmRunner(model_name,
-                    tensor_parallel_size=1,
-                    max_num_seqs=256,
-                    gpu_memory_utilization=0.7,
-                    distributed_executor_backend="mp",
-                    enable_expert_parallel=True,
-                    speculative_config={
-                        "method": "mtp",
-                        "num_speculative_tokens": 1,
-                    },
-                    enforce_eager=False,
-                    max_model_len=2000,
-                    compilation_config=CompilationConfig(
-                        cudagraph_mode=graph_mode_str),
-                    additional_config={
-                        "torchair_graph_config": {
-                            "enabled": True,
-                            "use_cached_graph": False,
-                            "graph_batch_sizes": [1, 2, 4],
-                        },
-                        "multistream_overlap_shared_expert": "True"
-                    }) as spec_llm:
-        spec_outputs = spec_llm.generate(example_prompts, sampling_config)
-
-    matches = 0
-    misses = 0
-    for ref_output, spec_output in zip(ref_outputs, spec_outputs):
-        ref_token_ids = ref_output[0][0]
-        spec_token_ids = spec_output[0][0]
-        if ref_token_ids == spec_token_ids[:len(ref_token_ids)]:
-            matches += 1
-        else:
-            misses += 1
-            print(f"ref_output: {ref_output[1][0]}")
-            print(f"spec_output: {spec_output[1][0]}")
-
-    # Heuristic: expect at least 66% of the prompts to match exactly
-    # Upon failure, inspect the outputs to check for inaccuracy.
-    assert matches > int(0.66 * len(ref_outputs))
-
-
-def test_mtp_torchair_correctness_piecewise(
-    sampling_config: SamplingParams,
-    model_name: str,
-):
-    mtp_torchair_correctness(sampling_config, model_name)
-
-
-def test_mtp_torchair_correctness_full(
-    sampling_config: SamplingParams,
-    model_name: str,
-):
-    mtp_torchair_correctness(sampling_config, model_name, CUDAGraphMode.FULL)