Drop torchair (#4814)

aclgraph is stable and fast now. Let's drop torchair graph mode now.

TODO: some logic to adapt torchair should be cleaned up as well. We'll
do it in the following PR.

- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
This commit is contained in:
wangxiyuan
2025-12-10 09:20:40 +08:00
committed by GitHub
parent ba9cda9dfd
commit 835b4c8f1d
84 changed files with 77 additions and 16881 deletions

View File

@@ -1,106 +0,0 @@
from __future__ import annotations
import pytest
from vllm import SamplingParams
from vllm.config import CompilationConfig, CUDAGraphMode
from tests.e2e.conftest import VllmRunner
@pytest.fixture
def sampling_config():
return SamplingParams(temperature=0, max_tokens=256, ignore_eos=False)
@pytest.fixture
def model_name():
return "wemaster/deepseek_mtp_main_random_bf16"
def mtp_torchair_correctness(
sampling_config: SamplingParams,
model_name: str,
graph_mode: CUDAGraphMode = CUDAGraphMode.PIECEWISE,
):
example_prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
'''
Compare the outputs of a original LLM and a speculative LLM
should be the same when using mtp speculative decoding.
'''
with VllmRunner(model_name,
tensor_parallel_size=1,
gpu_memory_utilization=0.7,
max_model_len=256,
enforce_eager=False,
additional_config={
"torchair_graph_config": {
"enabled": True,
"use_cached_graph": False,
"graph_batch_sizes": [1, 2, 4],
},
"multistream_overlap_shared_expert": "True"
}) as ref_llm:
ref_outputs = ref_llm.generate(example_prompts, sampling_config)
graph_mode_str = "PIECEWISE"
if graph_mode == CUDAGraphMode.FULL:
graph_mode_str = "FULL"
with VllmRunner(model_name,
tensor_parallel_size=1,
max_num_seqs=256,
gpu_memory_utilization=0.7,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "mtp",
"num_speculative_tokens": 1,
},
enforce_eager=False,
max_model_len=2000,
compilation_config=CompilationConfig(
cudagraph_mode=graph_mode_str),
additional_config={
"torchair_graph_config": {
"enabled": True,
"use_cached_graph": False,
"graph_batch_sizes": [1, 2, 4],
},
"multistream_overlap_shared_expert": "True"
}) as spec_llm:
spec_outputs = spec_llm.generate(example_prompts, sampling_config)
matches = 0
misses = 0
for ref_output, spec_output in zip(ref_outputs, spec_outputs):
ref_token_ids = ref_output[0][0]
spec_token_ids = spec_output[0][0]
if ref_token_ids == spec_token_ids[:len(ref_token_ids)]:
matches += 1
else:
misses += 1
print(f"ref_output: {ref_output[1][0]}")
print(f"spec_output: {spec_output[1][0]}")
# Heuristic: expect at least 66% of the prompts to match exactly
# Upon failure, inspect the outputs to check for inaccuracy.
assert matches > int(0.66 * len(ref_outputs))
def test_mtp_torchair_correctness_piecewise(
sampling_config: SamplingParams,
model_name: str,
):
mtp_torchair_correctness(sampling_config, model_name)
def test_mtp_torchair_correctness_full(
sampling_config: SamplingParams,
model_name: str,
):
mtp_torchair_correctness(sampling_config, model_name, CUDAGraphMode.FULL)