Drop torchair (#4814)
aclgraph is stable and fast now. Let's drop torchair graph mode now.
TODO: some logic to adapt torchair should be cleaned up as well. We'll
do it in the following PR.
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
This commit is contained in:
@@ -1,59 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
import pytest
|
||||
import vllm # noqa: F401
|
||||
|
||||
import vllm_ascend # noqa: F401
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
# Pangu local model path
|
||||
MODELS = [
|
||||
"IntervitensInc/pangu-pro-moe-model",
|
||||
]
|
||||
# set additional config for ascend scheduler and torchair graph
|
||||
ADDITIONAL_CONFIG = [{
|
||||
"additional_config": {
|
||||
"torchair_graph_config": {
|
||||
"enabled": True
|
||||
}
|
||||
}
|
||||
}]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float16"])
|
||||
@pytest.mark.parametrize("max_tokens", [5])
|
||||
@pytest.mark.parametrize("enfore_eager", [True, False])
|
||||
@pytest.mark.parametrize("additional_config", ADDITIONAL_CONFIG)
|
||||
def test_pangu_model(model: str, dtype: str, max_tokens: int,
|
||||
enfore_eager: bool, additional_config: dict) -> None:
|
||||
if enfore_eager:
|
||||
additional_config = {}
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
with VllmRunner(model,
|
||||
tensor_parallel_size=4,
|
||||
dtype=dtype,
|
||||
max_model_len=1024,
|
||||
enforce_eager=True,
|
||||
enable_expert_parallel=True,
|
||||
additional_config=additional_config,
|
||||
distributed_executor_backend="mp") as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
@@ -78,9 +78,6 @@ def test_models_distributed_DeepSeek_multistream_moe():
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="mp",
|
||||
additional_config={
|
||||
"torchair_graph_config": {
|
||||
"enabled": True,
|
||||
},
|
||||
"enable_multistream_moe": True,
|
||||
"refresh": True,
|
||||
},
|
||||
@@ -144,17 +141,12 @@ def test_models_distributed_DeepSeek_W4A8DYNAMIC(model):
|
||||
"Hello, my name is",
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
snapshot_download(model),
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
quantization="ascend",
|
||||
enforce_eager=True,
|
||||
enable_expert_parallel=True,
|
||||
additional_config={"torchair_graph_config": {
|
||||
"enabled": False,
|
||||
}},
|
||||
) as vllm_model:
|
||||
with VllmRunner(snapshot_download(model),
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
quantization="ascend",
|
||||
enforce_eager=True,
|
||||
enable_expert_parallel=True) as vllm_model:
|
||||
vllm_model.generate_greedy(prompts, max_tokens)
|
||||
|
||||
|
||||
|
||||
@@ -1,290 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
"""Compare the short outputs of HF and vLLM when using greedy sampling.
|
||||
|
||||
Run `pytest tests/multicard/test_torchair_graph_mode.py`.
|
||||
"""
|
||||
import os
|
||||
from typing import Dict
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
|
||||
|
||||
|
||||
def _deepseek_torchair_test_fixture(
|
||||
additional_config: Dict,
|
||||
*,
|
||||
tensor_parallel_size=2,
|
||||
use_v1_schduler=False,
|
||||
):
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
kwargs = {}
|
||||
if not use_v1_schduler:
|
||||
kwargs = {
|
||||
"refresh": True,
|
||||
}
|
||||
additional_config.update(**kwargs)
|
||||
|
||||
with VllmRunner(
|
||||
"vllm-ascend/DeepSeek-V3-Pruning",
|
||||
dtype="half",
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend="mp",
|
||||
additional_config=additional_config,
|
||||
) as vllm_model:
|
||||
# use greedy sampler to make sure the generated results are fix
|
||||
vllm_output = vllm_model.generate_greedy(example_prompts, 5)
|
||||
|
||||
# NOTE: vllm-ascend/DeepSeek-V3-Pruning is a random weight of
|
||||
# DeepSeek-V3 with 2 hidden layers, thus the golden results seems
|
||||
# inaccurate. This will only change if accuracy improves with the
|
||||
# official weights of DeepSeek-V3.
|
||||
golden_results = [
|
||||
'Hello, my name is下载早点向前很有่อง',
|
||||
'The president of the United States isSender)## physiological Albany',
|
||||
'The capital of France is Rocky转角 hospitalizedinterval sparked',
|
||||
'The future of AI is её asegο BIOS一扫',
|
||||
]
|
||||
|
||||
assert len(golden_results) == len(vllm_output)
|
||||
for i in range(len(vllm_output)):
|
||||
assert golden_results[i] == vllm_output[i][1]
|
||||
print(f"Generated text: {vllm_output[i][1]!r}")
|
||||
|
||||
|
||||
def test_e2e_deepseekv3_with_torchair():
|
||||
additional_config = {
|
||||
"torchair_graph_config": {
|
||||
"enabled": True,
|
||||
},
|
||||
}
|
||||
_deepseek_torchair_test_fixture(additional_config)
|
||||
|
||||
|
||||
def test_e2e_deepseekv3_with_torchair_ms_mla():
|
||||
additional_config = {
|
||||
"torchair_graph_config": {
|
||||
"enabled": True,
|
||||
"enable_multistream_mla": True,
|
||||
},
|
||||
}
|
||||
_deepseek_torchair_test_fixture(additional_config)
|
||||
|
||||
|
||||
@pytest.mark.skip("accuracy test failed. Fix me")
|
||||
def test_e2e_deepseekv3_with_torchair_v1scheduler():
|
||||
additional_config = {
|
||||
"torchair_graph_config": {
|
||||
"enabled": True,
|
||||
},
|
||||
}
|
||||
_deepseek_torchair_test_fixture(additional_config, use_v1_schduler=True)
|
||||
|
||||
|
||||
def _pangu_torchair_test_fixture(
|
||||
additional_config: Dict,
|
||||
*,
|
||||
tensor_parallel_size=2,
|
||||
):
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
# torchair is only work without chunked-prefill now
|
||||
kwargs = {
|
||||
"refresh": True,
|
||||
}
|
||||
additional_config.update(**kwargs)
|
||||
|
||||
with VllmRunner(
|
||||
"vllm-ascend/pangu-pro-moe-pruing",
|
||||
dtype="half",
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend="mp",
|
||||
additional_config=additional_config,
|
||||
enable_expert_parallel=True,
|
||||
) as vllm_model:
|
||||
# use greedy sampler to make sure the generated results are fix
|
||||
vllm_output = vllm_model.generate_greedy(example_prompts, 5)
|
||||
|
||||
# NOTE: vllm-ascend/pangu-pro-moe-pruing is only part of PanguProMoE
|
||||
# with 2 hidden layers, thus the golden results seems inaccurate.
|
||||
# This will only change if accuracy changes with the official weights
|
||||
# of PanguProMoE.
|
||||
golden_results = [
|
||||
'Hello, my name is Remempondeprecatedmiot忱',
|
||||
'The president of the United States is Remem下的一个 rever ceremoni Segnali',
|
||||
'The capital of France is Rememvoud administrativ Remem投',
|
||||
'The future of AI isotope Segnali Zoeken精细化 supus',
|
||||
]
|
||||
|
||||
assert len(golden_results) == len(vllm_output)
|
||||
for i in range(len(vllm_output)):
|
||||
assert golden_results[i] == vllm_output[i][1]
|
||||
print(f"Generated text: {vllm_output[i][1]!r}")
|
||||
|
||||
|
||||
@pytest.mark.skip("skipping test_e2e_pangu_with_torchair")
|
||||
def test_e2e_pangu_with_torchair():
|
||||
additional_config = {
|
||||
"torchair_graph_config": {
|
||||
"enabled": True,
|
||||
},
|
||||
}
|
||||
_pangu_torchair_test_fixture(additional_config)
|
||||
|
||||
|
||||
def _qwen_torchair_test_fixture(
|
||||
model,
|
||||
tp,
|
||||
enable_expert_parallel,
|
||||
):
|
||||
# The current access control does not support 16 cards,
|
||||
# so the MC2 operator in Qwen's graph mode cannot run.
|
||||
# Once 16-card support is available,
|
||||
# this e2e can be switched to graph mode.
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
additional_config = {
|
||||
"torchair_graph_config": {
|
||||
"enabled": False,
|
||||
},
|
||||
"refresh": True,
|
||||
}
|
||||
|
||||
with VllmRunner(
|
||||
model,
|
||||
dtype="half",
|
||||
tensor_parallel_size=tp,
|
||||
distributed_executor_backend="mp",
|
||||
enforce_eager=True,
|
||||
additional_config=additional_config,
|
||||
enable_expert_parallel=enable_expert_parallel,
|
||||
) as vllm_model:
|
||||
# use greedy sampler to make sure the generated results are fix
|
||||
vllm_output = vllm_model.generate_greedy(example_prompts, 5)
|
||||
|
||||
# NOTE: vllm-ascend/pangu-pro-moe-pruing is only part of PanguProMoE
|
||||
# with 2 hidden layers, thus the golden results seems inaccurate.
|
||||
# This will only change if accuracy changes with the official weights
|
||||
# of PanguProMoE.
|
||||
golden_results = [
|
||||
'Hello, my name is Remempondeprecatedmiot忱',
|
||||
'The president of the United States is Remem下的一个 rever ceremoni Segnali',
|
||||
'The capital of France is Rememvoud administrativ Remem投',
|
||||
'The future of AI isotope Segnali Zoeken精细化 supus',
|
||||
]
|
||||
|
||||
assert len(golden_results) == len(vllm_output)
|
||||
for i in range(len(vllm_output)):
|
||||
print(f"Generated text: {vllm_output[i][1]!r}")
|
||||
|
||||
|
||||
def test_e2e_qwen2_with_torchair():
|
||||
_qwen_torchair_test_fixture("Qwen/Qwen2.5-0.5B-Instruct", 2, False)
|
||||
|
||||
|
||||
def test_e2e_qwen3_moe_with_torchair():
|
||||
_qwen_torchair_test_fixture("Qwen/Qwen3-30B-A3B", 2, True)
|
||||
|
||||
|
||||
# test deepseek-v2-lite
|
||||
def _deepseek_v2_lite_torchair_test_fixure(
|
||||
additional_config: Dict,
|
||||
*,
|
||||
tensor_parallel_size=2,
|
||||
use_v1_schduler=False,
|
||||
):
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
kwargs = {}
|
||||
if not use_v1_schduler:
|
||||
kwargs = {
|
||||
"refresh": True,
|
||||
}
|
||||
additional_config.update(**kwargs)
|
||||
|
||||
with VllmRunner(
|
||||
"deepseek-ai/DeepSeek-V2-Lite",
|
||||
dtype="half",
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend="mp",
|
||||
additional_config=additional_config,
|
||||
) as vllm_model:
|
||||
vllm_output = vllm_model.generate_greedy(example_prompts, 5)
|
||||
|
||||
# NOTE: deepseek-ai/DeepSeek-V2-Lite is a random weight of
|
||||
# DeepSeek-V2-Lite with 2 hidden layers, thus the golden results seems
|
||||
# inaccurate. This will only change if accuracy improves with the
|
||||
# official weights of DeepSeek-V2-Lite.
|
||||
|
||||
for i in range(len(vllm_output)):
|
||||
generated_text = vllm_output[i][1]
|
||||
assert len(
|
||||
generated_text.strip()) > 0, f"The {i}-th output is null, failed"
|
||||
|
||||
|
||||
def test_e2e_deepseekv2lite_with_torchair():
|
||||
additional_config = {
|
||||
"torchair_graph_config": {
|
||||
"enabled": True,
|
||||
},
|
||||
}
|
||||
_deepseek_v2_lite_torchair_test_fixure(additional_config)
|
||||
|
||||
|
||||
def test_e2e_deepseekv2lite_with_torchair_v1scheduler():
|
||||
additional_config = {
|
||||
"torchair_graph_config": {
|
||||
"enabled": True,
|
||||
},
|
||||
}
|
||||
_deepseek_v2_lite_torchair_test_fixure(additional_config,
|
||||
use_v1_schduler=True)
|
||||
|
||||
|
||||
# kv_cache enable e2e test
|
||||
def test_e2e_deepseekv2lite_with_nz():
|
||||
additional_config = {
|
||||
"torchair_graph_config": {
|
||||
"enabled": True,
|
||||
"enable_kv_nz": True,
|
||||
},
|
||||
}
|
||||
_deepseek_v2_lite_torchair_test_fixure(additional_config)
|
||||
@@ -73,7 +73,6 @@ async def test_models(model: str, mode: str) -> None:
|
||||
"VLLM_RPC_TIMEOUT": "3600000",
|
||||
"VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": "3600000"
|
||||
}
|
||||
additional_config: dict[str, Any] = {}
|
||||
speculative_config = {"num_speculative_tokens": 2, "method": "mtp"}
|
||||
compilation_config = {
|
||||
"cudagraph_capture_sizes": [56],
|
||||
@@ -104,7 +103,6 @@ async def test_models(model: str, mode: str) -> None:
|
||||
["--speculative-config",
|
||||
json.dumps(speculative_config)])
|
||||
server_args.extend(["--gpu-memory-utilization", "0.92"])
|
||||
additional_config["torchair_graph_config"] = {"enabled": True}
|
||||
aisbench_cases = aisbench_gsm8k
|
||||
if mode == "mtp3":
|
||||
env_dict["HCCL_OP_EXPANSION_MODE"] = "AIV"
|
||||
@@ -117,9 +115,7 @@ async def test_models(model: str, mode: str) -> None:
|
||||
server_args.extend(
|
||||
["--compilation-config",
|
||||
json.dumps(compilation_config)])
|
||||
additional_config["torchair_graph_config"] = {"enabled": False}
|
||||
aisbench_cases = aisbench_aime
|
||||
server_args.extend(["--additional-config", json.dumps(additional_config)])
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
}
|
||||
|
||||
@@ -74,13 +74,6 @@ async def test_models(model: str) -> None:
|
||||
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
|
||||
}
|
||||
additional_config = {
|
||||
"torchair_graph_config": {
|
||||
"enabled": True,
|
||||
"enable_multistream_moe": False,
|
||||
"enable_multistream_mla": True,
|
||||
"graph_batch_size": [16],
|
||||
"use_cached_graph": True
|
||||
},
|
||||
"chunked_prefill_for_mla": True,
|
||||
"enable_weight_nz_layout": True
|
||||
}
|
||||
|
||||
@@ -29,7 +29,6 @@ MODELS = [
|
||||
]
|
||||
|
||||
MODES = [
|
||||
"torchair",
|
||||
"single",
|
||||
"aclgraph",
|
||||
"aclgraph_mlapo",
|
||||
@@ -78,13 +77,6 @@ async def test_models(model: str, mode: str) -> None:
|
||||
}
|
||||
speculative_config = {"num_speculative_tokens": 1, "method": "mtp"}
|
||||
additional_config = {
|
||||
"torchair_graph_config": {
|
||||
"enabled": True,
|
||||
"enable_multistream_moe": False,
|
||||
"enable_multistream_mla": True,
|
||||
"graph_batch_sizes": [16],
|
||||
"use_cached_graph": True
|
||||
},
|
||||
"chunked_prefill_for_mla": True,
|
||||
"enable_weight_nz_layout": True
|
||||
}
|
||||
@@ -99,12 +91,8 @@ async def test_models(model: str, mode: str) -> None:
|
||||
]
|
||||
if mode == "single":
|
||||
server_args.append("--enforce-eager")
|
||||
additional_config["torchair_graph_config"] = {"enabled": False}
|
||||
if mode == "aclgraph":
|
||||
additional_config["torchair_graph_config"] = {"enabled": False}
|
||||
if mode == "aclgraph_mlapo":
|
||||
env_dict["VLLM_ASCEND_ENABLE_MLAPO"] = "1"
|
||||
additional_config["torchair_graph_config"] = {"enabled": False}
|
||||
server_args.extend(["--additional-config", json.dumps(additional_config)])
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
|
||||
@@ -68,9 +68,6 @@ async def test_models(model: str) -> None:
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY"
|
||||
}
|
||||
additional_config: dict[str, Any] = {
|
||||
"torchair_graph_config": {
|
||||
"enabled": True
|
||||
},
|
||||
"enable_shared_expert_dp": False,
|
||||
"multistream_overlap_shared_expert": False,
|
||||
"dynamic_eplb": True,
|
||||
|
||||
@@ -72,27 +72,13 @@ async def test_models(model: str, tp_size: int, dp_size: int,
|
||||
port = get_open_port()
|
||||
env_dict = {"HCCL_BUFFSIZE": "1024", "VLLM_ASCEND_ENABLE_MLAPO": "0"}
|
||||
server_args = [
|
||||
"--no-enable-prefix-caching",
|
||||
"--enable-expert-parallel",
|
||||
"--no-enable-prefix-caching", "--enable-expert-parallel",
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size),
|
||||
"--data-parallel-size",
|
||||
str(dp_size),
|
||||
"--port",
|
||||
str(port),
|
||||
"--max-model-len",
|
||||
"16384",
|
||||
"--max-num-batched-tokens",
|
||||
"16384",
|
||||
"--block-size",
|
||||
"16",
|
||||
"--trust-remote-code",
|
||||
"--quantization",
|
||||
"ascend",
|
||||
"--gpu-memory-utilization",
|
||||
"0.9",
|
||||
"--additional-config",
|
||||
'{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}',
|
||||
str(tp_size), "--data-parallel-size",
|
||||
str(dp_size), "--port",
|
||||
str(port), "--max-model-len", "16384", "--max-num-batched-tokens",
|
||||
"16384", "--block-size", "16", "--trust-remote-code", "--quantization",
|
||||
"ascend", "--gpu-memory-utilization", "0.9"
|
||||
]
|
||||
if full_graph:
|
||||
server_args += [
|
||||
|
||||
@@ -1,64 +0,0 @@
|
||||
test_name: "test DeepSeek-R1-W8A8 torchair on A2"
|
||||
model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
|
||||
num_nodes: 2
|
||||
npu_per_node: 8
|
||||
env_common:
|
||||
VLLM_USE_MODELSCOPE: true
|
||||
HCCL_BUFFSIZE: 1024
|
||||
SERVER_PORT: 8080
|
||||
OMP_PROC_BIND: false
|
||||
OMP_NUM_THREADS: 10
|
||||
|
||||
|
||||
deployment:
|
||||
-
|
||||
server_cmd: >
|
||||
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
|
||||
--host 0.0.0.0
|
||||
--port $SERVER_PORT
|
||||
--data-parallel-size 4
|
||||
--data-parallel-size-local 2
|
||||
--data-parallel-address $LOCAL_IP
|
||||
--data-parallel-rpc-port 13399
|
||||
--no-enable-prefix-caching
|
||||
--max-num-seqs 16
|
||||
--tensor-parallel-size 4
|
||||
--max-model-len 36864
|
||||
--max-num-batched-tokens 6000
|
||||
--enable-expert-parallel
|
||||
--trust-remote-code
|
||||
--quantization ascend
|
||||
--gpu-memory-utilization 0.9
|
||||
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
|
||||
--additional-config '{"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
|
||||
|
||||
-
|
||||
server_cmd: >
|
||||
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
|
||||
--headless
|
||||
--data-parallel-size 4
|
||||
--data-parallel-rpc-port 13399
|
||||
--data-parallel-size-local 2
|
||||
--data-parallel-start-rank 2
|
||||
--data-parallel-address $MASTER_IP
|
||||
--no-enable-prefix-caching
|
||||
--max-num-seqs 16
|
||||
--tensor-parallel-size 4
|
||||
--max-model-len 36864
|
||||
--max-num-batched-tokens 6000
|
||||
--enable-expert-parallel
|
||||
--trust-remote-code
|
||||
--quantization ascend
|
||||
--gpu-memory-utilization 0.9
|
||||
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
|
||||
--additional-config '{"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
|
||||
benchmarks:
|
||||
acc:
|
||||
case_type: accuracy
|
||||
dataset_path: vllm-ascend/gsm8k
|
||||
request_conf: vllm_api_general_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
|
||||
max_out_len: 32768
|
||||
batch_size: 512
|
||||
baseline: 95
|
||||
threshold: 5
|
||||
@@ -58,7 +58,7 @@ deployment:
|
||||
}
|
||||
}'
|
||||
--additional-config
|
||||
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
||||
'{"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
||||
|
||||
-
|
||||
server_cmd: >
|
||||
@@ -96,7 +96,7 @@ deployment:
|
||||
}
|
||||
}'
|
||||
--additional-config
|
||||
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
||||
'{"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
||||
-
|
||||
server_cmd: >
|
||||
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
|
||||
@@ -135,7 +135,7 @@ deployment:
|
||||
}
|
||||
}'
|
||||
--additional-config
|
||||
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
||||
'{"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
||||
-
|
||||
server_cmd: >
|
||||
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
|
||||
@@ -173,7 +173,7 @@ deployment:
|
||||
}
|
||||
}'
|
||||
--additional-config
|
||||
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
||||
'{"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
||||
benchmarks:
|
||||
perf:
|
||||
case_type: performance
|
||||
|
||||
@@ -57,7 +57,7 @@ deployment:
|
||||
}
|
||||
}'
|
||||
--additional-config
|
||||
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
|
||||
'{"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
|
||||
|
||||
-
|
||||
server_cmd: >
|
||||
@@ -95,7 +95,7 @@ deployment:
|
||||
}
|
||||
}'
|
||||
--additional-config
|
||||
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
|
||||
'{"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
|
||||
-
|
||||
server_cmd: >
|
||||
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
|
||||
@@ -134,7 +134,7 @@ deployment:
|
||||
}
|
||||
}'
|
||||
--additional-config
|
||||
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
|
||||
'{"multistream_overlap_shared_expert":true}'
|
||||
-
|
||||
server_cmd: >
|
||||
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
|
||||
@@ -172,7 +172,7 @@ deployment:
|
||||
}
|
||||
}'
|
||||
--additional-config
|
||||
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
|
||||
'{"multistream_overlap_shared_expert":true}'
|
||||
benchmarks:
|
||||
perf:
|
||||
case_type: performance
|
||||
|
||||
@@ -82,7 +82,6 @@ deployment:
|
||||
--trust-remote-code
|
||||
--no-enable-prefix-caching
|
||||
--gpu-memory-utilization 0.9
|
||||
--additional-config '{"torchair_graph_config":{"enabled":true}}'
|
||||
--kv-transfer-config
|
||||
'{"kv_connector": "MooncakeConnector",
|
||||
"kv_role": "kv_consumer",
|
||||
|
||||
@@ -29,7 +29,6 @@ deployment:
|
||||
--trust-remote-code
|
||||
--no-enable-prefix-caching
|
||||
--gpu-memory-utilization 0.9
|
||||
--additional-config '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
|
||||
|
||||
-
|
||||
server_cmd: >
|
||||
@@ -49,5 +48,4 @@ deployment:
|
||||
--trust-remote-code
|
||||
--no-enable-prefix-caching
|
||||
--gpu-memory-utilization 0.92
|
||||
--additional-config '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
|
||||
benchmarks:
|
||||
|
||||
@@ -1,106 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
from vllm import SamplingParams
|
||||
from vllm.config import CompilationConfig, CUDAGraphMode
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sampling_config():
|
||||
return SamplingParams(temperature=0, max_tokens=256, ignore_eos=False)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def model_name():
|
||||
return "wemaster/deepseek_mtp_main_random_bf16"
|
||||
|
||||
|
||||
def mtp_torchair_correctness(
|
||||
sampling_config: SamplingParams,
|
||||
model_name: str,
|
||||
graph_mode: CUDAGraphMode = CUDAGraphMode.PIECEWISE,
|
||||
):
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
'''
|
||||
Compare the outputs of a original LLM and a speculative LLM
|
||||
should be the same when using mtp speculative decoding.
|
||||
'''
|
||||
with VllmRunner(model_name,
|
||||
tensor_parallel_size=1,
|
||||
gpu_memory_utilization=0.7,
|
||||
max_model_len=256,
|
||||
enforce_eager=False,
|
||||
additional_config={
|
||||
"torchair_graph_config": {
|
||||
"enabled": True,
|
||||
"use_cached_graph": False,
|
||||
"graph_batch_sizes": [1, 2, 4],
|
||||
},
|
||||
"multistream_overlap_shared_expert": "True"
|
||||
}) as ref_llm:
|
||||
ref_outputs = ref_llm.generate(example_prompts, sampling_config)
|
||||
|
||||
graph_mode_str = "PIECEWISE"
|
||||
if graph_mode == CUDAGraphMode.FULL:
|
||||
graph_mode_str = "FULL"
|
||||
|
||||
with VllmRunner(model_name,
|
||||
tensor_parallel_size=1,
|
||||
max_num_seqs=256,
|
||||
gpu_memory_utilization=0.7,
|
||||
distributed_executor_backend="mp",
|
||||
enable_expert_parallel=True,
|
||||
speculative_config={
|
||||
"method": "mtp",
|
||||
"num_speculative_tokens": 1,
|
||||
},
|
||||
enforce_eager=False,
|
||||
max_model_len=2000,
|
||||
compilation_config=CompilationConfig(
|
||||
cudagraph_mode=graph_mode_str),
|
||||
additional_config={
|
||||
"torchair_graph_config": {
|
||||
"enabled": True,
|
||||
"use_cached_graph": False,
|
||||
"graph_batch_sizes": [1, 2, 4],
|
||||
},
|
||||
"multistream_overlap_shared_expert": "True"
|
||||
}) as spec_llm:
|
||||
spec_outputs = spec_llm.generate(example_prompts, sampling_config)
|
||||
|
||||
matches = 0
|
||||
misses = 0
|
||||
for ref_output, spec_output in zip(ref_outputs, spec_outputs):
|
||||
ref_token_ids = ref_output[0][0]
|
||||
spec_token_ids = spec_output[0][0]
|
||||
if ref_token_ids == spec_token_ids[:len(ref_token_ids)]:
|
||||
matches += 1
|
||||
else:
|
||||
misses += 1
|
||||
print(f"ref_output: {ref_output[1][0]}")
|
||||
print(f"spec_output: {spec_output[1][0]}")
|
||||
|
||||
# Heuristic: expect at least 66% of the prompts to match exactly
|
||||
# Upon failure, inspect the outputs to check for inaccuracy.
|
||||
assert matches > int(0.66 * len(ref_outputs))
|
||||
|
||||
|
||||
def test_mtp_torchair_correctness_piecewise(
|
||||
sampling_config: SamplingParams,
|
||||
model_name: str,
|
||||
):
|
||||
mtp_torchair_correctness(sampling_config, model_name)
|
||||
|
||||
|
||||
def test_mtp_torchair_correctness_full(
|
||||
sampling_config: SamplingParams,
|
||||
model_name: str,
|
||||
):
|
||||
mtp_torchair_correctness(sampling_config, model_name, CUDAGraphMode.FULL)
|
||||
Reference in New Issue
Block a user