[Misc] Refactor additional_config (#1029)

More and more config options are added to additional_config. This PR
provide a new AscendConfig to manage these config options by an easier
way to make code cleaner and readable.

 This PR also added the `additional_config` doc for users.

Added the test_ascend_config.py to make sure the new AscendConfig works
as expect.

TODO: Add e2e test with torchair and deepseek once the CI resource is
available.

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-06-05 16:28:01 +08:00
committed by GitHub
parent 7737aaa40f
commit e1ab6d318e
23 changed files with 456 additions and 208 deletions

View File

@@ -167,17 +167,17 @@ def run_equality_correctness_test(
# TODO current torchair graph mode needs clean torchair cache.
# if do not clean, it will raise error
additional_config = common_llm_kwargs.get("additional_config")
enable_graph_mode = additional_config.get(
"enable_graph_mode") if additional_config else False
torchair_graph_enabled = common_llm_kwargs.get(
"additional_config", {}).get("torchair_graph_config",
{}).get("enabled", False)
with vllm_runner(**org_args) as vllm_model:
if enable_graph_mode:
if torchair_graph_enabled:
_clean_torchair_cache()
org_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params)
with vllm_runner(**sd_args) as vllm_model:
if enable_graph_mode:
if torchair_graph_enabled:
_clean_torchair_cache()
if ensure_all_accepted or expected_acceptance_rate is not None:
# Force log interval to be 0 to catch all metrics.

View File

@@ -218,7 +218,9 @@ def test_mtp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
"common_llm_kwargs",
[{
"additional_config": {
'enable_graph_mode': True,
'torchair_graph_config': {
"enabled": True,
},
},
# Print spec metrics.
@@ -262,7 +264,9 @@ def test_mtp_e2e_greedy_correctness_torchair_graph(
"common_llm_kwargs",
[{
"additional_config": {
'enable_graph_mode': True,
'torchair_graph_config': {
"enabled": True,
},
},
# Print spec metrics.

View File

@@ -18,8 +18,6 @@ import pytest
import torch
from vllm import LLM, SamplingParams
from vllm_ascend.utils import vllm_version_is
MODELS = [
"Qwen/Qwen2.5-0.5B-Instruct",
]
@@ -32,9 +30,6 @@ prompts = [
]
@pytest.mark.skipif(
(vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1")),
reason="aclgraph not supported in v0.8.5 and v0.8.5.post1")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
@pytest.mark.parametrize("max_tokens", [64])

View File

@@ -31,9 +31,7 @@ os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
def test_models_distributed_QwQ():
example_prompts = [
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
"Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
"Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
"Hello, my name is",
]
dtype = "half"
max_tokens = 5
@@ -48,9 +46,7 @@ def test_models_distributed_QwQ():
def test_models_distributed_DeepSeek():
example_prompts = [
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
"Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
"Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
"Hello, my name is",
]
dtype = "half"
max_tokens = 5

View File

@@ -28,16 +28,12 @@ from vllm import LLM, SamplingParams
from tests.conftest import VllmRunner
from tests.model_utils import check_outputs_equal
from vllm_ascend.utils import vllm_version_is
MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
reason="aclgraph only support on v1")
@pytest.mark.skipif(
(vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1")),
reason="aclgraph not supported in v0.8.5 and v0.8.5.post1")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
def test_models(
@@ -88,9 +84,6 @@ def test_models(
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
reason="aclgraph only support on v1")
@pytest.mark.skipif(
(vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1")),
reason="aclgraph not supported in v0.8.5 and v0.8.5.post1")
def test_deepseek_raises_error(monkeypatch: pytest.MonkeyPatch) -> None:
with monkeypatch.context() as m:
m.setenv("VLLM_USE_MODELSCOPE", "True")

View File

@@ -0,0 +1,118 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# This file is a part of the vllm-ascend project.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pytest
from tests.conftest import VllmRunner
from vllm_ascend.ascend_config import clear_ascend_config, get_ascend_config
def _clean_up_ascend_config(func):
def wrapper(*args, **kwargs):
clear_ascend_config()
func(*args, **kwargs)
clear_ascend_config()
return wrapper
@_clean_up_ascend_config
def test_run_without_ascend_config():
with VllmRunner("facebook/opt-125m"):
ascend_config = get_ascend_config()
assert not ascend_config.torchair_graph_config.enabled
assert not ascend_config.torchair_graph_config.use_cached_graph
assert ascend_config.torchair_graph_config.graph_batch_sizes == []
assert not ascend_config.torchair_graph_config.graph_batch_sizes_init
assert not ascend_config.ascend_scheduler_config.enabled
assert ascend_config.expert_tensor_parallel_size == 1
@_clean_up_ascend_config
def test_run_with_ascend_config():
input_additional_config = {
"torchair_graph_config": {
# torchair graph only works with deepseek. The e2e test should be added
# in multicard test with deepseek models.
"enabled": False,
"use_cached_graph": True,
"graph_batch_sizes": [1, 2, 4, 8],
"graph_batch_sizes_init": False,
},
"ascend_scheduler_config": {
"enabled": True,
"enable_chunked_prefill": True,
},
"expert_tensor_parallel_size": 1
}
with VllmRunner("facebook/opt-125m",
additional_config=input_additional_config):
ascend_config = get_ascend_config()
assert not ascend_config.torchair_graph_config.enabled
assert ascend_config.torchair_graph_config.use_cached_graph
assert ascend_config.torchair_graph_config.graph_batch_sizes == [
1, 2, 4, 8
]
assert not ascend_config.torchair_graph_config.graph_batch_sizes_init
assert ascend_config.ascend_scheduler_config.enabled
assert ascend_config.ascend_scheduler_config.enable_chunked_prefill
assert ascend_config.expert_tensor_parallel_size == 1
@_clean_up_ascend_config
def test_ascend_config_init_error():
# ascend_config should be initialized first
with pytest.raises(RuntimeError):
_ = get_ascend_config()
@_clean_up_ascend_config
def test_ascend_config_load_error():
# graph_batch_sizes should be list.
with pytest.raises(TypeError):
input_additional_config_fake_1 = {
"torchair_graph_config": {
"graph_batch_sizes": "fake_size",
},
}
with VllmRunner("facebook/opt-125m",
additional_config=input_additional_config_fake_1):
pass
# graph_batch_sizes_init should not be True when graph_batch_sizes is not empty.
with pytest.raises(ValueError):
input_additional_config_fake_2 = {
"torchair_graph_config": {
"graph_batch_sizes": [1, 2, 4, 8],
"graph_batch_sizes_init": True,
},
}
with VllmRunner("facebook/opt-125m",
additional_config=input_additional_config_fake_2):
pass
# torchair graph only works with deepseek.
with pytest.raises(NotImplementedError):
input_additional_config_fake_2 = {
"torchair_graph_config": {
"enabled": True,
},
}
with VllmRunner("facebook/opt-125m",
additional_config=input_additional_config_fake_2):
pass