From a3b5af830709960465496e8fed3473b997d28668 Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Sat, 14 Jun 2025 16:59:00 +0800 Subject: [PATCH] [CI/UT][Graph] Add ut for torchair graph mode (#1103) ### What this PR does / why we need it? Add ut for torchair graph mode on DeepSeekV3 ### How was this patch tested? CI passed with new added test. --------- Signed-off-by: MengqingCao Signed-off-by: Mengqing Cao --- docs/source/user_guide/additional_config.md | 18 ++--- docs/source/user_guide/graph_mode.md | 5 +- tests/conftest.py | 9 ++- tests/multicard/test_torchair_graph_mode.py | 80 +++++++++++++++++++++ 4 files changed, 100 insertions(+), 12 deletions(-) create mode 100644 tests/multicard/test_torchair_graph_mode.py diff --git a/docs/source/user_guide/additional_config.md b/docs/source/user_guide/additional_config.md index 4608326..778938a 100644 --- a/docs/source/user_guide/additional_config.md +++ b/docs/source/user_guide/additional_config.md @@ -53,7 +53,7 @@ The details of each config option are as follows: | ---- | ---- | ------- | ----------- | | `enabled` | bool | `False` | Whether to enable ascend scheduler for V1 engine| -ascend_scheduler_config also support the options from [vllm scheduler config](https://docs.vllm.ai/en/stable/api/vllm/config.html#vllm.config.SchedulerConfig). For example, you can add `chunked_prefill_enabled: true` to ascend_scheduler_config as well. +ascend_scheduler_config also support the options from [vllm scheduler config](https://docs.vllm.ai/en/stable/api/vllm/config.html#vllm.config.SchedulerConfig). For example, you can add `enable_chunked_prefill: True` to ascend_scheduler_config as well. ### Example @@ -62,18 +62,18 @@ A full example of additional configuration is as follows: ``` { "torchair_graph_config": { - "enabled": true, - "use_cached_graph": true, + "enabled": True, + "use_cached_graph": True, "graph_batch_sizes": [1, 2, 4, 8], - "graph_batch_sizes_init": false, - "enable_multistream_moe": false, - "enable_kv_nz": false + "graph_batch_sizes_init": False, + "enable_multistream_moe": False, + "enable_kv_nz": False }, "ascend_scheduler_config": { - "enabled": true, - "chunked_prefill_enabled": true, + "enabled": True, + "enable_chunked_prefill": True, }, "expert_tensor_parallel_size": 1, - "refresh": false, + "refresh": False, } ``` diff --git a/docs/source/user_guide/graph_mode.md b/docs/source/user_guide/graph_mode.md index 6831809..161b89a 100644 --- a/docs/source/user_guide/graph_mode.md +++ b/docs/source/user_guide/graph_mode.md @@ -47,14 +47,15 @@ from vllm import LLM os.environ["VLLM_USE_V1"] = 1 -model = LLM(model="deepseek-ai/DeepSeek-R1-0528", additional_config={"torchair_graph_config": {"enabled": True}}) +# TorchAirGraph is only work without chunked-prefill now +model = LLM(model="deepseek-ai/DeepSeek-R1-0528", additional_config={"torchair_graph_config": {"enabled": True},"ascend_scheduler_config": {"enabled": True,}}) outputs = model.generate("Hello, how are you?") ``` online example: ```shell -vllm serve Qwen/Qwen2-7B-Instruct --additional-config='{"torchair_graph_config": {"enabled": true}}' +vllm serve Qwen/Qwen2-7B-Instruct --additional-config='{"torchair_graph_config": {"enabled": True},"ascend_scheduler_config": {"enabled": True,}}' ``` You can find more detail about additional config [here](./additional_config.md) diff --git a/tests/conftest.py b/tests/conftest.py index 16bbc80..e0d70a1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -17,6 +17,7 @@ # Adapted from vllm-project/vllm/blob/main/tests/conftest.py # +import contextlib import gc from typing import List, Optional, Tuple, TypeVar, Union @@ -53,11 +54,17 @@ PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]] PromptVideoInput = _PromptMultiModalInput[np.ndarray] -def cleanup_dist_env_and_memory(): +def cleanup_dist_env_and_memory(shutdown_ray: bool = False): destroy_model_parallel() destroy_distributed_environment() + with contextlib.suppress(AssertionError): + torch.distributed.destroy_process_group() + if shutdown_ray: + import ray # Lazy import Ray + ray.shutdown() gc.collect() torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() class VllmRunner: diff --git a/tests/multicard/test_torchair_graph_mode.py b/tests/multicard/test_torchair_graph_mode.py new file mode 100644 index 0000000..d06ec7d --- /dev/null +++ b/tests/multicard/test_torchair_graph_mode.py @@ -0,0 +1,80 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +"""Compare the short outputs of HF and vLLM when using greedy sampling. + +Run `pytest tests/multicard/test_torchair_graph_mode.py`. +""" +import os + +import pytest + +from tests.conftest import VllmRunner + +os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" + + +@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", + reason="torchair graph is not supported on v0") +def test_e2e_deepseekv3_with_torchair(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv("VLLM_USE_MODELSCOPE", "True") + m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") + + example_prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + dtype = "half" + max_tokens = 5 + # torchair is only work without chunked-prefill now + with VllmRunner( + "vllm-ascend/DeepSeek-V3-Pruning", + dtype=dtype, + tensor_parallel_size=4, + distributed_executor_backend="mp", + additional_config={ + "torchair_graph_config": { + "enabled": True, + }, + "ascend_scheduler_config": { + "enabled": True, + }, + "refresh": True, + }, + enforce_eager=False, + ) as vllm_model: + # use greedy sampler to make sure the generated results are fix + vllm_output = vllm_model.generate_greedy(example_prompts, + max_tokens) + # NOTE: vllm-ascend/DeepSeek-V3-Pruning is a random weight of + # DeepSeek-V3 with 2 hidden layers, thus the golden results seems + # inaccurate. This will only change if accuracy improves with the + # official weights of DeepSeek-V3. + golden_results = [ + 'Hello, my name is feasibility伸 spazio debtor添', + 'The president of the United States is begg"""\n杭州风和 bestimm', + 'The capital of France is frequentlyশามalinkAllowed', + 'The future of AI is deleting俯احت怎么样了حراف', + ] + + assert len(golden_results) == len(vllm_output) + for i in range(len(vllm_output)): + assert golden_results[i] == vllm_output[i][1] + print(f"Generated text: {vllm_output[i][1]!r}")