[CI] Add new runner and enable QwQ multinpu test (#417)

### What this PR does / why we need it?

- Add a new runner to the continuous integration system and keep the
original CI runner until the new runner runs stably
- Add distributed test cases

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
CI passed

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
Li Wang
2025-04-08 16:52:45 +08:00
committed by GitHub
parent 5d6239306b
commit afdbf77483
5 changed files with 405 additions and 128 deletions

View File

@@ -17,14 +17,17 @@
# limitations under the License.
#
import gc
from typing import List, Optional, Tuple, TypeVar, Union
import numpy as np
import pytest
import torch
from PIL import Image
from vllm import LLM, SamplingParams
from vllm.config import TaskOption
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.distributed.parallel_state import (destroy_distributed_environment,
destroy_model_parallel)
from vllm.inputs import ExplicitEncoderDecoderPrompt, TextPrompt, TokensPrompt
from vllm.logger import init_logger
from vllm.outputs import RequestOutput
@@ -37,6 +40,7 @@ from tests.model_utils import (TokensTextLogprobs,
logger = init_logger(__name__)
_M = TypeVar("_M")
_PromptMultiModalInput = Union[List[_M], List[List[_M]]]
PromptImageInput = _PromptMultiModalInput[Image.Image]
@@ -44,6 +48,13 @@ PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]]
PromptVideoInput = _PromptMultiModalInput[np.ndarray]
def cleanup_dist_env_and_memory():
destroy_model_parallel()
destroy_distributed_environment()
gc.collect()
torch.npu.empty_cache()
class VllmRunner:
def __init__(

View File

@@ -31,20 +31,13 @@ import vllm_ascend # noqa: F401
MODELS = [
"Qwen/Qwen2.5-0.5B-Instruct",
]
os.environ["VLLM_USE_MODELSCOPE"] = "True"
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half", "float16"])
@pytest.mark.parametrize("max_tokens", [5])
def test_models(
model: str,
dtype: str,
max_tokens: int,
) -> None:
def test_models(model: str, dtype: str, max_tokens: int) -> None:
# 5042 tokens for gemma2
# gemma2 has alternating sliding window size of 4096
# we need a prompt with more than 4096 tokens to test the sliding window
@@ -60,6 +53,28 @@ def test_models(
vllm_model.generate_greedy(example_prompts, max_tokens)
@pytest.mark.multinpu
@pytest.mark.parametrize("model, distributed_executor_backend", [
("Qwen/QwQ-32B", "mp"),
])
def test_models_distributed(vllm_runner, model: str,
distributed_executor_backend: str) -> None:
example_prompts = [
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
"Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
"Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
]
dtype = "half"
max_tokens = 5
with vllm_runner(
model,
dtype=dtype,
tensor_parallel_size=4,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
if __name__ == "__main__":
import pytest
pytest.main([__file__])