Files
xc-llm-ascend/tests/e2e/singlecard/test_sampler.py

68 lines
2.4 KiB
Python
Raw Normal View History

#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# This file is a part of the vllm-ascend project.
# Adapted from vllm/tests/entrypoints/llm/test_guided_generate.py
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from vllm import SamplingParams
from tests.e2e.conftest import VllmRunner
def test_models_topk() -> None:
example_prompts = [
"Hello, my name is",
]
sampling_params = SamplingParams(max_tokens=5,
temperature=0.0,
top_k=50,
top_p=0.9)
with VllmRunner("Qwen/Qwen3-0.6B",
max_model_len=8192,
gpu_memory_utilization=0.7) as runner:
runner.generate(example_prompts, sampling_params)
def test_models_prompt_logprobs() -> None:
example_prompts = [
"Hello, my name is",
]
with VllmRunner("Qwen/Qwen3-0.6B",
max_model_len=8192,
gpu_memory_utilization=0.7) as runner:
runner.generate_greedy_logprobs(example_prompts,
max_tokens=5,
num_logprobs=1)
[Performance] Add async exponential while model executing (#4501) ### What this PR does / why we need it? Add a control to enable the exponential distribution operator overlapping with model executing (default is OFF due to this feature might not perform well on MOE models, i.e. For Qwen3-30B). Enable async exponential overlapping will provides performance improvement. Also, overlapping the exponential operator with module execution can cover the performance drop introduced by AICPU-version's exponential operator. **UPDATE**: (12/12) Now our overlap will use the same stream that introduced in this pr: #4908 . We move the `do_async_exponential` from `model_runner_v1.py` to `sampler.py`. Now we are using `additional_config` to enable async exponential: Add `"enable_async_exponential": 1` in `addition_config`. Now we **ONLY** support default exponential/AI-CPU exponential, the old `"enable_async_exponential": 2` option has been aborted to keep consistency. ### Does this PR introduce _any_ user-facing change? **YES**, added a new `additional_config` : `"enable_async_exponential": 1`. When `enable_async_exponential` is set to 1, we enable the async exponential and overlap with model runner. When `enable_async_exponential` is set to 0 (default is 0), we disable the async exponential, but exponential will still running on a different stream using stream introduced in #4908. - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: YuhanBai <yuhan.bai0830@gmail.com> Signed-off-by: YuhanBai yuhan.bai0830@gmail.com
2025-12-20 21:23:21 +08:00
def test_exponential_overlap() -> None:
example_prompts = [
"Hello, my name is",
]
sampling_params = SamplingParams(max_tokens=5,
temperature=1.0,
top_k=50,
top_p=0.9)
with VllmRunner("Qwen/Qwen3-0.6B",
max_model_len=8192,
gpu_memory_utilization=0.7,
additional_config={
"enable_async_exponential": 1,
}) as runner:
runner.generate(example_prompts, sampling_params)