[CI] Drop ascend scheduler from test (#4613)
Drop ascend scheduler from test - vLLM version: v0.11.2 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -1,118 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
from vllm import SamplingParams
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
from tests.e2e.model_utils import check_outputs_equal
|
||||
|
||||
MODEL = "Qwen/Qwen3-0.6B"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enforce_eager", [True, False])
|
||||
def test_concurrent_partial_prefill(enforce_eager):
|
||||
with VllmRunner(MODEL,
|
||||
max_num_seqs=3,
|
||||
max_num_batched_tokens=8192,
|
||||
enforce_eager=enforce_eager,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
outputs = vllm_model.model.generate(["Hello my name is Robert and I"] *
|
||||
3)
|
||||
assert len(outputs) == 3
|
||||
for output in outputs:
|
||||
assert len(output.outputs) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enforce_eager", [True, False])
|
||||
def test_prefix_cache_stats_is_recorded(enforce_eager):
|
||||
with VllmRunner(MODEL,
|
||||
max_num_seqs=3,
|
||||
max_num_batched_tokens=8192,
|
||||
enforce_eager=enforce_eager,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
# 17 tokens will make sure first 16 tokens are cached in a block
|
||||
input_tokens = {"prompt_token_ids": [101] * 129}
|
||||
_ = vllm_model.model.generate([input_tokens])
|
||||
outputs = vllm_model.model.generate([input_tokens])
|
||||
assert outputs[0].num_cached_tokens == 128
|
||||
|
||||
|
||||
@pytest.mark.parametrize("max_tokens",
|
||||
[4]) # cannot align results when max_tokens > 4
|
||||
@pytest.mark.parametrize("chunked_prefill_token_size", [2048])
|
||||
def test_chunked_prefill_with_scheduler_dynamic_batch(
|
||||
max_tokens: int, chunked_prefill_token_size: int) -> None:
|
||||
example_prompts = [
|
||||
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
|
||||
]
|
||||
max_num_seqs = chunked_prefill_token_size
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
with VllmRunner(MODEL,
|
||||
additional_config={
|
||||
'SLO_limits_for_dynamic_batch': 0,
|
||||
},
|
||||
max_num_seqs=max_num_seqs,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
max_model_len=2048,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
dynamic_batch_output = vllm_model.generate_greedy(
|
||||
example_prompts, max_tokens)
|
||||
|
||||
with VllmRunner(MODEL,
|
||||
additional_config={
|
||||
'SLO_limits_for_dynamic_batch': -1,
|
||||
},
|
||||
max_model_len=2048,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_output,
|
||||
outputs_1_lst=dynamic_batch_output,
|
||||
name_0="vllm_output",
|
||||
name_1="chunked_prefill_output",
|
||||
)
|
||||
|
||||
|
||||
def test_async_scheduling_eager() -> None:
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
] * 10
|
||||
sampling_params = SamplingParams(temperature=0.2,
|
||||
max_tokens=10,
|
||||
stop_token_ids=None)
|
||||
|
||||
with VllmRunner(
|
||||
"Qwen/Qwen2.5-0.5B-Instruct",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=50,
|
||||
dtype="bfloat16",
|
||||
gpu_memory_utilization=0.9,
|
||||
async_scheduling=True,
|
||||
) as vllm_model:
|
||||
vllm_model.generate(prompts, sampling_params=sampling_params)
|
||||
|
||||
|
||||
def test_async_scheduling_with_full_graph() -> None:
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
] * 10
|
||||
sampling_params = SamplingParams(temperature=0.2,
|
||||
max_tokens=10,
|
||||
stop_token_ids=None)
|
||||
|
||||
with VllmRunner("Qwen/Qwen3-8B",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=50,
|
||||
dtype="bfloat16",
|
||||
gpu_memory_utilization=0.9,
|
||||
async_scheduling=True,
|
||||
compilation_config={"cudagraph_mode":
|
||||
"FULL"}) as vllm_model:
|
||||
vllm_model.generate(prompts, sampling_params=sampling_params)
|
||||
Reference in New Issue
Block a user