Revert "drop ascend scheduler" (#4580)
Reverts vllm-project/vllm-ascend#4498 - vLLM version: v0.11.2 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2
This commit is contained in:
@@ -48,26 +48,27 @@ def mtp_correctness(sampling_config: SamplingParams,
|
||||
if graph_mode == CUDAGraphMode.FULL:
|
||||
graph_mode_str = "FULL_DECODE_ONLY"
|
||||
|
||||
with VllmRunner(model_name,
|
||||
tensor_parallel_size=1,
|
||||
max_num_seqs=256,
|
||||
gpu_memory_utilization=0.7,
|
||||
distributed_executor_backend="mp",
|
||||
enable_expert_parallel=True,
|
||||
speculative_config={
|
||||
"method":
|
||||
"deepseek_mtp",
|
||||
"num_speculative_tokens":
|
||||
num_speculative_tokens,
|
||||
"disable_padded_drafter_batch":
|
||||
disable_padded_drafter_batch,
|
||||
},
|
||||
enforce_eager=enforce_eager,
|
||||
max_model_len=2000,
|
||||
compilation_config=CompilationConfig(
|
||||
cudagraph_mode=graph_mode_str,
|
||||
cudagraph_capture_sizes=[12],
|
||||
)) as spec_llm:
|
||||
with VllmRunner(
|
||||
model_name,
|
||||
tensor_parallel_size=1,
|
||||
max_num_seqs=256,
|
||||
gpu_memory_utilization=0.7,
|
||||
distributed_executor_backend="mp",
|
||||
enable_expert_parallel=True,
|
||||
speculative_config={
|
||||
"method": "deepseek_mtp",
|
||||
"num_speculative_tokens": num_speculative_tokens,
|
||||
"disable_padded_drafter_batch": disable_padded_drafter_batch,
|
||||
},
|
||||
enforce_eager=enforce_eager,
|
||||
max_model_len=2000,
|
||||
compilation_config=CompilationConfig(
|
||||
cudagraph_mode=graph_mode_str,
|
||||
cudagraph_capture_sizes=[12],
|
||||
),
|
||||
additional_config={"ascend_scheduler_config": {
|
||||
"enabled": False
|
||||
}}) as spec_llm:
|
||||
spec_outputs = spec_llm.generate(example_prompts, sampling_config)
|
||||
|
||||
matches = 0
|
||||
|
||||
170
tests/e2e/singlecard/test_ascend_scheduler.py
Normal file
170
tests/e2e/singlecard/test_ascend_scheduler.py
Normal file
@@ -0,0 +1,170 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
from vllm import SamplingParams
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
from tests.e2e.model_utils import check_outputs_equal
|
||||
|
||||
MODEL = "Qwen/Qwen3-0.6B"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enforce_eager", [True, False])
|
||||
def test_concurrent_partial_prefill(enforce_eager):
|
||||
with VllmRunner(MODEL,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True,
|
||||
},
|
||||
},
|
||||
max_num_seqs=3,
|
||||
max_num_batched_tokens=8192,
|
||||
enforce_eager=enforce_eager,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
outputs = vllm_model.model.generate(["Hello my name is Robert and I"] *
|
||||
3)
|
||||
assert len(outputs) == 3
|
||||
for output in outputs:
|
||||
assert len(output.outputs) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enforce_eager", [True, False])
|
||||
def test_prefix_cache_stats_is_recorded(enforce_eager):
|
||||
with VllmRunner(MODEL,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True,
|
||||
},
|
||||
},
|
||||
max_num_seqs=3,
|
||||
max_num_batched_tokens=8192,
|
||||
enforce_eager=enforce_eager,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
# 17 tokens will make sure first 16 tokens are cached in a block
|
||||
input_tokens = {"prompt_token_ids": [101] * 129}
|
||||
_ = vllm_model.model.generate([input_tokens])
|
||||
outputs = vllm_model.model.generate([input_tokens])
|
||||
assert outputs[0].num_cached_tokens == 128
|
||||
|
||||
|
||||
@pytest.mark.parametrize("max_tokens",
|
||||
[4]) # cannot align results when max_tokens > 4
|
||||
@pytest.mark.parametrize("chunked_prefill_token_size", [2048])
|
||||
def test_chunked_prefill_with_ascend_scheduler(
|
||||
max_tokens: int, chunked_prefill_token_size: int) -> None:
|
||||
example_prompts = [
|
||||
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
|
||||
]
|
||||
max_num_seqs = chunked_prefill_token_size
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
with VllmRunner(MODEL,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True,
|
||||
'enable_chunked_prefill': True,
|
||||
},
|
||||
},
|
||||
max_num_seqs=max_num_seqs,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
max_model_len=2048,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
chunked_prefill_output = vllm_model.generate_greedy(
|
||||
example_prompts, max_tokens)
|
||||
|
||||
with VllmRunner(MODEL,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True,
|
||||
},
|
||||
},
|
||||
max_model_len=2048,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_output,
|
||||
outputs_1_lst=chunked_prefill_output,
|
||||
name_0="vllm_output",
|
||||
name_1="chunked_prefill_output",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("max_tokens",
|
||||
[4]) # cannot align results when max_tokens > 4
|
||||
@pytest.mark.parametrize("chunked_prefill_token_size", [2048])
|
||||
def test_chunked_prefill_with_scheduler_dynamic_batch(
|
||||
max_tokens: int, chunked_prefill_token_size: int) -> None:
|
||||
example_prompts = [
|
||||
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
|
||||
]
|
||||
max_num_seqs = chunked_prefill_token_size
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
with VllmRunner(MODEL,
|
||||
additional_config={
|
||||
'SLO_limits_for_dynamic_batch': 0,
|
||||
},
|
||||
max_num_seqs=max_num_seqs,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
max_model_len=2048,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
dynamic_batch_output = vllm_model.generate_greedy(
|
||||
example_prompts, max_tokens)
|
||||
|
||||
with VllmRunner(MODEL,
|
||||
additional_config={
|
||||
'SLO_limits_for_dynamic_batch': -1,
|
||||
},
|
||||
max_model_len=2048,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_output,
|
||||
outputs_1_lst=dynamic_batch_output,
|
||||
name_0="vllm_output",
|
||||
name_1="chunked_prefill_output",
|
||||
)
|
||||
|
||||
|
||||
def test_async_scheduling_eager() -> None:
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
] * 10
|
||||
sampling_params = SamplingParams(temperature=0.2,
|
||||
max_tokens=10,
|
||||
stop_token_ids=None)
|
||||
|
||||
with VllmRunner(
|
||||
"Qwen/Qwen2.5-0.5B-Instruct",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=50,
|
||||
dtype="bfloat16",
|
||||
gpu_memory_utilization=0.9,
|
||||
async_scheduling=True,
|
||||
) as vllm_model:
|
||||
vllm_model.generate(prompts, sampling_params=sampling_params)
|
||||
|
||||
|
||||
def test_async_scheduling_with_full_graph() -> None:
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
] * 10
|
||||
sampling_params = SamplingParams(temperature=0.2,
|
||||
max_tokens=10,
|
||||
stop_token_ids=None)
|
||||
|
||||
with VllmRunner("Qwen/Qwen3-8B",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=50,
|
||||
dtype="bfloat16",
|
||||
gpu_memory_utilization=0.9,
|
||||
async_scheduling=True,
|
||||
compilation_config={"cudagraph_mode":
|
||||
"FULL"}) as vllm_model:
|
||||
vllm_model.generate(prompts, sampling_params=sampling_params)
|
||||
82
tests/e2e/singlecard/test_chunked.py
Normal file
82
tests/e2e/singlecard/test_chunked.py
Normal file
@@ -0,0 +1,82 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
"""
|
||||
Compare the outputs of vLLM with and without aclgraph.
|
||||
|
||||
Run `pytest tests/compile/test_aclgraph.py`.
|
||||
"""
|
||||
import gc
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from vllm import SamplingParams
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [1])
|
||||
def test_models(
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
prompts = ["The president of the United States is"]
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=max_tokens,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
with VllmRunner(model,
|
||||
long_prefill_token_threshold=20,
|
||||
enforce_eager=False) as vllm_model:
|
||||
output1 = vllm_model.generate(prompts, sampling_params)
|
||||
|
||||
with VllmRunner(model,
|
||||
enforce_eager=False,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True
|
||||
},
|
||||
}) as vllm_model:
|
||||
output2 = vllm_model.generate(prompts, sampling_params)
|
||||
|
||||
# Extract the generated token IDs for comparison
|
||||
token_ids1 = output1[0][0][0]
|
||||
token_ids2 = output2[0][0][0]
|
||||
|
||||
print(f"Token IDs 1: {token_ids1}")
|
||||
print(f"Token IDs 2: {token_ids2}")
|
||||
|
||||
# Convert token IDs to tensors and calculate cosine similarity
|
||||
# Take the length of a shorter sequence to ensure consistent dimensions
|
||||
min_len = min(len(token_ids1), len(token_ids2))
|
||||
|
||||
tensor1 = torch.tensor(token_ids1[:min_len], dtype=torch.float32)
|
||||
tensor2 = torch.tensor(token_ids2[:min_len], dtype=torch.float32)
|
||||
|
||||
# Calculate similarity using torch.cosine_similarity
|
||||
similarity = torch.cosine_similarity(tensor1, tensor2, dim=0)
|
||||
print(f"Token IDs cosine similarity: {similarity.item()}")
|
||||
|
||||
assert similarity > 0.95
|
||||
|
||||
gc.collect()
|
||||
torch.npu.empty_cache()
|
||||
torch.npu.reset_peak_memory_stats()
|
||||
@@ -20,6 +20,7 @@
|
||||
|
||||
Run `pytest tests/test_offline_inference.py`.
|
||||
"""
|
||||
import pytest
|
||||
from vllm import SamplingParams
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.assets.image import ImageAsset
|
||||
@@ -54,6 +55,40 @@ def test_multimodal_vl(prompt_template):
|
||||
assert output_str, "Generated output should not be empty."
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="This e2e test will stuck in multi-batch scenario. "
|
||||
"Add this back after fixing the issue.")
|
||||
def test_multimodal_ascend_scheduler(prompt_template):
|
||||
image = ImageAsset("cherry_blossom") \
|
||||
.pil_image.convert("RGB")
|
||||
img_questions = [
|
||||
"What is the content of this image?",
|
||||
"Describe the content of this image in detail.",
|
||||
"What's in the image?",
|
||||
"Where is this image taken?",
|
||||
]
|
||||
images = [image] * len(img_questions)
|
||||
prompts = prompt_template(img_questions)
|
||||
with VllmRunner("Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
max_model_len=4096,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True,
|
||||
},
|
||||
},
|
||||
mm_processor_kwargs={
|
||||
"min_pixels": 28 * 28,
|
||||
"max_pixels": 1280 * 28 * 28,
|
||||
"fps": 1,
|
||||
},
|
||||
enforce_eager=True) as vllm_model:
|
||||
outputs = vllm_model.generate_greedy(prompts=prompts,
|
||||
images=images,
|
||||
max_tokens=64)
|
||||
assert len(outputs) == len(prompts)
|
||||
for _, output_str in outputs:
|
||||
assert output_str, "Generated output should not be empty."
|
||||
|
||||
|
||||
def test_multimodal_audio():
|
||||
audio_prompt = "".join([
|
||||
f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
|
||||
|
||||
Reference in New Issue
Block a user