drop ascend scheduler (#4498)

Ascend scheduler was added for non chunk prefill case before, since that
the npu ops didn't work well with chunked prefill.

Now the ops with chunked prefill work better, it's time to remove the
ascend scheduler to use vLLM default scheduler.

- vLLM version: v0.11.2

---------

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-11-29 16:18:34 +08:00
committed by GitHub
parent 53a52d6614
commit f10acddb78
52 changed files with 85 additions and 2948 deletions

View File

@@ -24,15 +24,12 @@ from tests.e2e.conftest import VllmRunner
MODELS = [
"IntervitensInc/pangu-pro-moe-model",
]
# set additional config for ascend scheduler and torchair graph
# set additional config for torchair graph
ADDITIONAL_CONFIG = [{
"additional_config": {
"torchair_graph_config": {
"enabled": True
},
"ascend_scheduler_config": {
"enabled": True,
}
}
}]

View File

@@ -15,23 +15,14 @@ def test_e2e_ep_correctness(model_name):
max_tokens = 5
# FIXME: Really strange that chunked prefill might lead to different results, investigate further
with VllmRunner(
model_name,
tensor_parallel_size=2,
additional_config={"ascend_scheduler_config": {
"enabled": True
}},
enforce_eager=False) as vllm_model:
with VllmRunner(model_name, tensor_parallel_size=2,
enforce_eager=False) as vllm_model:
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
with VllmRunner(
model_name,
tensor_parallel_size=2,
enable_expert_parallel=True,
additional_config={"ascend_scheduler_config": {
"enabled": True
}},
enforce_eager=False) as vllm_model:
with VllmRunner(model_name,
tensor_parallel_size=2,
enable_expert_parallel=True,
enforce_eager=False) as vllm_model:
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal(

View File

@@ -49,13 +49,7 @@ def test_generate_with_allgather():
tensor_parallel_size=2,
max_model_len=1024,
dtype="auto",
enable_expert_parallel=True,
additional_config={
"ascend_scheduler_config": {
"enabled": True,
"chunked_prefill_enabled": False,
},
}) as vllm_model:
enable_expert_parallel=True) as vllm_model:
vllm_model.generate(example_prompts, sampling_params)
@@ -76,11 +70,5 @@ def test_generate_with_alltoall():
tensor_parallel_size=2,
max_model_len=1024,
dtype="auto",
enable_expert_parallel=True,
additional_config={
"ascend_scheduler_config": {
"enabled": True,
"chunked_prefill_enabled": False,
},
}) as vllm_model:
enable_expert_parallel=True) as vllm_model:
vllm_model.generate(example_prompts, sampling_params)

View File

@@ -82,9 +82,6 @@ def test_models_distributed_DeepSeek_multistream_moe():
"enabled": True,
},
"enable_multistream_moe": True,
"ascend_scheduler_config": {
"enabled": True,
},
"refresh": True,
},
) as vllm_model:
@@ -154,14 +151,9 @@ def test_models_distributed_DeepSeek_W4A8DYNAMIC(model):
quantization="ascend",
enforce_eager=True,
enable_expert_parallel=True,
additional_config={
"torchair_graph_config": {
"enabled": False,
},
"ascend_scheduler_config": {
"enabled": True,
}
},
additional_config={"torchair_graph_config": {
"enabled": False,
}},
) as vllm_model:
vllm_model.generate_greedy(prompts, max_tokens)

View File

@@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Compare the with and without prefix caching on V1 scheduler or AscendScheduler."""
"""Compare the with and without prefix caching on V1 scheduler."""
import pytest
@@ -84,67 +84,3 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
name_0="vllm_output",
name_1="prefix_cache_output",
)
@pytest.mark.skip(reason="Fix me, the accuracy is not correct")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [50])
def test_prefix_cache_with_ascend_scheduler(model: str,
max_tokens: int) -> None:
with VllmRunner(model,
additional_config={
'ascend_scheduler_config': {
'enabled': True,
},
},
enforce_eager=False,
max_model_len=2048,
tensor_parallel_size=2,
gpu_memory_utilization=0.7) as vllm_model:
vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
with VllmRunner(model,
additional_config={
'ascend_scheduler_config': {
'enabled': True,
'enable_prefix_caching': True,
},
},
enforce_eager=False,
max_model_len=2048,
tensor_parallel_size=2,
gpu_memory_utilization=0.7) as vllm_model:
prefix_cache_output = vllm_model.generate_greedy(
INPUT_PROMPTS, max_tokens)
# TODO: enable apc and chunked prefill with ascend scheduler will lead accuracy problem.
# Disable it now. Fix it or drop the ascend scheduler in the future.
# with VllmRunner(model,
# additional_config={
# 'ascend_scheduler_config': {
# 'enabled': True,
# 'enable_prefix_caching': True,
# "enable_chunked_prefill": True,
# },
# },
# enforce_eager=True,
# max_model_len=2048,
# tensor_parallel_size=2,
# gpu_memory_utilization=0.7) as vllm_model:
# chunk_prefill_prefix_cache_output = vllm_model.generate_greedy(
# INPUT_PROMPTS, max_tokens)
check_outputs_equal(
outputs_0_lst=vllm_output,
outputs_1_lst=prefix_cache_output,
name_0="vllm_output",
name_1="prefix_cache_output",
)
# check_outputs_equal(
# outputs_0_lst=chunk_prefill_prefix_cache_output,
# outputs_1_lst=prefix_cache_output,
# name_0="chunk_prefill_prefix_cache_output",
# name_1="prefix_cache_output",
# )

View File

@@ -24,6 +24,7 @@ Run `pytest tests/e2e/multicard/test_qwen3_next.py`.
import os
from unittest.mock import patch
import pytest
from modelscope import snapshot_download # type: ignore
from tests.e2e.conftest import VllmRunner
@@ -63,6 +64,8 @@ def test_models_distributed_Qwen3_NEXT_TP4_FULL_DECODE_ONLY():
del vllm_model
@pytest.mark.skip(
reason="Qwen3-Next + MTP doesn't work with chunked prefill. Fix Me")
def test_models_distributed_Qwen3_NEXT_MTP_TP4_SIMILARITY():
example_prompts = [
"Hello, my name is",
@@ -89,12 +92,6 @@ def test_models_distributed_Qwen3_NEXT_MTP_TP4_SIMILARITY():
gpu_memory_utilization=0.8,
distributed_executor_backend="mp",
enforce_eager=True,
additional_config={
"ascend_scheduler_config": {
"enabled": True,
"enable_chunked_prefill": False
}
},
speculative_config={
"method": "qwen3_next_mtp",
"num_speculative_tokens": 1

View File

@@ -44,9 +44,6 @@ def _deepseek_torchair_test_fixture(
kwargs = {}
if not use_v1_schduler:
kwargs = {
"ascend_scheduler_config": {
"enabled": True,
},
"refresh": True,
}
additional_config.update(**kwargs)
@@ -120,9 +117,6 @@ def _pangu_torchair_test_fixture(
# torchair is only work without chunked-prefill now
kwargs = {
"ascend_scheduler_config": {
"enabled": True,
},
"refresh": True,
}
additional_config.update(**kwargs)
@@ -185,9 +179,6 @@ def _qwen_torchair_test_fixture(
"torchair_graph_config": {
"enabled": False,
},
"ascend_scheduler_config": {
"enabled": True,
},
"refresh": True,
}
@@ -244,9 +235,6 @@ def _deepseek_v2_lite_torchair_test_fixure(
kwargs = {}
if not use_v1_schduler:
kwargs = {
"ascend_scheduler_config": {
"enable": True,
},
"refresh": True,
}
additional_config.update(**kwargs)

View File

@@ -73,11 +73,7 @@ async def test_models(model: str, mode: str) -> None:
"VLLM_RPC_TIMEOUT": "3600000",
"VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": "3600000"
}
additional_config: dict[str, Any] = {
"ascend_scheduler_config": {
"enabled": False
},
}
additional_config: dict[str, Any] = {}
speculative_config = {
"num_speculative_tokens": 2,
"method": "deepseek_mtp"

View File

@@ -74,9 +74,6 @@ async def test_models(model: str) -> None:
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
}
additional_config = {
"ascend_scheduler_config": {
"enabled": False
},
"torchair_graph_config": {
"enabled": True,
"enable_multistream_moe": False,

View File

@@ -68,12 +68,7 @@ aisbench_cases75 = [{
async def test_models(model: str) -> None:
port = get_open_port()
env_dict = {"TASK_QUEUE_ENABLE": "1", "HCCL_OP_EXPANSION_MODE": "AIV"}
additional_config = {
"ascend_scheduler_config": {
"enabled": False
},
"enable_weight_nz_layout": True
}
additional_config = {"enable_weight_nz_layout": True}
server_args = [
"--quantization", "ascend", "--reasoning-parser", "qwen3",
"--tensor-parallel-size", "4", "--port",

View File

@@ -83,8 +83,7 @@ async def test_models(model: str, tp_size: int) -> None:
"0.9", "--block-size", "128", "--max-num-seqs", "256",
"--enforce-eager", "--max-model-len", "35840",
"--max-num-batched-tokens", "35840", "--additional-config",
'{"ascend_scheduler_config":{"enabled":true},"enable_weight_nz_layout":true}',
"--compilation-config",
'{"enable_weight_nz_layout":true}', "--compilation-config",
'{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[1,8,24,48,60]}'
]
with RemoteOpenAIServer(model,

View File

@@ -33,7 +33,6 @@ MODES = [
"single",
"aclgraph",
"aclgraph_mlapo",
"no_chunkprefill",
]
prompts = [
@@ -82,9 +81,6 @@ async def test_models(model: str, mode: str) -> None:
"method": "deepseek_mtp"
}
additional_config = {
"ascend_scheduler_config": {
"enabled": False
},
"torchair_graph_config": {
"enabled": True,
"enable_multistream_moe": False,
@@ -112,10 +108,6 @@ async def test_models(model: str, mode: str) -> None:
if mode == "aclgraph_mlapo":
env_dict["VLLM_ASCEND_ENABLE_MLAPO"] = "1"
additional_config["torchair_graph_config"] = {"enabled": False}
if mode == "no_chunkprefill":
additional_config["ascend_scheduler_config"] = {"enabled": True}
i = server_args.index("--max-num-batched-tokens") + 1
server_args[i] = "36864"
server_args.extend(["--additional-config", json.dumps(additional_config)])
request_keyword_args: dict[str, Any] = {
**api_keyword_args,

View File

@@ -71,9 +71,6 @@ async def test_models(model: str) -> None:
"cudagraph_mode": "FULL_DECODE_ONLY"
}
additional_config: dict[str, Any] = {
"ascend_scheduler_config": {
"enabled": False
},
"torchair_graph_config": {
"enabled": True
},

View File

@@ -92,7 +92,6 @@ async def test_models(model: str, tp_size: int, dp_size: int,
"--gpu-memory-utilization",
"0.9",
"--additional-config",
'{"ascend_scheduler_config":{"enabled":true},'
'"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}',
]
if full_graph:

View File

@@ -85,9 +85,8 @@ async def test_models(model: str, tp_size: int) -> None:
str(tp_size), "--port",
str(port), "--max-model-len", "30000", "--max-num-batched-tokens",
"40000", "--max-num-seqs", "400", "--trust-remote-code",
"--gpu-memory-utilization", "0.8", "--additional-config",
'{"ascend_scheduler_config":{"enabled":false}}',
"--compilation_config", '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
"--gpu-memory-utilization", "0.8", "--compilation_config",
'{"cudagraph_mode": "FULL_DECODE_ONLY"}'
]
request_keyword_args: dict[str, Any] = {
**api_keyword_args,

View File

@@ -60,11 +60,7 @@ async def test_models(model: str) -> None:
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"
}
additional_config: dict[str, Any] = {
"ascend_scheduler_config": {
"enabled": False
},
}
additional_config: dict[str, Any] = {}
compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"}
server_args = [
"--quantization", "ascend", "--async-scheduling",

View File

@@ -63,11 +63,6 @@ async def test_models(model: str, mode: str) -> None:
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"
}
additional_config: dict[str, Any] = {
"ascend_scheduler_config": {
"enabled": False
},
}
compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"}
server_args = [
"--quantization", "ascend", "--async-scheduling",
@@ -82,7 +77,6 @@ async def test_models(model: str, mode: str) -> None:
server_args.extend(
["--compilation-config",
json.dumps(compilation_config)])
server_args.extend(["--additional-config", json.dumps(additional_config)])
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
}

View File

@@ -93,8 +93,6 @@ async def test_models(model: str, mode: str, tp_size: int) -> None:
server_args.remove(
'{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 8, 24, 48, 60]}'
)
server_args.append("--additional-config")
server_args.append('{"ascend_scheduler_config":{"enabled":true}}')
server_args.append("--enforce-eager")
request_keyword_args: dict[str, Any] = {
**api_keyword_args,

View File

@@ -30,7 +30,7 @@ deployment:
--quantization ascend
--gpu-memory-utilization 0.9
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
--additional-config '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
--additional-config '{"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
-
server_cmd: >
@@ -51,7 +51,7 @@ deployment:
--quantization ascend
--gpu-memory-utilization 0.9
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
--additional-config '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
--additional-config '{"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
benchmarks:
acc:
case_type: accuracy

View File

@@ -31,7 +31,7 @@ deployment:
--gpu-memory-utilization 0.9
--enforce-eager
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
--additional-config '{"ascend_scheduler_config":{"enabled":false},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
--additional-config '{"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
-
server_cmd: >
@@ -53,5 +53,5 @@ deployment:
--gpu-memory-utilization 0.9
--enforce-eager
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
--additional-config '{"ascend_scheduler_config":{"enabled":false},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
--additional-config '{"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
benchmarks:

View File

@@ -50,7 +50,7 @@ deployment:
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}'
--additional-config
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
-
server_cmd: >
@@ -80,7 +80,7 @@ deployment:
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}'
--additional-config
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
@@ -111,7 +111,7 @@ deployment:
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}'
--additional-config
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
@@ -141,7 +141,7 @@ deployment:
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}'
--additional-config
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
benchmarks:
perf:
case_type: performance

View File

@@ -49,7 +49,7 @@ deployment:
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}'
--additional-config
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
-
server_cmd: >
@@ -79,7 +79,7 @@ deployment:
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}'
--additional-config
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
@@ -110,7 +110,7 @@ deployment:
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}'
--additional-config
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
@@ -140,7 +140,7 @@ deployment:
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}'
--additional-config
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
benchmarks:
perf:
case_type: performance

View File

@@ -29,7 +29,7 @@ deployment:
--trust-remote-code
--no-enable-prefix-caching
--gpu-memory-utilization 0.9
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
--additional-config '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
-
server_cmd: >
@@ -49,5 +49,5 @@ deployment:
--trust-remote-code
--no-enable-prefix-caching
--gpu-memory-utilization 0.92
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
--additional-config '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
benchmarks:

View File

@@ -48,27 +48,26 @@ def mtp_correctness(sampling_config: SamplingParams,
if graph_mode == CUDAGraphMode.FULL:
graph_mode_str = "FULL_DECODE_ONLY"
with VllmRunner(
model_name,
tensor_parallel_size=1,
max_num_seqs=256,
gpu_memory_utilization=0.7,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": num_speculative_tokens,
"disable_padded_drafter_batch": disable_padded_drafter_batch,
},
enforce_eager=enforce_eager,
max_model_len=2000,
compilation_config=CompilationConfig(
cudagraph_mode=graph_mode_str,
cudagraph_capture_sizes=[12],
),
additional_config={"ascend_scheduler_config": {
"enabled": False
}}) as spec_llm:
with VllmRunner(model_name,
tensor_parallel_size=1,
max_num_seqs=256,
gpu_memory_utilization=0.7,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method":
"deepseek_mtp",
"num_speculative_tokens":
num_speculative_tokens,
"disable_padded_drafter_batch":
disable_padded_drafter_batch,
},
enforce_eager=enforce_eager,
max_model_len=2000,
compilation_config=CompilationConfig(
cudagraph_mode=graph_mode_str,
cudagraph_capture_sizes=[12],
)) as spec_llm:
spec_outputs = spec_llm.generate(example_prompts, sampling_config)
matches = 0

View File

@@ -1,170 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm import SamplingParams
from tests.e2e.conftest import VllmRunner
from tests.e2e.model_utils import check_outputs_equal
MODEL = "Qwen/Qwen3-0.6B"
@pytest.mark.parametrize("enforce_eager", [True, False])
def test_concurrent_partial_prefill(enforce_eager):
with VllmRunner(MODEL,
additional_config={
'ascend_scheduler_config': {
'enabled': True,
},
},
max_num_seqs=3,
max_num_batched_tokens=8192,
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7) as vllm_model:
outputs = vllm_model.model.generate(["Hello my name is Robert and I"] *
3)
assert len(outputs) == 3
for output in outputs:
assert len(output.outputs) == 1
@pytest.mark.parametrize("enforce_eager", [True, False])
def test_prefix_cache_stats_is_recorded(enforce_eager):
with VllmRunner(MODEL,
additional_config={
'ascend_scheduler_config': {
'enabled': True,
},
},
max_num_seqs=3,
max_num_batched_tokens=8192,
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7) as vllm_model:
# 17 tokens will make sure first 16 tokens are cached in a block
input_tokens = {"prompt_token_ids": [101] * 129}
_ = vllm_model.model.generate([input_tokens])
outputs = vllm_model.model.generate([input_tokens])
assert outputs[0].num_cached_tokens == 128
@pytest.mark.parametrize("max_tokens",
[4]) # cannot align results when max_tokens > 4
@pytest.mark.parametrize("chunked_prefill_token_size", [2048])
def test_chunked_prefill_with_ascend_scheduler(
max_tokens: int, chunked_prefill_token_size: int) -> None:
example_prompts = [
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
]
max_num_seqs = chunked_prefill_token_size
max_num_batched_tokens = chunked_prefill_token_size
with VllmRunner(MODEL,
additional_config={
'ascend_scheduler_config': {
'enabled': True,
'enable_chunked_prefill': True,
},
},
max_num_seqs=max_num_seqs,
max_num_batched_tokens=max_num_batched_tokens,
max_model_len=2048,
gpu_memory_utilization=0.7) as vllm_model:
chunked_prefill_output = vllm_model.generate_greedy(
example_prompts, max_tokens)
with VllmRunner(MODEL,
additional_config={
'ascend_scheduler_config': {
'enabled': True,
},
},
max_model_len=2048,
gpu_memory_utilization=0.7) as vllm_model:
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal(
outputs_0_lst=vllm_output,
outputs_1_lst=chunked_prefill_output,
name_0="vllm_output",
name_1="chunked_prefill_output",
)
@pytest.mark.parametrize("max_tokens",
[4]) # cannot align results when max_tokens > 4
@pytest.mark.parametrize("chunked_prefill_token_size", [2048])
def test_chunked_prefill_with_scheduler_dynamic_batch(
max_tokens: int, chunked_prefill_token_size: int) -> None:
example_prompts = [
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
]
max_num_seqs = chunked_prefill_token_size
max_num_batched_tokens = chunked_prefill_token_size
with VllmRunner(MODEL,
additional_config={
'SLO_limits_for_dynamic_batch': 0,
},
max_num_seqs=max_num_seqs,
max_num_batched_tokens=max_num_batched_tokens,
max_model_len=2048,
gpu_memory_utilization=0.7) as vllm_model:
dynamic_batch_output = vllm_model.generate_greedy(
example_prompts, max_tokens)
with VllmRunner(MODEL,
additional_config={
'SLO_limits_for_dynamic_batch': -1,
},
max_model_len=2048,
gpu_memory_utilization=0.7) as vllm_model:
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal(
outputs_0_lst=vllm_output,
outputs_1_lst=dynamic_batch_output,
name_0="vllm_output",
name_1="chunked_prefill_output",
)
def test_async_scheduling_eager() -> None:
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
] * 10
sampling_params = SamplingParams(temperature=0.2,
max_tokens=10,
stop_token_ids=None)
with VllmRunner(
"Qwen/Qwen2.5-0.5B-Instruct",
max_model_len=4096,
max_num_seqs=50,
dtype="bfloat16",
gpu_memory_utilization=0.9,
async_scheduling=True,
) as vllm_model:
vllm_model.generate(prompts, sampling_params=sampling_params)
def test_async_scheduling_with_full_graph() -> None:
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
] * 10
sampling_params = SamplingParams(temperature=0.2,
max_tokens=10,
stop_token_ids=None)
with VllmRunner("Qwen/Qwen3-8B",
max_model_len=4096,
max_num_seqs=50,
dtype="bfloat16",
gpu_memory_utilization=0.9,
async_scheduling=True,
compilation_config={"cudagraph_mode":
"FULL"}) as vllm_model:
vllm_model.generate(prompts, sampling_params=sampling_params)

View File

@@ -1,82 +0,0 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
Compare the outputs of vLLM with and without aclgraph.
Run `pytest tests/compile/test_aclgraph.py`.
"""
import gc
import pytest
import torch
from vllm import SamplingParams
from tests.e2e.conftest import VllmRunner
MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [1])
def test_models(
model: str,
max_tokens: int,
) -> None:
prompts = ["The president of the United States is"]
sampling_params = SamplingParams(
max_tokens=max_tokens,
temperature=0.0,
)
with VllmRunner(model,
long_prefill_token_threshold=20,
enforce_eager=False) as vllm_model:
output1 = vllm_model.generate(prompts, sampling_params)
with VllmRunner(model,
enforce_eager=False,
additional_config={
'ascend_scheduler_config': {
'enabled': True
},
}) as vllm_model:
output2 = vllm_model.generate(prompts, sampling_params)
# Extract the generated token IDs for comparison
token_ids1 = output1[0][0][0]
token_ids2 = output2[0][0][0]
print(f"Token IDs 1: {token_ids1}")
print(f"Token IDs 2: {token_ids2}")
# Convert token IDs to tensors and calculate cosine similarity
# Take the length of a shorter sequence to ensure consistent dimensions
min_len = min(len(token_ids1), len(token_ids2))
tensor1 = torch.tensor(token_ids1[:min_len], dtype=torch.float32)
tensor2 = torch.tensor(token_ids2[:min_len], dtype=torch.float32)
# Calculate similarity using torch.cosine_similarity
similarity = torch.cosine_similarity(tensor1, tensor2, dim=0)
print(f"Token IDs cosine similarity: {similarity.item()}")
assert similarity > 0.95
gc.collect()
torch.npu.empty_cache()
torch.npu.reset_peak_memory_stats()

View File

@@ -20,7 +20,6 @@
Run `pytest tests/test_offline_inference.py`.
"""
import pytest
from vllm import SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.assets.image import ImageAsset
@@ -55,40 +54,6 @@ def test_multimodal_vl(prompt_template):
assert output_str, "Generated output should not be empty."
@pytest.mark.skip(reason="This e2e test will stuck in multi-batch scenario. "
"Add this back after fixing the issue.")
def test_multimodal_ascend_scheduler(prompt_template):
image = ImageAsset("cherry_blossom") \
.pil_image.convert("RGB")
img_questions = [
"What is the content of this image?",
"Describe the content of this image in detail.",
"What's in the image?",
"Where is this image taken?",
]
images = [image] * len(img_questions)
prompts = prompt_template(img_questions)
with VllmRunner("Qwen/Qwen2.5-VL-3B-Instruct",
max_model_len=4096,
additional_config={
'ascend_scheduler_config': {
'enabled': True,
},
},
mm_processor_kwargs={
"min_pixels": 28 * 28,
"max_pixels": 1280 * 28 * 28,
"fps": 1,
},
enforce_eager=True) as vllm_model:
outputs = vllm_model.generate_greedy(prompts=prompts,
images=images,
max_tokens=64)
assert len(outputs) == len(prompts)
for _, output_str in outputs:
assert output_str, "Generated output should not be empty."
def test_multimodal_audio():
audio_prompt = "".join([
f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"