drop ascend scheduler (#4498)
Ascend scheduler was added for non chunk prefill case before, since that the npu ops didn't work well with chunked prefill. Now the ops with chunked prefill work better, it's time to remove the ascend scheduler to use vLLM default scheduler. - vLLM version: v0.11.2 --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -24,15 +24,12 @@ from tests.e2e.conftest import VllmRunner
|
||||
MODELS = [
|
||||
"IntervitensInc/pangu-pro-moe-model",
|
||||
]
|
||||
# set additional config for ascend scheduler and torchair graph
|
||||
# set additional config for torchair graph
|
||||
ADDITIONAL_CONFIG = [{
|
||||
"additional_config": {
|
||||
"torchair_graph_config": {
|
||||
"enabled": True
|
||||
},
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
}
|
||||
}
|
||||
}]
|
||||
|
||||
|
||||
@@ -15,23 +15,14 @@ def test_e2e_ep_correctness(model_name):
|
||||
max_tokens = 5
|
||||
|
||||
# FIXME: Really strange that chunked prefill might lead to different results, investigate further
|
||||
with VllmRunner(
|
||||
model_name,
|
||||
tensor_parallel_size=2,
|
||||
additional_config={"ascend_scheduler_config": {
|
||||
"enabled": True
|
||||
}},
|
||||
enforce_eager=False) as vllm_model:
|
||||
with VllmRunner(model_name, tensor_parallel_size=2,
|
||||
enforce_eager=False) as vllm_model:
|
||||
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with VllmRunner(
|
||||
model_name,
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
additional_config={"ascend_scheduler_config": {
|
||||
"enabled": True
|
||||
}},
|
||||
enforce_eager=False) as vllm_model:
|
||||
with VllmRunner(model_name,
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
enforce_eager=False) as vllm_model:
|
||||
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
|
||||
@@ -49,13 +49,7 @@ def test_generate_with_allgather():
|
||||
tensor_parallel_size=2,
|
||||
max_model_len=1024,
|
||||
dtype="auto",
|
||||
enable_expert_parallel=True,
|
||||
additional_config={
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
"chunked_prefill_enabled": False,
|
||||
},
|
||||
}) as vllm_model:
|
||||
enable_expert_parallel=True) as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
|
||||
@@ -76,11 +70,5 @@ def test_generate_with_alltoall():
|
||||
tensor_parallel_size=2,
|
||||
max_model_len=1024,
|
||||
dtype="auto",
|
||||
enable_expert_parallel=True,
|
||||
additional_config={
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
"chunked_prefill_enabled": False,
|
||||
},
|
||||
}) as vllm_model:
|
||||
enable_expert_parallel=True) as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
@@ -82,9 +82,6 @@ def test_models_distributed_DeepSeek_multistream_moe():
|
||||
"enabled": True,
|
||||
},
|
||||
"enable_multistream_moe": True,
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
},
|
||||
"refresh": True,
|
||||
},
|
||||
) as vllm_model:
|
||||
@@ -154,14 +151,9 @@ def test_models_distributed_DeepSeek_W4A8DYNAMIC(model):
|
||||
quantization="ascend",
|
||||
enforce_eager=True,
|
||||
enable_expert_parallel=True,
|
||||
additional_config={
|
||||
"torchair_graph_config": {
|
||||
"enabled": False,
|
||||
},
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
}
|
||||
},
|
||||
additional_config={"torchair_graph_config": {
|
||||
"enabled": False,
|
||||
}},
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(prompts, max_tokens)
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Compare the with and without prefix caching on V1 scheduler or AscendScheduler."""
|
||||
"""Compare the with and without prefix caching on V1 scheduler."""
|
||||
|
||||
import pytest
|
||||
|
||||
@@ -84,67 +84,3 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
|
||||
name_0="vllm_output",
|
||||
name_1="prefix_cache_output",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Fix me, the accuracy is not correct")
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [50])
|
||||
def test_prefix_cache_with_ascend_scheduler(model: str,
|
||||
max_tokens: int) -> None:
|
||||
|
||||
with VllmRunner(model,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True,
|
||||
},
|
||||
},
|
||||
enforce_eager=False,
|
||||
max_model_len=2048,
|
||||
tensor_parallel_size=2,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
|
||||
|
||||
with VllmRunner(model,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True,
|
||||
'enable_prefix_caching': True,
|
||||
},
|
||||
},
|
||||
enforce_eager=False,
|
||||
max_model_len=2048,
|
||||
tensor_parallel_size=2,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
prefix_cache_output = vllm_model.generate_greedy(
|
||||
INPUT_PROMPTS, max_tokens)
|
||||
|
||||
# TODO: enable apc and chunked prefill with ascend scheduler will lead accuracy problem.
|
||||
# Disable it now. Fix it or drop the ascend scheduler in the future.
|
||||
# with VllmRunner(model,
|
||||
# additional_config={
|
||||
# 'ascend_scheduler_config': {
|
||||
# 'enabled': True,
|
||||
# 'enable_prefix_caching': True,
|
||||
# "enable_chunked_prefill": True,
|
||||
# },
|
||||
# },
|
||||
# enforce_eager=True,
|
||||
# max_model_len=2048,
|
||||
# tensor_parallel_size=2,
|
||||
# gpu_memory_utilization=0.7) as vllm_model:
|
||||
# chunk_prefill_prefix_cache_output = vllm_model.generate_greedy(
|
||||
# INPUT_PROMPTS, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_output,
|
||||
outputs_1_lst=prefix_cache_output,
|
||||
name_0="vllm_output",
|
||||
name_1="prefix_cache_output",
|
||||
)
|
||||
|
||||
# check_outputs_equal(
|
||||
# outputs_0_lst=chunk_prefill_prefix_cache_output,
|
||||
# outputs_1_lst=prefix_cache_output,
|
||||
# name_0="chunk_prefill_prefix_cache_output",
|
||||
# name_1="prefix_cache_output",
|
||||
# )
|
||||
|
||||
@@ -24,6 +24,7 @@ Run `pytest tests/e2e/multicard/test_qwen3_next.py`.
|
||||
import os
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from modelscope import snapshot_download # type: ignore
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
@@ -63,6 +64,8 @@ def test_models_distributed_Qwen3_NEXT_TP4_FULL_DECODE_ONLY():
|
||||
del vllm_model
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason="Qwen3-Next + MTP doesn't work with chunked prefill. Fix Me")
|
||||
def test_models_distributed_Qwen3_NEXT_MTP_TP4_SIMILARITY():
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
@@ -89,12 +92,6 @@ def test_models_distributed_Qwen3_NEXT_MTP_TP4_SIMILARITY():
|
||||
gpu_memory_utilization=0.8,
|
||||
distributed_executor_backend="mp",
|
||||
enforce_eager=True,
|
||||
additional_config={
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
"enable_chunked_prefill": False
|
||||
}
|
||||
},
|
||||
speculative_config={
|
||||
"method": "qwen3_next_mtp",
|
||||
"num_speculative_tokens": 1
|
||||
|
||||
@@ -44,9 +44,6 @@ def _deepseek_torchair_test_fixture(
|
||||
kwargs = {}
|
||||
if not use_v1_schduler:
|
||||
kwargs = {
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
},
|
||||
"refresh": True,
|
||||
}
|
||||
additional_config.update(**kwargs)
|
||||
@@ -120,9 +117,6 @@ def _pangu_torchair_test_fixture(
|
||||
|
||||
# torchair is only work without chunked-prefill now
|
||||
kwargs = {
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
},
|
||||
"refresh": True,
|
||||
}
|
||||
additional_config.update(**kwargs)
|
||||
@@ -185,9 +179,6 @@ def _qwen_torchair_test_fixture(
|
||||
"torchair_graph_config": {
|
||||
"enabled": False,
|
||||
},
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
},
|
||||
"refresh": True,
|
||||
}
|
||||
|
||||
@@ -244,9 +235,6 @@ def _deepseek_v2_lite_torchair_test_fixure(
|
||||
kwargs = {}
|
||||
if not use_v1_schduler:
|
||||
kwargs = {
|
||||
"ascend_scheduler_config": {
|
||||
"enable": True,
|
||||
},
|
||||
"refresh": True,
|
||||
}
|
||||
additional_config.update(**kwargs)
|
||||
|
||||
@@ -73,11 +73,7 @@ async def test_models(model: str, mode: str) -> None:
|
||||
"VLLM_RPC_TIMEOUT": "3600000",
|
||||
"VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": "3600000"
|
||||
}
|
||||
additional_config: dict[str, Any] = {
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": False
|
||||
},
|
||||
}
|
||||
additional_config: dict[str, Any] = {}
|
||||
speculative_config = {
|
||||
"num_speculative_tokens": 2,
|
||||
"method": "deepseek_mtp"
|
||||
|
||||
@@ -74,9 +74,6 @@ async def test_models(model: str) -> None:
|
||||
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
|
||||
}
|
||||
additional_config = {
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": False
|
||||
},
|
||||
"torchair_graph_config": {
|
||||
"enabled": True,
|
||||
"enable_multistream_moe": False,
|
||||
|
||||
@@ -68,12 +68,7 @@ aisbench_cases75 = [{
|
||||
async def test_models(model: str) -> None:
|
||||
port = get_open_port()
|
||||
env_dict = {"TASK_QUEUE_ENABLE": "1", "HCCL_OP_EXPANSION_MODE": "AIV"}
|
||||
additional_config = {
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": False
|
||||
},
|
||||
"enable_weight_nz_layout": True
|
||||
}
|
||||
additional_config = {"enable_weight_nz_layout": True}
|
||||
server_args = [
|
||||
"--quantization", "ascend", "--reasoning-parser", "qwen3",
|
||||
"--tensor-parallel-size", "4", "--port",
|
||||
|
||||
@@ -83,8 +83,7 @@ async def test_models(model: str, tp_size: int) -> None:
|
||||
"0.9", "--block-size", "128", "--max-num-seqs", "256",
|
||||
"--enforce-eager", "--max-model-len", "35840",
|
||||
"--max-num-batched-tokens", "35840", "--additional-config",
|
||||
'{"ascend_scheduler_config":{"enabled":true},"enable_weight_nz_layout":true}',
|
||||
"--compilation-config",
|
||||
'{"enable_weight_nz_layout":true}', "--compilation-config",
|
||||
'{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[1,8,24,48,60]}'
|
||||
]
|
||||
with RemoteOpenAIServer(model,
|
||||
|
||||
@@ -33,7 +33,6 @@ MODES = [
|
||||
"single",
|
||||
"aclgraph",
|
||||
"aclgraph_mlapo",
|
||||
"no_chunkprefill",
|
||||
]
|
||||
|
||||
prompts = [
|
||||
@@ -82,9 +81,6 @@ async def test_models(model: str, mode: str) -> None:
|
||||
"method": "deepseek_mtp"
|
||||
}
|
||||
additional_config = {
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": False
|
||||
},
|
||||
"torchair_graph_config": {
|
||||
"enabled": True,
|
||||
"enable_multistream_moe": False,
|
||||
@@ -112,10 +108,6 @@ async def test_models(model: str, mode: str) -> None:
|
||||
if mode == "aclgraph_mlapo":
|
||||
env_dict["VLLM_ASCEND_ENABLE_MLAPO"] = "1"
|
||||
additional_config["torchair_graph_config"] = {"enabled": False}
|
||||
if mode == "no_chunkprefill":
|
||||
additional_config["ascend_scheduler_config"] = {"enabled": True}
|
||||
i = server_args.index("--max-num-batched-tokens") + 1
|
||||
server_args[i] = "36864"
|
||||
server_args.extend(["--additional-config", json.dumps(additional_config)])
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
|
||||
@@ -71,9 +71,6 @@ async def test_models(model: str) -> None:
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY"
|
||||
}
|
||||
additional_config: dict[str, Any] = {
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": False
|
||||
},
|
||||
"torchair_graph_config": {
|
||||
"enabled": True
|
||||
},
|
||||
|
||||
@@ -92,7 +92,6 @@ async def test_models(model: str, tp_size: int, dp_size: int,
|
||||
"--gpu-memory-utilization",
|
||||
"0.9",
|
||||
"--additional-config",
|
||||
'{"ascend_scheduler_config":{"enabled":true},'
|
||||
'"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}',
|
||||
]
|
||||
if full_graph:
|
||||
|
||||
@@ -85,9 +85,8 @@ async def test_models(model: str, tp_size: int) -> None:
|
||||
str(tp_size), "--port",
|
||||
str(port), "--max-model-len", "30000", "--max-num-batched-tokens",
|
||||
"40000", "--max-num-seqs", "400", "--trust-remote-code",
|
||||
"--gpu-memory-utilization", "0.8", "--additional-config",
|
||||
'{"ascend_scheduler_config":{"enabled":false}}',
|
||||
"--compilation_config", '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
|
||||
"--gpu-memory-utilization", "0.8", "--compilation_config",
|
||||
'{"cudagraph_mode": "FULL_DECODE_ONLY"}'
|
||||
]
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
|
||||
@@ -60,11 +60,7 @@ async def test_models(model: str) -> None:
|
||||
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
|
||||
"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"
|
||||
}
|
||||
additional_config: dict[str, Any] = {
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": False
|
||||
},
|
||||
}
|
||||
additional_config: dict[str, Any] = {}
|
||||
compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"}
|
||||
server_args = [
|
||||
"--quantization", "ascend", "--async-scheduling",
|
||||
|
||||
@@ -63,11 +63,6 @@ async def test_models(model: str, mode: str) -> None:
|
||||
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
|
||||
"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"
|
||||
}
|
||||
additional_config: dict[str, Any] = {
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": False
|
||||
},
|
||||
}
|
||||
compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"}
|
||||
server_args = [
|
||||
"--quantization", "ascend", "--async-scheduling",
|
||||
@@ -82,7 +77,6 @@ async def test_models(model: str, mode: str) -> None:
|
||||
server_args.extend(
|
||||
["--compilation-config",
|
||||
json.dumps(compilation_config)])
|
||||
server_args.extend(["--additional-config", json.dumps(additional_config)])
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
}
|
||||
|
||||
@@ -93,8 +93,6 @@ async def test_models(model: str, mode: str, tp_size: int) -> None:
|
||||
server_args.remove(
|
||||
'{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 8, 24, 48, 60]}'
|
||||
)
|
||||
server_args.append("--additional-config")
|
||||
server_args.append('{"ascend_scheduler_config":{"enabled":true}}')
|
||||
server_args.append("--enforce-eager")
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
|
||||
@@ -30,7 +30,7 @@ deployment:
|
||||
--quantization ascend
|
||||
--gpu-memory-utilization 0.9
|
||||
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
|
||||
--additional-config '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
|
||||
--additional-config '{"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
|
||||
|
||||
-
|
||||
server_cmd: >
|
||||
@@ -51,7 +51,7 @@ deployment:
|
||||
--quantization ascend
|
||||
--gpu-memory-utilization 0.9
|
||||
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
|
||||
--additional-config '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
|
||||
--additional-config '{"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
|
||||
benchmarks:
|
||||
acc:
|
||||
case_type: accuracy
|
||||
|
||||
@@ -31,7 +31,7 @@ deployment:
|
||||
--gpu-memory-utilization 0.9
|
||||
--enforce-eager
|
||||
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
|
||||
--additional-config '{"ascend_scheduler_config":{"enabled":false},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
|
||||
--additional-config '{"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
|
||||
|
||||
-
|
||||
server_cmd: >
|
||||
@@ -53,5 +53,5 @@ deployment:
|
||||
--gpu-memory-utilization 0.9
|
||||
--enforce-eager
|
||||
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
|
||||
--additional-config '{"ascend_scheduler_config":{"enabled":false},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
|
||||
--additional-config '{"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
|
||||
benchmarks:
|
||||
|
||||
@@ -50,7 +50,7 @@ deployment:
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
||||
}'
|
||||
--additional-config
|
||||
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
||||
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
||||
|
||||
-
|
||||
server_cmd: >
|
||||
@@ -80,7 +80,7 @@ deployment:
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
||||
}'
|
||||
--additional-config
|
||||
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
||||
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
||||
-
|
||||
server_cmd: >
|
||||
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
|
||||
@@ -111,7 +111,7 @@ deployment:
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
||||
}'
|
||||
--additional-config
|
||||
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
||||
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
||||
-
|
||||
server_cmd: >
|
||||
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
|
||||
@@ -141,7 +141,7 @@ deployment:
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
||||
}'
|
||||
--additional-config
|
||||
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
||||
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
||||
benchmarks:
|
||||
perf:
|
||||
case_type: performance
|
||||
|
||||
@@ -49,7 +49,7 @@ deployment:
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
||||
}'
|
||||
--additional-config
|
||||
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
|
||||
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
|
||||
|
||||
-
|
||||
server_cmd: >
|
||||
@@ -79,7 +79,7 @@ deployment:
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
||||
}'
|
||||
--additional-config
|
||||
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
|
||||
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
|
||||
-
|
||||
server_cmd: >
|
||||
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
|
||||
@@ -110,7 +110,7 @@ deployment:
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
||||
}'
|
||||
--additional-config
|
||||
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
|
||||
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
|
||||
-
|
||||
server_cmd: >
|
||||
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
|
||||
@@ -140,7 +140,7 @@ deployment:
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
||||
}'
|
||||
--additional-config
|
||||
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
|
||||
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
|
||||
benchmarks:
|
||||
perf:
|
||||
case_type: performance
|
||||
|
||||
@@ -29,7 +29,7 @@ deployment:
|
||||
--trust-remote-code
|
||||
--no-enable-prefix-caching
|
||||
--gpu-memory-utilization 0.9
|
||||
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
|
||||
--additional-config '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
|
||||
|
||||
-
|
||||
server_cmd: >
|
||||
@@ -49,5 +49,5 @@ deployment:
|
||||
--trust-remote-code
|
||||
--no-enable-prefix-caching
|
||||
--gpu-memory-utilization 0.92
|
||||
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
|
||||
--additional-config '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
|
||||
benchmarks:
|
||||
|
||||
@@ -48,27 +48,26 @@ def mtp_correctness(sampling_config: SamplingParams,
|
||||
if graph_mode == CUDAGraphMode.FULL:
|
||||
graph_mode_str = "FULL_DECODE_ONLY"
|
||||
|
||||
with VllmRunner(
|
||||
model_name,
|
||||
tensor_parallel_size=1,
|
||||
max_num_seqs=256,
|
||||
gpu_memory_utilization=0.7,
|
||||
distributed_executor_backend="mp",
|
||||
enable_expert_parallel=True,
|
||||
speculative_config={
|
||||
"method": "deepseek_mtp",
|
||||
"num_speculative_tokens": num_speculative_tokens,
|
||||
"disable_padded_drafter_batch": disable_padded_drafter_batch,
|
||||
},
|
||||
enforce_eager=enforce_eager,
|
||||
max_model_len=2000,
|
||||
compilation_config=CompilationConfig(
|
||||
cudagraph_mode=graph_mode_str,
|
||||
cudagraph_capture_sizes=[12],
|
||||
),
|
||||
additional_config={"ascend_scheduler_config": {
|
||||
"enabled": False
|
||||
}}) as spec_llm:
|
||||
with VllmRunner(model_name,
|
||||
tensor_parallel_size=1,
|
||||
max_num_seqs=256,
|
||||
gpu_memory_utilization=0.7,
|
||||
distributed_executor_backend="mp",
|
||||
enable_expert_parallel=True,
|
||||
speculative_config={
|
||||
"method":
|
||||
"deepseek_mtp",
|
||||
"num_speculative_tokens":
|
||||
num_speculative_tokens,
|
||||
"disable_padded_drafter_batch":
|
||||
disable_padded_drafter_batch,
|
||||
},
|
||||
enforce_eager=enforce_eager,
|
||||
max_model_len=2000,
|
||||
compilation_config=CompilationConfig(
|
||||
cudagraph_mode=graph_mode_str,
|
||||
cudagraph_capture_sizes=[12],
|
||||
)) as spec_llm:
|
||||
spec_outputs = spec_llm.generate(example_prompts, sampling_config)
|
||||
|
||||
matches = 0
|
||||
|
||||
@@ -1,170 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
from vllm import SamplingParams
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
from tests.e2e.model_utils import check_outputs_equal
|
||||
|
||||
MODEL = "Qwen/Qwen3-0.6B"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enforce_eager", [True, False])
|
||||
def test_concurrent_partial_prefill(enforce_eager):
|
||||
with VllmRunner(MODEL,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True,
|
||||
},
|
||||
},
|
||||
max_num_seqs=3,
|
||||
max_num_batched_tokens=8192,
|
||||
enforce_eager=enforce_eager,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
outputs = vllm_model.model.generate(["Hello my name is Robert and I"] *
|
||||
3)
|
||||
assert len(outputs) == 3
|
||||
for output in outputs:
|
||||
assert len(output.outputs) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enforce_eager", [True, False])
|
||||
def test_prefix_cache_stats_is_recorded(enforce_eager):
|
||||
with VllmRunner(MODEL,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True,
|
||||
},
|
||||
},
|
||||
max_num_seqs=3,
|
||||
max_num_batched_tokens=8192,
|
||||
enforce_eager=enforce_eager,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
# 17 tokens will make sure first 16 tokens are cached in a block
|
||||
input_tokens = {"prompt_token_ids": [101] * 129}
|
||||
_ = vllm_model.model.generate([input_tokens])
|
||||
outputs = vllm_model.model.generate([input_tokens])
|
||||
assert outputs[0].num_cached_tokens == 128
|
||||
|
||||
|
||||
@pytest.mark.parametrize("max_tokens",
|
||||
[4]) # cannot align results when max_tokens > 4
|
||||
@pytest.mark.parametrize("chunked_prefill_token_size", [2048])
|
||||
def test_chunked_prefill_with_ascend_scheduler(
|
||||
max_tokens: int, chunked_prefill_token_size: int) -> None:
|
||||
example_prompts = [
|
||||
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
|
||||
]
|
||||
max_num_seqs = chunked_prefill_token_size
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
with VllmRunner(MODEL,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True,
|
||||
'enable_chunked_prefill': True,
|
||||
},
|
||||
},
|
||||
max_num_seqs=max_num_seqs,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
max_model_len=2048,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
chunked_prefill_output = vllm_model.generate_greedy(
|
||||
example_prompts, max_tokens)
|
||||
|
||||
with VllmRunner(MODEL,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True,
|
||||
},
|
||||
},
|
||||
max_model_len=2048,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_output,
|
||||
outputs_1_lst=chunked_prefill_output,
|
||||
name_0="vllm_output",
|
||||
name_1="chunked_prefill_output",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("max_tokens",
|
||||
[4]) # cannot align results when max_tokens > 4
|
||||
@pytest.mark.parametrize("chunked_prefill_token_size", [2048])
|
||||
def test_chunked_prefill_with_scheduler_dynamic_batch(
|
||||
max_tokens: int, chunked_prefill_token_size: int) -> None:
|
||||
example_prompts = [
|
||||
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
|
||||
]
|
||||
max_num_seqs = chunked_prefill_token_size
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
with VllmRunner(MODEL,
|
||||
additional_config={
|
||||
'SLO_limits_for_dynamic_batch': 0,
|
||||
},
|
||||
max_num_seqs=max_num_seqs,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
max_model_len=2048,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
dynamic_batch_output = vllm_model.generate_greedy(
|
||||
example_prompts, max_tokens)
|
||||
|
||||
with VllmRunner(MODEL,
|
||||
additional_config={
|
||||
'SLO_limits_for_dynamic_batch': -1,
|
||||
},
|
||||
max_model_len=2048,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_output,
|
||||
outputs_1_lst=dynamic_batch_output,
|
||||
name_0="vllm_output",
|
||||
name_1="chunked_prefill_output",
|
||||
)
|
||||
|
||||
|
||||
def test_async_scheduling_eager() -> None:
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
] * 10
|
||||
sampling_params = SamplingParams(temperature=0.2,
|
||||
max_tokens=10,
|
||||
stop_token_ids=None)
|
||||
|
||||
with VllmRunner(
|
||||
"Qwen/Qwen2.5-0.5B-Instruct",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=50,
|
||||
dtype="bfloat16",
|
||||
gpu_memory_utilization=0.9,
|
||||
async_scheduling=True,
|
||||
) as vllm_model:
|
||||
vllm_model.generate(prompts, sampling_params=sampling_params)
|
||||
|
||||
|
||||
def test_async_scheduling_with_full_graph() -> None:
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
] * 10
|
||||
sampling_params = SamplingParams(temperature=0.2,
|
||||
max_tokens=10,
|
||||
stop_token_ids=None)
|
||||
|
||||
with VllmRunner("Qwen/Qwen3-8B",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=50,
|
||||
dtype="bfloat16",
|
||||
gpu_memory_utilization=0.9,
|
||||
async_scheduling=True,
|
||||
compilation_config={"cudagraph_mode":
|
||||
"FULL"}) as vllm_model:
|
||||
vllm_model.generate(prompts, sampling_params=sampling_params)
|
||||
@@ -1,82 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
"""
|
||||
Compare the outputs of vLLM with and without aclgraph.
|
||||
|
||||
Run `pytest tests/compile/test_aclgraph.py`.
|
||||
"""
|
||||
import gc
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from vllm import SamplingParams
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [1])
|
||||
def test_models(
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
prompts = ["The president of the United States is"]
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=max_tokens,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
with VllmRunner(model,
|
||||
long_prefill_token_threshold=20,
|
||||
enforce_eager=False) as vllm_model:
|
||||
output1 = vllm_model.generate(prompts, sampling_params)
|
||||
|
||||
with VllmRunner(model,
|
||||
enforce_eager=False,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True
|
||||
},
|
||||
}) as vllm_model:
|
||||
output2 = vllm_model.generate(prompts, sampling_params)
|
||||
|
||||
# Extract the generated token IDs for comparison
|
||||
token_ids1 = output1[0][0][0]
|
||||
token_ids2 = output2[0][0][0]
|
||||
|
||||
print(f"Token IDs 1: {token_ids1}")
|
||||
print(f"Token IDs 2: {token_ids2}")
|
||||
|
||||
# Convert token IDs to tensors and calculate cosine similarity
|
||||
# Take the length of a shorter sequence to ensure consistent dimensions
|
||||
min_len = min(len(token_ids1), len(token_ids2))
|
||||
|
||||
tensor1 = torch.tensor(token_ids1[:min_len], dtype=torch.float32)
|
||||
tensor2 = torch.tensor(token_ids2[:min_len], dtype=torch.float32)
|
||||
|
||||
# Calculate similarity using torch.cosine_similarity
|
||||
similarity = torch.cosine_similarity(tensor1, tensor2, dim=0)
|
||||
print(f"Token IDs cosine similarity: {similarity.item()}")
|
||||
|
||||
assert similarity > 0.95
|
||||
|
||||
gc.collect()
|
||||
torch.npu.empty_cache()
|
||||
torch.npu.reset_peak_memory_stats()
|
||||
@@ -20,7 +20,6 @@
|
||||
|
||||
Run `pytest tests/test_offline_inference.py`.
|
||||
"""
|
||||
import pytest
|
||||
from vllm import SamplingParams
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.assets.image import ImageAsset
|
||||
@@ -55,40 +54,6 @@ def test_multimodal_vl(prompt_template):
|
||||
assert output_str, "Generated output should not be empty."
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="This e2e test will stuck in multi-batch scenario. "
|
||||
"Add this back after fixing the issue.")
|
||||
def test_multimodal_ascend_scheduler(prompt_template):
|
||||
image = ImageAsset("cherry_blossom") \
|
||||
.pil_image.convert("RGB")
|
||||
img_questions = [
|
||||
"What is the content of this image?",
|
||||
"Describe the content of this image in detail.",
|
||||
"What's in the image?",
|
||||
"Where is this image taken?",
|
||||
]
|
||||
images = [image] * len(img_questions)
|
||||
prompts = prompt_template(img_questions)
|
||||
with VllmRunner("Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
max_model_len=4096,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True,
|
||||
},
|
||||
},
|
||||
mm_processor_kwargs={
|
||||
"min_pixels": 28 * 28,
|
||||
"max_pixels": 1280 * 28 * 28,
|
||||
"fps": 1,
|
||||
},
|
||||
enforce_eager=True) as vllm_model:
|
||||
outputs = vllm_model.generate_greedy(prompts=prompts,
|
||||
images=images,
|
||||
max_tokens=64)
|
||||
assert len(outputs) == len(prompts)
|
||||
for _, output_str in outputs:
|
||||
assert output_str, "Generated output should not be empty."
|
||||
|
||||
|
||||
def test_multimodal_audio():
|
||||
audio_prompt = "".join([
|
||||
f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
|
||||
|
||||
Reference in New Issue
Block a user