drop ascend scheduler (#4498)

Ascend scheduler was added for non chunk prefill case before, since that
the npu ops didn't work well with chunked prefill.

Now the ops with chunked prefill work better, it's time to remove the
ascend scheduler to use vLLM default scheduler.

- vLLM version: v0.11.2

---------

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-11-29 16:18:34 +08:00
committed by GitHub
parent 53a52d6614
commit f10acddb78
52 changed files with 85 additions and 2948 deletions

View File

@@ -91,10 +91,8 @@ jobs:
pytest -sv tests/e2e/singlecard/test_completion_with_prompt_embeds.py
pytest -sv tests/e2e/singlecard/test_aclgraph.py
pytest -sv tests/e2e/singlecard/test_aclgraph_mem.py
pytest -sv tests/e2e/singlecard/test_ascend_scheduler.py
pytest -sv tests/e2e/singlecard/test_bge_model.py
pytest -sv tests/e2e/singlecard/test_camem.py
pytest -sv tests/e2e/singlecard/test_chunked.py
pytest -sv tests/e2e/singlecard/test_embedding.py
# pytest -sv tests/e2e/singlecard/test_embedding_aclgraph.py
pytest -sv tests/e2e/singlecard/test_guided_decoding.py

View File

@@ -108,7 +108,7 @@ vllm serve vllm-ascend/DeepSeek-V3.2-Exp-W8A8 \
--trust-remote-code \
--no-enable-prefix-caching \
--gpu-memory-utilization 0.92 \
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
--additional-config '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
```
### Multi-node Deployment
@@ -160,7 +160,7 @@ vllm serve /root/.cache/Modelers_Park/DeepSeek-V3.2-Exp \
--trust-remote-code \
--no-enable-prefix-caching \
--gpu-memory-utilization 0.9 \
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
--additional-config '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
```
**Node 1**
@@ -204,7 +204,7 @@ vllm serve /root/.cache/Modelers_Park/DeepSeek-V3.2-Exp \
--trust-remote-code \
--no-enable-prefix-caching \
--gpu-memory-utilization 0.92 \
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
--additional-config '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
```
::::
@@ -252,7 +252,7 @@ vllm serve vllm-ascend/DeepSeek-V3.2-Exp-W8A8 \
--quantization ascend \
--no-enable-prefix-caching \
--gpu-memory-utilization 0.9 \
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
--additional-config '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
```
**Node 1**
@@ -299,7 +299,7 @@ vllm serve vllm-ascend/DeepSeek-V3.2-Exp-W8A8 \
--quantization ascend \
--no-enable-prefix-caching \
--gpu-memory-utilization 0.92 \
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
--additional-config '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
```
::::

View File

@@ -137,7 +137,7 @@ vllm serve vllm-ascend/DeepSeek-V3.1-W8A8 \
--trust-remote-code \
--no-enable-prefix-caching \
--gpu-memory-utilization 0.9 \
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}'
--additional-config '{"torchair_graph_config":{"enabled":true}}'
```
**Node 1**
@@ -182,7 +182,7 @@ vllm serve vllm-ascend/DeepSeek-V3.1-W8A8 \
--trust-remote-code \
--no-enable-prefix-caching \
--gpu-memory-utilization 0.92 \
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}'
--additional-config '{"torchair_graph_config":{"enabled":true}}'
```
The deployment view looks like:

View File

@@ -93,7 +93,7 @@ vllm serve /home/cache/weights/Kimi-K2-Instruct-W8A8 \
--trust-remote-code \
--no-enable-prefix-caching \
--gpu-memory-utilization 0.9 \
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}'
--additional-config '{"torchair_graph_config":{"enabled":true}}'
```
**Node 1**
@@ -137,7 +137,7 @@ vllm serve /home/cache/weights/Kimi-K2-Instruct-W8A8 \
--trust-remote-code \
--no-enable-prefix-caching \
--gpu-memory-utilization 0.92 \
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}'
--additional-config '{"torchair_graph_config":{"enabled":true}}'
```
The deployment view looks like:

View File

@@ -158,11 +158,6 @@ if __name__ == "__main__":
'torchair_graph_config': {
'enabled': True,
},
'ascend_scheduler_config':{
'enabled': True,
'enable_chunked_prefill' : False,
'chunked_prefill_enabled': False
},
})
outputs = llm.generate(prompts, sampling_params)

View File

@@ -27,7 +27,6 @@ The following table lists additional configuration options available in vLLM Asc
| Name | Type | Default | Description |
|-------------------------------------|------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------|
| `torchair_graph_config` | dict | `{}` | Configuration options for torchair graph mode |
| `ascend_scheduler_config` | dict | `{}` | Configuration options for ascend scheduler |
| `weight_prefetch_config` | dict | `{}` | Configuration options for weight prefetch |
| `refresh` | bool | `false` | Whether to refresh global Ascend configuration content. This is usually used by rlhf or ut/e2e test case. |
| `expert_map_path` | str | `None` | When using expert load balancing for an MoE model, an expert map path needs to be passed in. |
@@ -61,18 +60,6 @@ The details of each configuration option are as follows:
| `enable_kv_nz`| bool | `False` | Whether to enable KV Cache NZ layout. This option only takes effect on models using MLA (for example, DeepSeek). |
| `enable_super_kernel` | bool | `False` | Whether to enable super kernel to fuse operators in deepseek moe layers. This option only takes effects on moe models using dynamic w8a8 quantization.|
**ascend_scheduler_config**
| Name | Type | Default | Description |
| ---- | ---- | ------- | ----------- |
| `enabled` | bool | `False` | Whether to enable ascend scheduler for V1 engine.|
| `enable_pd_transfer` | bool | `False` | Whether to enable P-D transfer. When it is enabled, decode is started only when prefill of all requests is done. This option only takes effect on offline inference. |
| `decode_max_num_seqs` | int | `0` | Whether to change max_num_seqs of decode phase when P-D transfer is enabled. This option only takes effect when enable_pd_transfer is True. |
| `max_long_partial_prefills` | Union[int, float] | `float('inf')` | The maximum number of prompts longer than long_prefill_token_threshold that will be prefilled concurrently. |
| `long_prefill_token_threshold` | Union[int, float] | `float('inf')` | a request is considered long if the prompt is longer than this number of tokens. |
ascend_scheduler_config also supports the options from [vllm scheduler config](https://docs.vllm.ai/en/stable/api/vllm/config.html#vllm.config.SchedulerConfig). For example, you can add `enable_chunked_prefill: True` to ascend_scheduler_config as well.
**weight_prefetch_config**
| Name | Type | Default | Description |
@@ -93,12 +80,6 @@ An example of additional configuration is as follows:
"graph_batch_sizes_init": False,
"enable_kv_nz": False
},
"ascend_scheduler_config": {
"enabled": True,
"enable_chunked_prefill": True,
"max_long_partial_prefills": 1,
"long_prefill_token_threshold": 4096,
},
"weight_prefetch_config": {
"enabled": True,
"prefetch_ratio": {

View File

@@ -45,14 +45,14 @@ import os
from vllm import LLM
# TorchAirGraph only works without chunked-prefill now
model = LLM(model="path/to/DeepSeek-R1-0528", additional_config={"torchair_graph_config": {"enabled": True},"ascend_scheduler_config": {"enabled": True}})
model = LLM(model="path/to/DeepSeek-R1-0528", additional_config={"torchair_graph_config": {"enabled": True}})
outputs = model.generate("Hello, how are you?")
```
Online example:
```shell
vllm serve path/to/DeepSeek-R1-0528 --additional-config='{"torchair_graph_config": {"enabled": true},"ascend_scheduler_config": {"enabled": true}}'
vllm serve path/to/DeepSeek-R1-0528 --additional-config='{"torchair_graph_config": {"enabled": true}}'
```
You can find more details about additional configuration [here](../configuration/additional_config.md).

View File

@@ -42,7 +42,6 @@ if __name__ == "__main__":
enable_chunked_prefill=False,
max_num_batched_tokens=2048,
max_model_len=1024,
additional_config={"ascend_scheduler_config": {"enabled": False}},
max_num_seqs=1,
block_size=128,
gpu_memory_utilization=0.9

View File

@@ -28,4 +28,4 @@ vllm serve Qwen/Qwen1.5-MoE-A2.7B \
--gpu-memory-utilization 0.9 \
--trust-remote-code \
--enforce-eager \
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":false, "use_cached_graph":false}}'
--additional-config '{"torchair_graph_config":{"enabled":false, "use_cached_graph":false}}'

View File

@@ -24,15 +24,12 @@ from tests.e2e.conftest import VllmRunner
MODELS = [
"IntervitensInc/pangu-pro-moe-model",
]
# set additional config for ascend scheduler and torchair graph
# set additional config for torchair graph
ADDITIONAL_CONFIG = [{
"additional_config": {
"torchair_graph_config": {
"enabled": True
},
"ascend_scheduler_config": {
"enabled": True,
}
}
}]

View File

@@ -15,23 +15,14 @@ def test_e2e_ep_correctness(model_name):
max_tokens = 5
# FIXME: Really strange that chunked prefill might lead to different results, investigate further
with VllmRunner(
model_name,
tensor_parallel_size=2,
additional_config={"ascend_scheduler_config": {
"enabled": True
}},
enforce_eager=False) as vllm_model:
with VllmRunner(model_name, tensor_parallel_size=2,
enforce_eager=False) as vllm_model:
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
with VllmRunner(
model_name,
tensor_parallel_size=2,
enable_expert_parallel=True,
additional_config={"ascend_scheduler_config": {
"enabled": True
}},
enforce_eager=False) as vllm_model:
with VllmRunner(model_name,
tensor_parallel_size=2,
enable_expert_parallel=True,
enforce_eager=False) as vllm_model:
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal(

View File

@@ -49,13 +49,7 @@ def test_generate_with_allgather():
tensor_parallel_size=2,
max_model_len=1024,
dtype="auto",
enable_expert_parallel=True,
additional_config={
"ascend_scheduler_config": {
"enabled": True,
"chunked_prefill_enabled": False,
},
}) as vllm_model:
enable_expert_parallel=True) as vllm_model:
vllm_model.generate(example_prompts, sampling_params)
@@ -76,11 +70,5 @@ def test_generate_with_alltoall():
tensor_parallel_size=2,
max_model_len=1024,
dtype="auto",
enable_expert_parallel=True,
additional_config={
"ascend_scheduler_config": {
"enabled": True,
"chunked_prefill_enabled": False,
},
}) as vllm_model:
enable_expert_parallel=True) as vllm_model:
vllm_model.generate(example_prompts, sampling_params)

View File

@@ -82,9 +82,6 @@ def test_models_distributed_DeepSeek_multistream_moe():
"enabled": True,
},
"enable_multistream_moe": True,
"ascend_scheduler_config": {
"enabled": True,
},
"refresh": True,
},
) as vllm_model:
@@ -154,14 +151,9 @@ def test_models_distributed_DeepSeek_W4A8DYNAMIC(model):
quantization="ascend",
enforce_eager=True,
enable_expert_parallel=True,
additional_config={
"torchair_graph_config": {
"enabled": False,
},
"ascend_scheduler_config": {
"enabled": True,
}
},
additional_config={"torchair_graph_config": {
"enabled": False,
}},
) as vllm_model:
vllm_model.generate_greedy(prompts, max_tokens)

View File

@@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Compare the with and without prefix caching on V1 scheduler or AscendScheduler."""
"""Compare the with and without prefix caching on V1 scheduler."""
import pytest
@@ -84,67 +84,3 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
name_0="vllm_output",
name_1="prefix_cache_output",
)
@pytest.mark.skip(reason="Fix me, the accuracy is not correct")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [50])
def test_prefix_cache_with_ascend_scheduler(model: str,
max_tokens: int) -> None:
with VllmRunner(model,
additional_config={
'ascend_scheduler_config': {
'enabled': True,
},
},
enforce_eager=False,
max_model_len=2048,
tensor_parallel_size=2,
gpu_memory_utilization=0.7) as vllm_model:
vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
with VllmRunner(model,
additional_config={
'ascend_scheduler_config': {
'enabled': True,
'enable_prefix_caching': True,
},
},
enforce_eager=False,
max_model_len=2048,
tensor_parallel_size=2,
gpu_memory_utilization=0.7) as vllm_model:
prefix_cache_output = vllm_model.generate_greedy(
INPUT_PROMPTS, max_tokens)
# TODO: enable apc and chunked prefill with ascend scheduler will lead accuracy problem.
# Disable it now. Fix it or drop the ascend scheduler in the future.
# with VllmRunner(model,
# additional_config={
# 'ascend_scheduler_config': {
# 'enabled': True,
# 'enable_prefix_caching': True,
# "enable_chunked_prefill": True,
# },
# },
# enforce_eager=True,
# max_model_len=2048,
# tensor_parallel_size=2,
# gpu_memory_utilization=0.7) as vllm_model:
# chunk_prefill_prefix_cache_output = vllm_model.generate_greedy(
# INPUT_PROMPTS, max_tokens)
check_outputs_equal(
outputs_0_lst=vllm_output,
outputs_1_lst=prefix_cache_output,
name_0="vllm_output",
name_1="prefix_cache_output",
)
# check_outputs_equal(
# outputs_0_lst=chunk_prefill_prefix_cache_output,
# outputs_1_lst=prefix_cache_output,
# name_0="chunk_prefill_prefix_cache_output",
# name_1="prefix_cache_output",
# )

View File

@@ -24,6 +24,7 @@ Run `pytest tests/e2e/multicard/test_qwen3_next.py`.
import os
from unittest.mock import patch
import pytest
from modelscope import snapshot_download # type: ignore
from tests.e2e.conftest import VllmRunner
@@ -63,6 +64,8 @@ def test_models_distributed_Qwen3_NEXT_TP4_FULL_DECODE_ONLY():
del vllm_model
@pytest.mark.skip(
reason="Qwen3-Next + MTP doesn't work with chunked prefill. Fix Me")
def test_models_distributed_Qwen3_NEXT_MTP_TP4_SIMILARITY():
example_prompts = [
"Hello, my name is",
@@ -89,12 +92,6 @@ def test_models_distributed_Qwen3_NEXT_MTP_TP4_SIMILARITY():
gpu_memory_utilization=0.8,
distributed_executor_backend="mp",
enforce_eager=True,
additional_config={
"ascend_scheduler_config": {
"enabled": True,
"enable_chunked_prefill": False
}
},
speculative_config={
"method": "qwen3_next_mtp",
"num_speculative_tokens": 1

View File

@@ -44,9 +44,6 @@ def _deepseek_torchair_test_fixture(
kwargs = {}
if not use_v1_schduler:
kwargs = {
"ascend_scheduler_config": {
"enabled": True,
},
"refresh": True,
}
additional_config.update(**kwargs)
@@ -120,9 +117,6 @@ def _pangu_torchair_test_fixture(
# torchair is only work without chunked-prefill now
kwargs = {
"ascend_scheduler_config": {
"enabled": True,
},
"refresh": True,
}
additional_config.update(**kwargs)
@@ -185,9 +179,6 @@ def _qwen_torchair_test_fixture(
"torchair_graph_config": {
"enabled": False,
},
"ascend_scheduler_config": {
"enabled": True,
},
"refresh": True,
}
@@ -244,9 +235,6 @@ def _deepseek_v2_lite_torchair_test_fixure(
kwargs = {}
if not use_v1_schduler:
kwargs = {
"ascend_scheduler_config": {
"enable": True,
},
"refresh": True,
}
additional_config.update(**kwargs)

View File

@@ -73,11 +73,7 @@ async def test_models(model: str, mode: str) -> None:
"VLLM_RPC_TIMEOUT": "3600000",
"VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": "3600000"
}
additional_config: dict[str, Any] = {
"ascend_scheduler_config": {
"enabled": False
},
}
additional_config: dict[str, Any] = {}
speculative_config = {
"num_speculative_tokens": 2,
"method": "deepseek_mtp"

View File

@@ -74,9 +74,6 @@ async def test_models(model: str) -> None:
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
}
additional_config = {
"ascend_scheduler_config": {
"enabled": False
},
"torchair_graph_config": {
"enabled": True,
"enable_multistream_moe": False,

View File

@@ -68,12 +68,7 @@ aisbench_cases75 = [{
async def test_models(model: str) -> None:
port = get_open_port()
env_dict = {"TASK_QUEUE_ENABLE": "1", "HCCL_OP_EXPANSION_MODE": "AIV"}
additional_config = {
"ascend_scheduler_config": {
"enabled": False
},
"enable_weight_nz_layout": True
}
additional_config = {"enable_weight_nz_layout": True}
server_args = [
"--quantization", "ascend", "--reasoning-parser", "qwen3",
"--tensor-parallel-size", "4", "--port",

View File

@@ -83,8 +83,7 @@ async def test_models(model: str, tp_size: int) -> None:
"0.9", "--block-size", "128", "--max-num-seqs", "256",
"--enforce-eager", "--max-model-len", "35840",
"--max-num-batched-tokens", "35840", "--additional-config",
'{"ascend_scheduler_config":{"enabled":true},"enable_weight_nz_layout":true}',
"--compilation-config",
'{"enable_weight_nz_layout":true}', "--compilation-config",
'{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[1,8,24,48,60]}'
]
with RemoteOpenAIServer(model,

View File

@@ -33,7 +33,6 @@ MODES = [
"single",
"aclgraph",
"aclgraph_mlapo",
"no_chunkprefill",
]
prompts = [
@@ -82,9 +81,6 @@ async def test_models(model: str, mode: str) -> None:
"method": "deepseek_mtp"
}
additional_config = {
"ascend_scheduler_config": {
"enabled": False
},
"torchair_graph_config": {
"enabled": True,
"enable_multistream_moe": False,
@@ -112,10 +108,6 @@ async def test_models(model: str, mode: str) -> None:
if mode == "aclgraph_mlapo":
env_dict["VLLM_ASCEND_ENABLE_MLAPO"] = "1"
additional_config["torchair_graph_config"] = {"enabled": False}
if mode == "no_chunkprefill":
additional_config["ascend_scheduler_config"] = {"enabled": True}
i = server_args.index("--max-num-batched-tokens") + 1
server_args[i] = "36864"
server_args.extend(["--additional-config", json.dumps(additional_config)])
request_keyword_args: dict[str, Any] = {
**api_keyword_args,

View File

@@ -71,9 +71,6 @@ async def test_models(model: str) -> None:
"cudagraph_mode": "FULL_DECODE_ONLY"
}
additional_config: dict[str, Any] = {
"ascend_scheduler_config": {
"enabled": False
},
"torchair_graph_config": {
"enabled": True
},

View File

@@ -92,7 +92,6 @@ async def test_models(model: str, tp_size: int, dp_size: int,
"--gpu-memory-utilization",
"0.9",
"--additional-config",
'{"ascend_scheduler_config":{"enabled":true},'
'"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}',
]
if full_graph:

View File

@@ -85,9 +85,8 @@ async def test_models(model: str, tp_size: int) -> None:
str(tp_size), "--port",
str(port), "--max-model-len", "30000", "--max-num-batched-tokens",
"40000", "--max-num-seqs", "400", "--trust-remote-code",
"--gpu-memory-utilization", "0.8", "--additional-config",
'{"ascend_scheduler_config":{"enabled":false}}',
"--compilation_config", '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
"--gpu-memory-utilization", "0.8", "--compilation_config",
'{"cudagraph_mode": "FULL_DECODE_ONLY"}'
]
request_keyword_args: dict[str, Any] = {
**api_keyword_args,

View File

@@ -60,11 +60,7 @@ async def test_models(model: str) -> None:
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"
}
additional_config: dict[str, Any] = {
"ascend_scheduler_config": {
"enabled": False
},
}
additional_config: dict[str, Any] = {}
compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"}
server_args = [
"--quantization", "ascend", "--async-scheduling",

View File

@@ -63,11 +63,6 @@ async def test_models(model: str, mode: str) -> None:
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"
}
additional_config: dict[str, Any] = {
"ascend_scheduler_config": {
"enabled": False
},
}
compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"}
server_args = [
"--quantization", "ascend", "--async-scheduling",
@@ -82,7 +77,6 @@ async def test_models(model: str, mode: str) -> None:
server_args.extend(
["--compilation-config",
json.dumps(compilation_config)])
server_args.extend(["--additional-config", json.dumps(additional_config)])
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
}

View File

@@ -93,8 +93,6 @@ async def test_models(model: str, mode: str, tp_size: int) -> None:
server_args.remove(
'{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 8, 24, 48, 60]}'
)
server_args.append("--additional-config")
server_args.append('{"ascend_scheduler_config":{"enabled":true}}')
server_args.append("--enforce-eager")
request_keyword_args: dict[str, Any] = {
**api_keyword_args,

View File

@@ -30,7 +30,7 @@ deployment:
--quantization ascend
--gpu-memory-utilization 0.9
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
--additional-config '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
--additional-config '{"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
-
server_cmd: >
@@ -51,7 +51,7 @@ deployment:
--quantization ascend
--gpu-memory-utilization 0.9
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
--additional-config '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
--additional-config '{"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
benchmarks:
acc:
case_type: accuracy

View File

@@ -31,7 +31,7 @@ deployment:
--gpu-memory-utilization 0.9
--enforce-eager
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
--additional-config '{"ascend_scheduler_config":{"enabled":false},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
--additional-config '{"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
-
server_cmd: >
@@ -53,5 +53,5 @@ deployment:
--gpu-memory-utilization 0.9
--enforce-eager
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
--additional-config '{"ascend_scheduler_config":{"enabled":false},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
--additional-config '{"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
benchmarks:

View File

@@ -50,7 +50,7 @@ deployment:
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}'
--additional-config
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
-
server_cmd: >
@@ -80,7 +80,7 @@ deployment:
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}'
--additional-config
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
@@ -111,7 +111,7 @@ deployment:
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}'
--additional-config
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
@@ -141,7 +141,7 @@ deployment:
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}'
--additional-config
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
benchmarks:
perf:
case_type: performance

View File

@@ -49,7 +49,7 @@ deployment:
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}'
--additional-config
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
-
server_cmd: >
@@ -79,7 +79,7 @@ deployment:
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}'
--additional-config
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
@@ -110,7 +110,7 @@ deployment:
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}'
--additional-config
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
@@ -140,7 +140,7 @@ deployment:
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}'
--additional-config
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
benchmarks:
perf:
case_type: performance

View File

@@ -29,7 +29,7 @@ deployment:
--trust-remote-code
--no-enable-prefix-caching
--gpu-memory-utilization 0.9
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
--additional-config '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
-
server_cmd: >
@@ -49,5 +49,5 @@ deployment:
--trust-remote-code
--no-enable-prefix-caching
--gpu-memory-utilization 0.92
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
--additional-config '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
benchmarks:

View File

@@ -48,27 +48,26 @@ def mtp_correctness(sampling_config: SamplingParams,
if graph_mode == CUDAGraphMode.FULL:
graph_mode_str = "FULL_DECODE_ONLY"
with VllmRunner(
model_name,
tensor_parallel_size=1,
max_num_seqs=256,
gpu_memory_utilization=0.7,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": num_speculative_tokens,
"disable_padded_drafter_batch": disable_padded_drafter_batch,
},
enforce_eager=enforce_eager,
max_model_len=2000,
compilation_config=CompilationConfig(
cudagraph_mode=graph_mode_str,
cudagraph_capture_sizes=[12],
),
additional_config={"ascend_scheduler_config": {
"enabled": False
}}) as spec_llm:
with VllmRunner(model_name,
tensor_parallel_size=1,
max_num_seqs=256,
gpu_memory_utilization=0.7,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method":
"deepseek_mtp",
"num_speculative_tokens":
num_speculative_tokens,
"disable_padded_drafter_batch":
disable_padded_drafter_batch,
},
enforce_eager=enforce_eager,
max_model_len=2000,
compilation_config=CompilationConfig(
cudagraph_mode=graph_mode_str,
cudagraph_capture_sizes=[12],
)) as spec_llm:
spec_outputs = spec_llm.generate(example_prompts, sampling_config)
matches = 0

View File

@@ -1,170 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm import SamplingParams
from tests.e2e.conftest import VllmRunner
from tests.e2e.model_utils import check_outputs_equal
MODEL = "Qwen/Qwen3-0.6B"
@pytest.mark.parametrize("enforce_eager", [True, False])
def test_concurrent_partial_prefill(enforce_eager):
with VllmRunner(MODEL,
additional_config={
'ascend_scheduler_config': {
'enabled': True,
},
},
max_num_seqs=3,
max_num_batched_tokens=8192,
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7) as vllm_model:
outputs = vllm_model.model.generate(["Hello my name is Robert and I"] *
3)
assert len(outputs) == 3
for output in outputs:
assert len(output.outputs) == 1
@pytest.mark.parametrize("enforce_eager", [True, False])
def test_prefix_cache_stats_is_recorded(enforce_eager):
with VllmRunner(MODEL,
additional_config={
'ascend_scheduler_config': {
'enabled': True,
},
},
max_num_seqs=3,
max_num_batched_tokens=8192,
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7) as vllm_model:
# 17 tokens will make sure first 16 tokens are cached in a block
input_tokens = {"prompt_token_ids": [101] * 129}
_ = vllm_model.model.generate([input_tokens])
outputs = vllm_model.model.generate([input_tokens])
assert outputs[0].num_cached_tokens == 128
@pytest.mark.parametrize("max_tokens",
[4]) # cannot align results when max_tokens > 4
@pytest.mark.parametrize("chunked_prefill_token_size", [2048])
def test_chunked_prefill_with_ascend_scheduler(
max_tokens: int, chunked_prefill_token_size: int) -> None:
example_prompts = [
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
]
max_num_seqs = chunked_prefill_token_size
max_num_batched_tokens = chunked_prefill_token_size
with VllmRunner(MODEL,
additional_config={
'ascend_scheduler_config': {
'enabled': True,
'enable_chunked_prefill': True,
},
},
max_num_seqs=max_num_seqs,
max_num_batched_tokens=max_num_batched_tokens,
max_model_len=2048,
gpu_memory_utilization=0.7) as vllm_model:
chunked_prefill_output = vllm_model.generate_greedy(
example_prompts, max_tokens)
with VllmRunner(MODEL,
additional_config={
'ascend_scheduler_config': {
'enabled': True,
},
},
max_model_len=2048,
gpu_memory_utilization=0.7) as vllm_model:
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal(
outputs_0_lst=vllm_output,
outputs_1_lst=chunked_prefill_output,
name_0="vllm_output",
name_1="chunked_prefill_output",
)
@pytest.mark.parametrize("max_tokens",
[4]) # cannot align results when max_tokens > 4
@pytest.mark.parametrize("chunked_prefill_token_size", [2048])
def test_chunked_prefill_with_scheduler_dynamic_batch(
max_tokens: int, chunked_prefill_token_size: int) -> None:
example_prompts = [
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
]
max_num_seqs = chunked_prefill_token_size
max_num_batched_tokens = chunked_prefill_token_size
with VllmRunner(MODEL,
additional_config={
'SLO_limits_for_dynamic_batch': 0,
},
max_num_seqs=max_num_seqs,
max_num_batched_tokens=max_num_batched_tokens,
max_model_len=2048,
gpu_memory_utilization=0.7) as vllm_model:
dynamic_batch_output = vllm_model.generate_greedy(
example_prompts, max_tokens)
with VllmRunner(MODEL,
additional_config={
'SLO_limits_for_dynamic_batch': -1,
},
max_model_len=2048,
gpu_memory_utilization=0.7) as vllm_model:
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal(
outputs_0_lst=vllm_output,
outputs_1_lst=dynamic_batch_output,
name_0="vllm_output",
name_1="chunked_prefill_output",
)
def test_async_scheduling_eager() -> None:
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
] * 10
sampling_params = SamplingParams(temperature=0.2,
max_tokens=10,
stop_token_ids=None)
with VllmRunner(
"Qwen/Qwen2.5-0.5B-Instruct",
max_model_len=4096,
max_num_seqs=50,
dtype="bfloat16",
gpu_memory_utilization=0.9,
async_scheduling=True,
) as vllm_model:
vllm_model.generate(prompts, sampling_params=sampling_params)
def test_async_scheduling_with_full_graph() -> None:
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
] * 10
sampling_params = SamplingParams(temperature=0.2,
max_tokens=10,
stop_token_ids=None)
with VllmRunner("Qwen/Qwen3-8B",
max_model_len=4096,
max_num_seqs=50,
dtype="bfloat16",
gpu_memory_utilization=0.9,
async_scheduling=True,
compilation_config={"cudagraph_mode":
"FULL"}) as vllm_model:
vllm_model.generate(prompts, sampling_params=sampling_params)

View File

@@ -1,82 +0,0 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
Compare the outputs of vLLM with and without aclgraph.
Run `pytest tests/compile/test_aclgraph.py`.
"""
import gc
import pytest
import torch
from vllm import SamplingParams
from tests.e2e.conftest import VllmRunner
MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [1])
def test_models(
model: str,
max_tokens: int,
) -> None:
prompts = ["The president of the United States is"]
sampling_params = SamplingParams(
max_tokens=max_tokens,
temperature=0.0,
)
with VllmRunner(model,
long_prefill_token_threshold=20,
enforce_eager=False) as vllm_model:
output1 = vllm_model.generate(prompts, sampling_params)
with VllmRunner(model,
enforce_eager=False,
additional_config={
'ascend_scheduler_config': {
'enabled': True
},
}) as vllm_model:
output2 = vllm_model.generate(prompts, sampling_params)
# Extract the generated token IDs for comparison
token_ids1 = output1[0][0][0]
token_ids2 = output2[0][0][0]
print(f"Token IDs 1: {token_ids1}")
print(f"Token IDs 2: {token_ids2}")
# Convert token IDs to tensors and calculate cosine similarity
# Take the length of a shorter sequence to ensure consistent dimensions
min_len = min(len(token_ids1), len(token_ids2))
tensor1 = torch.tensor(token_ids1[:min_len], dtype=torch.float32)
tensor2 = torch.tensor(token_ids2[:min_len], dtype=torch.float32)
# Calculate similarity using torch.cosine_similarity
similarity = torch.cosine_similarity(tensor1, tensor2, dim=0)
print(f"Token IDs cosine similarity: {similarity.item()}")
assert similarity > 0.95
gc.collect()
torch.npu.empty_cache()
torch.npu.reset_peak_memory_stats()

View File

@@ -20,7 +20,6 @@
Run `pytest tests/test_offline_inference.py`.
"""
import pytest
from vllm import SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.assets.image import ImageAsset
@@ -55,40 +54,6 @@ def test_multimodal_vl(prompt_template):
assert output_str, "Generated output should not be empty."
@pytest.mark.skip(reason="This e2e test will stuck in multi-batch scenario. "
"Add this back after fixing the issue.")
def test_multimodal_ascend_scheduler(prompt_template):
image = ImageAsset("cherry_blossom") \
.pil_image.convert("RGB")
img_questions = [
"What is the content of this image?",
"Describe the content of this image in detail.",
"What's in the image?",
"Where is this image taken?",
]
images = [image] * len(img_questions)
prompts = prompt_template(img_questions)
with VllmRunner("Qwen/Qwen2.5-VL-3B-Instruct",
max_model_len=4096,
additional_config={
'ascend_scheduler_config': {
'enabled': True,
},
},
mm_processor_kwargs={
"min_pixels": 28 * 28,
"max_pixels": 1280 * 28 * 28,
"fps": 1,
},
enforce_eager=True) as vllm_model:
outputs = vllm_model.generate_greedy(prompts=prompts,
images=images,
max_tokens=64)
assert len(outputs) == len(prompts)
for _, output_str in outputs:
assert output_str, "Generated output should not be empty."
def test_multimodal_audio():
audio_prompt = "".join([
f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"

View File

@@ -1,134 +0,0 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from vllm.config import SchedulerConfig
from tests.ut.base import TestBase
from vllm_ascend.core.schedule_config import AscendSchedulerConfig
class TestAscendSchedulerConfig(TestBase):
def setUp(self):
self.basic_scheduler_config = SchedulerConfig(
max_num_batched_tokens=8192,
max_model_len=8192,
is_multimodal_model=False,
send_delta_data=False,
)
def test_initialize_from_config_with_default(self):
# No additional config given, check the default value here.
ascend_config = AscendSchedulerConfig.initialize_from_config(
self.basic_scheduler_config, {})
self.assertEqual(ascend_config.enable_chunked_prefill, False)
self.assertEqual(ascend_config.policy, "fcfs")
self.assertEqual(ascend_config.scheduler_cls,
"vllm_ascend.core.scheduler.AscendScheduler")
self.assertEqual(ascend_config.max_num_encoder_input_tokens, 8192)
self.assertEqual(ascend_config.encoder_cache_size, 8192)
def test_initialize_from_config_with_override(self):
# test override
ascend_config = AscendSchedulerConfig.initialize_from_config(
self.basic_scheduler_config,
AscendSchedulerConfig(
enable_chunked_prefill=False,
policy="fcfs",
scheduler_cls="vllm_ascend.core.scheduler.AscendScheduler",
max_num_batched_tokens=8192,
max_model_len=2048,
max_long_partial_prefills=1,
long_prefill_token_threshold=512,
),
)
self.assertEqual(ascend_config.enable_chunked_prefill, False)
self.assertEqual(ascend_config.policy, "fcfs")
self.assertEqual(ascend_config.scheduler_cls,
"vllm_ascend.core.scheduler.AscendScheduler")
self.assertEqual(ascend_config.max_num_batched_tokens, 8192)
self.assertEqual(ascend_config.encoder_cache_size, 8192)
self.assertEqual(ascend_config.max_long_partial_prefills, 1)
self.assertEqual(ascend_config.long_prefill_token_threshold, 512)
def test_not_implemented_policy(self):
with self.assertRaises(NotImplementedError) as context:
AscendSchedulerConfig.initialize_from_config(
self.basic_scheduler_config,
AscendSchedulerConfig(
policy="custom_policy",
max_num_batched_tokens=8192,
max_model_len=2048,
),
)
self.assertIn(
"currently AscendScheduler only supports fcfs policy",
str(context.exception),
)
def test_no_override(self):
ascend_config = AscendSchedulerConfig.initialize_from_config(
self.basic_scheduler_config, {})
self.assertEqual(ascend_config.max_num_encoder_input_tokens, 8192)
self.assertEqual(ascend_config.encoder_cache_size, 8192)
def test_valid_config_with_multimodal(self):
config = AscendSchedulerConfig.initialize_from_config(
SchedulerConfig(is_multimodal_model=True,
max_num_batched_tokens=8192), {})
self.assertTrue(config.is_multimodal_model)
def test_valid_config_with_chunked_prefill(self):
ascend_config = AscendSchedulerConfig.initialize_from_config(
self.basic_scheduler_config,
AscendSchedulerConfig(
enable_chunked_prefill=True,
max_num_batched_tokens=8192,
max_model_len=8192,
),
)
self.assertEqual(ascend_config.max_num_batched_tokens, 8192)
self.assertEqual(ascend_config.max_model_len, 8192)
self.assertTrue(ascend_config.enable_chunked_prefill)
def test_invalid_config_without_chunked_prefill(self):
with self.assertRaises(ValueError) as context:
AscendSchedulerConfig.initialize_from_config(
self.basic_scheduler_config,
AscendSchedulerConfig(
enable_chunked_prefill=False,
max_num_batched_tokens=2048,
max_model_len=8192,
),
)
self.assertIn(
"Ascend scheduler is enabled without chunked prefill feature",
str(context.exception),
)
self.assertIn("max_num_batched_tokens (2048)", str(context.exception))
self.assertIn("max_model_len (8192)", str(context.exception))
def test_initialize_from_config_with_pd_transfer(self):
ascend_config = AscendSchedulerConfig.initialize_from_config(
self.basic_scheduler_config,
AscendSchedulerConfig(
enable_pd_transfer=True,
decode_max_num_seqs=48,
max_num_batched_tokens=8192,
max_model_len=4096,
),
)
self.assertEqual(ascend_config.enable_pd_transfer, True)
self.assertEqual(ascend_config.decode_max_num_seqs, 48)

File diff suppressed because it is too large Load Diff

View File

@@ -99,7 +99,6 @@ class TestAscendRowParallelLinear(BaseLinearTest):
ascend_config._ASCEND_CONFIG = MagicMock()
ascend_config._ASCEND_CONFIG.oproj_tensor_parallel_size = 2
ascend_config._ASCEND_CONFIG.ascend_scheduler_config.enabled = False
linear = AscendRowParallelLinear(
input_size=16,

View File

@@ -209,12 +209,7 @@ class TestAscendLogitsProcessor(unittest.TestCase):
return_value=torch.randn(1, self.vocab_size)),
patch(
"vllm_ascend.ops.vocab_parallel_embedding.get_lmhead_tp_group.all_gather",
return_value=torch.randn(1, self.vocab_size)),
patch(
"vllm_ascend.core.schedule_config.AscendSchedulerConfig.initialize_from_config",
return_value=MagicMock(max_num_batched_tokens=1000,
max_model_len=512,
enable_chunked_prefill=False))
return_value=torch.randn(1, self.vocab_size))
]
for p in self.patches:

View File

@@ -33,13 +33,6 @@ class TestAscendW8A8FusedMoEMethod(TestBase):
mock_get_ep_group.return_value = mock_ep_group
mock_ascend_config = Mock()
# 创建一个具有具体属性的 Mock 对象来表示 ascend_scheduler_config
mock_ascend_scheduler_config = Mock()
mock_ascend_scheduler_config.enabled = False
mock_ascend_scheduler_config.max_num_batched_tokens = 1024
mock_ascend_scheduler_config.max_model_len = 2048
mock_ascend_config.ascend_scheduler_config = mock_ascend_scheduler_config
mock_ascend_config.torchair_graph_config = Mock(enabled=False)
mock_ascend_config.enable_chunked_prefill = False
mock_get_ascend_config.return_value = mock_ascend_config

View File

@@ -56,9 +56,6 @@ class TestAscendConfig(TestBase):
self.assertTrue(torchair_graph_config.enable_frozen_parameter)
self.assertFalse(torchair_graph_config.enable_kv_nz)
ascend_scheduler_config = ascend_config.ascend_scheduler_config
self.assertFalse(ascend_scheduler_config.enabled)
@_clean_up_ascend_config
def test_init_ascend_config_with_additional_config(self):
test_vllm_config = VllmConfig()
@@ -74,9 +71,6 @@ class TestAscendConfig(TestBase):
"enable_kv_nz": True
},
"multistream_overlap_shared_expert": True,
"ascend_scheduler_config": {
"enabled": True
},
"expert_map_path": "test_expert_map_path",
"refresh": True,
}
@@ -94,9 +88,6 @@ class TestAscendConfig(TestBase):
self.assertTrue(torchair_graph_config.enable_frozen_parameter)
self.assertTrue(torchair_graph_config.enable_kv_nz)
ascend_scheduler_config = ascend_config.ascend_scheduler_config
self.assertTrue(ascend_scheduler_config.enabled)
@_clean_up_ascend_config
def test_init_ascend_config_with_refresh(self):
test_vllm_config = VllmConfig()

View File

@@ -32,7 +32,6 @@ class TestNPUPlatform(TestBase):
def mock_vllm_ascend_config():
mock_ascend_config = MagicMock()
mock_ascend_config.torchair_graph_config.enabled = False
mock_ascend_config.ascend_scheduler_config.enabled = False
mock_ascend_config.enable_shared_expert_dp = False
return mock_ascend_config
@@ -522,31 +521,6 @@ class TestNPUPlatform(TestBase):
self.platform.check_and_update_config(vllm_config)
self.assertEqual(vllm_config.compilation_config.custom_ops, [])
@patch('vllm_ascend.utils.get_ascend_device_type',
return_value=AscendDeviceType._910_93)
@patch("vllm_ascend.ascend_config.check_ascend_config")
@patch("vllm_ascend.ascend_config.init_ascend_config")
@patch(
"vllm_ascend.core.recompute_schedule_config.RecomputeSchedulerConfig.initialize_from_config"
)
def test_check_and_update_config_ascend_scheduler_config(
self, mock_init_recompute, mock_init_ascend, mock_check_ascend,
mock_soc_version):
mock_ascend_config = TestNPUPlatform.mock_vllm_ascend_config()
mock_ascend_config.ascend_scheduler_config.enabled = True
mock_init_ascend.return_value = mock_ascend_config
vllm_config = TestNPUPlatform.mock_vllm_config()
vllm_config.parallel_config.tensor_parallel_size = 1
mock_init_recompute.return_value = MagicMock()
with patch("vllm_ascend.core.schedule_config.AscendSchedulerConfig"
) as mock_scheduler:
from vllm_ascend import platform
importlib.reload(platform)
self.platform.check_and_update_config(vllm_config)
mock_scheduler.initialize_from_config.assert_called_once()
@patch('vllm_ascend.platform.get_ascend_config')
def test_get_attn_backend_cls_use_v1_and_mla(self, mock_get_ascend_config):
mock_config = MagicMock()

View File

@@ -253,12 +253,10 @@ class TestUtils(TestBase):
model_path = os.path.join(os.path.dirname(__file__), "fake_weight")
test_model_config = ModelConfig(model=model_path, enforce_eager=True)
test_parallel_config = ParallelConfig()
ascend_config = {"ascend_scheduler_config": {"enabled": False}}
test_vllm_config = VllmConfig(
model_config=test_model_config,
compilation_config=test_compilation_config,
parallel_config=test_parallel_config,
additional_config=ascend_config)
parallel_config=test_parallel_config)
utils.update_aclgraph_sizes(test_vllm_config)
os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV'
utils.update_aclgraph_sizes(test_vllm_config)

View File

@@ -235,8 +235,6 @@ def test_torchair_deepseek_v2_mlp(mock_distributed, base_config):
hidden_act="silu",
quant_config=None)
assert isinstance(mlp.act_fn, TorchairDeepseekV2SiluAndMul)
ascend_config = MagicMock()
ascend_config._ASCEND_CONFIG.ascend_scheduler_config.enabled = False
with patch(
"vllm_ascend.torchair.models.torchair_deepseek_v2.QuantizationConfig"
) as mock_quant_config:

View File

@@ -39,11 +39,6 @@ class AscendConfig:
self.torchair_graph_config = TorchairGraphConfig(
torchair_graph_config, vllm_config, additional_config)
ascend_scheduler_config = additional_config.get(
"ascend_scheduler_config", {})
self.ascend_scheduler_config = AscendSchedulerConfig(
ascend_scheduler_config)
# Dump / PrecisionDebugger configuration
dump_config_path = additional_config.get("dump_config", None)
self.dump_config = DumpConfig(dump_config_path)
@@ -220,20 +215,6 @@ class TorchairGraphConfig:
)
class AscendSchedulerConfig:
"""
Configuration Object for ascend_scheduler_config from additional_config
"""
def __init__(self, ascend_scheduler_config: dict):
self.enabled = ascend_scheduler_config.get("enabled", False)
# Ascend scheduler is based on vllm v0 scheduler, so we should support
# all vllm v0 scheduler configs as well.
for k, v in ascend_scheduler_config.items():
if not hasattr(self, k):
setattr(self, k, v)
class DumpConfig:
"""
Configuration object for dump/PrecisionDebugger settings.

View File

@@ -1,105 +0,0 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
from dataclasses import dataclass, fields
from typing import Type, Union
from vllm.config import SchedulerConfig
MAX_INT = 2147483647
@dataclass
class AscendSchedulerConfig(SchedulerConfig):
enable_chunked_prefill: bool = False
max_long_partial_prefills: int = 1
long_prefill_token_threshold: int = MAX_INT
policy: str = "fcfs"
scheduler_cls: Union[str, Type[object]] = (
"vllm_ascend.core.scheduler.AscendScheduler")
enable_pd_transfer: bool = False
decode_max_num_seqs: int = 0
@classmethod
def initialize_from_config(
cls,
vllm_scheduler_config: SchedulerConfig,
ascend_scheduler_config,
):
scheduler_config = {
field.name: getattr(vllm_scheduler_config, field.name)
for field in fields(vllm_scheduler_config) if field.init
}
# Override default values into original SchedulerConfig
scheduler_config["enable_chunked_prefill"] = False
scheduler_config["max_long_partial_prefills"] = None
scheduler_config["long_prefill_token_threshold"] = None
scheduler_config["policy"] = "fcfs"
scheduler_config["scheduler_cls"] = (
"vllm_ascend.core.scheduler.AscendScheduler")
scheduler_config["enable_pd_transfer"] = False
scheduler_config["decode_max_num_seqs"] = 0
# Override params in original SchedulerConfig with params in ascend_scheduler_config
for k, _ in scheduler_config.items():
if hasattr(ascend_scheduler_config, k):
scheduler_config[k] = getattr(ascend_scheduler_config, k)
return cls(**scheduler_config)
def __post_init__(self, *args) -> None:
self.max_num_encoder_input_tokens = self.max_num_batched_tokens
self.encoder_cache_size = self.max_num_batched_tokens
self.chunked_prefill_enabled = self.enable_chunked_prefill
if (self.max_num_batched_tokens < self.max_model_len
and not self.chunked_prefill_enabled):
raise ValueError(
"Ascend scheduler is enabled without chunked prefill feature. "
f"Argument max_num_batched_tokens ({self.max_num_batched_tokens}) is "
f"smaller than max_model_len ({self.max_model_len}). "
"This effectively limits the maximum sequence length to "
"max_num_batched_tokens and makes vLLM reject longer "
"sequences. Please increase max_num_batched_tokens or "
"decrease max_model_len.")
# concurrent partial prefills. Default is 1 meaning not enabled.
if self.max_long_partial_prefills is None:
self.max_long_partial_prefills = 1
self.long_prefill_token_threshold = MAX_INT
if self.long_prefill_token_threshold is None or \
self.long_prefill_token_threshold <= 0:
if self.max_model_len is None:
self.long_prefill_token_threshold = MAX_INT
else:
self.long_prefill_token_threshold = \
max(1, int(self.max_model_len * 0.04))
if self.max_long_partial_prefills < 0:
raise ValueError(
f"max_long_partial_prefills must be non-negative, but got "
f"{self.max_long_partial_prefills}")
if self.long_prefill_token_threshold < 0:
raise ValueError(
f"long_prefill_token_threshold must be non-negative, but got "
f"{self.long_prefill_token_threshold}")
if self.policy != "fcfs":
raise NotImplementedError(
f"currently AscendScheduler only supports fcfs policy, got {self.policy}"
)
if getattr(self, "scheduler_delay_factor", 0) > 0:
raise NotImplementedError(
"currently AscendScheduler doesn't support scheduler_delay_factor."
)

View File

@@ -1,592 +0,0 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
import time
from collections import deque
from typing import Iterable, Optional, Union
from vllm.config import VllmConfig
from vllm.distributed.kv_events import KVEventBatch
from vllm.logger import logger
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
from vllm.utils.math_utils import cdiv
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
from vllm.v1.core.sched.output import NewRequestData, SchedulerOutput
from vllm.v1.core.sched.scheduler import Scheduler
from vllm.v1.engine import EngineCoreEventType, EngineCoreOutputs
from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.request import Request, RequestStatus
from vllm.v1.structured_output import StructuredOutputManager
class AscendScheduler(Scheduler):
"""This Scheduler extends vllm's original v1 scheduler
with prefill-first scheduling strategy."""
def _initialize_common(self) -> None:
"""Initialize common attributes shared across all versions."""
self.scheduled_req_ids: set[str] = set()
self.running: list[Request] = []
self.finished_prefill_reqs: deque[Request] = deque()
enable_pd_transfer = getattr(self.scheduler_config,
'enable_pd_transfer', False)
decode_max_num_seqs = getattr(self.scheduler_config,
'decode_max_num_seqs', 0)
self.phase = "" if not enable_pd_transfer else "prefill"
self.decode_max_num_running_reqs = max(self.max_num_running_reqs,
decode_max_num_seqs)
def __init__(
self,
vllm_config: VllmConfig,
kv_cache_config: KVCacheConfig,
structured_output_manager: StructuredOutputManager,
block_size: Optional[int] = None,
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
include_finished_set: bool = False,
log_stats: bool = False,
) -> None:
# Call the parent class's __init__ method
super().__init__(vllm_config, kv_cache_config,
structured_output_manager, block_size, mm_registry,
include_finished_set, log_stats)
# Initialize common attributes
self._initialize_common()
def schedule(self) -> SchedulerOutput:
if self.scheduler_config.chunked_prefill_enabled:
return super().schedule()
scheduled_new_reqs: list[Request] = []
scheduled_resumed_reqs: list[Request] = []
scheduled_running_reqs: list[Request] = []
preempted_reqs: list[Request] = []
req_to_new_blocks: dict[str, KVCacheBlocks] = {}
num_scheduled_tokens: dict[str, int] = {}
token_budget = self.max_num_scheduled_tokens
# Encoder-related.
scheduled_encoder_inputs: dict[str, list[int]] = {}
encoder_budget = self.max_num_encoder_input_tokens
# Spec decode-related.
scheduled_spec_decode_tokens: dict[str, list[int]] = {}
# For logging.
scheduled_timestamp = time.monotonic()
# Record scheduled LoRA requests.
scheduled_loras: set[int] = set()
# Use a temporary deque to collect requests that need to be skipped
# and put back at the head of the waiting queue later
skipped_waiting_requests: deque[Request] = deque()
if self.phase == "prefill":
remaining_running_reqs = []
for request in self.running:
# move request has finished prefill to finished_prefill_reqs
if request.num_tokens > request.num_prompt_tokens:
self.finished_prefill_reqs.append(request)
else:
remaining_running_reqs.append(request)
self.running = remaining_running_reqs
# all request prefilled, change phase to decode
if not self.waiting and not self.running:
self.phase = "decode"
# Skip long prompt requests in prefill stage.
# long_prefill_budget is float('inf') if not use.
if self.vllm_config.scheduler_config.long_prefill_token_threshold == 0:
long_prefill_budget = float('inf')
long_prefill_token_threshold = float('inf')
else:
long_prefill_budget = self.vllm_config.scheduler_config.max_long_partial_prefills
long_prefill_token_threshold = self.vllm_config.scheduler_config.long_prefill_token_threshold
# Schedule prefill requests first.
while self.waiting and token_budget > 0:
if len(self.running) == (self.decode_max_num_running_reqs
if self.phase == "decode" else
self.max_num_running_reqs):
break
request = self.waiting[0]
def skip_cur_request():
self.waiting.popleft()
skipped_waiting_requests.appendleft(request)
# P/D: skip request if still waiting for remote kvs.
if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
is_ready = self._update_waiting_for_remote_kv(request)
if is_ready:
request.status = RequestStatus.WAITING
else:
skip_cur_request()
continue
# Check that adding the request still respects the max_loras
# constraint.
if (self.lora_config and request.lora_request and
(len(scheduled_loras) == self.lora_config.max_loras
and request.lora_request.lora_int_id not in scheduled_loras)):
# Scheduling would exceed max_loras, skip.
skip_cur_request()
continue
num_external_computed_tokens = 0
load_kv_async = False
# Get already-cached tokens.
if request.num_computed_tokens == 0:
new_computed_blocks, num_new_local_computed_tokens = \
self.kv_cache_manager.get_computed_blocks(
request)
# Get externally-cached tokens if using a KVConnector.
if self.connector is not None:
num_external_computed_tokens, load_kv_async = (
self.connector.get_num_new_matched_tokens(
request, num_new_local_computed_tokens))
# Total computed tokens (local + external).
num_computed_tokens = (num_new_local_computed_tokens +
num_external_computed_tokens)
else:
# P/D: skip checking prefix cache if loaded from remote kvs.
new_computed_blocks = (
self.kv_cache_manager.create_empty_block_list())
num_new_local_computed_tokens = 0
num_computed_tokens = request.num_computed_tokens
encoder_inputs_to_schedule = None
new_encoder_budget = encoder_budget
# P/D: loading remote KV, do not allocate for new work.
if load_kv_async:
assert num_external_computed_tokens > 0
num_new_tokens = 0
blocks = None
# Number of tokens to be scheduled.
else:
prompt_limit = self._get_prompt_limit(request)
# We use `request.num_tokens` instead of
# `request.num_prompt_tokens` to consider the resumed
# requests, which have output tokens.
num_new_tokens = request.num_tokens - num_computed_tokens
max_tokens_in_kvcache = (self.kv_cache_config.num_blocks *
self.block_size)
prompt_limit = min(prompt_limit, max_tokens_in_kvcache)
# Finish request that exceeds prompt_limit or kv cache size.
if num_new_tokens > prompt_limit:
logger.warning(
"Input prompt (%d tokens) is too long"
" and exceeds limit of %d",
num_new_tokens,
prompt_limit,
)
request.status = RequestStatus.FINISHED_IGNORED
self.finished_req_ids.add( # type: ignore
request.request_id) # type: ignore
self.waiting.popleft()
continue
if num_new_tokens > token_budget:
# Scheduling would exceed token_budget, skip.
skip_cur_request()
continue
assert num_new_tokens > 0
blocks = new_computed_blocks.blocks[0]
# Schedule encoder inputs.
if request.has_encoder_inputs:
(encoder_inputs_to_schedule, num_new_tokens,
new_encoder_budget,
_) = self._try_schedule_encoder_inputs(
request, num_computed_tokens, num_new_tokens,
encoder_budget)
if num_new_tokens == 0 or len(
encoder_inputs_to_schedule) == 0:
# The request cannot be scheduled.
break
watermark = getattr(self.scheduler_config, "watermark", 0.01)
if not self._check_watermark_for_prefill(request, num_new_tokens,
blocks, watermark):
# Scheduling would exceed watermark, skip.
skip_cur_request()
continue
if num_new_tokens > long_prefill_token_threshold \
and long_prefill_budget <= 0:
skip_cur_request()
continue
new_blocks = self.kv_cache_manager.allocate_slots(
request,
num_new_tokens + num_external_computed_tokens,
num_new_local_computed_tokens,
new_computed_blocks=new_computed_blocks,
num_lookahead_tokens=self.num_lookahead_tokens,
delay_cache_blocks=load_kv_async)
if new_blocks is None:
# The request cannot be scheduled.
break
# KVConnector: update internal state after allocation.
# This information is used to determine if a load is
# needed for this request.
if self.connector is not None:
self.connector.update_state_after_alloc(
request,
new_computed_blocks + new_blocks,
num_external_computed_tokens,
)
self.waiting.popleft()
if load_kv_async:
# If loading async, allocate memory and put request
# into the WAITING_FOR_REMOTE_KV state.
skipped_waiting_requests.appendleft(request)
request.status = RequestStatus.WAITING_FOR_REMOTE_KVS
continue
self.running.append(request)
if self.log_stats:
request.record_event(EngineCoreEventType.SCHEDULED,
scheduled_timestamp)
self.scheduled_req_ids.add(request.request_id)
# Check request status.
if request.status == RequestStatus.WAITING:
scheduled_new_reqs.append(request)
elif request.status == RequestStatus.PREEMPTED:
scheduled_resumed_reqs.append(request)
else:
raise RuntimeError(f"Invalid request status: {request.status}")
if self.lora_config and request.lora_request:
scheduled_loras.add(request.lora_request.lora_int_id)
req_to_new_blocks[
request.request_id] = self.kv_cache_manager.get_blocks(
request.request_id)
# Update request info.
num_scheduled_tokens[request.request_id] = num_new_tokens
token_budget -= num_new_tokens
if num_new_tokens > long_prefill_token_threshold:
long_prefill_budget -= 1
request.status = RequestStatus.RUNNING
request.num_computed_tokens = num_computed_tokens
# Count the number of prefix cached tokens.
if request.num_cached_tokens < 0:
request.num_cached_tokens = num_computed_tokens
# Encoder-related.
if encoder_inputs_to_schedule:
scheduled_encoder_inputs[request.request_id] = (
encoder_inputs_to_schedule)
# Allocate the encoder cache.
for i in encoder_inputs_to_schedule:
self.encoder_cache_manager.allocate(request, i)
encoder_budget = new_encoder_budget
# Put back any skipped requests at the head of the waiting queue
if skipped_waiting_requests:
self.waiting.extendleft(skipped_waiting_requests)
if self.phase == "decode":
while len(
self.running
) < self.decode_max_num_running_reqs and self.finished_prefill_reqs:
request = self.finished_prefill_reqs.popleft()
self.running.append(request)
# If no prefill requests are scheduled,
# Schedule decode requests next.
if len(self.scheduled_req_ids) == 0:
req_index = 0
while req_index < len(self.running) and token_budget > 0:
request = self.running[req_index]
if request.request_id in self.scheduled_req_ids:
# This request has already been scheduled.
req_index += 1
continue
num_new_tokens = (request.num_tokens_with_spec -
request.num_computed_tokens)
assert (request.num_tokens - request.num_computed_tokens) == 1
num_new_tokens = min(num_new_tokens, token_budget)
# Make sure the input position does not exceed the max model len.
# This is necessary when using spec decoding.
num_new_tokens = min(
num_new_tokens,
self.max_model_len - request.num_computed_tokens)
# Schedule encoder inputs.
encoder_inputs_to_schedule = None
new_encoder_budget = encoder_budget
if request.has_encoder_inputs:
(encoder_inputs_to_schedule, num_new_tokens,
new_encoder_budget) = self._try_schedule_encoder_inputs(
request, request.num_computed_tokens, num_new_tokens,
encoder_budget)
# Check that adding the request still respects the max_loras
# constraint.
if self.lora_config and request.lora_request and (
len(scheduled_loras) == self.lora_config.max_loras
and request.lora_request.lora_int_id
not in scheduled_loras):
# Scheduling would exceed max_loras, skip.
num_new_tokens = 0
if num_new_tokens == 0:
# The request cannot be scheduled because one of the following
# reason:
# 1. No new tokens to schedule. This may happen when PP>1 and
# we have already scheduled all prompt tokens but they are
# not finished yet.
# 2. Adding the request exceeds the max_loras constraint.
# NOTE(woosuk): Here, by doing `continue` instead of `break`,
# we do not strictly follow the FCFS scheduling policy and
# allow the lower-priority requests to be scheduled.
req_index += 1
continue
while True:
new_blocks = self.kv_cache_manager.allocate_slots(
request,
num_new_tokens,
num_lookahead_tokens=self.num_lookahead_tokens)
if new_blocks is None:
# The request cannot be scheduled.
# Preempt the lowest-priority request.
preempted_req = self.running.pop()
self.kv_cache_manager.free(preempted_req)
preempted_req.status = RequestStatus.PREEMPTED
preempted_req.num_computed_tokens = 0
if self.log_stats:
preempted_req.record_event(
EngineCoreEventType.PREEMPTED,
scheduled_timestamp)
self.waiting.appendleft(preempted_req)
preempted_reqs.append(preempted_req)
if preempted_req == request:
# No more request to preempt.
can_schedule = False
break
else:
# The request can be scheduled.
can_schedule = True
break
if not can_schedule:
break
assert new_blocks is not None
# Schedule the request.
scheduled_running_reqs.append(request)
self.scheduled_req_ids.add(request.request_id)
req_to_new_blocks[request.request_id] = new_blocks
num_scheduled_tokens[request.request_id] = num_new_tokens
token_budget -= num_new_tokens
req_index += 1
# Speculative decode related.
if request.spec_token_ids:
num_scheduled_spec_tokens = (num_new_tokens +
request.num_computed_tokens -
request.num_tokens)
if num_scheduled_spec_tokens > 0:
# Trim spec_token_ids list to num_scheduled_spec_tokens.
del request.spec_token_ids[num_scheduled_spec_tokens:]
scheduled_spec_decode_tokens[request.request_id] = (
request.spec_token_ids)
# Encoder-related.
if encoder_inputs_to_schedule:
scheduled_encoder_inputs[request.request_id] = (
encoder_inputs_to_schedule)
# Allocate the encoder cache.
for i in encoder_inputs_to_schedule:
self.encoder_cache_manager.allocate(request, i)
encoder_budget = new_encoder_budget
# Record scheduled LoRA requests.
if self.lora_config and request.lora_request:
scheduled_loras.add(request.lora_request.lora_int_id)
# Check if the scheduling constraints are satisfied.
total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens
assert token_budget >= 0
assert len(
self.running
) <= self.decode_max_num_running_reqs if self.phase == "decode" else self.max_num_running_reqs
assert len(scheduled_new_reqs) + len(scheduled_resumed_reqs) + len(
scheduled_running_reqs) <= len(self.running)
# Get the longest common prefix among all requests in the running queue.
# This can be potentially used for cascade attention.
num_common_prefix_blocks = [0] * len(
self.kv_cache_config.kv_cache_groups)
if self.running:
any_request = self.running[0]
num_common_prefix_blocks = (
self.kv_cache_manager.get_num_common_prefix_blocks(
any_request.request_id))
# Construct the scheduler output.
new_reqs_data = [
NewRequestData.from_request(
req, req_to_new_blocks[req.request_id].get_block_ids())
for req in scheduled_new_reqs
]
cached_reqs_data = self._make_cached_request_data(
scheduled_running_reqs, scheduled_resumed_reqs,
num_scheduled_tokens, scheduled_spec_decode_tokens,
req_to_new_blocks)
scheduled_cached_reqs = cached_reqs_data
scheduler_output = SchedulerOutput(
scheduled_new_reqs=new_reqs_data,
scheduled_cached_reqs=scheduled_cached_reqs,
num_scheduled_tokens=num_scheduled_tokens,
total_num_scheduled_tokens=total_num_scheduled_tokens,
scheduled_spec_decode_tokens=scheduled_spec_decode_tokens,
scheduled_encoder_inputs=scheduled_encoder_inputs,
num_common_prefix_blocks=num_common_prefix_blocks,
# finished_req_ids is an existing state in the scheduler,
# instead of being newly scheduled in this step.
# It contains the request IDs that are finished in between
# the previous and the current steps.
finished_req_ids=self.finished_req_ids, # type: ignore
free_encoder_mm_hashes=self.encoder_cache_manager.
get_freed_mm_hashes(),
)
# NOTE(Kuntai): this function is designed for multiple purposes:
# 1. Plan the KV cache store
# 2. Wrap up all the KV cache load / save ops into an opaque object
# 3. Clear the internal states of the connector
if self.connector is not None:
meta = self.connector.build_connector_meta(scheduler_output)
scheduler_output.kv_connector_metadata = meta
events = self.kv_cache_manager.take_events()
if events:
batch = KVEventBatch(ts=time.time(), events=events)
self.kv_event_publisher.publish(batch)
# Advance the number of computed tokens for the request AFTER
# the request is scheduled.
# 1. The scheduler_output of the current step has to include the
# original number of scheduled tokens to determine input IDs.
# 2. Advance the number of computed tokens here allowing us to
# schedule the prefill request again immediately in the next
# scheduling step.
# 3. If some tokens (e.g. spec tokens) are rejected later, the number of
# computed tokens will be adjusted in update_from_output.
for req_id, num_scheduled_token in num_scheduled_tokens.items():
self.requests[req_id].num_computed_tokens += num_scheduled_token
self.finished_req_ids = set() # type: ignore
return scheduler_output
def _check_watermark_for_prefill(self,
request,
num_new_tokens,
computed_blocks,
watermark=0.01):
computed_blocks = computed_blocks or []
watermark_blocks = self.kv_cache_config.num_blocks * watermark
num_computed_tokens = (request.num_computed_tokens +
len(computed_blocks) * self.block_size)
num_required_blocks = cdiv(num_new_tokens + num_computed_tokens,
self.block_size)
req_blocks = self.kv_cache_manager.coordinator.get_blocks(
request.request_id)
num_new_blocks = (num_required_blocks - len(req_blocks[0]) -
len(computed_blocks))
num_evictable_computed_blocks = sum(1 for blk in computed_blocks
if blk.ref_cnt == 0)
# If number of free blocks is less than water mark after allocating, don't allocate.
if (self.kv_cache_manager.block_pool.get_num_free_blocks() -
num_evictable_computed_blocks -
num_new_blocks) < watermark_blocks:
return False
return True
def _get_prompt_limit(self, request: Request) -> int:
if (self.scheduler_config.chunked_prefill_enabled
and not self.scheduler_config.is_multi_step):
prompt_limit = self.vllm_config.model_config.max_model_len
else:
prompt_limit = min(
self.vllm_config.model_config.max_model_len,
self.scheduler_config.max_num_batched_tokens,
)
# Model is fine tuned with long context. Return the fine tuned max_len.
if request.lora_request and request.lora_request.long_lora_max_len:
assert prompt_limit <= request.lora_request.long_lora_max_len
return request.lora_request.long_lora_max_len
else:
return prompt_limit
def finish_requests(
self,
request_ids: Union[str, Iterable[str]],
finished_status: RequestStatus,
) -> None:
"""Handles the finish signal from outside the scheduler.
For example, the API server can abort a request when the client
disconnects.
"""
for req_id in request_ids:
request = self.requests.get(req_id)
if request is None:
# Invalid request ID.
continue
if request.status == RequestStatus.RUNNING:
self.scheduled_req_ids.discard(request.request_id)
super().finish_requests(request_ids, finished_status)
def update_from_output(
self,
scheduler_output: SchedulerOutput,
model_runner_output: ModelRunnerOutput,
) -> EngineCoreOutputs:
num_scheduled_tokens = scheduler_output.num_scheduled_tokens
# NOTE(woosuk): As len(self.running) can be up to 1K or more, the below
# loop can be a performance bottleneck. We should do our best to avoid
# expensive operations inside the loop.
for request in self.running:
req_id = request.request_id
num_tokens_scheduled = num_scheduled_tokens.get(req_id, 0)
if num_tokens_scheduled == 0:
# The request was not scheduled in this step.
continue
if req_id in self.scheduled_req_ids:
self.scheduled_req_ids.remove(req_id)
return super().update_from_output(scheduler_output,
model_runner_output)

View File

@@ -153,7 +153,6 @@ class NPUPlatform(Platform):
model_config = vllm_config.model_config
parallel_config = vllm_config.parallel_config
cache_config = vllm_config.cache_config
ascend_scheduler_config = ascend_config.ascend_scheduler_config
kv_cache_dtype = vllm_config.additional_config.get(
"kv_cache_dtype", None)
@@ -291,35 +290,23 @@ class NPUPlatform(Platform):
if cache_config:
if cache_config.block_size is None:
cache_config.block_size = 128
if cache_config.enable_prefix_caching or \
not ascend_scheduler_config.enabled or \
getattr(ascend_scheduler_config, "enable_chunked_prefill", False):
logger.warning(
"If chunked prefill or prefix caching is enabled, block size must be set to 128."
)
origin_block_size = cache_config.block_size
cache_config.block_size = 128
# TODO(MengqingCao): Remove the model_type check, after resolving the hidden error in get_kv_cache_groups.
if model_config and model_config.hf_config.model_type == "qwen3_next":
logger.warning(
"When running qwen3-next model, block_size needs to be restored to its original value."
)
cache_config.block_size = origin_block_size
# ignore block size check if model is qwen3-next
# TODO(MengqingCao): Remove the model_type check, after resolving the hidden error in get_kv_cache_groups.
if not (model_config
and model_config.hf_config.model_type == "qwen3_next"):
# we must set block size to 128 if prefix caching is enabled or chunked prefill is enabled
if cache_config.enable_prefix_caching or \
(vllm_config.scheduler_config and vllm_config.scheduler_config.enable_chunked_prefill):
if cache_config.block_size != 128:
logger.warning(
"block size must be set to 128 on NPU platform.")
cache_config.block_size = 128
# Activate custom ops for v1, except on 310P
if get_ascend_device_type() != AscendDeviceType._310P:
compilation_config.custom_ops = ["all"]
# If ascend_scheduler_config is enabled,
# extents original scheduler_config to use AscendScheduler.
if ascend_config.ascend_scheduler_config.enabled:
from vllm_ascend.core.schedule_config import AscendSchedulerConfig
ascend_scheduler_config = AscendSchedulerConfig.initialize_from_config(
vllm_config.scheduler_config,
ascend_config.ascend_scheduler_config)
vllm_config.scheduler_config = ascend_scheduler_config
elif ascend_config.recompute_scheduler_enable:
if ascend_config.recompute_scheduler_enable:
from vllm_ascend.core.recompute_schedule_config import \
RecomputeSchedulerConfig
recompute_scheduler_config = RecomputeSchedulerConfig.initialize_from_config(

View File

@@ -44,11 +44,6 @@ SERVICE_PROFILING_SYMBOLS_YAML = """
handler: msserviceprofiler.vllm_profiler.vllm_v1.batch_hookers:schedule
name: batchFrameworkProcessing
- symbol: vllm_ascend.core.scheduler:AscendScheduler.schedule
min_version: "0.9.1"
handler: msserviceprofiler.vllm_profiler.vllm_v1.batch_hookers:schedule
name: batchFrameworkProcessing
- symbol: vllm.v1.core.sched.scheduler:Scheduler._free_request
min_version: "0.9.1"
handler: msserviceprofiler.vllm_profiler.vllm_v1.batch_hookers:free_request

View File

@@ -451,8 +451,7 @@ class AscendAttentionTorchairBackendImpl(AttentionImpl):
else:
raise NotImplementedError(
"Torchair graph mode with non-MLA attention backend is still experimental."
"v1 scheduler(chunked prefill) is not supported at this moment. Please"
"setting 'ascend_scheduler_config':{'enabled':true} in additional_config"
"to use ascend scheduler.")
"v1 scheduler(chunked prefill) is not supported at this moment. "
)
return output.view(num_tokens, self.hidden_size)

View File

@@ -330,10 +330,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
# Ascend-specific configurations
self.ascend_config = get_ascend_config()
if self.ascend_config.ascend_scheduler_config.enabled:
self.chunked_prefill_enabled = self.scheduler_config.chunked_prefill_enabled
else:
self.chunked_prefill_enabled = True
self.weight_prefetch_method = WeightPrefetchMethod(
self.ascend_config.weight_prefetch_config)
# Dump / PrecisionDebugger configuration now comes from AscendConfig
@@ -1942,7 +1938,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
def _build_attn_state(self, num_reqs, num_scheduled_tokens,
num_valid_tokens):
ascend_config = get_ascend_config()
if np.array_equal(self.seq_lens_np[:num_reqs], num_scheduled_tokens):
attn_state = AscendAttentionState.PrefillNoCache
# We assume it is the decode stage, where prefill occurs but only one token is not hit in cache.
@@ -1959,7 +1954,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
else:
attn_state = AscendAttentionState.ChunkedPrefill
# splitfuse
elif not ascend_config.ascend_scheduler_config.enabled or self.chunked_prefill_enabled:
elif self.scheduler_config.enable_chunked_prefill:
attn_state = AscendAttentionState.ChunkedPrefill
else:
attn_state = AscendAttentionState.PrefillCacheHit