diff --git a/docs/source/tutorials/DeepSeek-V3.2-Exp.md b/docs/source/tutorials/DeepSeek-V3.2-Exp.md index 97e6d1fd..84a5863f 100644 --- a/docs/source/tutorials/DeepSeek-V3.2-Exp.md +++ b/docs/source/tutorials/DeepSeek-V3.2-Exp.md @@ -174,7 +174,7 @@ vllm serve vllm-ascend/DeepSeek-V3.2-Exp-W8A8 \ --trust-remote-code \ --no-enable-prefix-caching \ --gpu-memory-utilization 0.92 \ ---additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}' +--additional-config '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}' ``` ### Multi-node Deployment @@ -226,7 +226,7 @@ vllm serve /root/.cache/Modelers_Park/DeepSeek-V3.2-Exp \ --trust-remote-code \ --no-enable-prefix-caching \ --gpu-memory-utilization 0.9 \ ---additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}' +--additional-config '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}' ``` **Node 1** @@ -270,7 +270,7 @@ vllm serve /root/.cache/Modelers_Park/DeepSeek-V3.2-Exp \ --trust-remote-code \ --no-enable-prefix-caching \ --gpu-memory-utilization 0.92 \ ---additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}' +--additional-config '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}' ``` :::: @@ -318,7 +318,7 @@ vllm serve vllm-ascend/DeepSeek-V3.2-Exp-W8A8 \ --quantization ascend \ --no-enable-prefix-caching \ --gpu-memory-utilization 0.9 \ ---additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}' +--additional-config '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}' ``` **Node 1** @@ -365,7 +365,7 @@ vllm serve vllm-ascend/DeepSeek-V3.2-Exp-W8A8 \ --quantization ascend \ --no-enable-prefix-caching \ --gpu-memory-utilization 0.92 \ ---additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}' +--additional-config '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}' ``` :::: diff --git a/docs/source/tutorials/multi_node.md b/docs/source/tutorials/multi_node.md index 68c7056b..d04fa090 100644 --- a/docs/source/tutorials/multi_node.md +++ b/docs/source/tutorials/multi_node.md @@ -137,7 +137,7 @@ vllm serve vllm-ascend/DeepSeek-V3.1-W8A8 \ --trust-remote-code \ --no-enable-prefix-caching \ --gpu-memory-utilization 0.9 \ ---additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}' +--additional-config '{"torchair_graph_config":{"enabled":true}}' ``` **Node 1** @@ -182,7 +182,7 @@ vllm serve vllm-ascend/DeepSeek-V3.1-W8A8 \ --trust-remote-code \ --no-enable-prefix-caching \ --gpu-memory-utilization 0.92 \ ---additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}' +--additional-config '{"torchair_graph_config":{"enabled":true}}' ``` The deployment view looks like: diff --git a/docs/source/tutorials/multi_node_kimi.md b/docs/source/tutorials/multi_node_kimi.md index cb28bca9..84840cdf 100644 --- a/docs/source/tutorials/multi_node_kimi.md +++ b/docs/source/tutorials/multi_node_kimi.md @@ -93,7 +93,7 @@ vllm serve /home/cache/weights/Kimi-K2-Instruct-W8A8 \ --trust-remote-code \ --no-enable-prefix-caching \ --gpu-memory-utilization 0.9 \ ---additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}' +--additional-config '{"torchair_graph_config":{"enabled":true}}' ``` **Node 1** @@ -137,7 +137,7 @@ vllm serve /home/cache/weights/Kimi-K2-Instruct-W8A8 \ --trust-remote-code \ --no-enable-prefix-caching \ --gpu-memory-utilization 0.92 \ ---additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}' +--additional-config '{"torchair_graph_config":{"enabled":true}}' ``` The deployment view looks like: diff --git a/docs/source/tutorials/multi_npu_moge.md b/docs/source/tutorials/multi_npu_moge.md index e426c0f3..91806ba7 100644 --- a/docs/source/tutorials/multi_npu_moge.md +++ b/docs/source/tutorials/multi_npu_moge.md @@ -157,12 +157,7 @@ if __name__ == "__main__": additional_config={ 'torchair_graph_config': { 'enabled': True, - }, - 'ascend_scheduler_config':{ - 'enabled': True, - 'enable_chunked_prefill' : False, - 'chunked_prefill_enabled': False - }, + } }) outputs = llm.generate(prompts, sampling_params) diff --git a/docs/source/user_guide/configuration/additional_config.md b/docs/source/user_guide/configuration/additional_config.md index 448f2ec4..a77d0d53 100644 --- a/docs/source/user_guide/configuration/additional_config.md +++ b/docs/source/user_guide/configuration/additional_config.md @@ -27,7 +27,6 @@ The following table lists additional configuration options available in vLLM Asc | Name | Type | Default | Description | |-------------------------------------|------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------| | `torchair_graph_config` | dict | `{}` | Configuration options for torchair graph mode | -| `ascend_scheduler_config` | dict | `{}` | Configuration options for ascend scheduler | | `weight_prefetch_config` | dict | `{}` | Configuration options for weight prefetch | | `refresh` | bool | `false` | Whether to refresh global Ascend configuration content. This is usually used by rlhf or ut/e2e test case. | | `expert_map_path` | str | `None` | When using expert load balancing for an MoE model, an expert map path needs to be passed in. | @@ -61,18 +60,6 @@ The details of each configuration option are as follows: | `enable_kv_nz`| bool | `False` | Whether to enable KV Cache NZ layout. This option only takes effect on models using MLA (for example, DeepSeek). | | `enable_super_kernel` | bool | `False` | Whether to enable super kernel to fuse operators in deepseek moe layers. This option only takes effects on moe models using dynamic w8a8 quantization.| -**ascend_scheduler_config** - -| Name | Type | Default | Description | -| ---- | ---- | ------- | ----------- | -| `enabled` | bool | `False` | Whether to enable ascend scheduler for V1 engine.| -| `enable_pd_transfer` | bool | `False` | Whether to enable P-D transfer. When it is enabled, decode is started only when prefill of all requests is done. This option only takes effect on offline inference. | -| `decode_max_num_seqs` | int | `0` | Whether to change max_num_seqs of decode phase when P-D transfer is enabled. This option only takes effect when enable_pd_transfer is True. | -| `max_long_partial_prefills` | Union[int, float] | `float('inf')` | The maximum number of prompts longer than long_prefill_token_threshold that will be prefilled concurrently. | -| `long_prefill_token_threshold` | Union[int, float] | `float('inf')` | a request is considered long if the prompt is longer than this number of tokens. | - -ascend_scheduler_config also supports the options from [vllm scheduler config](https://docs.vllm.ai/en/stable/api/vllm/config.html#vllm.config.SchedulerConfig). For example, you can add `enable_chunked_prefill: True` to ascend_scheduler_config as well. - **weight_prefetch_config** | Name | Type | Default | Description | @@ -93,12 +80,6 @@ An example of additional configuration is as follows: "graph_batch_sizes_init": False, "enable_kv_nz": False }, - "ascend_scheduler_config": { - "enabled": True, - "enable_chunked_prefill": True, - "max_long_partial_prefills": 1, - "long_prefill_token_threshold": 4096, - }, "weight_prefetch_config": { "enabled": True, "prefetch_ratio": { diff --git a/docs/source/user_guide/feature_guide/graph_mode.md b/docs/source/user_guide/feature_guide/graph_mode.md index 43236289..9afa1d52 100644 --- a/docs/source/user_guide/feature_guide/graph_mode.md +++ b/docs/source/user_guide/feature_guide/graph_mode.md @@ -45,14 +45,14 @@ import os from vllm import LLM # TorchAirGraph only works without chunked-prefill now -model = LLM(model="path/to/DeepSeek-R1-0528", additional_config={"torchair_graph_config": {"enabled": True},"ascend_scheduler_config": {"enabled": True}}) +model = LLM(model="path/to/DeepSeek-R1-0528", additional_config={"torchair_graph_config": {"enabled": True}}) outputs = model.generate("Hello, how are you?") ``` Online example: ```shell -vllm serve path/to/DeepSeek-R1-0528 --additional-config='{"torchair_graph_config": {"enabled": true},"ascend_scheduler_config": {"enabled": true}}' +vllm serve path/to/DeepSeek-R1-0528 --additional-config='{"torchair_graph_config": {"enabled": true}}' ``` You can find more details about additional configuration [here](../configuration/additional_config.md). diff --git a/examples/offline_inference_npu_long_seq.py b/examples/offline_inference_npu_long_seq.py index 2ed96f63..7e3afa01 100644 --- a/examples/offline_inference_npu_long_seq.py +++ b/examples/offline_inference_npu_long_seq.py @@ -42,7 +42,6 @@ if __name__ == "__main__": enable_chunked_prefill=False, max_num_batched_tokens=2048, max_model_len=1024, - additional_config={"ascend_scheduler_config": {"enabled": False}}, max_num_seqs=1, block_size=128, gpu_memory_utilization=0.9 diff --git a/examples/run_dp_server.sh b/examples/run_dp_server.sh index 9b9868c4..ec0cb686 100644 --- a/examples/run_dp_server.sh +++ b/examples/run_dp_server.sh @@ -28,4 +28,4 @@ vllm serve Qwen/Qwen1.5-MoE-A2.7B \ --gpu-memory-utilization 0.9 \ --trust-remote-code \ --enforce-eager \ - --additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":false, "use_cached_graph":false}}' + --additional-config '{"torchair_graph_config":{"enabled":false, "use_cached_graph":false}}'