drop ascend scheduler (#4498)
Ascend scheduler was added for non chunk prefill case before, since that the npu ops didn't work well with chunked prefill. Now the ops with chunked prefill work better, it's time to remove the ascend scheduler to use vLLM default scheduler. - vLLM version: v0.11.2 --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -153,7 +153,6 @@ class NPUPlatform(Platform):
|
||||
model_config = vllm_config.model_config
|
||||
parallel_config = vllm_config.parallel_config
|
||||
cache_config = vllm_config.cache_config
|
||||
ascend_scheduler_config = ascend_config.ascend_scheduler_config
|
||||
|
||||
kv_cache_dtype = vllm_config.additional_config.get(
|
||||
"kv_cache_dtype", None)
|
||||
@@ -291,35 +290,23 @@ class NPUPlatform(Platform):
|
||||
if cache_config:
|
||||
if cache_config.block_size is None:
|
||||
cache_config.block_size = 128
|
||||
|
||||
if cache_config.enable_prefix_caching or \
|
||||
not ascend_scheduler_config.enabled or \
|
||||
getattr(ascend_scheduler_config, "enable_chunked_prefill", False):
|
||||
logger.warning(
|
||||
"If chunked prefill or prefix caching is enabled, block size must be set to 128."
|
||||
)
|
||||
origin_block_size = cache_config.block_size
|
||||
cache_config.block_size = 128
|
||||
# TODO(MengqingCao): Remove the model_type check, after resolving the hidden error in get_kv_cache_groups.
|
||||
if model_config and model_config.hf_config.model_type == "qwen3_next":
|
||||
logger.warning(
|
||||
"When running qwen3-next model, block_size needs to be restored to its original value."
|
||||
)
|
||||
cache_config.block_size = origin_block_size
|
||||
# ignore block size check if model is qwen3-next
|
||||
# TODO(MengqingCao): Remove the model_type check, after resolving the hidden error in get_kv_cache_groups.
|
||||
if not (model_config
|
||||
and model_config.hf_config.model_type == "qwen3_next"):
|
||||
# we must set block size to 128 if prefix caching is enabled or chunked prefill is enabled
|
||||
if cache_config.enable_prefix_caching or \
|
||||
(vllm_config.scheduler_config and vllm_config.scheduler_config.enable_chunked_prefill):
|
||||
if cache_config.block_size != 128:
|
||||
logger.warning(
|
||||
"block size must be set to 128 on NPU platform.")
|
||||
cache_config.block_size = 128
|
||||
|
||||
# Activate custom ops for v1, except on 310P
|
||||
if get_ascend_device_type() != AscendDeviceType._310P:
|
||||
compilation_config.custom_ops = ["all"]
|
||||
|
||||
# If ascend_scheduler_config is enabled,
|
||||
# extents original scheduler_config to use AscendScheduler.
|
||||
if ascend_config.ascend_scheduler_config.enabled:
|
||||
from vllm_ascend.core.schedule_config import AscendSchedulerConfig
|
||||
ascend_scheduler_config = AscendSchedulerConfig.initialize_from_config(
|
||||
vllm_config.scheduler_config,
|
||||
ascend_config.ascend_scheduler_config)
|
||||
vllm_config.scheduler_config = ascend_scheduler_config
|
||||
elif ascend_config.recompute_scheduler_enable:
|
||||
if ascend_config.recompute_scheduler_enable:
|
||||
from vllm_ascend.core.recompute_schedule_config import \
|
||||
RecomputeSchedulerConfig
|
||||
recompute_scheduler_config = RecomputeSchedulerConfig.initialize_from_config(
|
||||
|
||||
Reference in New Issue
Block a user