[Core] Disable the chunked prefill feature in Non-MLA LLMs (#2894)
### What this PR does / why we need it?
This PR enforces the forcible disabling of the chunked prefill feature
in Non-MLA models, as the performance of operators supporting this
functionality is currently suboptimal. Unless the user has enabled
chunked prefill in the ascend_scheduler_config, we would allow this
feature.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
CI passed with new added/existing test.
Related: https://github.com/vllm-project/vllm-ascend/pull/2659
- vLLM version: main
- vLLM main:
d21a36f5f9
Signed-off-by: rjg-lyh <1318825571@qq.com>
This commit is contained in:
@@ -128,6 +128,35 @@ class NPUPlatform(Platform):
|
||||
model_config = vllm_config.model_config
|
||||
parallel_config = vllm_config.parallel_config
|
||||
cache_config = vllm_config.cache_config
|
||||
decoding_config = vllm_config.decoding_config
|
||||
scheduler_config = vllm_config.scheduler_config
|
||||
ascend_scheduler_config = ascend_config.ascend_scheduler_config
|
||||
|
||||
if model_config is not None and not model_config.use_mla:
|
||||
logger.info(
|
||||
"Non-MLA LLMs forcibly disable the chunked prefill feature,"
|
||||
"as the performance of operators supporting this feature "
|
||||
"functionality is currently suboptimal.")
|
||||
if not model_config.is_multimodal_model and \
|
||||
decoding_config.backend == "auto" and \
|
||||
not scheduler_config.delay_factor > 0 and \
|
||||
not scheduler_config.send_delta_data and \
|
||||
scheduler_config.policy == "fcfs":
|
||||
ascend_scheduler_config.enabled = True
|
||||
chunked_prefill_enabled_in_ascend_scheduler = getattr(
|
||||
ascend_scheduler_config, "enable_chunked_prefill", False)
|
||||
if chunked_prefill_enabled_in_ascend_scheduler:
|
||||
logger.warning(
|
||||
"Chunked prefill feature is enabled in ascend_scheduler,"
|
||||
"but note that the operator supporting this feature "
|
||||
"would lead to performance degradation.")
|
||||
# In this situation, max_num_batched_tokens would have been rewritten.
|
||||
# So we must make sure max_num_batched_tokens is not smaller than max_model_len.
|
||||
if (scheduler_config.max_num_batched_tokens
|
||||
< scheduler_config.max_model_len
|
||||
and not chunked_prefill_enabled_in_ascend_scheduler):
|
||||
scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len
|
||||
|
||||
kv_cache_dtype = vllm_config.additional_config.get(
|
||||
"kv_cache_dtype", None)
|
||||
if kv_cache_dtype is not None:
|
||||
|
||||
Reference in New Issue
Block a user