Upgrade to 0.11.1 newest vllm commit (#3762)
### What this PR does / why we need it?c9461e05a4Fix ```spec decode rejection sampler```, caused by https://github.com/vllm-project/vllm/pull/26060 Fix some ```import```, caused by https://github.com/vllm-project/vllm/pull/27374 Fix ```scheduler_config.send_delta_data```, caused by https://github.com/vllm-project/vllm-ascend/pull/3719 Fix ```init_with_cudagraph_sizes```, caused by https://github.com/vllm-project/vllm/pull/26016 Fix ```vl model```of replacing PatchEmbed's conv3d to linear layer, caused by https://github.com/vllm-project/vllm/pull/27418 ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with new added/existing test. - vLLM version: v0.11.0rc3 - vLLM main:c9461e05a4--------- Signed-off-by: Icey <1790571317@qq.com>
This commit is contained in:
@@ -33,7 +33,8 @@ from vllm_ascend.torchair.utils import (check_torchair_cache_exist,
|
||||
delete_torchair_cache_file)
|
||||
from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, enable_sp, is_310p,
|
||||
prefill_context_parallel_enable,
|
||||
update_aclgraph_sizes, vllm_version_is)
|
||||
update_aclgraph_sizes,
|
||||
update_cudagraph_capture_sizes, vllm_version_is)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig, VllmConfig
|
||||
@@ -142,24 +143,47 @@ class NPUPlatform(Platform):
|
||||
"Non-MLA LLMs forcibly disable the chunked prefill feature,"
|
||||
"as the performance of operators supporting this feature "
|
||||
"functionality is currently suboptimal.")
|
||||
if not model_config.is_multimodal_model and \
|
||||
structured_outputs_config.backend == "auto" and \
|
||||
not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \
|
||||
scheduler_config.policy == "fcfs":
|
||||
ascend_scheduler_config.enabled = True
|
||||
chunked_prefill_enabled_in_ascend_scheduler = getattr(
|
||||
ascend_scheduler_config, "enable_chunked_prefill", False)
|
||||
if chunked_prefill_enabled_in_ascend_scheduler:
|
||||
logger.warning(
|
||||
"Chunked prefill feature is enabled in ascend_scheduler,"
|
||||
"but note that the operator supporting this feature "
|
||||
"would lead to performance degradation.")
|
||||
# In this situation, max_num_batched_tokens would have been rewritten.
|
||||
# So we must make sure max_num_batched_tokens is not smaller than max_model_len.
|
||||
if (scheduler_config.max_num_batched_tokens
|
||||
< scheduler_config.max_model_len
|
||||
and not chunked_prefill_enabled_in_ascend_scheduler):
|
||||
scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len
|
||||
if vllm_version_is("0.11.0"):
|
||||
if not model_config.is_multimodal_model and \
|
||||
structured_outputs_config.backend == "auto" and \
|
||||
not scheduler_config.send_delta_data and \
|
||||
not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \
|
||||
scheduler_config.policy == "fcfs":
|
||||
ascend_scheduler_config.enabled = True
|
||||
chunked_prefill_enabled_in_ascend_scheduler = getattr(
|
||||
ascend_scheduler_config, "enable_chunked_prefill",
|
||||
False)
|
||||
if chunked_prefill_enabled_in_ascend_scheduler:
|
||||
logger.warning(
|
||||
"Chunked prefill feature is enabled in ascend_scheduler,"
|
||||
"but note that the operator supporting this feature "
|
||||
"would lead to performance degradation.")
|
||||
# In this situation, max_num_batched_tokens would have been rewritten.
|
||||
# So we must make sure max_num_batched_tokens is not smaller than max_model_len.
|
||||
if (scheduler_config.max_num_batched_tokens
|
||||
< scheduler_config.max_model_len and
|
||||
not chunked_prefill_enabled_in_ascend_scheduler):
|
||||
scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len
|
||||
else:
|
||||
if not model_config.is_multimodal_model and \
|
||||
structured_outputs_config.backend == "auto" and \
|
||||
not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \
|
||||
scheduler_config.policy == "fcfs":
|
||||
ascend_scheduler_config.enabled = True
|
||||
chunked_prefill_enabled_in_ascend_scheduler = getattr(
|
||||
ascend_scheduler_config, "enable_chunked_prefill",
|
||||
False)
|
||||
if chunked_prefill_enabled_in_ascend_scheduler:
|
||||
logger.warning(
|
||||
"Chunked prefill feature is enabled in ascend_scheduler,"
|
||||
"but note that the operator supporting this feature "
|
||||
"would lead to performance degradation.")
|
||||
# In this situation, max_num_batched_tokens would have been rewritten.
|
||||
# So we must make sure max_num_batched_tokens is not smaller than max_model_len.
|
||||
if (scheduler_config.max_num_batched_tokens
|
||||
< scheduler_config.max_model_len and
|
||||
not chunked_prefill_enabled_in_ascend_scheduler):
|
||||
scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len
|
||||
|
||||
kv_cache_dtype = vllm_config.additional_config.get(
|
||||
"kv_cache_dtype", None)
|
||||
@@ -237,8 +261,12 @@ class NPUPlatform(Platform):
|
||||
f"{vllm_config.parallel_config.tensor_parallel_size}")
|
||||
if len(sp_aclgraph_sizes) != len(original_sizes):
|
||||
compilation_config.cudagraph_capture_sizes = sp_aclgraph_sizes
|
||||
vllm_config.compilation_config.init_with_cudagraph_sizes(
|
||||
sp_aclgraph_sizes)
|
||||
if vllm_version_is("0.11.0"):
|
||||
compilation_config.init_with_cudagraph_sizes(
|
||||
sp_aclgraph_sizes)
|
||||
else:
|
||||
update_cudagraph_capture_sizes(vllm_config,
|
||||
sp_aclgraph_sizes)
|
||||
|
||||
# TODO: Full graph is fully supported later, and the default value will be set to full graph.
|
||||
if compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE:
|
||||
|
||||
Reference in New Issue
Block a user