Upgrade to 0.11.1 newest vllm commit (#3762)

### What this PR does / why we need it? c9461e05a4 Fix ```spec decode rejection sampler```, caused by https://github.com/vllm-project/vllm/pull/26060 Fix some ```import```, caused by https://github.com/vllm-project/vllm/pull/27374 Fix ```scheduler_config.send_delta_data```, caused by https://github.com/vllm-project/vllm-ascend/pull/3719 Fix ```init_with_cudagraph_sizes```, caused by https://github.com/vllm-project/vllm/pull/26016 Fix ```vl model```of replacing PatchEmbed's conv3d to linear layer, caused by https://github.com/vllm-project/vllm/pull/27418 ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with new added/existing test. - vLLM version: v0.11.0rc3 - vLLM main: c9461e05a4 --------- Signed-off-by: Icey <1790571317@qq.com>
2025-10-28 14:55:03 +08:00
parent f846bd20e4
commit a7450db1bd
12 changed files with 175 additions and 51 deletions
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -33,7 +33,8 @@ from vllm_ascend.torchair.utils import (check_torchair_cache_exist,
                                        delete_torchair_cache_file)
 from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, enable_sp, is_310p,
                               prefill_context_parallel_enable,
-                               update_aclgraph_sizes, vllm_version_is)
+                               update_aclgraph_sizes,
+                               update_cudagraph_capture_sizes, vllm_version_is)

 if TYPE_CHECKING:
    from vllm.config import ModelConfig, VllmConfig
@@ -142,24 +143,47 @@ class NPUPlatform(Platform):
                "Non-MLA LLMs forcibly disable the chunked prefill feature,"
                "as the performance of operators supporting this feature "
                "functionality is currently suboptimal.")
-            if not model_config.is_multimodal_model and \
-                structured_outputs_config.backend == "auto" and \
-                not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \
-                scheduler_config.policy == "fcfs":
-                ascend_scheduler_config.enabled = True
-                chunked_prefill_enabled_in_ascend_scheduler = getattr(
-                    ascend_scheduler_config, "enable_chunked_prefill", False)
-                if chunked_prefill_enabled_in_ascend_scheduler:
-                    logger.warning(
-                        "Chunked prefill feature is enabled in ascend_scheduler,"
-                        "but note that the operator supporting this feature "
-                        "would lead to performance degradation.")
-                # In this situation, max_num_batched_tokens would have been rewritten.
-                # So we must make sure max_num_batched_tokens is not smaller than max_model_len.
-                if (scheduler_config.max_num_batched_tokens
-                        < scheduler_config.max_model_len
-                        and not chunked_prefill_enabled_in_ascend_scheduler):
-                    scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len
+            if vllm_version_is("0.11.0"):
+                if not model_config.is_multimodal_model and \
+                    structured_outputs_config.backend == "auto" and \
+                    not scheduler_config.send_delta_data and \
+                    not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \
+                    scheduler_config.policy == "fcfs":
+                    ascend_scheduler_config.enabled = True
+                    chunked_prefill_enabled_in_ascend_scheduler = getattr(
+                        ascend_scheduler_config, "enable_chunked_prefill",
+                        False)
+                    if chunked_prefill_enabled_in_ascend_scheduler:
+                        logger.warning(
+                            "Chunked prefill feature is enabled in ascend_scheduler,"
+                            "but note that the operator supporting this feature "
+                            "would lead to performance degradation.")
+                    # In this situation, max_num_batched_tokens would have been rewritten.
+                    # So we must make sure max_num_batched_tokens is not smaller than max_model_len.
+                    if (scheduler_config.max_num_batched_tokens
+                            < scheduler_config.max_model_len and
+                            not chunked_prefill_enabled_in_ascend_scheduler):
+                        scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len
+            else:
+                if not model_config.is_multimodal_model and \
+                    structured_outputs_config.backend == "auto" and \
+                    not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \
+                    scheduler_config.policy == "fcfs":
+                    ascend_scheduler_config.enabled = True
+                    chunked_prefill_enabled_in_ascend_scheduler = getattr(
+                        ascend_scheduler_config, "enable_chunked_prefill",
+                        False)
+                    if chunked_prefill_enabled_in_ascend_scheduler:
+                        logger.warning(
+                            "Chunked prefill feature is enabled in ascend_scheduler,"
+                            "but note that the operator supporting this feature "
+                            "would lead to performance degradation.")
+                    # In this situation, max_num_batched_tokens would have been rewritten.
+                    # So we must make sure max_num_batched_tokens is not smaller than max_model_len.
+                    if (scheduler_config.max_num_batched_tokens
+                            < scheduler_config.max_model_len and
+                            not chunked_prefill_enabled_in_ascend_scheduler):
+                        scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len

        kv_cache_dtype = vllm_config.additional_config.get(
            "kv_cache_dtype", None)
@@ -237,8 +261,12 @@ class NPUPlatform(Platform):
                f"{vllm_config.parallel_config.tensor_parallel_size}")
            if len(sp_aclgraph_sizes) != len(original_sizes):
                compilation_config.cudagraph_capture_sizes = sp_aclgraph_sizes
-                vllm_config.compilation_config.init_with_cudagraph_sizes(
-                    sp_aclgraph_sizes)
+                if vllm_version_is("0.11.0"):
+                    compilation_config.init_with_cudagraph_sizes(
+                        sp_aclgraph_sizes)
+                else:
+                    update_cudagraph_capture_sizes(vllm_config,
+                                                   sp_aclgraph_sizes)

        # TODO: Full graph is fully supported later, and the default value will be set to full graph.
        if compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE: