Drop 0.11.0 support (#4377)

There is a lot hack code for v0.11.0, which makes the code hard to upgrade to newer vLLM version. Since v0.11.0 will release soon. Let's drop v0.11.0 support first. Then we'll upgrade to v0.11.2 soon. - vLLM version: v0.11.0 - vLLM main: 2918c1b49c Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-11-24 17:08:20 +08:00
parent 41ddb06554
commit a1f142b7ad
80 changed files with 467 additions and 1755 deletions
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -34,7 +34,7 @@ from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, enable_sp, is_310p,
                               is_vl_model, prefill_context_parallel_enable,
                               update_aclgraph_sizes,
                               update_cudagraph_capture_sizes,
-                               update_default_aclgraph_sizes, vllm_version_is)
+                               update_default_aclgraph_sizes)

 if TYPE_CHECKING:
    from vllm.config import ModelConfig, VllmConfig
@@ -120,10 +120,7 @@ class NPUPlatform(Platform):
        # initialize ascend config from vllm additional_config
        ascend_config = init_ascend_config(vllm_config)

-        if vllm_version_is("0.11.0"):
-            from vllm.config import CompilationLevel
-        else:
-            from vllm.config import CompilationMode  # noqa: E402
+        from vllm.config import CompilationMode  # noqa: E402

        compilation_config = vllm_config.compilation_config
        model_config = vllm_config.model_config
@@ -149,29 +146,17 @@ class NPUPlatform(Platform):
        from vllm.config.compilation import CUDAGraphMode
        if enforce_eager:
            logger.info("Compilation disabled, using eager mode by default")
-            if vllm_version_is("0.11.0"):
-                compilation_config.level = CompilationLevel.NO_COMPILATION
-            else:
-                compilation_config.mode = CompilationMode.NONE
+            compilation_config.mode = CompilationMode.NONE

        compilation_config.cudagraph_num_of_warmups = 1

-        if vllm_version_is("0.11.0"):
-            if compilation_config.level not in [
-                    CompilationLevel.NO_COMPILATION, CompilationLevel.PIECEWISE
-            ]:
-                logger.warning(
-                    "NPU does not support %s compilation level. Setting CUDAGraphMode to NONE",
-                    compilation_config.level)
-                compilation_config.cudagraph_mode = CUDAGraphMode.NONE
-        else:
-            if compilation_config.mode not in [
-                    CompilationMode.NONE, CompilationMode.VLLM_COMPILE
-            ]:
-                logger.warning(
-                    "NPU does not support %s compilation mode. Setting CUDAGraphMode to NONE",
-                    compilation_config.mode)
-                compilation_config.cudagraph_mode = CUDAGraphMode.NONE
+        if compilation_config.mode not in [
+                CompilationMode.NONE, CompilationMode.VLLM_COMPILE
+        ]:
+            logger.warning(
+                "NPU does not support %s compilation mode. Setting CUDAGraphMode to NONE",
+                compilation_config.mode)
+            compilation_config.cudagraph_mode = CUDAGraphMode.NONE

        # set CUDAGraphMode to None when torchair is enabled, no mather what compilation_config.level is.
        if ascend_config.torchair_graph_config.enabled:
@@ -211,96 +196,49 @@ class NPUPlatform(Platform):
                f"{vllm_config.parallel_config.tensor_parallel_size}")
            if len(sp_aclgraph_sizes) != len(original_sizes):
                compilation_config.cudagraph_capture_sizes = sp_aclgraph_sizes
-                if vllm_version_is("0.11.0"):
-                    compilation_config.init_with_cudagraph_sizes(
-                        sp_aclgraph_sizes)
-                else:
-                    update_cudagraph_capture_sizes(vllm_config,
-                                                   sp_aclgraph_sizes)
+                update_cudagraph_capture_sizes(vllm_config, sp_aclgraph_sizes)

        # TODO: Full graph is fully supported later, and the default value will be set to full graph.
        if compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE:
            compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE

-        if vllm_version_is("0.11.0"):
-            if compilation_config.cudagraph_mode == CUDAGraphMode.NONE:
-                compilation_config.level = CompilationLevel.NO_COMPILATION
-            elif compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE:
-                logger.info(
-                    "PIECEWISE compilation enabled on NPU. use_inductor not supported - "
-                    "using only ACL Graph mode")
-                assert compilation_config.level == CompilationLevel.PIECEWISE, \
-                    "When enabling piecewise aclgraph, please make sure compilation_config.level == CompilationLevel.PIECEWISE and compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE"
-                compilation_config.set_splitting_ops_for_v1()
-                compilation_config.use_inductor = False
-                compilation_config.splitting_ops.extend([
-                    "vllm.unified_ascend_attention_with_output",
-                    "vllm.mla_forward"
-                ])
-                update_aclgraph_sizes(vllm_config)
-            elif compilation_config.cudagraph_mode == CUDAGraphMode.FULL_DECODE_ONLY or\
-                compilation_config.cudagraph_mode == CUDAGraphMode.FULL:
-                logger.info(
-                    "FULL_DECODE_ONLY compilation enabled on NPU. use_inductor not supported - "
-                    "using only ACL Graph mode")
-                compilation_config.use_inductor = False
-                warning_message = """\033[91m
-                **********************************************************************************
-                * WARNING: You have enabled the *full graph* feature.
-                * This is an early experimental stage and may involve various unknown issues.
-                * A known problem is that capturing too many batch sizes can lead to OOM
-                * (Out of Memory) errors or inference hangs. If you encounter such issues,
-                * consider reducing `gpu_memory_utilization` or manually specifying a smaller
-                * batch size for graph capture.
-                * For more details, please refer to:
-                * https://docs.vllm.ai/en/stable/configuration/conserving_memory.html#reduce-cuda-graphs
-                **********************************************************************************\033[0m
-                """
-                logger.warning(warning_message)
-            else:
-                logger.info(
-                    "%s cudagraph_mode is not support on NPU. falling back to NONE",
-                    compilation_config.cudagraph_mode)
-                compilation_config.cudagraph_mode = CUDAGraphMode.NONE
-                compilation_config.level = CompilationLevel.NO_COMPILATION
+        if compilation_config.cudagraph_mode == CUDAGraphMode.NONE:
+            compilation_config.mode = CompilationMode.NONE
+        elif compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE:
+            logger.info(
+                "PIECEWISE compilation enabled on NPU. use_inductor not supported - "
+                "using only ACL Graph mode")
+            assert compilation_config.mode == CompilationMode.VLLM_COMPILE, \
+                "When enabling VLLM_COMPILE aclgraph, please make sure compilation_config.mode == CompilationMode.VLLM_COMPILE and compilation_config.cudagraph_mode == CUDAGraphMode.VLLM_COMPILE"
+            compilation_config.set_splitting_ops_for_v1()
+            compilation_config.use_inductor = False
+            compilation_config.splitting_ops.extend(["vllm::mla_forward"])
+            update_aclgraph_sizes(vllm_config)
+        elif compilation_config.cudagraph_mode == CUDAGraphMode.FULL_DECODE_ONLY or\
+            compilation_config.cudagraph_mode == CUDAGraphMode.FULL:
+            logger.info(
+                "FULL_DECODE_ONLY compilation enabled on NPU. use_inductor not supported - "
+                "using only ACL Graph mode")
+            compilation_config.use_inductor = False
+            warning_message = """\033[91m
+            **********************************************************************************
+            * WARNING: You have enabled the *full graph* feature.
+            * This is an early experimental stage and may involve various unknown issues.
+            * A known problem is that capturing too many batch sizes can lead to OOM
+            * (Out of Memory) errors or inference hangs. If you encounter such issues,
+            * consider reducing `gpu_memory_utilization` or manually specifying a smaller
+            * batch size for graph capture.
+            * For more details, please refer to:
+            * https://docs.vllm.ai/en/stable/configuration/conserving_memory.html#reduce-cuda-graphs
+            **********************************************************************************\033[0m
+            """
+            logger.warning(warning_message)
        else:
-            if compilation_config.cudagraph_mode == CUDAGraphMode.NONE:
-                compilation_config.mode = CompilationMode.NONE
-            elif compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE:
-                logger.info(
-                    "PIECEWISE compilation enabled on NPU. use_inductor not supported - "
-                    "using only ACL Graph mode")
-                assert compilation_config.mode == CompilationMode.VLLM_COMPILE, \
-                    "When enabling VLLM_COMPILE aclgraph, please make sure compilation_config.mode == CompilationMode.VLLM_COMPILE and compilation_config.cudagraph_mode == CUDAGraphMode.VLLM_COMPILE"
-                compilation_config.set_splitting_ops_for_v1()
-                compilation_config.use_inductor = False
-                compilation_config.splitting_ops.extend(["vllm::mla_forward"])
-                update_aclgraph_sizes(vllm_config)
-            elif compilation_config.cudagraph_mode == CUDAGraphMode.FULL_DECODE_ONLY or\
-                compilation_config.cudagraph_mode == CUDAGraphMode.FULL:
-                logger.info(
-                    "FULL_DECODE_ONLY compilation enabled on NPU. use_inductor not supported - "
-                    "using only ACL Graph mode")
-                compilation_config.use_inductor = False
-                warning_message = """\033[91m
-                **********************************************************************************
-                * WARNING: You have enabled the *full graph* feature.
-                * This is an early experimental stage and may involve various unknown issues.
-                * A known problem is that capturing too many batch sizes can lead to OOM
-                * (Out of Memory) errors or inference hangs. If you encounter such issues,
-                * consider reducing `gpu_memory_utilization` or manually specifying a smaller
-                * batch size for graph capture.
-                * For more details, please refer to:
-                * https://docs.vllm.ai/en/stable/configuration/conserving_memory.html#reduce-cuda-graphs
-                **********************************************************************************\033[0m
-                """
-                logger.warning(warning_message)
-            else:
-                logger.info(
-                    "%s cudagraph_mode is not support on NPU. falling back to NONE",
-                    compilation_config.cudagraph_mode)
-                compilation_config.cudagraph_mode = CUDAGraphMode.NONE
-                compilation_config.mode = CompilationMode.NONE
+            logger.info(
+                "%s cudagraph_mode is not support on NPU. falling back to NONE",
+                compilation_config.cudagraph_mode)
+            compilation_config.cudagraph_mode = CUDAGraphMode.NONE
+            compilation_config.mode = CompilationMode.NONE

        # TODO: Remove this check when ACL Graph supports ASCEND_LAUNCH_BLOCKING=1
        # Then, we will have to discuss the error handling strategy and user experience
@@ -315,10 +253,7 @@ class NPUPlatform(Platform):

        if parallel_config and parallel_config.worker_cls == "auto":
            # TODO: this is a tricky way to disable `use_sequence_parallel_moe` in vllm.
-            if vllm_version_is("0.11.0"):
-                os.environ["VLLM_ALL2ALL_BACKEND"] = "flashinfer_all2allv"
-            else:
-                parallel_config.all2all_backend = "flashinfer_all2allv"
+            parallel_config.all2all_backend = "flashinfer_all2allv"
            if ascend_config.torchair_graph_config.enabled or ascend_config.enable_shared_expert_dp:
                parallel_config.worker_cls = "vllm_ascend.torchair.torchair_worker.NPUTorchairWorker"
            else:
@@ -443,10 +378,7 @@ class NPUPlatform(Platform):

    @classmethod
    def get_punica_wrapper(cls) -> str:
-        if vllm_version_is("0.11.0"):
-            return "vllm_ascend.lora.punica_npu.PunicaWrapperNPU0110"
-        else:
-            return "vllm_ascend.lora.punica_npu.PunicaWrapperNPU"
+        return "vllm_ascend.lora.punica_npu.PunicaWrapperNPU"

    @classmethod
    def get_current_memory_usage(cls,