Drop 0.11.0 support (#4377)

There is a lot hack code for v0.11.0, which makes the code hard to upgrade to newer vLLM version. Since v0.11.0 will release soon. Let's drop v0.11.0 support first. Then we'll upgrade to v0.11.2 soon. - vLLM version: v0.11.0 - vLLM main: 2918c1b49c Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-11-24 17:08:20 +08:00
parent 41ddb06554
commit a1f142b7ad
80 changed files with 467 additions and 1755 deletions
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -412,33 +412,21 @@ def _is_default_capture_sizes(vllm_config: VllmConfig) -> bool:
    Check whether it is vLLM default capture sizes.
    """

-    if vllm_version_is("0.11.0"):
-        cuda_graph_sizes = vllm_config.scheduler_config.cuda_graph_sizes
-        if len(cuda_graph_sizes) == 1:
-            cudagraph_capture_sizes = [1, 2, 4] + [
-                i for i in range(8, cuda_graph_sizes[0] + 1, 8)
-            ]
-    else:
-        max_cudagraph_capture_size = \
-            vllm_config.compilation_config.max_cudagraph_capture_size
-        cudagraph_capture_sizes = [
-            i for i in [1, 2, 4] if i <= max_cudagraph_capture_size
-        ]
-        if max_cudagraph_capture_size >= 8:
-            # Step size 8 for small batch sizes, up to 256(not included)
-            cudagraph_capture_sizes += list(
-                range(8, min(max_cudagraph_capture_size + 1, 256), 8))
-        if max_cudagraph_capture_size >= 256:
-            # Step size 16 for larger batch sizes
-            cudagraph_capture_sizes += list(
-                range(256, max_cudagraph_capture_size + 1, 16))
-
-    if vllm_version_is("0.11.0"):
-        target_cudagraph_capture_sizes = sorted(cudagraph_capture_sizes,
-                                                reverse=True)
-    else:
-        # in newer version, vVLLM use ascending order of cudagraph_capture_sizes.
-        target_cudagraph_capture_sizes = sorted(cudagraph_capture_sizes)
+    max_cudagraph_capture_size = \
+        vllm_config.compilation_config.max_cudagraph_capture_size
+    cudagraph_capture_sizes = [
+        i for i in [1, 2, 4] if i <= max_cudagraph_capture_size
+    ]
+    if max_cudagraph_capture_size >= 8:
+        # Step size 8 for small batch sizes, up to 256(not included)
+        cudagraph_capture_sizes += list(
+            range(8, min(max_cudagraph_capture_size + 1, 256), 8))
+    if max_cudagraph_capture_size >= 256:
+        # Step size 16 for larger batch sizes
+        cudagraph_capture_sizes += list(
+            range(256, max_cudagraph_capture_size + 1, 16))
+    # in newer version, vLLM use ascending order of cudagraph_capture_sizes.
+    target_cudagraph_capture_sizes = sorted(cudagraph_capture_sizes)
    if target_cudagraph_capture_sizes == \
            vllm_config.compilation_config.cudagraph_capture_sizes:
        return True
@@ -465,21 +453,13 @@ def update_default_aclgraph_sizes(vllm_config: VllmConfig) -> None:
    if vllm_config.model_config and vllm_config.model_config.hf_config.model_type == "qwen3_moe" \
        and vllm_config.parallel_config.tensor_parallel_size == 1 \
        and vllm_config.parallel_config.data_parallel_size > 1 :
-        if vllm_version_is("0.11.0"):
-            max_capture_size = vllm_config.scheduler_config.cuda_graph_sizes[0]
-        else:
-            max_capture_size = vllm_config.compilation_config.max_cudagraph_capture_size
+
+        max_capture_size = vllm_config.compilation_config.max_cudagraph_capture_size
        new_cudagraph_capture_sizes = [1, 2, 5, 10, 15, 20] + [
            i for i in range(24, max_capture_size + 1, 8)
        ]
-
-        if vllm_version_is("0.11.0"):
-            vllm_config.compilation_config.cudagraph_capture_sizes = new_cudagraph_capture_sizes
-            vllm_config.compilation_config.init_with_cudagraph_sizes(
-                new_cudagraph_capture_sizes)
-        else:
-            update_cudagraph_capture_sizes(vllm_config,
-                                           new_cudagraph_capture_sizes)
+        update_cudagraph_capture_sizes(vllm_config,
+                                       new_cudagraph_capture_sizes)


 def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
@@ -573,10 +553,7 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
        indices[0], indices[-1] = 0, len(original_sizes) - 1

        sampled_sizes = [original_sizes[i] for i in indices]
-        if vllm_version_is("0.11.0"):
-            compilation_config.init_with_cudagraph_sizes(sampled_sizes)
-        else:
-            update_cudagraph_capture_sizes(vllm_config, sampled_sizes)
+        update_cudagraph_capture_sizes(vllm_config, sampled_sizes)

        logger.info(
            "Adjusted ACL graph batch sizes for %s model (layers: %d): %d → %d sizes",
@@ -607,10 +584,7 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
        if original_sizes[0] < (num_speculative_tokens + 1) * max_num_seqs:
            enlarged_sizes = [(num_speculative_tokens + 1) * size
                              for size in original_sizes]
-            if vllm_version_is("0.11.0"):
-                compilation_config.init_with_cudagraph_sizes(enlarged_sizes)
-            else:
-                update_cudagraph_capture_sizes(vllm_config, enlarged_sizes)
+            update_cudagraph_capture_sizes(vllm_config, enlarged_sizes)
            logger.info(
                "Adjusted ACL graphs: %s → %s for speculative decoding",
                original_sizes, enlarged_sizes)
@@ -719,11 +693,8 @@ def register_ascend_customop(vllm_config: Optional[VllmConfig] = None):
        "GemmaRMSNorm": AscendGemmaRMSNorm,
        "FusedMoE": AscendFusedMoE,
        "SharedFusedMoE": AscendSharedFusedMoE,
+        "MultiHeadLatentAttentionWrapper": AscendMultiHeadLatentAttention,
    }
-    mla_to_register = "MultiHeadLatentAttention" if vllm_version_is(
-        "0.11.0") else "MultiHeadLatentAttentionWrapper"
-    if vllm_config and vllm_config.model_config and vllm_config.model_config.use_mla:
-        REGISTERED_ASCEND_OPS[mla_to_register] = AscendMultiHeadLatentAttention

    for name, op_cls in REGISTERED_ASCEND_OPS.items():
        CustomOp.register_oot(_decorated_op_cls=op_cls, name=name)