init v0.11.0rc0

2025-10-14 10:38:28 +08:00
parent 67afd0ea78
commit 66dc16f966
278 changed files with 28130 additions and 11708 deletions
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -16,6 +16,7 @@
 #

 import gc
+import os
 from datetime import timedelta
 from typing import TYPE_CHECKING, Optional, Tuple

@@ -31,7 +32,7 @@ from vllm_ascend.ascend_config import (check_ascend_config, get_ascend_config,
 from vllm_ascend.torchair.utils import (check_torchair_cache_exist,
                                        delete_torchair_cache_file)
 from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, is_310p,
-                               update_aclgraph_sizes)
+                               update_aclgraph_sizes, vllm_version_is)

 if TYPE_CHECKING:
    from vllm.config import ModelConfig, VllmConfig
@@ -128,11 +129,43 @@ class NPUPlatform(Platform):
        model_config = vllm_config.model_config
        parallel_config = vllm_config.parallel_config
        cache_config = vllm_config.cache_config
+        scheduler_config = vllm_config.scheduler_config
+        ascend_scheduler_config = ascend_config.ascend_scheduler_config
+        if vllm_version_is("0.10.2"):
+            structured_outputs_config = vllm_config.decoding_config
+        else:
+            structured_outputs_config = vllm_config.structured_outputs_config
+
+        if (model_config is not None and not model_config.use_mla
+                and not scheduler_config.async_scheduling):
+            logger.info(
+                "Non-MLA LLMs forcibly disable the chunked prefill feature,"
+                "as the performance of operators supporting this feature "
+                "functionality is currently suboptimal.")
+            if not model_config.is_multimodal_model and \
+                structured_outputs_config.backend == "auto" and \
+                not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \
+                not scheduler_config.send_delta_data and \
+                scheduler_config.policy == "fcfs":
+                ascend_scheduler_config.enabled = True
+                chunked_prefill_enabled_in_ascend_scheduler = getattr(
+                    ascend_scheduler_config, "enable_chunked_prefill", False)
+                if chunked_prefill_enabled_in_ascend_scheduler:
+                    logger.warning(
+                        "Chunked prefill feature is enabled in ascend_scheduler,"
+                        "but note that the operator supporting this feature "
+                        "would lead to performance degradation.")
+                # In this situation, max_num_batched_tokens would have been rewritten.
+                # So we must make sure max_num_batched_tokens is not smaller than max_model_len.
+                if (scheduler_config.max_num_batched_tokens
+                        < scheduler_config.max_model_len
+                        and not chunked_prefill_enabled_in_ascend_scheduler):
+                    scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len
+
        kv_cache_dtype = vllm_config.additional_config.get(
            "kv_cache_dtype", None)
        if kv_cache_dtype is not None:
            vllm_config.cache_config.cache_dtype = kv_cache_dtype
-
        if model_config is None:
            logger.warning("Model config is missing. This may indicate "
                           "that we are running a test case")
@@ -148,23 +181,13 @@ class NPUPlatform(Platform):

        compilation_config.cudagraph_num_of_warmups = 1

-        # TODO: make vllm support oot platform to set `compilation_config.cudagraph_mode`
-        # if cudagraph_mode is not explicitly set by users, set default value
-        if compilation_config.level == CompilationLevel.PIECEWISE:
-            compilation_config.cudagraph_mode = \
-                CUDAGraphMode.PIECEWISE
-        elif compilation_config.level not in [
+        if compilation_config.level not in [
                CompilationLevel.NO_COMPILATION, CompilationLevel.PIECEWISE
        ]:
            logger.warning(
                "NPU does not support %s compilation level. Setting CUDAGraphMode to NONE",
                compilation_config.level)
            compilation_config.cudagraph_mode = CUDAGraphMode.NONE
-        else:
-            logger.warning(
-                "compilation_config.level = CompilationLevel.NO_COMPILATION is set, Setting CUDAGraphMode to NONE"
-            )
-            compilation_config.cudagraph_mode = CUDAGraphMode.NONE

        # set CUDAGraphMode to None when torchair is enabled, no mather what compilation_config.level is.
        if ascend_config.torchair_graph_config.enabled:
@@ -185,18 +208,22 @@ class NPUPlatform(Platform):
                    "and use_cached_kv_cache_bytes in torchair_graph_config.")
                delete_torchair_cache_file()

-        if parallel_config.distributed_executor_backend == "ray":
-            logger.warning(
-                "Ray distributed executor backend is not compatible with ACL Graph mode "
-                "right now. Setting CUDAGraphMode to NONE")
-            compilation_config.cudagraph_mode = CUDAGraphMode.NONE
-
        # set cudaprah sizes before extending `compilation_config.splitting_ops`
        vllm_config._set_cudagraph_sizes()

+        # TODO: Full graph is fully supported later, and the default value will be set to full graph.
+        if not vllm_version_is("0.10.2"):
+            if compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE:
+                compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
+
        if compilation_config.cudagraph_mode == CUDAGraphMode.NONE:
            compilation_config.level = CompilationLevel.NO_COMPILATION
-        elif compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE:
+        # TODO: Currently MLA does not support FULL_DECODE_ONLY, remove the second condition
+        # after MLA being supported
+        elif compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE or (
+                compilation_config.cudagraph_mode
+                == CUDAGraphMode.FULL_DECODE_ONLY and model_config is not None
+                and model_config.use_mla):
            logger.info(
                "PIECEWISE compilation enabled on NPU. use_inductor not supported - "
                "using only ACL Graph mode")
@@ -204,9 +231,28 @@ class NPUPlatform(Platform):
                "When enabling piecewise aclgraph, please make sure compilation_config.level == CompilationLevel.PIECEWISE and compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE"
            compilation_config.set_splitting_ops_for_v1()
            compilation_config.use_inductor = False
-            compilation_config.splitting_ops.extend(
-                ["vllm.unified_ascend_attention_with_output"])
+            compilation_config.splitting_ops.extend([
+                "vllm.unified_ascend_attention_with_output", "vllm.mla_forward"
+            ])
            update_aclgraph_sizes(vllm_config)
+        elif compilation_config.cudagraph_mode == CUDAGraphMode.FULL_DECODE_ONLY:
+            logger.info(
+                "FULL_DECODE_ONLY compilation enabled on NPU. use_inductor not supported - "
+                "using only ACL Graph mode")
+            compilation_config.use_inductor = False
+            warning_message = """\033[91m
+            **********************************************************************************
+            * WARNING: You have enabled the *full graph* feature.
+            * This is an early experimental stage and may involve various unknown issues.
+            * A known problem is that capturing too many batch sizes can lead to OOM
+            * (Out of Memory) errors or inference hangs. If you encounter such issues,
+            * consider reducing `gpu_memory_utilization` or manually specifying a smaller
+            * batch size for graph capture.
+            * For more details, please refer to:
+            * https://docs.vllm.ai/en/stable/configuration/conserving_memory.html#reduce-cuda-graphs
+            **********************************************************************************\033[0m
+            """
+            logger.warning(warning_message)
        else:
            logger.info(
                "%s cudagraph_mode is not support on NPU. falling back to NONE",
@@ -215,7 +261,9 @@ class NPUPlatform(Platform):
            compilation_config.level = CompilationLevel.NO_COMPILATION

        if parallel_config and parallel_config.worker_cls == "auto":
-            if ascend_config.torchair_graph_config.enabled:
+            # TODO: this is a tricky way to disable `use_sequence_parallel_moe` in vllm.
+            os.environ["VLLM_ALL2ALL_BACKEND"] = "flashinfer_all2allv"
+            if ascend_config.torchair_graph_config.enabled or ascend_config.enable_shared_expert_dp:
                parallel_config.worker_cls = "vllm_ascend.torchair.torchair_worker.NPUTorchairWorker"
            else:
                parallel_config.worker_cls = "vllm_ascend.worker.worker_v1.NPUWorker"
@@ -223,6 +271,7 @@ class NPUPlatform(Platform):
        if cache_config:
            if cache_config.block_size is None:
                cache_config.block_size = 128
+
            if cache_config.enable_prefix_caching and cache_config.block_size != 128:
                logger.warning(
                    "If prefix caching is enabled, block size must be set to 128."
@@ -242,12 +291,6 @@ class NPUPlatform(Platform):
                ascend_config.ascend_scheduler_config)
            vllm_config.scheduler_config = ascend_scheduler_config

-        if compilation_config.pass_config.enable_sequence_parallelism:
-            if not parallel_config.enable_expert_parallel or vllm_config.model_config.hf_config.model_type != "qwen3_moe":
-                raise NotImplementedError(
-                    "For better performance in Qwen3 MoE, SP only works exclusively with MC2, AllToAll, and AllToAllV."
-                )
-
    @classmethod
    def get_attn_backend_cls(cls,
                             selected_backend,
@@ -257,27 +300,40 @@ class NPUPlatform(Platform):
                             block_size,
                             use_v1,
                             use_mla,
+                             use_sfa,
                             has_sink=False):
        if not use_v1:
            raise ValueError("vLLM Ascend does not support V0 engine.")

-        use_torchair = get_ascend_config().torchair_graph_config.enabled
+        ascend_config = get_ascend_config()
+
+        if use_mla and ascend_config.enable_shared_expert_dp:
+            if use_mla and not use_sfa:
+                return "vllm_ascend.torchair.torchair_mla.AscendMLATorchairBackend"
+            if use_mla and use_sfa:
+                return "vllm_ascend.torchair.torchair_sfa.AscendSFATorchairBackend"
+
+        use_torchair = ascend_config.torchair_graph_config.enabled
        # choose attention backend based on use_mla and use_torchair
        backend_map = {
-            (True, True):
+            (True, False, True):
            "vllm_ascend.torchair.torchair_mla.AscendMLATorchairBackend",
-            (True, False):
+            (True, False, False):
            "vllm_ascend.attention.mla_v1.AscendMLABackend",
-            (False, True):
+            (False, False, True):
            "vllm_ascend.torchair.torchair_attention.AscendAttentionTorchairBackend",
-            (False, False):
-            "vllm_ascend.attention.attention_v1.AscendAttentionBackend"
+            (False, False, False):
+            "vllm_ascend.attention.attention_v1.AscendAttentionBackend",
+            (True, True, False):
+            "vllm_ascend.attention.sfa_v1.AscendSFABackend",
+            (True, True, True):
+            "vllm_ascend.torchair.torchair_sfa.AscendSFATorchairBackend",
        }
-        return backend_map[(use_mla, use_torchair)]
+        return backend_map[(use_mla, use_sfa, use_torchair)]

    @classmethod
    def get_punica_wrapper(cls) -> str:
-        return "vllm_ascend.lora.punica_wrapper.punica_npu.PunicaWrapperNPU"
+        return "vllm_ascend.lora.punica_npu.PunicaWrapperNPU"

    @classmethod
    def get_current_memory_usage(cls,
@@ -343,3 +399,11 @@ class NPUPlatform(Platform):

        pg._register_backend(device, backend_type, backend_class)
        return pg
+
+    @classmethod
+    def support_hybrid_kv_cache(cls) -> bool:
+        return True
+
+    @classmethod
+    def support_static_graph_mode(cls) -> bool:
+        return True