init v0.11.0rc0

2025-10-14 10:38:28 +08:00
parent 67afd0ea78
commit 66dc16f966
278 changed files with 28130 additions and 11708 deletions
--- a/vllm_ascend/torchair/torchair_model_runner.py
+++ b/vllm_ascend/torchair/torchair_model_runner.py
@@ -17,6 +17,7 @@
 # Adapted from vllm-project/vllm/vllm/worker/gpu_model_runner.py
 # isort: skip_file

+import math
 import types
 from typing import Optional

@@ -24,7 +25,6 @@ import torch
 import torch.distributed as dist
 import torch.nn as nn
 import torch_npu
-import vllm.envs as envs_vllm
 from vllm.config import VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_dp_group
@@ -40,25 +40,39 @@ from vllm_ascend.torchair.utils import (
    register_torchair_model, torchair_ops_patch,
    torchair_quant_method_register, write_kv_cache_bytes_to_file)
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
-                               is_310p)
+                               is_310p, get_ascend_soc_version,
+                               AscendSocVersion)
 from vllm_ascend.worker.model_runner_v1 import NPUModelRunner


 class NPUTorchairModelRunner(NPUModelRunner):

    def __init__(self, vllm_config: VllmConfig, device: torch.device):
+        self.ascend_config = get_ascend_config()
+        self.enable_shared_expert_dp = self.ascend_config.enable_shared_expert_dp
        super().__init__(vllm_config, device)
-        ascend_config = get_ascend_config()
+        if self.speculative_config:
+            self.actual_seq_lengths_q = list(
+                range(self.decode_token_per_req, self.max_num_tokens + 1,
+                      self.decode_token_per_req))
+        self.attn_metadata_builder = self.attn_backend.get_builder_cls()(
+            None, None, vllm_config, device)
+
+        register_torchair_model()
+        torchair_ops_patch()
+        torchair_quant_method_register()
+        if self.enable_shared_expert_dp:
+            return
        self.new_kv_cache_bytes = -1
        self.torchair_compiled_model = None  # type: ignore
        self.torchair_compiled_models = {}  # type: ignore
-        self.use_cached_npu_graph = ascend_config.torchair_graph_config.use_cached_graph
-        self.use_cached_kv_cache_bytes = ascend_config.torchair_graph_config.use_cached_kv_cache_bytes
-        self.torchair_graph_batch_sizes = ascend_config.torchair_graph_config.graph_batch_sizes
-        if ascend_config.torchair_graph_config.graph_batch_sizes_init:
+        self.use_cached_npu_graph = self.ascend_config.torchair_graph_config.use_cached_graph
+        self.use_cached_kv_cache_bytes = self.ascend_config.torchair_graph_config.use_cached_kv_cache_bytes
+        self.torchair_graph_batch_sizes = self.ascend_config.torchair_graph_config.graph_batch_sizes
+        if self.ascend_config.torchair_graph_config.graph_batch_sizes_init:
            self.init_torchair_graph_batch_sizes()

-        self.check_torchair_graph_batch_sizes()
+        self.update_torchair_graph_batch_sizes()

        torch._dynamo.cache_size.config.cache_size_limit += len(
            self.torchair_graph_batch_sizes)
@@ -67,14 +81,14 @@ class NPUTorchairModelRunner(NPUModelRunner):
            recompiles=envs_ascend.VLLM_ASCEND_TRACE_RECOMPILES)

        self._check_batch_sizes_consistency()
-        register_torchair_model()
-        torchair_ops_patch()
-        torchair_quant_method_register()

    def _sync_metadata_across_dp(
            self, num_tokens: int, with_prefill: bool, enable_dbo: bool
    ) -> tuple[int, Optional[torch.Tensor], bool, bool]:
        """Override from NPUModelRunner to pad num_tokens"""
+        if self.enable_shared_expert_dp:
+            # Padding is not required for shared_expert_dp cases in eager mode.
+            return num_tokens, None, with_prefill, enable_dbo
        if self.dp_size == 1:
            if not with_prefill:
                maybe_padded_num_tokens = self.select_torchair_padded_batch_size(
@@ -107,10 +121,15 @@ class NPUTorchairModelRunner(NPUModelRunner):

        return maybe_padded_num_tokens, num_tokens_across_dp, with_prefill, enable_dbo

-    def _build_attention_metadata(self, with_prefill, num_reqs, skip_attn):
+    def _build_attention_metadata(self, with_prefill, num_reqs, num_tokens,
+                                  max_query_len, force_attention):
        # NOTE: If torchair graph mode and not with_prefill,
        # we can't skip_attn, it will cause graph recompile.
-        if not with_prefill:
+        if with_prefill or self.enable_shared_expert_dp:
+            attn_metadata = super()._build_attention_metadata(
+                with_prefill, num_reqs, num_tokens, max_query_len,
+                force_attention)
+        else:
            common_attn_metadata = TorchairCommonAttentionMetadata(
                num_reqs=num_reqs,
                num_actual_tokens=1,
@@ -121,17 +140,19 @@ class NPUTorchairModelRunner(NPUModelRunner):
            )
            attn_metadata = self.attn_metadata_builder.build_torchair_graph_dummy(
                common_attn_metadata)
-        else:
-            attn_metadata = super()._build_attention_metadata(
-                with_prefill, num_reqs, skip_attn)
        return attn_metadata

    def _generate_dummy_run_hidden_states(self, with_prefill,
                                          is_torchair_compile, input_ids,
                                          positions, attn_metadata, num_tokens,
                                          intermediate_tensors, inputs_embeds):
-
-        if not with_prefill:
+        if with_prefill or self.enable_shared_expert_dp:
+            if is_310p():
+                converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND)
+            hidden_states = super()._generate_dummy_run_hidden_states(
+                with_prefill, is_torchair_compile, input_ids, positions,
+                attn_metadata, num_tokens, intermediate_tensors, inputs_embeds)
+        else:
            # Only mark static while compiling
            if is_torchair_compile:
                torch._dynamo.mark_static(input_ids)
@@ -163,15 +184,11 @@ class NPUTorchairModelRunner(NPUModelRunner):
                inputs_embeds=None,
                **model_kwargs,
            )
-        else:
-            if is_310p():
-                converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND)
-            hidden_states = super()._generate_dummy_run_hidden_states(
-                with_prefill, is_torchair_compile, input_ids, positions,
-                attn_metadata, num_tokens, intermediate_tensors, inputs_embeds)
        return hidden_states

    def _convert_torch_format(self, kv_cache):
+        if self.enable_shared_expert_dp:
+            return super()._convert_torch_format(kv_cache)
        kv_cache = torch_npu.npu_format_cast(kv_cache, ACL_FORMAT_FRACTAL_ND)
        return kv_cache

@@ -189,6 +206,8 @@ class NPUTorchairModelRunner(NPUModelRunner):

    def _capture_model(self):
        """Override from NPUModelRunner to use torchair graph capture."""
+        if self.enable_shared_expert_dp:
+            return super()._capture_model()
        # TODO(NeverRaR): Calling graph_capture(device=self.device) in
        # torchair graph capture can cause some issues, so now we just
        # temporarily split the codepath for the two different graph patterns.
@@ -228,6 +247,8 @@ class NPUTorchairModelRunner(NPUModelRunner):
                                         self.new_kv_cache_bytes)

    def _use_aclgraph(self) -> bool:
+        if self.enable_shared_expert_dp:
+            return super()._use_aclgraph()
        return False

    def _check_batch_sizes_consistency(self) -> None:
@@ -253,10 +274,10 @@ class NPUTorchairModelRunner(NPUModelRunner):
            )

    def _update_graph_pad_size(self, with_prefill, graph_pad_size):
-        if not with_prefill:
-            self.graph_pad_size = graph_pad_size
-        else:
+        if with_prefill or self.enable_shared_expert_dp:
            super()._update_graph_pad_size(with_prefill, graph_pad_size)
+        else:
+            self.graph_pad_size = graph_pad_size

    def _update_input_ids_and_positions(self, input_ids, positions,
                                        num_input_tokens, with_prefill,
@@ -266,7 +287,9 @@ class NPUTorchairModelRunner(NPUModelRunner):
            input_ids, positions, num_input_tokens, with_prefill,
            padded_num_tokens_across_dp)

-        if not with_prefill:
+        if with_prefill or self.enable_shared_expert_dp:
+            return input_ids, positions
+        else:
            input_ids = self.input_ids[:padded_num_tokens_across_dp]
            positions = self.positions[:padded_num_tokens_across_dp]
        return input_ids, positions
@@ -276,6 +299,13 @@ class NPUTorchairModelRunner(NPUModelRunner):
                                             input_ids, positions,
                                             intermediate_tensors,
                                             inputs_embeds):
+        if attn_metadata is not None and isinstance(attn_metadata, dict):
+            attn_metadata = attn_metadata['model.layers.0.self_attn.attn']
+
+        if self.enable_shared_expert_dp:
+            return super()._generate_process_reqs_hidden_states(
+                attn_metadata, with_prefill, padded_num_tokens_across_dp,
+                input_ids, positions, intermediate_tensors, inputs_embeds)
        model_kwargs = {
            "kv_caches": self.kv_caches,
            "attn_metadata": attn_metadata
@@ -332,21 +362,22 @@ class NPUTorchairModelRunner(NPUModelRunner):
            communication_adaptation_310p()

        config = torchair.CompilerConfig()
-        if get_ascend_config().torchair_graph_config.mode:
-            config.mode = get_ascend_config().torchair_graph_config.mode
-        config.experimental_config.frozen_parameter = True
+        if self.ascend_config.torchair_graph_config.mode:
+            config.mode = self.ascend_config.torchair_graph_config.mode
+        config.experimental_config.frozen_parameter = \
+        self.ascend_config.torchair_graph_config.enable_frozen_parameter
        # enabling tiling_schedule_optimize on 300I Duo has some bugs, so we have to
        # disable it on 300I Duo platform now.
        config.experimental_config.tiling_schedule_optimize = not is_310p()
        config.experimental_config.enable_view_optimize = \
-        get_ascend_config().torchair_graph_config.enable_view_optimize
+        self.ascend_config.torchair_graph_config.enable_view_optimize
        torch.npu.set_compile_mode(jit_compile=False)
        if not self.use_cached_npu_graph:
            npu_backend = torchair.get_npu_backend(compiler_config=config)
            self.torchair_compiled_model = torch.compile(
                self.model,
-                dynamic=True,
-                fullgraph=envs_vllm.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
+                dynamic=not self.ascend_config.use_sfa,
+                fullgraph=True,
                backend=npu_backend)
            return self.torchair_compiled_model
        else:
@@ -368,8 +399,8 @@ class NPUTorchairModelRunner(NPUModelRunner):
            self.torchair_compiled_models[
                batch_size] = torchair.inference.cache_compile(
                    self.model.__dict__[forward_proxy_name],
-                    dynamic=True,
-                    fullgraph=envs_vllm.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
+                    dynamic=not self.ascend_config.use_sfa,
+                    fullgraph=True,
                    cache_dir=TORCHAIR_CACHE_DIR,
                    config=config,
                    ge_cache=False)
@@ -396,10 +427,16 @@ class NPUTorchairModelRunner(NPUModelRunner):
            f"{self.torchair_graph_batch_sizes}, but cur batch_size is {batch_size}."
        )

-    def check_torchair_graph_batch_sizes(self):
+    def update_torchair_graph_batch_sizes(self):
        # return graph_batch_sizes according to the max number of tokens
        # first pad according to the number of requests
-        if len(self.torchair_graph_batch_sizes) == 0:
+        if self.is_kv_consumer and self.speculative_config and self.speculative_config.method == 'deepseek_mtp':
+            # pd disaggregation scenario may incorrectly calculate the batch in mtp scenario, so we force set it to max_num_reqs
+            self.torchair_graph_batch_sizes = [self.max_num_reqs]
+            logger.warning(
+                "is kv_consumer, torch_graph_batch_sizes sets to [max_num_seqs]"
+            )
+        elif len(self.torchair_graph_batch_sizes) == 0:
            self.torchair_graph_batch_sizes = [1, self.max_num_reqs]
        else:
            self.torchair_graph_batch_sizes = sorted(
@@ -420,27 +457,47 @@ class NPUTorchairModelRunner(NPUModelRunner):
            for graph_batch_size in self.torchair_graph_batch_sizes
        ]

-        # NOTE: when enable_expert_parallel, we need to check if `graph_batch_size` is divisible by `tp_size`
+        # NOTE: when enable_expert_parallel on A3, we need to check if `graph_batch_size` is divisible by `tp_size`
+        # Because we use x_active_mask for dispatch/combine op on A3, which requires that input shape should be same
+        # on all EP ranks
+        if get_ascend_soc_version(
+        ) == AscendSocVersion.A3 and self.parallel_config.enable_expert_parallel:
+            self._align_graph_size_divisible_by_tp_size()
+
+    def _align_graph_size_divisible_by_tp_size(self):
        tp_size = self.parallel_config.tensor_parallel_size
-        if self.parallel_config.enable_expert_parallel:
-            new_graph_batch_sizes = []
-            for graph_batch_size in self.torchair_graph_batch_sizes:
-                cur_graph_batch_size = (graph_batch_size + tp_size -
-                                        1) // tp_size * tp_size
-                if cur_graph_batch_size not in new_graph_batch_sizes and \
-                    cur_graph_batch_size <= self.scheduler_config.max_num_batched_tokens:
-                    new_graph_batch_sizes.append(cur_graph_batch_size)
-                elif cur_graph_batch_size > self.scheduler_config.max_num_batched_tokens \
-                        and self.decode_token_per_req > 1:
-                    logger.warning(
-                        f"torchair_graph_batch_sizes {cur_graph_batch_size} is bigger than max_num_batched_tokens",
-                        f"{self.scheduler_config.max_num_batched_tokens} will skip this batch size."
-                    )
+        new_graph_batch_sizes = []
+        for graph_batch_size in self.torchair_graph_batch_sizes:
+            cur_graph_batch_size = (graph_batch_size + tp_size -
+                                    1) // tp_size * tp_size
+            # MTP > 1: Cal LCMLeast Common Multiple with graph_batch_size and tp_size,
+            # Both adapter multi-dp and FIA operator
+            if self.speculative_config is not None and self.speculative_config.num_speculative_tokens > 1:
+                cur_graph_batch_size = (tp_size * graph_batch_size) \
+                                       // math.gcd(tp_size, graph_batch_size)
+            if cur_graph_batch_size not in new_graph_batch_sizes and \
+                cur_graph_batch_size <= self.scheduler_config.max_num_batched_tokens:
+                new_graph_batch_sizes.append(cur_graph_batch_size)
+            elif cur_graph_batch_size > self.scheduler_config.max_num_batched_tokens \
+                    and self.decode_token_per_req > 1:
+                logger.warning(
+                    f"torchair_graph_batch_sizes {cur_graph_batch_size} is bigger than max_num_batched_tokens",
+                    f"{self.scheduler_config.max_num_batched_tokens} will skip this batch size."
+                )
+        new_max_num_reqs = max(new_graph_batch_sizes)
+        if self.max_num_reqs != new_max_num_reqs:
+            logger.warning(f"max_num_reqs is updated to {new_max_num_reqs}")
+            self.max_num_reqs = new_max_num_reqs
+            self.scheduler_config.max_num_seqs = new_max_num_reqs
+
+        if new_graph_batch_sizes != self.torchair_graph_batch_sizes:
+            logger.warning(
+                f"torchair_graph_batch_sizes are updated to {new_graph_batch_sizes}."
+            )
            self.torchair_graph_batch_sizes = new_graph_batch_sizes

    def _build_drafter_prepare_inputs_torchair_param(self):
-        return True
-
-    def get_dp_padding(self, num_tokens):
-        """Override from NPUModelRunner to get dp padding"""
-        return 0, None
+        if self.enable_shared_expert_dp:
+            return super()._build_drafter_prepare_inputs_torchair_param()
+        else:
+            return True