[MISC] Cherry pick #1291 from v0.9.1-dev (#1825)

### What this PR does / why we need it? Cherry pick #1291 from v0.9.1-dev, This pr implement the synchronization of whether `dbo` is enabled across all dp ranks. specifically, it performed allreduce op across multiple DP ranks, only when all the dp rank is `enable_dbo`, it is enabled Co-authored-by: shikang-hangzhou <459956190@qq.com> Co-authored-by: wangli <wangli858794774@gmail.com> - vLLM version: v0.10.0 - vLLM main: 2836dd73f1 --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2025-08-01 09:08:45 +08:00
parent 9e65da990e
commit 2284289880
6 changed files with 68 additions and 37 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -79,6 +79,7 @@ from vllm_ascend.attention.attention_v1 import (AscendAttentionState,
                                                AscendMetadata)
 from vllm_ascend.attention.attention_v1_torchair import AscendTorchairMetadata
 from vllm_ascend.attention.mla_v1 import AscendMLAMetadata
+from vllm_ascend.multistream.ms_split import compute_split_seq_index
 from vllm_ascend.platform import NPUPlatform
 from vllm_ascend.sample.rejection_sampler import AscendRejectionSampler
 from vllm_ascend.torchair.utils import (check_torchair_cache_exist,
@@ -606,6 +607,27 @@ class NPUModelRunner(LoRAModelRunnerMixin):
        return maybe_padded_num_tokens, num_tokens_across_dp, with_prefill, not bool(
            forward_metadata[-1])

+    def _check_dbo_is_valid(self, query_lens: torch.Tensor,
+                            attn_state: AscendAttentionState,
+                            num_tokens: int) -> bool:
+        # do the checks for dp + dbo
+        if attn_state in [
+                AscendAttentionState.DecodeOnly,
+                AscendAttentionState.SpecDecoding
+        ]:
+            return False
+        # considering the case that one dp rank may enable dbo while others may not
+        if not self.vllm_config.model_config.use_mla or not envs_ascend.VLLM_ASCEND_ENABLE_DBO:
+            return False
+        # TODO: remove it if token-level microbatch is enabled
+        [token_index,
+         seq_index] = compute_split_seq_index(query_lens, attn_state,
+                                              num_tokens)
+        if token_index == 0 or seq_index == 0 or seq_index == len(
+                query_lens) or num_tokens < 256:
+            return False
+        return True
+
    def get_eagle_atten_dict(
        self,
        scheduler_output: "SchedulerOutput",
@@ -1080,6 +1102,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
        with_prefill = attn_state not in [
            AscendAttentionState.DecodeOnly, AscendAttentionState.SpecDecoding
        ]
+        enable_dbo = self._check_dbo_is_valid(self.query_lens.tolist(),
+                                              attn_state,
+                                              total_num_scheduled_tokens)

        maybe_padded_num_tokens = total_num_scheduled_tokens
        if self.torchair_graph_enabled and not with_prefill:
@@ -1087,7 +1112,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                total_num_scheduled_tokens)
        (padded_num_tokens_across_dp, num_tokens_across_dp, with_prefill,
         enable_dbo) = self._get_forward_metadata_across_dp(
-             maybe_padded_num_tokens, total_num_scheduled_tokens, with_prefill)
+             maybe_padded_num_tokens, total_num_scheduled_tokens, with_prefill,
+             enable_dbo)
+        extra_builder_kwargs['enable_dbo_across_dp'] = enable_dbo

        if self.torchair_graph_enabled and not with_prefill:
            graph_pad_size = padded_num_tokens_across_dp - total_num_scheduled_tokens
@@ -1739,8 +1766,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):

        # Padding for DP
        (num_tokens, num_tokens_across_dp, with_prefill,
-         enable_dbo) = self._get_forward_metadata_across_dp(
-             maybe_padded_num_tokens, num_tokens, with_prefill, False)
+         _) = self._get_forward_metadata_across_dp(maybe_padded_num_tokens,
+                                                   num_tokens, with_prefill,
+                                                   False)

        # Set num_scheduled_tokens based on num_tokens and max_num_seqs
        # for dummy run with LoRA so that the num_reqs collectively