[DP] Fix dp padding logic in dummyrun (#4705)

### What this PR does / why we need it? Fix dp padding logic in dummyrun. After https://github.com/vllm-project/vllm/pull/28579, `num_tokens` will be padded in `CudagraphDispatcher`, thus we also need to do the pad in the dummy_run. ### How was this patch tested? Test locally with the following scripts ```bash VLLM_USE_MODELSCOPE=true python3 -m vllm.entrypoints.openai.api_server \ --model wemaster/deepseek_mtp_main_random_bf16 \ --trust-remote-code \ --data-parallel-size 4 \ --tensor-parallel-size 1 \ --compilation-config '{"cudagraph_capture_sizes":[96],"cudagraph_mode":"FULL_DECODE_ONLY"}' \ --enable-expert-parallel ``` ```bash vllm bench serve --model wemaster/deepseek_mtp_main_random_bf16 --endpoint /v1/completions --dataset-name random --random-input 512 --random-output 100 --num-prompts 48 --request-rate 1 --ready-check-timeout-sec 0 ``` - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: MengqingCao <cmq0113@163.com>
2025-12-08 20:32:35 +08:00
parent 193dc1703f
commit 58db21f56a
1 changed files with 51 additions and 39 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -3002,8 +3002,6 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin):
         with_prefill) = self._sync_metadata_across_dp(num_tokens,
                                                       with_prefill)

-        moe_comm_type = self._select_moe_comm_method(num_tokens)
-
        # If cudagraph_mode.decode_mode() == FULL and
        # cudagraph_mode.seperate_routine(). This means that we are using
        # different graphs and/or modes for mixed prefill-decode batches vs.
@@ -3049,23 +3047,62 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin):
        if not self.in_profile_run and self.dynamic_eplb:
            self.eplb_updator.forward_before()

+        has_lora = True if self.lora_config and self.compilation_config.cudagraph_specialize_lora else False
+        _ag_mode, batch_descriptor = \
+            self.aclgraph_dispatcher.dispatch(num_tokens=num_tokens, uniform_decode=uniform_decode, has_lora=has_lora)
+
+        num_tokens_padded = batch_descriptor.num_tokens
+        num_reqs_padded = (batch_descriptor.num_reqs if
+                           batch_descriptor.num_reqs is not None else num_reqs)
+        if num_tokens_across_dp is not None and num_tokens_padded != num_tokens:
+            # pad is needed if the pad of `num_tokens` is triggered inside CudagraphDispatcher
+            num_tokens_across_dp[:] = num_tokens_padded
+            num_scheduled_tokens = num_scheduled_tokens.repeat(num_reqs_padded)
+
+        moe_comm_type = self._select_moe_comm_method(num_tokens_padded)
+
+        # filter out the valid batch descriptor
+        if aclgraph_runtime_mode is not None:
+            # we allow forcing NONE when the dispatcher disagrees to support
+            # warm ups for aclgraph capture
+            if aclgraph_runtime_mode != CUDAGraphMode.NONE and aclgraph_runtime_mode != _ag_mode:
+                raise ValueError(
+                    f"Aclgraph runtime mode mismatch at dummy_run. "
+                    f"Expected {_ag_mode}, but got {aclgraph_runtime_mode}.")
+        else:
+            aclgraph_runtime_mode = _ag_mode
+
+        # TODO(Mengqing): Set create_mixed_batch to False since it's only used in FI warmup
+        # and not supported in ASCEND now. We could remove it in the future.
+        attn_metadata = self._build_dummy_attn_metadata(
+            False,
+            num_reqs=num_reqs_padded,
+            num_tokens=num_tokens_padded,
+            max_query_len=max_query_len,
+            aclgraph_runtime_mode=aclgraph_runtime_mode,
+            force_attention=force_attention,
+            num_scheduled_tokens=num_scheduled_tokens,
+        )
+
        with self.maybe_dummy_run_with_lora(self.lora_config,
                                            num_scheduled_tokens,
                                            num_sampled_tokens):
+            # Make sure padding doesn't exceed max_num_tokens
+            assert num_tokens_padded <= self.max_num_tokens
            if self.is_multimodal_model:
                input_ids = None
-                inputs_embeds = self.inputs_embeds.gpu[:num_tokens]
+                inputs_embeds = self.inputs_embeds.gpu[:num_tokens_padded]
            elif self.enable_prompt_embeds:
                input_ids = None
-                inputs_embeds = self.inputs_embeds.gpu[:num_tokens]
+                inputs_embeds = self.inputs_embeds.gpu[:num_tokens_padded]
            else:
-                input_ids = self.input_ids[:num_tokens]
+                input_ids = self.input_ids[:num_tokens_padded]
                inputs_embeds = None

            if self.uses_mrope:
-                positions = self.mrope_positions[:, :num_tokens]
+                positions = self.mrope_positions[:, :num_tokens_padded]
            else:
-                positions = self.positions[:num_tokens]
+                positions = self.positions[:num_tokens_padded]

            if get_pp_group().is_first_rank:
                intermediate_tensors = None
@@ -3077,39 +3114,14 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin):
                            dtype=self.dtype,
                            device=self.device))
                intermediate_tensors = IntermediateTensors({
-                    k: v[:num_tokens]
+                    k:
+                    v[:num_tokens_padded]
                    for k, v in self.intermediate_tensors.items()
                })
-            has_lora = True if self.lora_config and self.compilation_config.cudagraph_specialize_lora else False
-            # filter out the valid batch descriptor
-            _ag_mode, batch_descriptor = \
-                self.aclgraph_dispatcher.dispatch(num_tokens=num_tokens, uniform_decode=uniform_decode, has_lora=has_lora)
-            if aclgraph_runtime_mode is not None:
-                # we allow forcing NONE when the dispatcher disagrees to support
-                # warm ups for aclgraph capture
-                if aclgraph_runtime_mode != CUDAGraphMode.NONE and aclgraph_runtime_mode != _ag_mode:
-                    raise ValueError(
-                        f"Aclgraph runtime mode mismatch at dummy_run. "
-                        f"Expected {_ag_mode}, but got {aclgraph_runtime_mode}."
-                    )
-            else:
-                aclgraph_runtime_mode = _ag_mode
-
-            # TODO(Mengqing): Set create_mixed_batch to False since it's only used in FI warmup
-            # and not supported in ASCEND now. We could remove it in the future.
-            attn_metadata = self._build_dummy_attn_metadata(
-                False,
-                num_reqs=num_reqs,
-                num_tokens=num_tokens,
-                max_query_len=max_query_len,
-                aclgraph_runtime_mode=aclgraph_runtime_mode,
-                force_attention=force_attention,
-                num_scheduled_tokens=num_scheduled_tokens,
-            )

            need_dummy_logits = (not self.in_profile_run
                                 and lmhead_tp_enable())
-            max_num_reqs_across_dp = num_tokens if not with_prefill else max_num_reqs
+            max_num_reqs_across_dp = num_tokens_padded if not with_prefill else max_num_reqs
            dummy_indices = torch.zeros(max_num_reqs_across_dp,
                                        dtype=torch.int32)

@@ -3129,7 +3141,7 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin):
            with set_ascend_forward_context(
                    attn_metadata,
                    self.vllm_config,
-                    num_tokens=num_tokens,
+                    num_tokens=num_tokens_padded,
                    num_tokens_across_dp=num_tokens_across_dp,
                    with_prefill=with_prefill,
                    in_profile_run=self.in_profile_run,
@@ -3143,15 +3155,15 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin):
                    weight_prefetch_method=self.weight_prefetch_method):
                hidden_states = self._generate_dummy_run_hidden_states(
                    with_prefill, is_torchair_compile, input_ids, positions,
-                    attn_metadata, num_tokens, intermediate_tensors,
+                    attn_metadata, num_tokens_padded, intermediate_tensors,
                    inputs_embeds)
                dummy_compute_logits(hidden_states)

            if self.drafter:
                self.drafter.dummy_run(
-                    num_tokens=num_tokens,
+                    num_tokens=num_tokens_padded,
                    with_prefill=with_prefill,
-                    num_reqs=num_reqs,
+                    num_reqs=num_reqs_padded,
                    num_tokens_across_dp=num_tokens_across_dp,
                    aclgraph_runtime_mode=aclgraph_runtime_mode,
                    batch_descriptor=batch_descriptor,