[DP] Fix dp padding logic in dummyrun (#4705)
### What this PR does / why we need it?
Fix dp padding logic in dummyrun. After
https://github.com/vllm-project/vllm/pull/28579, `num_tokens` will be
padded in `CudagraphDispatcher`, thus we also need to do the pad in the
dummy_run.
### How was this patch tested?
Test locally with the following scripts
```bash
VLLM_USE_MODELSCOPE=true python3 -m vllm.entrypoints.openai.api_server \
--model wemaster/deepseek_mtp_main_random_bf16 \
--trust-remote-code \
--data-parallel-size 4 \
--tensor-parallel-size 1 \
--compilation-config '{"cudagraph_capture_sizes":[96],"cudagraph_mode":"FULL_DECODE_ONLY"}' \
--enable-expert-parallel
```
```bash
vllm bench serve --model wemaster/deepseek_mtp_main_random_bf16 --endpoint /v1/completions --dataset-name random --random-input 512 --random-output 100 --num-prompts 48 --request-rate 1 --ready-check-timeout-sec 0
```
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
Signed-off-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
@@ -3002,8 +3002,6 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin):
|
|||||||
with_prefill) = self._sync_metadata_across_dp(num_tokens,
|
with_prefill) = self._sync_metadata_across_dp(num_tokens,
|
||||||
with_prefill)
|
with_prefill)
|
||||||
|
|
||||||
moe_comm_type = self._select_moe_comm_method(num_tokens)
|
|
||||||
|
|
||||||
# If cudagraph_mode.decode_mode() == FULL and
|
# If cudagraph_mode.decode_mode() == FULL and
|
||||||
# cudagraph_mode.seperate_routine(). This means that we are using
|
# cudagraph_mode.seperate_routine(). This means that we are using
|
||||||
# different graphs and/or modes for mixed prefill-decode batches vs.
|
# different graphs and/or modes for mixed prefill-decode batches vs.
|
||||||
@@ -3049,23 +3047,62 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin):
|
|||||||
if not self.in_profile_run and self.dynamic_eplb:
|
if not self.in_profile_run and self.dynamic_eplb:
|
||||||
self.eplb_updator.forward_before()
|
self.eplb_updator.forward_before()
|
||||||
|
|
||||||
|
has_lora = True if self.lora_config and self.compilation_config.cudagraph_specialize_lora else False
|
||||||
|
_ag_mode, batch_descriptor = \
|
||||||
|
self.aclgraph_dispatcher.dispatch(num_tokens=num_tokens, uniform_decode=uniform_decode, has_lora=has_lora)
|
||||||
|
|
||||||
|
num_tokens_padded = batch_descriptor.num_tokens
|
||||||
|
num_reqs_padded = (batch_descriptor.num_reqs if
|
||||||
|
batch_descriptor.num_reqs is not None else num_reqs)
|
||||||
|
if num_tokens_across_dp is not None and num_tokens_padded != num_tokens:
|
||||||
|
# pad is needed if the pad of `num_tokens` is triggered inside CudagraphDispatcher
|
||||||
|
num_tokens_across_dp[:] = num_tokens_padded
|
||||||
|
num_scheduled_tokens = num_scheduled_tokens.repeat(num_reqs_padded)
|
||||||
|
|
||||||
|
moe_comm_type = self._select_moe_comm_method(num_tokens_padded)
|
||||||
|
|
||||||
|
# filter out the valid batch descriptor
|
||||||
|
if aclgraph_runtime_mode is not None:
|
||||||
|
# we allow forcing NONE when the dispatcher disagrees to support
|
||||||
|
# warm ups for aclgraph capture
|
||||||
|
if aclgraph_runtime_mode != CUDAGraphMode.NONE and aclgraph_runtime_mode != _ag_mode:
|
||||||
|
raise ValueError(
|
||||||
|
f"Aclgraph runtime mode mismatch at dummy_run. "
|
||||||
|
f"Expected {_ag_mode}, but got {aclgraph_runtime_mode}.")
|
||||||
|
else:
|
||||||
|
aclgraph_runtime_mode = _ag_mode
|
||||||
|
|
||||||
|
# TODO(Mengqing): Set create_mixed_batch to False since it's only used in FI warmup
|
||||||
|
# and not supported in ASCEND now. We could remove it in the future.
|
||||||
|
attn_metadata = self._build_dummy_attn_metadata(
|
||||||
|
False,
|
||||||
|
num_reqs=num_reqs_padded,
|
||||||
|
num_tokens=num_tokens_padded,
|
||||||
|
max_query_len=max_query_len,
|
||||||
|
aclgraph_runtime_mode=aclgraph_runtime_mode,
|
||||||
|
force_attention=force_attention,
|
||||||
|
num_scheduled_tokens=num_scheduled_tokens,
|
||||||
|
)
|
||||||
|
|
||||||
with self.maybe_dummy_run_with_lora(self.lora_config,
|
with self.maybe_dummy_run_with_lora(self.lora_config,
|
||||||
num_scheduled_tokens,
|
num_scheduled_tokens,
|
||||||
num_sampled_tokens):
|
num_sampled_tokens):
|
||||||
|
# Make sure padding doesn't exceed max_num_tokens
|
||||||
|
assert num_tokens_padded <= self.max_num_tokens
|
||||||
if self.is_multimodal_model:
|
if self.is_multimodal_model:
|
||||||
input_ids = None
|
input_ids = None
|
||||||
inputs_embeds = self.inputs_embeds.gpu[:num_tokens]
|
inputs_embeds = self.inputs_embeds.gpu[:num_tokens_padded]
|
||||||
elif self.enable_prompt_embeds:
|
elif self.enable_prompt_embeds:
|
||||||
input_ids = None
|
input_ids = None
|
||||||
inputs_embeds = self.inputs_embeds.gpu[:num_tokens]
|
inputs_embeds = self.inputs_embeds.gpu[:num_tokens_padded]
|
||||||
else:
|
else:
|
||||||
input_ids = self.input_ids[:num_tokens]
|
input_ids = self.input_ids[:num_tokens_padded]
|
||||||
inputs_embeds = None
|
inputs_embeds = None
|
||||||
|
|
||||||
if self.uses_mrope:
|
if self.uses_mrope:
|
||||||
positions = self.mrope_positions[:, :num_tokens]
|
positions = self.mrope_positions[:, :num_tokens_padded]
|
||||||
else:
|
else:
|
||||||
positions = self.positions[:num_tokens]
|
positions = self.positions[:num_tokens_padded]
|
||||||
|
|
||||||
if get_pp_group().is_first_rank:
|
if get_pp_group().is_first_rank:
|
||||||
intermediate_tensors = None
|
intermediate_tensors = None
|
||||||
@@ -3077,39 +3114,14 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin):
|
|||||||
dtype=self.dtype,
|
dtype=self.dtype,
|
||||||
device=self.device))
|
device=self.device))
|
||||||
intermediate_tensors = IntermediateTensors({
|
intermediate_tensors = IntermediateTensors({
|
||||||
k: v[:num_tokens]
|
k:
|
||||||
|
v[:num_tokens_padded]
|
||||||
for k, v in self.intermediate_tensors.items()
|
for k, v in self.intermediate_tensors.items()
|
||||||
})
|
})
|
||||||
has_lora = True if self.lora_config and self.compilation_config.cudagraph_specialize_lora else False
|
|
||||||
# filter out the valid batch descriptor
|
|
||||||
_ag_mode, batch_descriptor = \
|
|
||||||
self.aclgraph_dispatcher.dispatch(num_tokens=num_tokens, uniform_decode=uniform_decode, has_lora=has_lora)
|
|
||||||
if aclgraph_runtime_mode is not None:
|
|
||||||
# we allow forcing NONE when the dispatcher disagrees to support
|
|
||||||
# warm ups for aclgraph capture
|
|
||||||
if aclgraph_runtime_mode != CUDAGraphMode.NONE and aclgraph_runtime_mode != _ag_mode:
|
|
||||||
raise ValueError(
|
|
||||||
f"Aclgraph runtime mode mismatch at dummy_run. "
|
|
||||||
f"Expected {_ag_mode}, but got {aclgraph_runtime_mode}."
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
aclgraph_runtime_mode = _ag_mode
|
|
||||||
|
|
||||||
# TODO(Mengqing): Set create_mixed_batch to False since it's only used in FI warmup
|
|
||||||
# and not supported in ASCEND now. We could remove it in the future.
|
|
||||||
attn_metadata = self._build_dummy_attn_metadata(
|
|
||||||
False,
|
|
||||||
num_reqs=num_reqs,
|
|
||||||
num_tokens=num_tokens,
|
|
||||||
max_query_len=max_query_len,
|
|
||||||
aclgraph_runtime_mode=aclgraph_runtime_mode,
|
|
||||||
force_attention=force_attention,
|
|
||||||
num_scheduled_tokens=num_scheduled_tokens,
|
|
||||||
)
|
|
||||||
|
|
||||||
need_dummy_logits = (not self.in_profile_run
|
need_dummy_logits = (not self.in_profile_run
|
||||||
and lmhead_tp_enable())
|
and lmhead_tp_enable())
|
||||||
max_num_reqs_across_dp = num_tokens if not with_prefill else max_num_reqs
|
max_num_reqs_across_dp = num_tokens_padded if not with_prefill else max_num_reqs
|
||||||
dummy_indices = torch.zeros(max_num_reqs_across_dp,
|
dummy_indices = torch.zeros(max_num_reqs_across_dp,
|
||||||
dtype=torch.int32)
|
dtype=torch.int32)
|
||||||
|
|
||||||
@@ -3129,7 +3141,7 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin):
|
|||||||
with set_ascend_forward_context(
|
with set_ascend_forward_context(
|
||||||
attn_metadata,
|
attn_metadata,
|
||||||
self.vllm_config,
|
self.vllm_config,
|
||||||
num_tokens=num_tokens,
|
num_tokens=num_tokens_padded,
|
||||||
num_tokens_across_dp=num_tokens_across_dp,
|
num_tokens_across_dp=num_tokens_across_dp,
|
||||||
with_prefill=with_prefill,
|
with_prefill=with_prefill,
|
||||||
in_profile_run=self.in_profile_run,
|
in_profile_run=self.in_profile_run,
|
||||||
@@ -3143,15 +3155,15 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin):
|
|||||||
weight_prefetch_method=self.weight_prefetch_method):
|
weight_prefetch_method=self.weight_prefetch_method):
|
||||||
hidden_states = self._generate_dummy_run_hidden_states(
|
hidden_states = self._generate_dummy_run_hidden_states(
|
||||||
with_prefill, is_torchair_compile, input_ids, positions,
|
with_prefill, is_torchair_compile, input_ids, positions,
|
||||||
attn_metadata, num_tokens, intermediate_tensors,
|
attn_metadata, num_tokens_padded, intermediate_tensors,
|
||||||
inputs_embeds)
|
inputs_embeds)
|
||||||
dummy_compute_logits(hidden_states)
|
dummy_compute_logits(hidden_states)
|
||||||
|
|
||||||
if self.drafter:
|
if self.drafter:
|
||||||
self.drafter.dummy_run(
|
self.drafter.dummy_run(
|
||||||
num_tokens=num_tokens,
|
num_tokens=num_tokens_padded,
|
||||||
with_prefill=with_prefill,
|
with_prefill=with_prefill,
|
||||||
num_reqs=num_reqs,
|
num_reqs=num_reqs_padded,
|
||||||
num_tokens_across_dp=num_tokens_across_dp,
|
num_tokens_across_dp=num_tokens_across_dp,
|
||||||
aclgraph_runtime_mode=aclgraph_runtime_mode,
|
aclgraph_runtime_mode=aclgraph_runtime_mode,
|
||||||
batch_descriptor=batch_descriptor,
|
batch_descriptor=batch_descriptor,
|
||||||
|
|||||||
Reference in New Issue
Block a user