fixed fia pad logic in graph mode. (#7144)
### What this PR does / why we need it?
related to vllm PR #34043 this pr delete func
‘relax_for_mixed_batch_cudagraphs’, num_reqs no longer equals the actual
number of requests, due to fia operator requires that
query_start_loc[-1] equals the total number of computed tokens, so this
func delete cause the ifa error.
In full graph mode, set num_reqs_paded = num_reqs to fix the error
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.16.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
Co-authored-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
This commit is contained in:
@@ -18,7 +18,6 @@
|
||||
#
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from vllm import SamplingParams
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
@@ -68,7 +67,6 @@ def test_qwen3_moe_full_decode_only_tp2():
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="CANN8.5 failed with this test, fix me")
|
||||
def test_qwen3_moe_full_graph_tp2():
|
||||
if "HCCL_OP_EXPANSION_MODE" in os.environ:
|
||||
del os.environ["HCCL_OP_EXPANSION_MODE"]
|
||||
|
||||
@@ -521,11 +521,24 @@ class NPUModelRunner(GPUModelRunner):
|
||||
return self.model.unwrap()
|
||||
return self.model
|
||||
|
||||
def _pad_query_start_loc_for_fia(self, num_tokens_padded: int, num_reqs_padded: int, num_reqs: int) -> int:
|
||||
def _pad_query_start_loc_for_fia(
|
||||
self,
|
||||
num_tokens_padded: int,
|
||||
num_reqs_padded: int,
|
||||
num_reqs: int,
|
||||
cudagraph_runtime_mode: CUDAGraphMode | None = None,
|
||||
batch_desc_num_reqs: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
This function is only designed to satisfied the constraint that when the layout is TND,
|
||||
the first dimension of `hidden_states` must equal the last element of `actual_seq_lengths_q`.
|
||||
"""
|
||||
# TODO: need refactor later, related to vllm PR #34043 this pr delete func
|
||||
# relax_for_mixed_batch_cudagraphs, num_reqs no longer equals the actual number of requests.
|
||||
if cudagraph_runtime_mode == CUDAGraphMode.FULL:
|
||||
num_reqs_padded = num_reqs
|
||||
else:
|
||||
num_reqs_padded = batch_desc_num_reqs if batch_desc_num_reqs is not None else num_reqs
|
||||
|
||||
if num_tokens_padded == num_reqs_padded * self.uniform_decode_query_len:
|
||||
# Uniform-batch case: num_reqs must be no greater than num_reqs_padded
|
||||
@@ -1218,7 +1231,9 @@ class NPUModelRunner(GPUModelRunner):
|
||||
# Another possible condition is num_tokens_padded != num_tokens_unpadded
|
||||
# but this scope is way too big and the consequences are unpredictable
|
||||
old_num_reqs_padded = num_reqs_padded
|
||||
num_reqs_padded = self._pad_query_start_loc_for_fia(num_tokens_padded, num_reqs_padded, num_reqs)
|
||||
num_reqs_padded = self._pad_query_start_loc_for_fia(
|
||||
num_tokens_padded, num_reqs_padded, num_reqs, cudagraph_mode, batch_desc.num_reqs
|
||||
)
|
||||
if enable_sp() and num_tokens_padded == num_tokens_unpadded:
|
||||
if num_reqs_padded > old_num_reqs_padded:
|
||||
num_reqs_padded = old_num_reqs_padded
|
||||
@@ -2324,8 +2339,9 @@ class NPUModelRunner(GPUModelRunner):
|
||||
cum_num_tokens, _ = self._get_cumsum_and_arange(num_scheduled_tokens)
|
||||
self.query_start_loc.np[1 : num_reqs_padded + 1] = cum_num_tokens
|
||||
self.query_start_loc.copy_to_gpu()
|
||||
|
||||
num_reqs_padded = self._pad_query_start_loc_for_fia(num_tokens_padded, num_reqs_padded, num_reqs)
|
||||
num_reqs_padded = self._pad_query_start_loc_for_fia(
|
||||
num_tokens_padded, num_reqs_padded, num_reqs, cudagraph_runtime_mode, batch_desc.num_reqs
|
||||
)
|
||||
|
||||
pad_attn = cudagraph_runtime_mode == CUDAGraphMode.FULL
|
||||
attn_metadata, _ = self._build_attention_metadata(
|
||||
|
||||
Reference in New Issue
Block a user