fixed fia pad logic in graph mode. (#7144)

### What this PR does / why we need it?
related to vllm PR #34043 this pr delete func
‘relax_for_mixed_batch_cudagraphs’, num_reqs no longer equals the actual
number of requests, due to fia operator requires that
query_start_loc[-1] equals the total number of computed tokens, so this
func delete cause the ifa error.
In full graph mode, set num_reqs_paded = num_reqs to fix the error
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.16.0
- vLLM main:
4034c3d32e

---------

Signed-off-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
Co-authored-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
This commit is contained in:
XiaoxinWang
2026-03-12 14:50:54 +08:00
committed by GitHub
parent bbffe58b63
commit 37d1bd8c50
2 changed files with 20 additions and 6 deletions

View File

@@ -18,7 +18,6 @@
# #
import os import os
import pytest
from vllm import SamplingParams from vllm import SamplingParams
from tests.e2e.conftest import VllmRunner from tests.e2e.conftest import VllmRunner
@@ -68,7 +67,6 @@ def test_qwen3_moe_full_decode_only_tp2():
) )
@pytest.mark.skip(reason="CANN8.5 failed with this test, fix me")
def test_qwen3_moe_full_graph_tp2(): def test_qwen3_moe_full_graph_tp2():
if "HCCL_OP_EXPANSION_MODE" in os.environ: if "HCCL_OP_EXPANSION_MODE" in os.environ:
del os.environ["HCCL_OP_EXPANSION_MODE"] del os.environ["HCCL_OP_EXPANSION_MODE"]

View File

@@ -521,11 +521,24 @@ class NPUModelRunner(GPUModelRunner):
return self.model.unwrap() return self.model.unwrap()
return self.model return self.model
def _pad_query_start_loc_for_fia(self, num_tokens_padded: int, num_reqs_padded: int, num_reqs: int) -> int: def _pad_query_start_loc_for_fia(
self,
num_tokens_padded: int,
num_reqs_padded: int,
num_reqs: int,
cudagraph_runtime_mode: CUDAGraphMode | None = None,
batch_desc_num_reqs: int | None = None,
) -> int:
""" """
This function is only designed to satisfied the constraint that when the layout is TND, This function is only designed to satisfied the constraint that when the layout is TND,
the first dimension of `hidden_states` must equal the last element of `actual_seq_lengths_q`. the first dimension of `hidden_states` must equal the last element of `actual_seq_lengths_q`.
""" """
# TODO: need refactor later, related to vllm PR #34043 this pr delete func
# relax_for_mixed_batch_cudagraphs, num_reqs no longer equals the actual number of requests.
if cudagraph_runtime_mode == CUDAGraphMode.FULL:
num_reqs_padded = num_reqs
else:
num_reqs_padded = batch_desc_num_reqs if batch_desc_num_reqs is not None else num_reqs
if num_tokens_padded == num_reqs_padded * self.uniform_decode_query_len: if num_tokens_padded == num_reqs_padded * self.uniform_decode_query_len:
# Uniform-batch case: num_reqs must be no greater than num_reqs_padded # Uniform-batch case: num_reqs must be no greater than num_reqs_padded
@@ -1218,7 +1231,9 @@ class NPUModelRunner(GPUModelRunner):
# Another possible condition is num_tokens_padded != num_tokens_unpadded # Another possible condition is num_tokens_padded != num_tokens_unpadded
# but this scope is way too big and the consequences are unpredictable # but this scope is way too big and the consequences are unpredictable
old_num_reqs_padded = num_reqs_padded old_num_reqs_padded = num_reqs_padded
num_reqs_padded = self._pad_query_start_loc_for_fia(num_tokens_padded, num_reqs_padded, num_reqs) num_reqs_padded = self._pad_query_start_loc_for_fia(
num_tokens_padded, num_reqs_padded, num_reqs, cudagraph_mode, batch_desc.num_reqs
)
if enable_sp() and num_tokens_padded == num_tokens_unpadded: if enable_sp() and num_tokens_padded == num_tokens_unpadded:
if num_reqs_padded > old_num_reqs_padded: if num_reqs_padded > old_num_reqs_padded:
num_reqs_padded = old_num_reqs_padded num_reqs_padded = old_num_reqs_padded
@@ -2324,8 +2339,9 @@ class NPUModelRunner(GPUModelRunner):
cum_num_tokens, _ = self._get_cumsum_and_arange(num_scheduled_tokens) cum_num_tokens, _ = self._get_cumsum_and_arange(num_scheduled_tokens)
self.query_start_loc.np[1 : num_reqs_padded + 1] = cum_num_tokens self.query_start_loc.np[1 : num_reqs_padded + 1] = cum_num_tokens
self.query_start_loc.copy_to_gpu() self.query_start_loc.copy_to_gpu()
num_reqs_padded = self._pad_query_start_loc_for_fia(
num_reqs_padded = self._pad_query_start_loc_for_fia(num_tokens_padded, num_reqs_padded, num_reqs) num_tokens_padded, num_reqs_padded, num_reqs, cudagraph_runtime_mode, batch_desc.num_reqs
)
pad_attn = cudagraph_runtime_mode == CUDAGraphMode.FULL pad_attn = cudagraph_runtime_mode == CUDAGraphMode.FULL
attn_metadata, _ = self._build_attention_metadata( attn_metadata, _ = self._build_attention_metadata(