[Fix] Pads query_start_loc to satisfy FIA/TND constraint (#6357)
### What this PR does / why we need it?
This handles both uniform and mixed batches (by inserting a dummy
request for mixed batches), consolidates ad-hoc padding into a single
helper, copies the updated buffer to the device, and asserts the layout
constraint before building the attention metadata. Together, these
changes prevent kernel mismatches or failures and ensure correct shapes
for FIA/TND execution in full graph modes.
We currently place this helper in `execute_model`. My original design
was to include it in `_prepare_inputs`, but that doesn’t work because it
must run after padding. While I’d prefer to minimize the impact and
reuse as much of the base class as possible in the future, it doesn’t
seem achievable at the moment.
### Does this PR introduce _any_ user-facing change?
None.
### How was this patch tested?
Test cases added.
- vLLM version: v0.14.1
- vLLM main:
dc917cceb8
---------
Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
This commit is contained in:
@@ -44,6 +44,29 @@ CASE_DS_ACLGRAPH = LLMTestCase(
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
CASE_QWEN_FULL = LLMTestCase(
|
||||||
|
model="Qwen/Qwen3-0.6B",
|
||||||
|
prompts=PROMPTS_SHORT,
|
||||||
|
golden_answers=[
|
||||||
|
" Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I want to know if there are any",
|
||||||
|
' the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president',
|
||||||
|
' Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of',
|
||||||
|
' not just a technological frontier but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and'
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
CASE_DS_FULL = LLMTestCase(
|
||||||
|
model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
|
||||||
|
quantization="ascend",
|
||||||
|
prompts=PROMPTS_SHORT,
|
||||||
|
golden_answers=[
|
||||||
|
'\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2',
|
||||||
|
' a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the',
|
||||||
|
' Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art',
|
||||||
|
' here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of'
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
CASE_QWEN_FULL_DECODE_ONLY = LLMTestCase(
|
CASE_QWEN_FULL_DECODE_ONLY = LLMTestCase(
|
||||||
model="Qwen/Qwen3-0.6B",
|
model="Qwen/Qwen3-0.6B",
|
||||||
prompts=PROMPTS_LONG,
|
prompts=PROMPTS_LONG,
|
||||||
@@ -94,6 +117,23 @@ def test_piecewise_res_consistency(cur_case: LLMTestCase):
|
|||||||
sampling_params=cur_case.sampling_params,
|
sampling_params=cur_case.sampling_params,
|
||||||
golden_answers=cur_case.golden_answers)
|
golden_answers=cur_case.golden_answers)
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"cur_case", [CASE_QWEN_FULL, CASE_DS_FULL])
|
||||||
|
def test_full_res_consistency(cur_case: LLMTestCase, monkeypatch):
|
||||||
|
monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
|
||||||
|
runner_kwargs = {
|
||||||
|
"model_name": cur_case.model,
|
||||||
|
"max_model_len": 1024,
|
||||||
|
"compilation_config": {
|
||||||
|
"cudagraph_capture_sizes": [4, 8, 32, 64],
|
||||||
|
"cudagraph_mode": "FULL_DECODE_ONLY"
|
||||||
|
},
|
||||||
|
"quantization": cur_case.quantization,
|
||||||
|
}
|
||||||
|
gen_and_valid(runner_kwargs=runner_kwargs,
|
||||||
|
prompts=cur_case.prompts,
|
||||||
|
sampling_params=cur_case.sampling_params,
|
||||||
|
golden_answers=cur_case.golden_answers)
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"cur_case", [CASE_QWEN_FULL_DECODE_ONLY, CASE_DS_FULL_DECODE_ONLY])
|
"cur_case", [CASE_QWEN_FULL_DECODE_ONLY, CASE_DS_FULL_DECODE_ONLY])
|
||||||
|
|||||||
@@ -206,6 +206,13 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
vllm_config.scheduler_config.max_num_batched_tokens += max_pcp_pad_tokens
|
vllm_config.scheduler_config.max_num_batched_tokens += max_pcp_pad_tokens
|
||||||
with _torch_cuda_wrapper():
|
with _torch_cuda_wrapper():
|
||||||
super().__init__(vllm_config, device)
|
super().__init__(vllm_config, device)
|
||||||
|
|
||||||
|
# NOTE: For FULL mode we change +1 to +2 to reserve extra space for padding.
|
||||||
|
# See _pad_query_start_loc_for_fia.
|
||||||
|
self.query_start_loc = self._make_buffer(
|
||||||
|
self.max_num_reqs + 2, dtype=torch.int32 # type: ignore[has-type]
|
||||||
|
)
|
||||||
|
|
||||||
vllm_config.scheduler_config.max_num_batched_tokens -= max_pcp_pad_tokens
|
vllm_config.scheduler_config.max_num_batched_tokens -= max_pcp_pad_tokens
|
||||||
self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
|
self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
|
||||||
self.max_num_reqs = self.scheduler_config.max_num_seqs
|
self.max_num_reqs = self.scheduler_config.max_num_seqs
|
||||||
@@ -509,6 +516,36 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
return self.model.unwrap()
|
return self.model.unwrap()
|
||||||
return self.model
|
return self.model
|
||||||
|
|
||||||
|
def _pad_query_start_loc_for_fia(
|
||||||
|
self, num_tokens_padded: int, num_reqs_padded: int, num_reqs: int
|
||||||
|
) -> int:
|
||||||
|
"""
|
||||||
|
This function is only designed to satisfied the constraint that when the layout is TND,
|
||||||
|
the first dimension of `hidden_states` must equal the last element of `actual_seq_lengths_q`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if num_tokens_padded == num_reqs_padded * self.uniform_decode_query_len:
|
||||||
|
# Uniform-batch case: num_reqs must be no greater than num_reqs_padded
|
||||||
|
assert num_reqs <= num_reqs_padded
|
||||||
|
|
||||||
|
last_loc = self.query_start_loc.np[num_reqs]
|
||||||
|
self.query_start_loc.np[num_reqs + 1 : num_reqs_padded + 1] = (
|
||||||
|
self.arange_np[1 : num_reqs_padded + 1 - num_reqs]
|
||||||
|
* self.uniform_decode_query_len
|
||||||
|
+ last_loc
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Mixed-batch case: num_reqs must equal num_reqs_padded
|
||||||
|
assert num_reqs == num_reqs_padded
|
||||||
|
|
||||||
|
# Insert a dummy request instead of setting query_start_loc[num_reqs] = num_tokens_padded directly
|
||||||
|
self.query_start_loc.np[num_reqs_padded + 1] = num_tokens_padded
|
||||||
|
num_reqs_padded = num_reqs_padded + 1
|
||||||
|
|
||||||
|
self.query_start_loc.copy_to_gpu()
|
||||||
|
|
||||||
|
return num_reqs_padded
|
||||||
|
|
||||||
def _prepare_inputs(
|
def _prepare_inputs(
|
||||||
self,
|
self,
|
||||||
scheduler_output: "SchedulerOutput",
|
scheduler_output: "SchedulerOutput",
|
||||||
@@ -666,10 +703,6 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
|
|
||||||
self.query_start_loc.np[0] = 0
|
self.query_start_loc.np[0] = 0
|
||||||
self.query_start_loc.np[1:num_reqs + 1] = cu_num_tokens
|
self.query_start_loc.np[1:num_reqs + 1] = cu_num_tokens
|
||||||
# NOTE: Due to the FIA operator limitation, here we pad so that hidden_states.shape[0]
|
|
||||||
# and self.query_start_loc[num_reqs_padded] are equal
|
|
||||||
self.query_start_loc.np[num_reqs + 1:] = (self.arange_np[1:self.max_num_reqs + 1 - num_reqs]
|
|
||||||
* self.uniform_decode_query_len + cu_num_tokens[-1])
|
|
||||||
self.query_start_loc.copy_to_gpu()
|
self.query_start_loc.copy_to_gpu()
|
||||||
|
|
||||||
self.seq_lens.np[:num_reqs] = (
|
self.seq_lens.np[:num_reqs] = (
|
||||||
@@ -1153,6 +1186,7 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
scheduler_output,
|
scheduler_output,
|
||||||
num_scheduled_tokens_np,
|
num_scheduled_tokens_np,
|
||||||
)
|
)
|
||||||
|
|
||||||
num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens
|
num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens
|
||||||
if self.pcp_size > 1:
|
if self.pcp_size > 1:
|
||||||
num_tokens_unpadded = self.pcp_manager.total_num_sampled_tokens_pcp
|
num_tokens_unpadded = self.pcp_manager.total_num_sampled_tokens_pcp
|
||||||
@@ -1207,6 +1241,11 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0
|
use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0
|
||||||
ubatch_slices_attn = ubatch_slices_padded if pad_attn else ubatch_slices
|
ubatch_slices_attn = ubatch_slices_padded if pad_attn else ubatch_slices
|
||||||
|
|
||||||
|
if cudagraph_mode != CUDAGraphMode.NONE:
|
||||||
|
num_reqs_padded = self._pad_query_start_loc_for_fia(
|
||||||
|
num_tokens_padded, num_reqs_padded, num_reqs
|
||||||
|
)
|
||||||
|
|
||||||
(attn_metadata, spec_decode_common_attn_metadata) = (
|
(attn_metadata, spec_decode_common_attn_metadata) = (
|
||||||
self._build_attention_metadata(
|
self._build_attention_metadata(
|
||||||
num_tokens=num_tokens_unpadded,
|
num_tokens=num_tokens_unpadded,
|
||||||
@@ -1341,7 +1380,6 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
assert broadcasted is not None
|
assert broadcasted is not None
|
||||||
logits = broadcasted["logits"]
|
logits = broadcasted["logits"]
|
||||||
|
|
||||||
|
|
||||||
# Apply structured output bitmasks if present
|
# Apply structured output bitmasks if present
|
||||||
self.execute_model_state = ExecuteModelState(
|
self.execute_model_state = ExecuteModelState(
|
||||||
scheduler_output,
|
scheduler_output,
|
||||||
@@ -1941,6 +1979,13 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
long_seq_metdadata = _get_pcp_metadata(num_tokens)
|
long_seq_metdadata = _get_pcp_metadata(num_tokens)
|
||||||
block_table_gid_0, slot_mapping_gid_0 = _get_block_table_and_slot_mapping(0)
|
block_table_gid_0, slot_mapping_gid_0 = _get_block_table_and_slot_mapping(0)
|
||||||
|
|
||||||
|
actual_last_loc = self.query_start_loc.np[num_reqs_padded]
|
||||||
|
error_msg = (
|
||||||
|
f"Due to FIA kernel constraints, when the layout is TND, "
|
||||||
|
f"the first dimension of `hidden_states` ({num_tokens_padded}) "
|
||||||
|
f"must equal the last element of `actual_seq_lengths_q` ({actual_last_loc})."
|
||||||
|
)
|
||||||
|
assert self.query_start_loc.np[num_reqs_padded] == num_tokens_padded, error_msg
|
||||||
cm_base = AscendCommonAttentionMetadata(
|
cm_base = AscendCommonAttentionMetadata(
|
||||||
query_start_loc=self.query_start_loc.gpu[: num_reqs_padded + 1],
|
query_start_loc=self.query_start_loc.gpu[: num_reqs_padded + 1],
|
||||||
query_start_loc_cpu=self.query_start_loc.cpu[: num_reqs_padded + 1],
|
query_start_loc_cpu=self.query_start_loc.cpu[: num_reqs_padded + 1],
|
||||||
@@ -2193,9 +2238,15 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
self.seq_lens.np[:num_reqs_padded] = seq_lens
|
self.seq_lens.np[:num_reqs_padded] = seq_lens
|
||||||
self.seq_lens.np[num_reqs_padded:] = 0
|
self.seq_lens.np[num_reqs_padded:] = 0
|
||||||
self.seq_lens.copy_to_gpu()
|
self.seq_lens.copy_to_gpu()
|
||||||
|
|
||||||
cum_num_tokens, _ = self._get_cumsum_and_arange(num_scheduled_tokens)
|
cum_num_tokens, _ = self._get_cumsum_and_arange(num_scheduled_tokens)
|
||||||
self.query_start_loc.np[1 : num_reqs_padded + 1] = cum_num_tokens
|
self.query_start_loc.np[1 : num_reqs_padded + 1] = cum_num_tokens
|
||||||
self.query_start_loc.copy_to_gpu()
|
self.query_start_loc.copy_to_gpu()
|
||||||
|
|
||||||
|
num_reqs_padded = self._pad_query_start_loc_for_fia(
|
||||||
|
num_tokens_padded, num_reqs_padded, num_reqs
|
||||||
|
)
|
||||||
|
|
||||||
pad_attn = cudagraph_runtime_mode == CUDAGraphMode.FULL
|
pad_attn = cudagraph_runtime_mode == CUDAGraphMode.FULL
|
||||||
attn_metadata, _ = self._build_attention_metadata(
|
attn_metadata, _ = self._build_attention_metadata(
|
||||||
num_tokens=num_tokens_unpadded,
|
num_tokens=num_tokens_unpadded,
|
||||||
|
|||||||
Reference in New Issue
Block a user