[CI] Fix EAGLE CI problems (#6702)
### What this PR does / why we need it?
New FIA operator requires queryT equal to the last element of
actualSequenceLengthQ.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Passed existing test (test_mtp_eagle_correctness.py).
- vLLM version: v0.15.0
- vLLM main:
9562912cea
---------
Signed-off-by: Wangbingjie <wangbj1207@126.com>
Signed-off-by: Wangbingjie <w30061490@china.huawei.com>
Co-authored-by: Wangbingjie <w30061490@china.huawei.com>
This commit is contained in:
@@ -120,7 +120,6 @@ def test_deepseek_mtp_correctness(model_name: str, num_speculative_tokens: int,
|
|||||||
del spec_llm
|
del spec_llm
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="Failed with CANN8.5, fix me")
|
|
||||||
@pytest.mark.parametrize("model_name", MODELS_EAGLE)
|
@pytest.mark.parametrize("model_name", MODELS_EAGLE)
|
||||||
@pytest.mark.parametrize("model_name_main", MODELS_MAIN)
|
@pytest.mark.parametrize("model_name_main", MODELS_MAIN)
|
||||||
@pytest.mark.parametrize("num_speculative_tokens", [1, 2])
|
@pytest.mark.parametrize("num_speculative_tokens", [1, 2])
|
||||||
@@ -169,7 +168,6 @@ def test_llama_qwen3_eagle_correctness(
|
|||||||
"draft_tensor_parallel_size":
|
"draft_tensor_parallel_size":
|
||||||
draft_tensor_parallel_size,
|
draft_tensor_parallel_size,
|
||||||
"max_model_len": 128,
|
"max_model_len": 128,
|
||||||
"draft_vocab_size": 128256,
|
|
||||||
},
|
},
|
||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
cudagraph_mode="FULL_DECODE_ONLY",
|
cudagraph_mode="FULL_DECODE_ONLY",
|
||||||
|
|||||||
@@ -417,6 +417,21 @@ class EagleProposer(VllmEagleProposer):
|
|||||||
self.input_ids[last_token_indices] = next_token_ids
|
self.input_ids[last_token_indices] = next_token_ids
|
||||||
if self.use_cuda_graph and num_tokens <= self.runner.cudagraph_batch_sizes[-1]:
|
if self.use_cuda_graph and num_tokens <= self.runner.cudagraph_batch_sizes[-1]:
|
||||||
num_input_tokens = self.runner.cudagraph_dispatcher._bs_to_padded_graph_size[num_tokens]
|
num_input_tokens = self.runner.cudagraph_dispatcher._bs_to_padded_graph_size[num_tokens]
|
||||||
|
if not (
|
||||||
|
self.speculative_config.disable_padded_drafter_batch
|
||||||
|
and self.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE
|
||||||
|
):
|
||||||
|
# TODO: Due to the inconsistency between the proposer `dispatcher` and model runner, this padding
|
||||||
|
# should have been done in model runner but not. For example, at prefill stage, target model
|
||||||
|
# is run in eager mode currently, which means `_pad_query_start_loc_for_fia` is not called,
|
||||||
|
# while draft model is run in graph model, which means we should pad the `query_start_loc`.
|
||||||
|
# Need to be fixed in the future.
|
||||||
|
num_reqs_padded = self.runner._pad_query_start_loc_for_fia(
|
||||||
|
num_input_tokens, common_attn_metadata.num_reqs, common_attn_metadata.num_reqs
|
||||||
|
)
|
||||||
|
common_attn_metadata.num_reqs = num_reqs_padded
|
||||||
|
common_attn_metadata.query_start_loc = self.runner.query_start_loc.gpu[: num_reqs_padded + 1]
|
||||||
|
common_attn_metadata.query_start_loc_cpu = self.runner.query_start_loc.cpu[: num_reqs_padded + 1]
|
||||||
else:
|
else:
|
||||||
num_input_tokens = num_tokens
|
num_input_tokens = num_tokens
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user