[Bugfix] Fix the graph capture failure issue in the eagle3+full scenario. (#5553)
### What this PR does / why we need it?
When launching the service in the scenario where the
cudagraph_mode is set to FULL and Eagle3 acceleration is enabled for
inference, an error in fia will cause graph capture to fail. This PR
fixes the issue.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
7157596103
Signed-off-by: WithHades <244036962@qq.com>
This commit is contained in:
@@ -73,7 +73,8 @@ class EagleProposer(VllmEagleProposer):
|
|||||||
|
|
||||||
self.pcp_size = self.runner.pcp_size
|
self.pcp_size = self.runner.pcp_size
|
||||||
self.decode_threshold = 1 + self.num_speculative_tokens
|
self.decode_threshold = 1 + self.num_speculative_tokens
|
||||||
|
self.query_start_loc = self.runner._make_buffer(
|
||||||
|
self.runner.max_num_reqs + 1, dtype=torch.int32)
|
||||||
self.arange_cpu = torch.arange(self.arange.shape[0],
|
self.arange_cpu = torch.arange(self.arange.shape[0],
|
||||||
device="cpu",
|
device="cpu",
|
||||||
dtype=torch.int32)
|
dtype=torch.int32)
|
||||||
@@ -200,10 +201,14 @@ class EagleProposer(VllmEagleProposer):
|
|||||||
num_computed_tokens_cpu = (
|
num_computed_tokens_cpu = (
|
||||||
self.runner.input_batch.
|
self.runner.input_batch.
|
||||||
num_computed_tokens_cpu_tensor[:num_reqs])
|
num_computed_tokens_cpu_tensor[:num_reqs])
|
||||||
|
self.query_start_loc.cpu[:num_reqs + 1] = torch.tensor(
|
||||||
|
[0] + self.runner.actual_seq_lengths_q[:num_reqs],
|
||||||
|
device="cpu",
|
||||||
|
dtype=torch.int32)
|
||||||
|
self.query_start_loc.copy_to_gpu()
|
||||||
common_attn_metadata = AscendCommonAttentionMetadata(
|
common_attn_metadata = AscendCommonAttentionMetadata(
|
||||||
query_start_loc=self.runner.query_start_loc.gpu[:num_reqs + 1],
|
query_start_loc=self.query_start_loc.gpu[:num_reqs + 1],
|
||||||
query_start_loc_cpu=self.runner.query_start_loc.cpu[:num_reqs +
|
query_start_loc_cpu=self.query_start_loc.cpu[:num_reqs + 1],
|
||||||
1],
|
|
||||||
seq_lens_cpu=self.runner.seq_lens.cpu,
|
seq_lens_cpu=self.runner.seq_lens.cpu,
|
||||||
seq_lens=self.runner.seq_lens.gpu[:num_reqs],
|
seq_lens=self.runner.seq_lens.gpu[:num_reqs],
|
||||||
num_reqs=num_reqs,
|
num_reqs=num_reqs,
|
||||||
|
|||||||
@@ -1939,14 +1939,7 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
[0] * dcp_world_size for _ in range(pcp_world_size)
|
[0] * dcp_world_size for _ in range(pcp_world_size)
|
||||||
] for _ in range(num_tokens)]
|
] for _ in range(num_tokens)]
|
||||||
long_seq_metadata.num_computed_tokens_of_pcp_dcp = num_computed_tokens_of_pcp_dcp
|
long_seq_metadata.num_computed_tokens_of_pcp_dcp = num_computed_tokens_of_pcp_dcp
|
||||||
# QUESTION: Why do we separately set query_start_loc for spec in the first place?
|
|
||||||
# While in _prepare_inputs we don't?
|
|
||||||
if self.speculative_config:
|
|
||||||
self.query_start_loc.cpu[:num_reqs + 1] = torch.tensor(
|
|
||||||
[0] + self.actual_seq_lengths_q[:num_reqs],
|
|
||||||
device="cpu",
|
|
||||||
dtype=torch.int32)
|
|
||||||
self.query_start_loc.copy_to_gpu()
|
|
||||||
common_attn_metadata = AscendCommonAttentionMetadata(
|
common_attn_metadata = AscendCommonAttentionMetadata(
|
||||||
query_start_loc=self.query_start_loc.gpu[:num_reqs + 1],
|
query_start_loc=self.query_start_loc.gpu[:num_reqs + 1],
|
||||||
query_start_loc_cpu=self.query_start_loc.cpu[:num_reqs +
|
query_start_loc_cpu=self.query_start_loc.cpu[:num_reqs +
|
||||||
|
|||||||
Reference in New Issue
Block a user