Fix some ci issue and refactor modelrunner (#2445)

### What this PR does / why we need it? Fix some ci issue and refactor modelrunner ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with existing test. - vLLM version: v0.10.0 - vLLM main: 4d9c61993a --------- Signed-off-by: wangli <wangli858794774@gmail.com> Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: weiguihua2 <weiguihua2@huawei.com> Co-authored-by: wangli <wangli858794774@gmail.com> Co-authored-by: weiguihua2 <weiguihua2@huawei.com>
2025-08-20 09:01:04 +08:00
parent 955411611c
commit 1327f9be1c
28 changed files with 1612 additions and 1020 deletions
--- a/vllm_ascend/worker/eagle_proposer_v1.py
+++ b/vllm_ascend/worker/eagle_proposer_v1.py
@@ -16,6 +16,7 @@ from vllm.v1.sample.metadata import SamplingMetadata
 from vllm_ascend.ascend_forward_context import set_ascend_forward_context
 from vllm_ascend.attention.attention_mask import AttentionMaskBuilder
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
+from vllm_ascend.attention.utils import AscendCommonAttentionMetadata

 PADDING_SLOT_ID = -1

@@ -125,12 +126,27 @@ class EagleProposer:
        query_lens = cu_num_tokens[1:] - cu_num_tokens[:-1]
        max_query_len = query_lens.max().item()

-        # FIXME(woosuk): The below two ops cause synchronization. Optimize.
-        attn_metadata = self.runner.attn_metadata_builder.build(
+        common_attn_metadata = AscendCommonAttentionMetadata(
+            query_start_loc=self.runner.query_start_loc[:batch_size + 1],
+            query_start_loc_cpu=self.runner.query_start_loc_cpu[:batch_size +
+                                                                1],
+            seq_lens_cpu=self.runner.seq_lens_cpu,
+            max_query_len=max_query_len,
            num_reqs=batch_size,
            num_actual_tokens=num_tokens,
-            max_query_len=max_query_len,
+            actual_seq_lengths_q=self.runner.actual_seq_lengths_q,
+            block_table_tensor=self.runner.input_batch.block_table[0].
+            get_device_tensor(),
+            slot_mapping_cpu=target_slot_mapping,
+            positions=target_positions,
+            attn_mask=self.runner.attn_mask,
+            spec_attn_mask=self.runner.spec_attn_mask,
+            attn_state=self.runner.attn_state,
+            decode_token_per_req=self.runner.decode_token_per_req,
        )
+        # FIXME(woosuk): The below two ops cause synchronization. Optimize.
+        attn_metadata = self.runner.attn_metadata_builder.build(
+            common_attn_metadata, self.runner.model)
        if self.use_cuda_graph and \
            num_tokens <= self.cudagraph_batch_sizes[-1]:
            num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)