[refactor] refactor excute_model and _dymmy_run method (#6043)

### What this PR does / why we need it? The structure of the `excute_model` and `_dymmy_run` methods in NPUModelRunner differs greatly from that in GPUModelRunner. Achieve alignment with GPUModelRunner: Split the `_prepare_inputs` method into `_prepare_inputs`, `_determine_batch_execution_and_padding`, `_build_attention_metadata`, and `_preprocess`. Modify `_generate_process_reqs_hidden_states` to `_model_forward`. Align the implementation of the `postprocess` phase **Related-RFC**: https://github.com/vllm-project/vllm-ascend/issues/5449 **Co-authored-by**: @zhenwenqi2024 ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: d68209402d --------- Signed-off-by: Wang Kunpeng <1289706727@qq.com> Signed-off-by: gcanlin <canlinguosdu@gmail.com> Signed-off-by: zhenwenqi2024 <zhenwenqi_2022@qq.com> Co-authored-by: gcanlin <canlinguosdu@gmail.com> Co-authored-by: zhenwenqi2024 <zhenwenqi_2022@qq.com>
2026-01-27 22:27:01 +08:00
parent 41eb71d665
commit c498cea22d
4 changed files with 825 additions and 650 deletions
--- a/tests/ut/worker/test_pcp_manager.py
+++ b/tests/ut/worker/test_pcp_manager.py
@@ -123,6 +123,7 @@ def test_update_tokens_for_pcp_basic(tokens, num_reqs, num_computed_tokens,
    vllm_config = MagicMock()
    vllm_config.model_config = MagicMock()
    vllm_config.speculative_config.num_speculative_tokens = 0
+    vllm_config.scheduler_config.max_num_seqs = 1000

    pcp_manager = PCPManager(pcp_world_size=pcp_size,
                             pcp_rank=0,
--- a/vllm_ascend/attention/sfa_v1.py
+++ b/vllm_ascend/attention/sfa_v1.py
@@ -170,7 +170,7 @@ class AscendSFAMetadataBuilder(MLACommonMetadataBuilder[AscendSFAMetadata]):
                npu_fused_infer_attention_score TND layout's limit of 16, \
                got {self.decode_threshold}"
            )
-
+        self.reorder_batch_threshold = self.decode_threshold
        self.attn_mask_builder = AttentionMaskBuilder(self.device)
        self.rope_dim = self.model_config.hf_text_config.qk_rope_head_dim
        self.enable_dsa_cp = enable_dsa_cp()
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
--- a/vllm_ascend/worker/pcp_utils.py
+++ b/vllm_ascend/worker/pcp_utils.py
@@ -75,11 +75,13 @@ class PCPManager:
            dtype=torch.int32,
            device=device,
        )
+        self.pcp_tokens = np.zeros(self.max_num_reqs, dtype=np.int32)
+        self.total_num_sampled_tokens_pcp = 0
        self.num_pcp_pads_cpu_tensor = torch.zeros((max_num_reqs, ),
                                                   device="cpu",
                                                   dtype=torch.int64)
        self.num_pcp_pads_cpu = self.num_pcp_pads_cpu_tensor.numpy()
-        self.pcp_unpad_mask_cpu_tensor = torch.zeros(
+        self.pcp_unpad_mask_cpu_tensor = torch.ones(
            (max_buffer_num_tokens, ),
            device="cpu",
            dtype=torch.bool,
@@ -292,6 +294,8 @@ class PCPManager:
            all_positions.argsort())
        self.pcp_allgather_restore_idx.copy_to_gpu(all_positions.shape[0])

+        self.pcp_tokens[:num_reqs] = pcp_tokens[:num_reqs]
+        self.total_num_sampled_tokens_pcp = pcp_tokens[:num_reqs].sum()
        return (
            pcp_tokens[:num_reqs],
            positions,
@@ -312,17 +316,16 @@ class PCPManager:
                num_scheduled_tokens * self.pcp_world_size -
                self.num_pcp_pads_cpu[:num_reqs]) < num_tokens_np

-    def get_padded_slot_mapping(self, num_tokens: int,
+    def get_padded_slot_mapping(self, num_tokens: int, num_tokens_padded: int,
                                slot_mapping: torch.Tensor):
        # After pcp allgather and restore, there are padded tokens in kv,
        # so we need pad slotmapping for alignment.
-        pcp_padded_slot_mapping = self.pcp_padded_slot_mapping[:num_tokens *
-                                                               self.
-                                                               pcp_world_size]
+        pcp_padded_slot_mapping = self.pcp_padded_slot_mapping[:num_tokens_padded * self.pcp_world_size]
+
        cp_unpad_mask = self.pcp_unpad_mask_cpu_tensor[:num_tokens *
                                                       self.pcp_world_size]
        pcp_padded_slot_mapping.fill_(-1)
-        pcp_padded_slot_mapping[cp_unpad_mask] = slot_mapping
+        pcp_padded_slot_mapping[:num_tokens * self.pcp_world_size][cp_unpad_mask] = slot_mapping
        return pcp_padded_slot_mapping

    def get_restore_hidden_states(