[refactor] refactor excute_model and _dymmy_run method (#6043)

### What this PR does / why we need it?
The structure of the `excute_model` and `_dymmy_run` methods in
NPUModelRunner differs greatly from that in GPUModelRunner.
Achieve alignment with GPUModelRunner:
Split the `_prepare_inputs` method into `_prepare_inputs`,
`_determine_batch_execution_and_padding`, `_build_attention_metadata`,
and `_preprocess`.
Modify `_generate_process_reqs_hidden_states` to `_model_forward`.
Align the implementation of the `postprocess` phase

**Related-RFC**: https://github.com/vllm-project/vllm-ascend/issues/5449

**Co-authored-by**: @zhenwenqi2024 
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?

- vLLM version: v0.13.0
- vLLM main:
d68209402d

---------

Signed-off-by: Wang Kunpeng <1289706727@qq.com>
Signed-off-by: gcanlin <canlinguosdu@gmail.com>
Signed-off-by: zhenwenqi2024 <zhenwenqi_2022@qq.com>
Co-authored-by: gcanlin <canlinguosdu@gmail.com>
Co-authored-by: zhenwenqi2024 <zhenwenqi_2022@qq.com>
This commit is contained in:
Wang Kunpeng
2026-01-27 22:27:01 +08:00
committed by GitHub
parent 41eb71d665
commit c498cea22d
4 changed files with 825 additions and 650 deletions

View File

@@ -123,6 +123,7 @@ def test_update_tokens_for_pcp_basic(tokens, num_reqs, num_computed_tokens,
vllm_config = MagicMock()
vllm_config.model_config = MagicMock()
vllm_config.speculative_config.num_speculative_tokens = 0
vllm_config.scheduler_config.max_num_seqs = 1000
pcp_manager = PCPManager(pcp_world_size=pcp_size,
pcp_rank=0,

View File

@@ -170,7 +170,7 @@ class AscendSFAMetadataBuilder(MLACommonMetadataBuilder[AscendSFAMetadata]):
npu_fused_infer_attention_score TND layout's limit of 16, \
got {self.decode_threshold}"
)
self.reorder_batch_threshold = self.decode_threshold
self.attn_mask_builder = AttentionMaskBuilder(self.device)
self.rope_dim = self.model_config.hf_text_config.qk_rope_head_dim
self.enable_dsa_cp = enable_dsa_cp()

File diff suppressed because it is too large Load Diff

View File

@@ -75,11 +75,13 @@ class PCPManager:
dtype=torch.int32,
device=device,
)
self.pcp_tokens = np.zeros(self.max_num_reqs, dtype=np.int32)
self.total_num_sampled_tokens_pcp = 0
self.num_pcp_pads_cpu_tensor = torch.zeros((max_num_reqs, ),
device="cpu",
dtype=torch.int64)
self.num_pcp_pads_cpu = self.num_pcp_pads_cpu_tensor.numpy()
self.pcp_unpad_mask_cpu_tensor = torch.zeros(
self.pcp_unpad_mask_cpu_tensor = torch.ones(
(max_buffer_num_tokens, ),
device="cpu",
dtype=torch.bool,
@@ -292,6 +294,8 @@ class PCPManager:
all_positions.argsort())
self.pcp_allgather_restore_idx.copy_to_gpu(all_positions.shape[0])
self.pcp_tokens[:num_reqs] = pcp_tokens[:num_reqs]
self.total_num_sampled_tokens_pcp = pcp_tokens[:num_reqs].sum()
return (
pcp_tokens[:num_reqs],
positions,
@@ -312,17 +316,16 @@ class PCPManager:
num_scheduled_tokens * self.pcp_world_size -
self.num_pcp_pads_cpu[:num_reqs]) < num_tokens_np
def get_padded_slot_mapping(self, num_tokens: int,
def get_padded_slot_mapping(self, num_tokens: int, num_tokens_padded: int,
slot_mapping: torch.Tensor):
# After pcp allgather and restore, there are padded tokens in kv,
# so we need pad slotmapping for alignment.
pcp_padded_slot_mapping = self.pcp_padded_slot_mapping[:num_tokens *
self.
pcp_world_size]
pcp_padded_slot_mapping = self.pcp_padded_slot_mapping[:num_tokens_padded * self.pcp_world_size]
cp_unpad_mask = self.pcp_unpad_mask_cpu_tensor[:num_tokens *
self.pcp_world_size]
pcp_padded_slot_mapping.fill_(-1)
pcp_padded_slot_mapping[cp_unpad_mask] = slot_mapping
pcp_padded_slot_mapping[:num_tokens * self.pcp_world_size][cp_unpad_mask] = slot_mapping
return pcp_padded_slot_mapping
def get_restore_hidden_states(