[refactor] refactor excute_model and _dymmy_run method (#6043)
### What this PR does / why we need it?
The structure of the `excute_model` and `_dymmy_run` methods in
NPUModelRunner differs greatly from that in GPUModelRunner.
Achieve alignment with GPUModelRunner:
Split the `_prepare_inputs` method into `_prepare_inputs`,
`_determine_batch_execution_and_padding`, `_build_attention_metadata`,
and `_preprocess`.
Modify `_generate_process_reqs_hidden_states` to `_model_forward`.
Align the implementation of the `postprocess` phase
**Related-RFC**: https://github.com/vllm-project/vllm-ascend/issues/5449
**Co-authored-by**: @zhenwenqi2024
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
d68209402d
---------
Signed-off-by: Wang Kunpeng <1289706727@qq.com>
Signed-off-by: gcanlin <canlinguosdu@gmail.com>
Signed-off-by: zhenwenqi2024 <zhenwenqi_2022@qq.com>
Co-authored-by: gcanlin <canlinguosdu@gmail.com>
Co-authored-by: zhenwenqi2024 <zhenwenqi_2022@qq.com>
This commit is contained in:
@@ -123,6 +123,7 @@ def test_update_tokens_for_pcp_basic(tokens, num_reqs, num_computed_tokens,
|
||||
vllm_config = MagicMock()
|
||||
vllm_config.model_config = MagicMock()
|
||||
vllm_config.speculative_config.num_speculative_tokens = 0
|
||||
vllm_config.scheduler_config.max_num_seqs = 1000
|
||||
|
||||
pcp_manager = PCPManager(pcp_world_size=pcp_size,
|
||||
pcp_rank=0,
|
||||
|
||||
@@ -170,7 +170,7 @@ class AscendSFAMetadataBuilder(MLACommonMetadataBuilder[AscendSFAMetadata]):
|
||||
npu_fused_infer_attention_score TND layout's limit of 16, \
|
||||
got {self.decode_threshold}"
|
||||
)
|
||||
|
||||
self.reorder_batch_threshold = self.decode_threshold
|
||||
self.attn_mask_builder = AttentionMaskBuilder(self.device)
|
||||
self.rope_dim = self.model_config.hf_text_config.qk_rope_head_dim
|
||||
self.enable_dsa_cp = enable_dsa_cp()
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -75,11 +75,13 @@ class PCPManager:
|
||||
dtype=torch.int32,
|
||||
device=device,
|
||||
)
|
||||
self.pcp_tokens = np.zeros(self.max_num_reqs, dtype=np.int32)
|
||||
self.total_num_sampled_tokens_pcp = 0
|
||||
self.num_pcp_pads_cpu_tensor = torch.zeros((max_num_reqs, ),
|
||||
device="cpu",
|
||||
dtype=torch.int64)
|
||||
self.num_pcp_pads_cpu = self.num_pcp_pads_cpu_tensor.numpy()
|
||||
self.pcp_unpad_mask_cpu_tensor = torch.zeros(
|
||||
self.pcp_unpad_mask_cpu_tensor = torch.ones(
|
||||
(max_buffer_num_tokens, ),
|
||||
device="cpu",
|
||||
dtype=torch.bool,
|
||||
@@ -292,6 +294,8 @@ class PCPManager:
|
||||
all_positions.argsort())
|
||||
self.pcp_allgather_restore_idx.copy_to_gpu(all_positions.shape[0])
|
||||
|
||||
self.pcp_tokens[:num_reqs] = pcp_tokens[:num_reqs]
|
||||
self.total_num_sampled_tokens_pcp = pcp_tokens[:num_reqs].sum()
|
||||
return (
|
||||
pcp_tokens[:num_reqs],
|
||||
positions,
|
||||
@@ -312,17 +316,16 @@ class PCPManager:
|
||||
num_scheduled_tokens * self.pcp_world_size -
|
||||
self.num_pcp_pads_cpu[:num_reqs]) < num_tokens_np
|
||||
|
||||
def get_padded_slot_mapping(self, num_tokens: int,
|
||||
def get_padded_slot_mapping(self, num_tokens: int, num_tokens_padded: int,
|
||||
slot_mapping: torch.Tensor):
|
||||
# After pcp allgather and restore, there are padded tokens in kv,
|
||||
# so we need pad slotmapping for alignment.
|
||||
pcp_padded_slot_mapping = self.pcp_padded_slot_mapping[:num_tokens *
|
||||
self.
|
||||
pcp_world_size]
|
||||
pcp_padded_slot_mapping = self.pcp_padded_slot_mapping[:num_tokens_padded * self.pcp_world_size]
|
||||
|
||||
cp_unpad_mask = self.pcp_unpad_mask_cpu_tensor[:num_tokens *
|
||||
self.pcp_world_size]
|
||||
pcp_padded_slot_mapping.fill_(-1)
|
||||
pcp_padded_slot_mapping[cp_unpad_mask] = slot_mapping
|
||||
pcp_padded_slot_mapping[:num_tokens * self.pcp_world_size][cp_unpad_mask] = slot_mapping
|
||||
return pcp_padded_slot_mapping
|
||||
|
||||
def get_restore_hidden_states(
|
||||
|
||||
Reference in New Issue
Block a user