[refactor] refactor excute_model and _dymmy_run method (#6043)
### What this PR does / why we need it?
The structure of the `excute_model` and `_dymmy_run` methods in
NPUModelRunner differs greatly from that in GPUModelRunner.
Achieve alignment with GPUModelRunner:
Split the `_prepare_inputs` method into `_prepare_inputs`,
`_determine_batch_execution_and_padding`, `_build_attention_metadata`,
and `_preprocess`.
Modify `_generate_process_reqs_hidden_states` to `_model_forward`.
Align the implementation of the `postprocess` phase
**Related-RFC**: https://github.com/vllm-project/vllm-ascend/issues/5449
**Co-authored-by**: @zhenwenqi2024
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
d68209402d
---------
Signed-off-by: Wang Kunpeng <1289706727@qq.com>
Signed-off-by: gcanlin <canlinguosdu@gmail.com>
Signed-off-by: zhenwenqi2024 <zhenwenqi_2022@qq.com>
Co-authored-by: gcanlin <canlinguosdu@gmail.com>
Co-authored-by: zhenwenqi2024 <zhenwenqi_2022@qq.com>
This commit is contained in:
@@ -123,6 +123,7 @@ def test_update_tokens_for_pcp_basic(tokens, num_reqs, num_computed_tokens,
|
|||||||
vllm_config = MagicMock()
|
vllm_config = MagicMock()
|
||||||
vllm_config.model_config = MagicMock()
|
vllm_config.model_config = MagicMock()
|
||||||
vllm_config.speculative_config.num_speculative_tokens = 0
|
vllm_config.speculative_config.num_speculative_tokens = 0
|
||||||
|
vllm_config.scheduler_config.max_num_seqs = 1000
|
||||||
|
|
||||||
pcp_manager = PCPManager(pcp_world_size=pcp_size,
|
pcp_manager = PCPManager(pcp_world_size=pcp_size,
|
||||||
pcp_rank=0,
|
pcp_rank=0,
|
||||||
|
|||||||
@@ -170,7 +170,7 @@ class AscendSFAMetadataBuilder(MLACommonMetadataBuilder[AscendSFAMetadata]):
|
|||||||
npu_fused_infer_attention_score TND layout's limit of 16, \
|
npu_fused_infer_attention_score TND layout's limit of 16, \
|
||||||
got {self.decode_threshold}"
|
got {self.decode_threshold}"
|
||||||
)
|
)
|
||||||
|
self.reorder_batch_threshold = self.decode_threshold
|
||||||
self.attn_mask_builder = AttentionMaskBuilder(self.device)
|
self.attn_mask_builder = AttentionMaskBuilder(self.device)
|
||||||
self.rope_dim = self.model_config.hf_text_config.qk_rope_head_dim
|
self.rope_dim = self.model_config.hf_text_config.qk_rope_head_dim
|
||||||
self.enable_dsa_cp = enable_dsa_cp()
|
self.enable_dsa_cp = enable_dsa_cp()
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -75,11 +75,13 @@ class PCPManager:
|
|||||||
dtype=torch.int32,
|
dtype=torch.int32,
|
||||||
device=device,
|
device=device,
|
||||||
)
|
)
|
||||||
|
self.pcp_tokens = np.zeros(self.max_num_reqs, dtype=np.int32)
|
||||||
|
self.total_num_sampled_tokens_pcp = 0
|
||||||
self.num_pcp_pads_cpu_tensor = torch.zeros((max_num_reqs, ),
|
self.num_pcp_pads_cpu_tensor = torch.zeros((max_num_reqs, ),
|
||||||
device="cpu",
|
device="cpu",
|
||||||
dtype=torch.int64)
|
dtype=torch.int64)
|
||||||
self.num_pcp_pads_cpu = self.num_pcp_pads_cpu_tensor.numpy()
|
self.num_pcp_pads_cpu = self.num_pcp_pads_cpu_tensor.numpy()
|
||||||
self.pcp_unpad_mask_cpu_tensor = torch.zeros(
|
self.pcp_unpad_mask_cpu_tensor = torch.ones(
|
||||||
(max_buffer_num_tokens, ),
|
(max_buffer_num_tokens, ),
|
||||||
device="cpu",
|
device="cpu",
|
||||||
dtype=torch.bool,
|
dtype=torch.bool,
|
||||||
@@ -292,6 +294,8 @@ class PCPManager:
|
|||||||
all_positions.argsort())
|
all_positions.argsort())
|
||||||
self.pcp_allgather_restore_idx.copy_to_gpu(all_positions.shape[0])
|
self.pcp_allgather_restore_idx.copy_to_gpu(all_positions.shape[0])
|
||||||
|
|
||||||
|
self.pcp_tokens[:num_reqs] = pcp_tokens[:num_reqs]
|
||||||
|
self.total_num_sampled_tokens_pcp = pcp_tokens[:num_reqs].sum()
|
||||||
return (
|
return (
|
||||||
pcp_tokens[:num_reqs],
|
pcp_tokens[:num_reqs],
|
||||||
positions,
|
positions,
|
||||||
@@ -312,17 +316,16 @@ class PCPManager:
|
|||||||
num_scheduled_tokens * self.pcp_world_size -
|
num_scheduled_tokens * self.pcp_world_size -
|
||||||
self.num_pcp_pads_cpu[:num_reqs]) < num_tokens_np
|
self.num_pcp_pads_cpu[:num_reqs]) < num_tokens_np
|
||||||
|
|
||||||
def get_padded_slot_mapping(self, num_tokens: int,
|
def get_padded_slot_mapping(self, num_tokens: int, num_tokens_padded: int,
|
||||||
slot_mapping: torch.Tensor):
|
slot_mapping: torch.Tensor):
|
||||||
# After pcp allgather and restore, there are padded tokens in kv,
|
# After pcp allgather and restore, there are padded tokens in kv,
|
||||||
# so we need pad slotmapping for alignment.
|
# so we need pad slotmapping for alignment.
|
||||||
pcp_padded_slot_mapping = self.pcp_padded_slot_mapping[:num_tokens *
|
pcp_padded_slot_mapping = self.pcp_padded_slot_mapping[:num_tokens_padded * self.pcp_world_size]
|
||||||
self.
|
|
||||||
pcp_world_size]
|
|
||||||
cp_unpad_mask = self.pcp_unpad_mask_cpu_tensor[:num_tokens *
|
cp_unpad_mask = self.pcp_unpad_mask_cpu_tensor[:num_tokens *
|
||||||
self.pcp_world_size]
|
self.pcp_world_size]
|
||||||
pcp_padded_slot_mapping.fill_(-1)
|
pcp_padded_slot_mapping.fill_(-1)
|
||||||
pcp_padded_slot_mapping[cp_unpad_mask] = slot_mapping
|
pcp_padded_slot_mapping[:num_tokens * self.pcp_world_size][cp_unpad_mask] = slot_mapping
|
||||||
return pcp_padded_slot_mapping
|
return pcp_padded_slot_mapping
|
||||||
|
|
||||||
def get_restore_hidden_states(
|
def get_restore_hidden_states(
|
||||||
|
|||||||
Reference in New Issue
Block a user