[1/N][Refactor] Refactor code to adapt with vllm main (#3612)
### What this PR does / why we need it? This is the step 1 of refactoring code to adapt with vllm main, and this pr aligned with17c540a9931. refactor deepseek to the latest code arch as of17c540a9932. bunches of fixes due to vllm changes - Fix `AscendScheduler` `__post_init__`, caused by https://github.com/vllm-project/vllm/pull/25075 - Fix `AscendScheduler` init got an unexpected arg `block_size`, caused by https://github.com/vllm-project/vllm/pull/26296 - Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by https://github.com/vllm-project/vllm/pull/23485 - Fix `MLAAttention` import,caused by https://github.com/vllm-project/vllm/pull/25103 - Fix `SharedFusedMoE` import, caused by https://github.com/vllm-project/vllm/pull/26145 - Fix `LazyLoader` improt, caused by https://github.com/vllm-project/vllm/pull/27022 - Fix `vllm.utils.swap_dict_values` improt, caused by https://github.com/vllm-project/vllm/pull/26990 - Fix `Backend` enum import, caused by https://github.com/vllm-project/vllm/pull/25893 - Fix `CompilationLevel` renaming to `CompilationMode` issue introduced by https://github.com/vllm-project/vllm/pull/26355 - Fix fused_moe ops, caused by https://github.com/vllm-project/vllm/pull/24097 - Fix bert model because of `inputs_embeds`, caused by https://github.com/vllm-project/vllm/pull/25922 - Fix MRope because of `get_input_positions_tensor` to `get_mrope_input_positions`, caused by https://github.com/vllm-project/vllm/pull/24172 - Fix `splitting_ops` changes introduced by https://github.com/vllm-project/vllm/pull/25845 - Fix multi-modality changes introduced by https://github.com/vllm-project/vllm/issues/16229 - Fix lora bias dropping issue introduced by https://github.com/vllm-project/vllm/pull/25807 - Fix structured ouput break introduced by https://github.com/vllm-project/vllm/issues/26737 ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? CI passed with existing test. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: Icey <1790571317@qq.com> Co-authored-by: Icey <1790571317@qq.com>
This commit is contained in:
@@ -493,21 +493,19 @@ class AscendSFAImpl(MLAAttentionImpl):
|
||||
self.qk_head_dim = kwargs['qk_head_dim']
|
||||
self.v_head_dim = kwargs['v_head_dim']
|
||||
self.rotary_emb = kwargs['rotary_emb']
|
||||
self.q_proj = kwargs['q_proj']
|
||||
self.q_proj = kwargs['q_proj'] if self.q_lora_rank is None else kwargs[
|
||||
'q_b_proj']
|
||||
self.fused_qkv_a_proj = kwargs.get('fused_qkv_a_proj', None)
|
||||
self.kv_b_proj = kwargs['kv_b_proj']
|
||||
self.o_proj = kwargs['o_proj']
|
||||
self.indexer = kwargs['indexer']
|
||||
self.kv_a_proj_with_mqa = kwargs.get('kv_a_proj_with_mqa', None)
|
||||
self.kv_a_layernorm = kwargs.get('kv_a_layernorm', None)
|
||||
self.q_a_proj = kwargs.get('q_a_proj', None)
|
||||
self.q_a_layernorm = kwargs.get('q_a_layernorm', None)
|
||||
self.num_queries_per_kv = self.num_heads // self.num_kv_heads
|
||||
self.tp_size = get_tensor_model_parallel_world_size()
|
||||
self.num_heads_per_rank = self.num_heads // self.tp_size
|
||||
if self.q_a_proj is not None:
|
||||
self.q_b_proj = self.q_proj
|
||||
else:
|
||||
self.q_b_proj = None
|
||||
self.q_b_proj = kwargs['q_b_proj']
|
||||
|
||||
ascend_config = get_ascend_config()
|
||||
self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp
|
||||
@@ -629,10 +627,13 @@ class AscendSFAImpl(MLAAttentionImpl):
|
||||
if has_decode:
|
||||
q_len = 1
|
||||
hidden_states_decode = hidden_states[:num_decode_tokens]
|
||||
decode_kq = self.q_a_proj(hidden_states_decode) # q down
|
||||
decode_q_c = self.q_a_layernorm(decode_kq) # q down layernorm
|
||||
decode_kv_no_split = self.kv_a_proj_with_mqa(
|
||||
hidden_states_decode) # c_kv
|
||||
decode_qkv_lora = self.fused_qkv_a_proj(hidden_states_decode)[0]
|
||||
decode_q_c, decode_kv_no_split = decode_qkv_lora.split(
|
||||
[self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim],
|
||||
dim=-1,
|
||||
)
|
||||
decode_q_c = self.q_a_layernorm(decode_q_c) # q down layernorm
|
||||
decode_kv_no_split = decode_kv_no_split.contiguous()
|
||||
|
||||
# decode_q_c = q_c[:num_decode_tokens]
|
||||
decode_slot_mapping = attn_metadata.slot_mapping[:
|
||||
@@ -713,10 +714,13 @@ class AscendSFAImpl(MLAAttentionImpl):
|
||||
|
||||
hidden_states_prefill = hidden_states[
|
||||
num_decode_tokens:num_actual_tokens]
|
||||
prefill_kq = self.q_a_proj(hidden_states_prefill) # q down
|
||||
prefill_q_c = self.q_a_layernorm(prefill_kq) # q down layernorm
|
||||
prefill_kv_no_split = self.kv_a_proj_with_mqa(
|
||||
hidden_states_prefill) # c_kv
|
||||
prefill_qkv_lora = self.fused_qkv_a_proj(hidden_states_prefill)[0]
|
||||
prefill_q_c, prefill_kv_no_split = prefill_qkv_lora.split(
|
||||
[self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim],
|
||||
dim=-1,
|
||||
)
|
||||
prefill_q_c = self.q_a_layernorm(prefill_q_c) # q down layernorm
|
||||
prefill_kv_no_split = prefill_kv_no_split.contiguous()
|
||||
|
||||
# prefill_q_c = q_c[
|
||||
# num_decode_tokens:num_actual_tokens]
|
||||
@@ -808,7 +812,7 @@ class AscendSFAImpl(MLAAttentionImpl):
|
||||
assert output is not None, "Output tensor must be provided."
|
||||
if attn_metadata is None:
|
||||
# Profiling run.
|
||||
return output
|
||||
return output.fill_(0)
|
||||
num_actual_tokens = attn_metadata.num_actual_tokens
|
||||
assert attn_metadata.num_decodes is not None and \
|
||||
attn_metadata.num_prefills is not None and \
|
||||
|
||||
Reference in New Issue
Block a user