[Feature] Support for cross-attention and whisper model (#5592)
### What this PR does / why we need it?
To solve the problem of the
issue:https://github.com/vllm-project/vllm-ascend/issues/2262
- support for cross-attention when the model is encoder-decoder
- support for whisper model
- vLLM version: v0.13.0
- vLLM main:
7157596103
Signed-off-by: gh924 <guihao2@huawei.com>
Co-authored-by: Aoxuan Chen <43376869+chenaoxuan@users.noreply.github.com>
This commit is contained in:
@@ -55,7 +55,7 @@ from vllm.utils.mem_utils import DeviceMemoryProfiler
|
||||
from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
|
||||
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.kv_cache_interface import (AttentionSpec,
|
||||
from vllm.v1.kv_cache_interface import (AttentionSpec, CrossAttentionSpec,
|
||||
EncoderOnlyAttentionSpec,
|
||||
FullAttentionSpec, KVCacheConfig,
|
||||
KVCacheGroupSpec, KVCacheSpec,
|
||||
@@ -315,7 +315,8 @@ class NPUModelRunner(GPUModelRunner):
|
||||
# the block_sizes in the kv cache config.
|
||||
self.input_batch = NPUInputBatch(
|
||||
max_num_reqs=self.max_num_reqs,
|
||||
max_model_len=self.model_config.max_model_len,
|
||||
max_model_len=max(self.model_config.max_model_len,
|
||||
self.max_encoder_len),
|
||||
max_num_batched_tokens=self.max_num_tokens,
|
||||
device=self.device,
|
||||
pin_memory=self.pin_memory,
|
||||
@@ -485,7 +486,8 @@ class NPUModelRunner(GPUModelRunner):
|
||||
intermediate_tensors: Optional[IntermediateTensors] = None,
|
||||
) -> tuple[dict[str, Any], torch.Tensor, np.ndarray, int, torch.Tensor,
|
||||
int, torch.Tensor, SpecDecodeMetadata, Optional[torch.Tensor],
|
||||
Optional[torch.Tensor], Optional[torch.Tensor], int]:
|
||||
Optional[torch.Tensor], Optional[torch.Tensor], int, dict[str,
|
||||
Any]]:
|
||||
total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
|
||||
assert total_num_scheduled_tokens > 0
|
||||
num_reqs = self.input_batch.num_reqs
|
||||
@@ -729,7 +731,11 @@ class NPUModelRunner(GPUModelRunner):
|
||||
|
||||
# _prepare_inputs may reorder the batch, so we must gather
|
||||
# multi-modal outputs after that to ensure the correct order
|
||||
if self.is_multimodal_model:
|
||||
if vllm_version_is('0.13.0'):
|
||||
model_kwargs = self._init_model_kwargs(num_input_tokens)
|
||||
else:
|
||||
model_kwargs = self._init_model_kwargs()
|
||||
if self.is_multimodal_model and not self.model_config.is_encoder_decoder:
|
||||
self.multimodal_cpu_fields = ["grid_thw"]
|
||||
self._prepare_multimodal_fields()
|
||||
with self.maybe_get_ec_connector_output(
|
||||
@@ -796,6 +802,13 @@ class NPUModelRunner(GPUModelRunner):
|
||||
else:
|
||||
positions = self.positions.gpu[:num_input_tokens]
|
||||
|
||||
# Run the encoder, just like we do with other multimodal inputs.
|
||||
if self.model_config.is_encoder_decoder and scheduler_output.scheduled_encoder_inputs:
|
||||
input_ids = self.input_ids.gpu[:total_num_scheduled_tokens]
|
||||
positions = self.positions.gpu[:total_num_scheduled_tokens]
|
||||
encoder_outputs = self._execute_mm_encoder(scheduler_output)
|
||||
model_kwargs.update({"encoder_outputs": encoder_outputs})
|
||||
|
||||
# type: ignore
|
||||
if get_pp_group().is_first_rank:
|
||||
intermediate_tensors = None
|
||||
@@ -880,6 +893,11 @@ class NPUModelRunner(GPUModelRunner):
|
||||
# in the same group share the same metadata.
|
||||
for kv_cache_group_id, kv_cache_group_spec in enumerate(
|
||||
self.kv_cache_config.kv_cache_groups):
|
||||
encoder_seq_lens, encoder_seq_lens_cpu = self._get_encoder_seq_lens(
|
||||
scheduler_output.num_scheduled_tokens or {},
|
||||
kv_cache_group_spec.kv_cache_spec,
|
||||
self.input_batch.num_reqs,
|
||||
)
|
||||
if isinstance(kv_cache_group_spec.kv_cache_spec,
|
||||
EncoderOnlyAttentionSpec):
|
||||
# Encoder-only layers do not have KV cache, so we need to
|
||||
@@ -977,7 +995,8 @@ class NPUModelRunner(GPUModelRunner):
|
||||
decode_token_per_req=self.decode_token_per_req,
|
||||
prefill_context_parallel_metadata=self.long_seq_metadata,
|
||||
max_seq_len=0,
|
||||
)
|
||||
encoder_seq_lens=encoder_seq_lens,
|
||||
encoder_seq_lens_cpu=encoder_seq_lens_cpu)
|
||||
|
||||
if self.speculative_config and self.pcp_size * self.dcp_size > 1:
|
||||
# For pcp + spec decode, we flatten block_table
|
||||
@@ -1059,7 +1078,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
num_input_tokens, num_tokens_across_dp,
|
||||
maybe_padded_num_tokens, logits_indices, spec_decode_metadata,
|
||||
input_ids, inputs_embeds, intermediate_tensors,
|
||||
max_num_scheduled_tokens)
|
||||
max_num_scheduled_tokens, model_kwargs)
|
||||
|
||||
# all-gather one hidden-states in sp scene
|
||||
@staticmethod
|
||||
@@ -1091,22 +1110,13 @@ class NPUModelRunner(GPUModelRunner):
|
||||
def _generate_process_reqs_hidden_states(self, maybe_padded_num_tokens,
|
||||
input_ids, positions,
|
||||
intermediate_tensors,
|
||||
inputs_embeds):
|
||||
inputs_embeds, model_kwargs):
|
||||
assert self.model is not None
|
||||
if vllm_version_is('0.13.0'):
|
||||
hidden_states = self.model(
|
||||
input_ids=input_ids,
|
||||
positions=positions,
|
||||
intermediate_tensors=intermediate_tensors,
|
||||
inputs_embeds=inputs_embeds,
|
||||
**self._init_model_kwargs(maybe_padded_num_tokens))
|
||||
else:
|
||||
hidden_states = self.model(
|
||||
input_ids=input_ids,
|
||||
positions=positions,
|
||||
intermediate_tensors=intermediate_tensors,
|
||||
inputs_embeds=inputs_embeds,
|
||||
**self._init_model_kwargs())
|
||||
hidden_states = self.model(input_ids=input_ids,
|
||||
positions=positions,
|
||||
intermediate_tensors=intermediate_tensors,
|
||||
inputs_embeds=inputs_embeds,
|
||||
**model_kwargs)
|
||||
|
||||
forward_context = get_forward_context()
|
||||
if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL \
|
||||
@@ -1465,9 +1475,9 @@ class NPUModelRunner(GPUModelRunner):
|
||||
(attn_metadata, positions, num_scheduled_tokens_np,
|
||||
num_input_tokens, num_tokens_across_dp, maybe_padded_num_tokens,
|
||||
logits_indices, spec_decode_metadata, input_ids, inputs_embeds,
|
||||
intermediate_tensors,
|
||||
max_query_len) = (self._prepare_inputs(scheduler_output,
|
||||
intermediate_tensors))
|
||||
intermediate_tensors, max_query_len,
|
||||
model_kwargs) = (self._prepare_inputs(scheduler_output,
|
||||
intermediate_tensors))
|
||||
|
||||
if self.dynamic_eplb:
|
||||
self.eplb_updator.take_update_info_from_eplb_process()
|
||||
@@ -1512,7 +1522,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
|
||||
hidden_states = self._generate_process_reqs_hidden_states(
|
||||
maybe_padded_num_tokens, input_ids, positions,
|
||||
intermediate_tensors, inputs_embeds)
|
||||
intermediate_tensors, inputs_embeds, model_kwargs)
|
||||
|
||||
self.maybe_wait_for_kv_save()
|
||||
finished_sending, finished_recving = self.get_finished_kv_transfer(
|
||||
@@ -2152,7 +2162,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
num_sampled_tokens):
|
||||
# Make sure padding doesn't exceed max_num_tokens
|
||||
assert num_tokens_padded <= self.max_num_tokens
|
||||
if self.is_multimodal_model:
|
||||
if self.is_multimodal_model and not self.model_config.is_encoder_decoder:
|
||||
input_ids = None
|
||||
inputs_embeds = self.inputs_embeds.gpu[:num_tokens_padded]
|
||||
elif self.enable_prompt_embeds:
|
||||
@@ -2546,7 +2556,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
|
||||
# TODO: remove this after the OOM issue is located and fixed, otherwise, some model may
|
||||
# encounter OOM issue
|
||||
if isinstance(kv_cache_spec, FullAttentionSpec):
|
||||
if isinstance(kv_cache_spec, AttentionSpec):
|
||||
raw_dsa_k_tensor = None
|
||||
if self.use_sparse:
|
||||
raw_k_tensor, raw_v_tensor, raw_dsa_k_tensor = kv_cache_raw_tensors[ # type: ignore
|
||||
@@ -2721,7 +2731,8 @@ class NPUModelRunner(GPUModelRunner):
|
||||
"for more details.")
|
||||
self.input_batch = NPUInputBatch(
|
||||
max_num_reqs=self.max_num_reqs,
|
||||
max_model_len=self.model_config.max_model_len,
|
||||
max_model_len=max(self.model_config.max_model_len,
|
||||
self.max_encoder_len),
|
||||
max_num_batched_tokens=self.max_num_tokens,
|
||||
device=self.device,
|
||||
pin_memory=self.pin_memory,
|
||||
@@ -2889,7 +2900,11 @@ class NPUModelRunner(GPUModelRunner):
|
||||
# encoder-only attention does not need KV cache.
|
||||
continue
|
||||
elif attn_module.attn_type == AttentionType.ENCODER_DECODER:
|
||||
raise NotImplementedError
|
||||
kv_cache_spec[layer_name] = CrossAttentionSpec(
|
||||
block_size=block_size,
|
||||
num_kv_heads=attn_module.num_kv_heads,
|
||||
head_size=attn_module.head_size,
|
||||
dtype=self.kv_cache_dtype)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unknown attention type: {attn_module.attn_type}")
|
||||
|
||||
Reference in New Issue
Block a user