This reverts commit95649344aa. The CI failure doesn't related to this change. Let's reapply it. - vLLM version: v0.14.0 - vLLM main:d68209402d
This commit is contained in:
@@ -84,10 +84,7 @@ from vllm_ascend.attention.utils import AscendCommonAttentionMetadata, using_pag
|
||||
from vllm_ascend.compilation.acl_graph import (ACLGraphWrapper,
|
||||
set_draft_graph_params,
|
||||
set_graph_params,
|
||||
update_attn_dcp_pcp_params,
|
||||
update_attn_params,
|
||||
update_mla_attn_dcp_pcp_params,
|
||||
update_mla_attn_params)
|
||||
update_full_graph_params)
|
||||
# yapf: enable
|
||||
from vllm_ascend.eplb.adaptor.vllm_adaptor import VllmEplbAdaptor
|
||||
from vllm_ascend.eplb.core.eplb_device_transfer_loader import \
|
||||
@@ -1142,26 +1139,9 @@ class NPUModelRunner(GPUModelRunner):
|
||||
if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL \
|
||||
and not self.use_sparse:
|
||||
# TODO: maybe_padded_num_tokens will be removed, use num_input_tokens instead
|
||||
if self.vllm_config.model_config.use_mla:
|
||||
if self.pcp_size * self.dcp_size > 1:
|
||||
# FIXME: Try using `auto_dispatch_capture=True`
|
||||
update_mla_attn_dcp_pcp_params(self.update_stream,
|
||||
forward_context,
|
||||
maybe_padded_num_tokens)
|
||||
else:
|
||||
# FIXME: Try using `auto_dispatch_capture=True`
|
||||
update_mla_attn_params(self.update_stream, forward_context,
|
||||
maybe_padded_num_tokens,
|
||||
self.speculative_config)
|
||||
else:
|
||||
if self.pcp_size * self.dcp_size > 1:
|
||||
update_attn_dcp_pcp_params(self.update_stream,
|
||||
forward_context,
|
||||
maybe_padded_num_tokens)
|
||||
else:
|
||||
update_attn_params(self.update_stream, forward_context,
|
||||
maybe_padded_num_tokens,
|
||||
self.vllm_config)
|
||||
update_full_graph_params(self.attn_backend, self.update_stream, forward_context,
|
||||
maybe_padded_num_tokens, self.vllm_config,
|
||||
self.vllm_config.speculative_config)
|
||||
|
||||
if get_forward_context().sp_enabled and not isinstance(
|
||||
hidden_states, IntermediateTensors):
|
||||
@@ -2038,25 +2018,9 @@ class NPUModelRunner(GPUModelRunner):
|
||||
assert forward_context is not None
|
||||
if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL and \
|
||||
not forward_context.capturing and not self.use_sparse:
|
||||
if self.vllm_config.model_config.use_mla:
|
||||
# FIXME: Try using `auto_dispatch_capture=True`
|
||||
if self.pcp_size * self.dcp_size > 1:
|
||||
# FIXME: Try using `auto_dispatch_capture=True`
|
||||
update_mla_attn_dcp_pcp_params(self.update_stream,
|
||||
forward_context,
|
||||
positions.shape[0])
|
||||
else:
|
||||
# FIXME: Try using `auto_dispatch_capture=True`
|
||||
update_mla_attn_params(self.update_stream, forward_context,
|
||||
num_tokens, self.speculative_config)
|
||||
else:
|
||||
if self.pcp_size * self.dcp_size > 1:
|
||||
update_attn_dcp_pcp_params(self.update_stream,
|
||||
forward_context,
|
||||
positions.shape[0])
|
||||
else:
|
||||
update_attn_params(self.update_stream, forward_context,
|
||||
num_tokens, self.vllm_config)
|
||||
update_full_graph_params(self.attn_backend, self.update_stream, forward_context,
|
||||
num_tokens, self.vllm_config,
|
||||
self.speculative_config, positions.shape[0])
|
||||
|
||||
if self.use_aux_hidden_state_outputs:
|
||||
hidden_states, _ = hidden_states
|
||||
@@ -2899,7 +2863,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
attn_layers = get_layers_from_vllm_config(self.vllm_config,
|
||||
AttentionLayerBase)
|
||||
# NOTE: Must process Attention/MLAAttention before MambaBase to maintain
|
||||
# ordering expected by acl_graph.py's _update_attn_fia_params.
|
||||
# ordering expected by graph parameter update logic in attention backends.
|
||||
mamba_layers: dict[str, MambaBase] = {}
|
||||
for layer_name, attn_module in attn_layers.items():
|
||||
if isinstance(attn_module, Attention):
|
||||
|
||||
Reference in New Issue
Block a user