[Bugfix] Fix the Eagle3 inference failure issue. (#4721)
### What this PR does / why we need it? Fix the Eagle3 inference failure issue. error message: "EngineCore encountered an issue. See stack trace (above) for the root cause." Fixes https://github.com/vllm-project/vllm-ascend/issues/4323 ### How was this patch tested? `vllm serve /nfs/1_AscendPackage/05_weights_public/Qwen3-32B \ --served-model-name Qwen3-32B \ -tp 4 \ --host "0.0.0.0" \ --port "8000" \ --trust-remote-code \ --speculative-config '{"method":"eagle3","model":"/home/scd/qwen3_32b_eagle3/","num_speculative_tokens":4,"draft_tensor_parallel_size":1}' \ --max-num-batched-tokens 4096 \ --max-model-len 4096` ``` curl http://localhost:8000/v1/completions \ -H "Content-Type: application/json" \ -d '{ "model": "Qwen3-32B", "prompt": "hi, where is the capital of France?", "max_tokens": 10, "temperature": 0 }' | python3 -m json.tool ``` vLLM version: v0.11.0 vLLM-ascend version: v0.11.0rc2 Signed-off-by: 17764591921 <sunchend@outlook.com>
This commit is contained in:
@@ -72,7 +72,7 @@ class EagleProposer(Proposer):
|
|||||||
dtype=torch.int32)
|
dtype=torch.int32)
|
||||||
attn_mask_len = self.vllm_config.model_config.max_model_len
|
attn_mask_len = self.vllm_config.model_config.max_model_len
|
||||||
self.attn_mask_builder = AttentionMaskBuilder(
|
self.attn_mask_builder = AttentionMaskBuilder(
|
||||||
attn_mask_len, self.vllm_config.model_config.dtype)
|
attn_mask_len, self.vllm_config.model_config.dtype, device=device)
|
||||||
|
|
||||||
def load_model(self, model: nn.Module) -> None:
|
def load_model(self, model: nn.Module) -> None:
|
||||||
target_attn_layer_names = set(
|
target_attn_layer_names = set(
|
||||||
@@ -424,9 +424,7 @@ class EagleProposer(Proposer):
|
|||||||
|
|
||||||
query_lens = cu_num_tokens[1:] - cu_num_tokens[:-1]
|
query_lens = cu_num_tokens[1:] - cu_num_tokens[:-1]
|
||||||
max_query_len = query_lens.max().item()
|
max_query_len = query_lens.max().item()
|
||||||
attn_mask = self.attn_mask_builder.get_splitfuse_attn_mask(
|
attn_mask = self.runner.attn_mask
|
||||||
seq_lens, target_positions, self.vllm_config.model_config.dtype,
|
|
||||||
self.device)
|
|
||||||
|
|
||||||
common_attn_metadata = AscendCommonAttentionMetadata(
|
common_attn_metadata = AscendCommonAttentionMetadata(
|
||||||
query_start_loc=cu_num_tokens.to(device),
|
query_start_loc=cu_num_tokens.to(device),
|
||||||
@@ -506,9 +504,15 @@ class EagleProposer(Proposer):
|
|||||||
attn_metadata.num_actual_tokens = batch_size
|
attn_metadata.num_actual_tokens = batch_size
|
||||||
attn_metadata.max_query_len = 1
|
attn_metadata.max_query_len = 1
|
||||||
attn_metadata.query_start_loc = self.arange[:batch_size + 1]
|
attn_metadata.query_start_loc = self.arange[:batch_size + 1]
|
||||||
|
attn_metadata.query_start_loc_list = attn_metadata.query_start_loc[
|
||||||
|
1:].tolist()
|
||||||
|
attn_metadata.num_decodes, attn_metadata.num_prefills, attn_metadata.num_decode_tokens, attn_metadata.num_prefill_tokens = 0, batch_size, 0, batch_size
|
||||||
|
attn_metadata.num_actual_tokens_pcp_padded = attn_metadata.num_decode_tokens + attn_metadata.num_prefill_tokens
|
||||||
query_lens.fill_(1)
|
query_lens.fill_(1)
|
||||||
attn_metadata.query_lens = query_lens
|
attn_metadata.query_lens = query_lens
|
||||||
|
|
||||||
|
attn_metadata.actual_seq_lengths_q = [1 + i for i in range(batch_size)]
|
||||||
|
attn_metadata.seq_lens_list = seq_lens.tolist()
|
||||||
attn_metadata.attn_state = AscendAttentionState.ChunkedPrefill
|
attn_metadata.attn_state = AscendAttentionState.ChunkedPrefill
|
||||||
for now_speculative in range(
|
for now_speculative in range(
|
||||||
self.vllm_config.speculative_config.num_speculative_tokens -
|
self.vllm_config.speculative_config.num_speculative_tokens -
|
||||||
@@ -535,6 +539,9 @@ class EagleProposer(Proposer):
|
|||||||
# TODO: Increment the sequence lengths.
|
# TODO: Increment the sequence lengths.
|
||||||
|
|
||||||
attn_metadata.seq_lens += 1
|
attn_metadata.seq_lens += 1
|
||||||
|
attn_metadata.seq_lens_list = [
|
||||||
|
_ + 1 for _ in attn_metadata.seq_lens_list
|
||||||
|
]
|
||||||
# TODO: Consider max model length.
|
# TODO: Consider max model length.
|
||||||
# attn_metadata.max_seq_len = min(attn_metadata.max_seq_len,
|
# attn_metadata.max_seq_len = min(attn_metadata.max_seq_len,
|
||||||
# self.max_model_len)
|
# self.max_model_len)
|
||||||
|
|||||||
@@ -61,6 +61,7 @@ _IS_VL_MODEL = None
|
|||||||
_ENABLE_SP = None
|
_ENABLE_SP = None
|
||||||
_HAS_LAYER_IDX = None
|
_HAS_LAYER_IDX = None
|
||||||
_ENABLE_NZ = None
|
_ENABLE_NZ = None
|
||||||
|
_IS_EAGLE_MODE = None
|
||||||
|
|
||||||
|
|
||||||
def is_310p():
|
def is_310p():
|
||||||
@@ -73,14 +74,20 @@ def is_310p():
|
|||||||
|
|
||||||
def is_enable_nz(dtype: Optional[torch.dtype] = torch.int8,
|
def is_enable_nz(dtype: Optional[torch.dtype] = torch.int8,
|
||||||
vllm_config: Optional[VllmConfig] = None) -> bool:
|
vllm_config: Optional[VllmConfig] = None) -> bool:
|
||||||
global _ENABLE_NZ
|
global _ENABLE_NZ, _IS_EAGLE_MODE
|
||||||
if _ENABLE_NZ is None:
|
if _ENABLE_NZ is None:
|
||||||
if not vllm_config:
|
if not vllm_config:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"vllm_config must be provided when _ENABLE_NZ is None")
|
"vllm_config must be provided when _ENABLE_NZ is None")
|
||||||
_ENABLE_NZ = envs_ascend.VLLM_ASCEND_ENABLE_NZ and vllm_config.model_config.hf_config.model_type != "qwen3_next"
|
_ENABLE_NZ = envs_ascend.VLLM_ASCEND_ENABLE_NZ and vllm_config.model_config.hf_config.model_type != "qwen3_next"
|
||||||
|
|
||||||
|
_IS_EAGLE_MODE = (
|
||||||
|
vllm_config.speculative_config is not None and
|
||||||
|
getattr(vllm_config.speculative_config, 'method', None) in ("eagle", "eagle3")
|
||||||
|
)
|
||||||
|
|
||||||
if dtype in [torch.float16, torch.bfloat16]:
|
if dtype in [torch.float16, torch.bfloat16]:
|
||||||
return False
|
return _ENABLE_NZ if _IS_EAGLE_MODE else False
|
||||||
return _ENABLE_NZ
|
return _ENABLE_NZ
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user