[0.11.0][Bugfix] Fix ngram precision issue and open e2e ngram test (#4092)

### What this PR does / why we need it?
Fix ngram precision issue and open e2e ngram test
---------

Signed-off-by: Icey <1790571317@qq.com>
Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
Signed-off-by: zhaomingyu13 <zhaomingyu13@h-partners.com>
Co-authored-by: Icey <1790571317@qq.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
This commit is contained in:
zhaomingyu13
2025-11-11 09:58:03 +08:00
committed by GitHub
parent 2069bef449
commit 650ce8ad19
5 changed files with 34 additions and 25 deletions

View File

@@ -1512,7 +1512,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
extra_attn_metadata_args = dict(
num_accepted_tokens=self.num_accepted_tokens.
gpu[:num_reqs],
num_draft_tokens=self.num_draft_tokens.
num_decode_draft_tokens_cpu=self.num_draft_tokens.
gpu[:num_reqs],
)
attn_metadata_i = builder.build(
@@ -1587,11 +1587,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
attn_state = AscendAttentionState.SpecDecoding
# Speculative decoding.
elif np.all(num_valid_tokens == 1):
if self.drafter and (self.drafter.name == SpecDcodeType.EAGLE
or self.drafter.name == SpecDcodeType.EAGLE3):
attn_state = AscendAttentionState.ChunkedPrefill
else:
if self.speculative_config and self.speculative_config.method == 'deepseek_mtp':
attn_state = AscendAttentionState.SpecDecoding
else:
attn_state = AscendAttentionState.ChunkedPrefill
# splitfuse
elif not ascend_config.ascend_scheduler_config.enabled or self.chunked_prefill_enabled:
attn_state = AscendAttentionState.ChunkedPrefill