[0.11.0][Bugfix] Fix ngram precision issue and open e2e ngram test (#4092)

### What this PR does / why we need it?
Fix ngram precision issue and open e2e ngram test
---------

Signed-off-by: Icey <1790571317@qq.com>
Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
Signed-off-by: zhaomingyu13 <zhaomingyu13@h-partners.com>
Co-authored-by: Icey <1790571317@qq.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
This commit is contained in:
zhaomingyu13
2025-11-11 09:58:03 +08:00
committed by GitHub
parent 2069bef449
commit 650ce8ad19
5 changed files with 34 additions and 25 deletions

View File

@@ -191,6 +191,14 @@ class AscendAttentionMetadataBuilder:
self.max_num_blocks_per_req = cdiv(
self.model_config.max_model_len,
AscendAttentionBackend.get_supported_block_size()[0])
self.speculative_config = vllm_config.speculative_config
self.decode_threshold = 1
if self.speculative_config:
spec_token_num = self.speculative_config.num_speculative_tokens
self.decode_threshold += spec_token_num
assert self.decode_threshold <= 16, f"decode_threshold exceeded \
npu_fused_infer_attention_score TND layout's limit of 16, \
got {self.decode_threshold}"
def reorder_batch(self, input_batch,
scheduler_output: "SchedulerOutput") -> bool: