[0.11.0][Bugfix] Fix ngram precision issue and open e2e ngram test (#4092)
### What this PR does / why we need it? Fix ngram precision issue and open e2e ngram test --------- Signed-off-by: Icey <1790571317@qq.com> Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com> Signed-off-by: zhaomingyu13 <zhaomingyu13@h-partners.com> Co-authored-by: Icey <1790571317@qq.com> Co-authored-by: Mengqing Cao <cmq0113@163.com>
This commit is contained in:
@@ -191,6 +191,14 @@ class AscendAttentionMetadataBuilder:
|
||||
self.max_num_blocks_per_req = cdiv(
|
||||
self.model_config.max_model_len,
|
||||
AscendAttentionBackend.get_supported_block_size()[0])
|
||||
self.speculative_config = vllm_config.speculative_config
|
||||
self.decode_threshold = 1
|
||||
if self.speculative_config:
|
||||
spec_token_num = self.speculative_config.num_speculative_tokens
|
||||
self.decode_threshold += spec_token_num
|
||||
assert self.decode_threshold <= 16, f"decode_threshold exceeded \
|
||||
npu_fused_infer_attention_score TND layout's limit of 16, \
|
||||
got {self.decode_threshold}"
|
||||
|
||||
def reorder_batch(self, input_batch,
|
||||
scheduler_output: "SchedulerOutput") -> bool:
|
||||
|
||||
Reference in New Issue
Block a user