[0.11.0][Bugfix] Fix ngram precision issue and open e2e ngram test (#4092)

### What this PR does / why we need it? Fix ngram precision issue and open e2e ngram test --------- Signed-off-by: Icey <1790571317@qq.com> Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com> Signed-off-by: zhaomingyu13 <zhaomingyu13@h-partners.com> Co-authored-by: Icey <1790571317@qq.com> Co-authored-by: Mengqing Cao <cmq0113@163.com>
2025-11-11 09:58:03 +08:00
parent 2069bef449
commit 650ce8ad19
5 changed files with 34 additions and 25 deletions
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -191,6 +191,14 @@ class AscendAttentionMetadataBuilder:
        self.max_num_blocks_per_req = cdiv(
            self.model_config.max_model_len,
            AscendAttentionBackend.get_supported_block_size()[0])
+        self.speculative_config = vllm_config.speculative_config
+        self.decode_threshold = 1
+        if self.speculative_config:
+            spec_token_num = self.speculative_config.num_speculative_tokens
+            self.decode_threshold += spec_token_num
+            assert self.decode_threshold <= 16, f"decode_threshold exceeded \
+                npu_fused_infer_attention_score TND layout's limit of 16, \
+                got {self.decode_threshold}"

    def reorder_batch(self, input_batch,
                      scheduler_output: "SchedulerOutput") -> bool: