[0.11.0][Bugfix] Fix ngram precision issue and open e2e ngram test (#4092)
### What this PR does / why we need it? Fix ngram precision issue and open e2e ngram test --------- Signed-off-by: Icey <1790571317@qq.com> Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com> Signed-off-by: zhaomingyu13 <zhaomingyu13@h-partners.com> Co-authored-by: Icey <1790571317@qq.com> Co-authored-by: Mengqing Cao <cmq0113@163.com>
This commit is contained in:
@@ -1512,7 +1512,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
extra_attn_metadata_args = dict(
|
||||
num_accepted_tokens=self.num_accepted_tokens.
|
||||
gpu[:num_reqs],
|
||||
num_draft_tokens=self.num_draft_tokens.
|
||||
num_decode_draft_tokens_cpu=self.num_draft_tokens.
|
||||
gpu[:num_reqs],
|
||||
)
|
||||
attn_metadata_i = builder.build(
|
||||
@@ -1587,11 +1587,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
attn_state = AscendAttentionState.SpecDecoding
|
||||
# Speculative decoding.
|
||||
elif np.all(num_valid_tokens == 1):
|
||||
if self.drafter and (self.drafter.name == SpecDcodeType.EAGLE
|
||||
or self.drafter.name == SpecDcodeType.EAGLE3):
|
||||
attn_state = AscendAttentionState.ChunkedPrefill
|
||||
else:
|
||||
if self.speculative_config and self.speculative_config.method == 'deepseek_mtp':
|
||||
attn_state = AscendAttentionState.SpecDecoding
|
||||
else:
|
||||
attn_state = AscendAttentionState.ChunkedPrefill
|
||||
# splitfuse
|
||||
elif not ascend_config.ascend_scheduler_config.enabled or self.chunked_prefill_enabled:
|
||||
attn_state = AscendAttentionState.ChunkedPrefill
|
||||
|
||||
Reference in New Issue
Block a user