[Fix] Delete pooling redundant code (#4940)
### What this PR does / why we need it?
Remove redundant code in #3122.
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
Signed-off-by: lianyibo <lianyibo1@kunlunit.com>
This commit is contained in:
@@ -1173,14 +1173,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
|
||||
def _build_attn_state(self, num_reqs, num_scheduled_tokens,
|
||||
num_valid_tokens):
|
||||
if self.model_config.runner_type == "pooling":
|
||||
if isinstance(
|
||||
self.kv_cache_config.kv_cache_groups[0].kv_cache_spec,
|
||||
EncoderOnlyAttentionSpec):
|
||||
attn_state = AscendAttentionState.PrefillNoCache
|
||||
else:
|
||||
attn_state = AscendAttentionState.PrefillCacheHit
|
||||
elif np.array_equal(self.seq_lens.np[:num_reqs], num_scheduled_tokens):
|
||||
if np.array_equal(self.seq_lens.np[:num_reqs], num_scheduled_tokens):
|
||||
attn_state = AscendAttentionState.PrefillNoCache
|
||||
# We assume it is the decode stage, where prefill occurs but only one token is not hit in cache.
|
||||
elif np.all(num_scheduled_tokens == 1):
|
||||
|
||||
Reference in New Issue
Block a user