[Fix] Delete pooling redundant code (#4940)

### What this PR does / why we need it?
Remove redundant code in #3122.

- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c

Signed-off-by: lianyibo <lianyibo1@kunlunit.com>
This commit is contained in:
lianyibo
2025-12-20 20:47:30 +08:00
committed by GitHub
parent 21745221a3
commit 58773af708

View File

@@ -1173,14 +1173,7 @@ class NPUModelRunner(GPUModelRunner):
def _build_attn_state(self, num_reqs, num_scheduled_tokens,
num_valid_tokens):
if self.model_config.runner_type == "pooling":
if isinstance(
self.kv_cache_config.kv_cache_groups[0].kv_cache_spec,
EncoderOnlyAttentionSpec):
attn_state = AscendAttentionState.PrefillNoCache
else:
attn_state = AscendAttentionState.PrefillCacheHit
elif np.array_equal(self.seq_lens.np[:num_reqs], num_scheduled_tokens):
if np.array_equal(self.seq_lens.np[:num_reqs], num_scheduled_tokens):
attn_state = AscendAttentionState.PrefillNoCache
# We assume it is the decode stage, where prefill occurs but only one token is not hit in cache.
elif np.all(num_scheduled_tokens == 1):