[Fix] Delete pooling redundant code (#4940)

### What this PR does / why we need it? Remove redundant code in #3122. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: lianyibo <lianyibo1@kunlunit.com>
2025-12-20 20:47:30 +08:00
parent 21745221a3
commit 58773af708
1 changed files with 1 additions and 8 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -1173,14 +1173,7 @@ class NPUModelRunner(GPUModelRunner):
    def _build_attn_state(self, num_reqs, num_scheduled_tokens,
                          num_valid_tokens):
-        if self.model_config.runner_type == "pooling":
+        if np.array_equal(self.seq_lens.np[:num_reqs], num_scheduled_tokens):
            if isinstance(
                    self.kv_cache_config.kv_cache_groups[0].kv_cache_spec,
                    EncoderOnlyAttentionSpec):
                attn_state = AscendAttentionState.PrefillNoCache
            else:
                attn_state = AscendAttentionState.PrefillCacheHit
        elif np.array_equal(self.seq_lens.np[:num_reqs], num_scheduled_tokens):
            attn_state = AscendAttentionState.PrefillNoCache
        # We assume it is the decode stage, where prefill occurs but only one token is not hit in cache.
        elif np.all(num_scheduled_tokens == 1):