[Fix] Delete pooling redundant code (#4940)
### What this PR does / why we need it?
Remove redundant code in #3122.
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
Signed-off-by: lianyibo <lianyibo1@kunlunit.com>
This commit is contained in:
@@ -1173,14 +1173,7 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
|
|
||||||
def _build_attn_state(self, num_reqs, num_scheduled_tokens,
|
def _build_attn_state(self, num_reqs, num_scheduled_tokens,
|
||||||
num_valid_tokens):
|
num_valid_tokens):
|
||||||
if self.model_config.runner_type == "pooling":
|
if np.array_equal(self.seq_lens.np[:num_reqs], num_scheduled_tokens):
|
||||||
if isinstance(
|
|
||||||
self.kv_cache_config.kv_cache_groups[0].kv_cache_spec,
|
|
||||||
EncoderOnlyAttentionSpec):
|
|
||||||
attn_state = AscendAttentionState.PrefillNoCache
|
|
||||||
else:
|
|
||||||
attn_state = AscendAttentionState.PrefillCacheHit
|
|
||||||
elif np.array_equal(self.seq_lens.np[:num_reqs], num_scheduled_tokens):
|
|
||||||
attn_state = AscendAttentionState.PrefillNoCache
|
attn_state = AscendAttentionState.PrefillNoCache
|
||||||
# We assume it is the decode stage, where prefill occurs but only one token is not hit in cache.
|
# We assume it is the decode stage, where prefill occurs but only one token is not hit in cache.
|
||||||
elif np.all(num_scheduled_tokens == 1):
|
elif np.all(num_scheduled_tokens == 1):
|
||||||
|
|||||||
Reference in New Issue
Block a user