[Bugfix][kvcache] revert multiple kv cache groups (#923)

Revert multiple kv cache groups related changes as this feature is reverted in vllm https://github.com/vllm-project/vllm/pull/18459 --------- Signed-off-by: MengqingCao <cmq0113@163.com>
2025-05-22 15:15:33 +08:00
parent b4d6672d01
commit 7aa4f85f10
3 changed files with 34 additions and 30 deletions
--- a/vllm_ascend/attention/mla_v1.py
+++ b/vllm_ascend/attention/mla_v1.py
@@ -16,7 +16,6 @@ from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding

 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
-from vllm_ascend.utils import vllm_version_is
 from vllm_ascend.worker.model_runner_v1 import NPUModelRunner

 if TYPE_CHECKING:
@@ -239,12 +238,8 @@ class AscendMLAMetadataBuilder:
        # function. We should avoid GPU -> CPU sync as much as possible because
        # it blocks on all previous kernels.
        device = self.runner.device
-        if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
-            block_table = (self.runner.input_batch.block_table.
-                           get_device_tensor()[:num_reqs])
-        else:
-            block_table = (self.runner.input_batch.block_table[0].
-                           get_device_tensor()[:num_reqs])
+        block_table = (
+            self.runner.input_batch.block_table.get_device_tensor()[:num_reqs])
        slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
            device, non_blocking=True)
        input_positions = self.runner.positions_cpu[:num_actual_tokens].to(