upgrade to 0.18.0 (#7502)

### What this PR does / why we need it? 1. upgrade to 0.18.0 2. ensure kernel_block_sizes is int for Eagle drafter ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.17.0 - vLLM main: 8b6325758c --------- Signed-off-by: Meihan-chen <jcccx.cmh@gmail.com> Signed-off-by: hfadzxy <starmoon_zhang@163.com> Co-authored-by: hfadzxy <starmoon_zhang@163.com>
2026-03-21 16:05:38 +08:00
parent 80a4265717
commit bff4fbfca5
16 changed files with 139 additions and 258 deletions
--- a/vllm_ascend/_310p/model_runner_310p.py
+++ b/vllm_ascend/_310p/model_runner_310p.py
@@ -37,7 +37,7 @@ from vllm.v1.kv_cache_interface import (
 )

 from vllm_ascend.attention.attention_v1 import AscendAttentionState
-from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, vllm_version_is
+from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ
 from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
 from vllm_ascend.worker.npu_input_batch import NPUInputBatch

@@ -451,18 +451,11 @@ class NPUModelRunner310(NPUModelRunner):
                self.kernel_block_sizes.append([0])

        if block_sizes != [self.cache_config.block_size] or self.kernel_block_sizes != [[self.cache_config.block_size]]:
-            if vllm_version_is("0.17.0"):
-                assert self.cache_config.cpu_offload_gb == 0, (
-                    "Cannot re-initialize the input batch when CPU weight "
-                    "offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 "  # noqa: E501
-                    "for more details."
-                )
-            else:
-                assert self.offload_config.uva.cpu_offload_gb == 0, (
-                    "Cannot re-initialize the input batch when CPU weight "
-                    "offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 "  # noqa: E501
-                    "for more details."
-                )
+            assert self.offload_config.uva.cpu_offload_gb == 0, (
+                "Cannot re-initialize the input batch when CPU weight "
+                "offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 "  # noqa: E501
+                "for more details."
+            )
            self.input_batch = NPUInputBatch(
                max_num_reqs=self.max_num_reqs,
                max_model_len=max(self.model_config.max_model_len, self.max_encoder_len),