Fix some ci issue and refactor modelrunner (#2445)

### What this PR does / why we need it? Fix some ci issue and refactor modelrunner ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with existing test. - vLLM version: v0.10.0 - vLLM main: 4d9c61993a --------- Signed-off-by: wangli <wangli858794774@gmail.com> Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: weiguihua2 <weiguihua2@huawei.com> Co-authored-by: wangli <wangli858794774@gmail.com> Co-authored-by: weiguihua2 <weiguihua2@huawei.com>
2025-08-20 09:01:04 +08:00
parent 955411611c
commit 1327f9be1c
28 changed files with 1612 additions and 1020 deletions
--- a/tests/ut/kv_connector/test_remote_prefill_lifecycle.py
+++ b/tests/ut/kv_connector/test_remote_prefill_lifecycle.py
@@ -42,7 +42,8 @@ def test_basic_lifecycle():

    request = create_request(request_id=1,
                             num_tokens=NUM_TOKENS,
-                             do_remote_prefill=True)
+                             do_remote_prefill=True,
+                             block_size=BLOCK_SIZE)

    scheduler.add_request(request)
    request_id = request.request_id
--- a/tests/ut/kv_connector/utils.py
+++ b/tests/ut/kv_connector/utils.py
@@ -10,6 +10,8 @@ import torch
 from vllm import SamplingParams
 from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig,
                         ModelConfig, SchedulerConfig, VllmConfig)
+from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
+                                         init_none_hash)
 from vllm.v1.core.sched.scheduler import Scheduler
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                        KVCacheGroupSpec)
@@ -39,7 +41,6 @@ def assert_scheduler_empty(scheduler: Scheduler):
    # KVCache Manager.
    assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
               req_to_blocks) == 0
-    assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0
    assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
               num_cached_block) == 0
    num_free_blocks = (
@@ -118,6 +119,9 @@ def create_scheduler(
    )


+_none_hash_initialized = False
+
+
 def create_request(
    request_id: int,
    num_tokens: int = 10,
@@ -126,8 +130,15 @@ def create_request(
    do_remote_prefill: bool = False,
    use_all_1s_for_prompt_tokens: bool = False,
    num_remote_blocks: int = 3,
+    block_size: int = 16,
 ) -> Request:
    """Make dummy request for testing."""
+    global _none_hash_initialized
+    if not _none_hash_initialized:
+        init_none_hash(hash)
+        _none_hash_initialized = True
+
+    block_hasher = get_request_block_hasher(block_size, hash)

    kv_transfer_params: Optional[dict[str, Any]] = None

@@ -164,6 +175,7 @@ def create_request(
            "pooling_params": []
        } if not vllm_version_is("0.9.1") else {}),
        eos_token_id=EOS_TOKEN_ID,
+        block_hasher=block_hasher,
    )
    req.kv_transfer_params = kv_transfer_params
    return req
@@ -196,7 +208,6 @@ def create_model_runner_output(
        req_ids=req_ids,
        req_id_to_index=req_id_to_index,
        sampled_token_ids=sampled_token_ids,
-        spec_token_ids=None,
        logprobs=None,
        prompt_logprobs_dict={},
        pooler_output=[],