[ModelRunner] Use shared CachedRequestData cross request to fix ci (#1546)

### What this PR does / why we need it?

This PR (adapted from
2863befce3)
updates the CachedRequestData definition to use a single instance shared
across all requests in a batch, instead of creating a new instance per
request.

Found ci boken by the vllm's model_runner change: `ERROR 07-01 09:53:53
[core.py:521] TypeError: 'CachedRequestData' object is not iterable`,
Modify the model_runner to fix it.


### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
pass ci will verify this.

---------

Signed-off-by: ganyi <pleaplusone.gy@gmail.com>
Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
Co-authored-by: Yikun Jiang <yikunkero@gmail.com>
This commit is contained in:
Pleaplusone
2025-07-02 06:05:21 +08:00
committed by GitHub
parent 6db7dc2c85
commit 0e43813120
4 changed files with 179 additions and 85 deletions

View File

@@ -32,6 +32,8 @@ from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.request import Request, RequestStatus
from vllm.v1.structured_output import StructuredOutputManager
from vllm_ascend.utils import vllm_version_is
class AscendScheduler(Scheduler):
"""This Scheduler extends vllm's original v1 scheduler
@@ -364,27 +366,36 @@ class AscendScheduler(Scheduler):
req_to_new_block_ids[req.request_id])
for req in scheduled_new_reqs
]
resumed_reqs_data = [
self._make_cached_request_data(
req,
num_scheduled_tokens[req.request_id],
len(scheduled_spec_decode_tokens.get(req.request_id, ())),
req_to_new_block_ids[req.request_id],
resumed_from_preemption=True,
) for req in scheduled_resumed_reqs
]
running_reqs_data = [
self._make_cached_request_data(
req,
num_scheduled_tokens[req.request_id],
len(scheduled_spec_decode_tokens.get(req.request_id, ())),
req_to_new_block_ids[req.request_id],
resumed_from_preemption=False,
) for req in scheduled_running_reqs
]
if vllm_version_is("0.9.1"):
resumed_reqs_data = [
self._make_cached_request_data(
req,
num_scheduled_tokens[req.request_id],
len(scheduled_spec_decode_tokens.get(req.request_id, ())),
req_to_new_block_ids[req.request_id],
resumed_from_preemption=True,
) for req in scheduled_resumed_reqs
]
running_reqs_data = [
self._make_cached_request_data(
req,
num_scheduled_tokens[req.request_id],
len(scheduled_spec_decode_tokens.get(req.request_id, ())),
req_to_new_block_ids[req.request_id],
resumed_from_preemption=False,
) for req in scheduled_running_reqs
]
scheduled_cached_reqs = resumed_reqs_data + running_reqs_data
else:
cached_reqs_data = self._make_cached_request_data(
scheduled_running_reqs, scheduled_resumed_reqs,
num_scheduled_tokens, scheduled_spec_decode_tokens,
req_to_new_block_ids)
scheduled_cached_reqs = cached_reqs_data
scheduler_output = SchedulerOutput(
scheduled_new_reqs=new_reqs_data,
scheduled_cached_reqs=resumed_reqs_data + running_reqs_data,
scheduled_cached_reqs=scheduled_cached_reqs,
num_scheduled_tokens=num_scheduled_tokens,
total_num_scheduled_tokens=total_num_scheduled_tokens,
scheduled_spec_decode_tokens=scheduled_spec_decode_tokens,