vlm: enable radix cache for qwen-vl models (#5349)

Co-authored-by: Xinyuan Tong <justinning0323@outlook.com>
This commit is contained in:
Mick
2025-04-24 12:35:05 +09:00
committed by GitHub
parent 7d0edf3cae
commit c998d04b46
26 changed files with 429 additions and 331 deletions

View File

@@ -909,6 +909,7 @@ def v1_chat_generate_request(
# NOTE: with openai API, the prompt's logprobs are always not computed
is_multimodal = tokenizer_manager.model_config.is_multimodal
for request in all_requests:
# Prep the data needed for the underlying GenerateReqInput:
# - prompt: The full prompt string.
@@ -918,6 +919,7 @@ def v1_chat_generate_request(
# None skips any image processing in GenerateReqInput.
strict_tag = None
prompt = ""
prompt_ids = []
if not isinstance(request.messages, str):
# Apply chat template and its stop strings.
tools = None
@@ -1019,7 +1021,7 @@ def v1_chat_generate_request(
):
encoded = encoded[1:]
prompt_ids += encoded
if tokenizer_manager.model_config.is_multimodal:
if is_multimodal:
prompt = tokenizer_manager.tokenizer.decode(prompt_ids)
stop = request.stop
image_data = None
@@ -1064,8 +1066,9 @@ def v1_chat_generate_request(
stop.append(request.stop)
else:
stop.extend(request.stop)
prompt_ids = tokenizer_manager.tokenizer.encode(prompt)
if not is_multimodal:
prompt_ids = tokenizer_manager.tokenizer.encode(prompt)
else:
# Use the raw prompt and stop strings if the messages is already a string.
prompt_ids = request.messages
@@ -1135,7 +1138,7 @@ def v1_chat_generate_request(
audio_data_list.append(audio_data)
modalities_list.append(modalities)
if len(all_requests) == 1:
if tokenizer_manager.model_config.is_multimodal:
if is_multimodal:
# processor will need text input
prompt_kwargs = {"text": prompts[0]}
else: