refactor: multimodal data (#4754)

This commit is contained in:
Mick
2025-04-01 00:57:51 +08:00
committed by GitHub
parent c7457191a0
commit 5cb552b1d4
36 changed files with 989 additions and 1138 deletions

View File

@@ -897,6 +897,7 @@ def v1_chat_generate_request(
request_ids: List[str] = None,
):
input_ids = []
prompts = []
sampling_params_list = []
image_data_list = []
audio_data_list = []
@@ -916,6 +917,7 @@ def v1_chat_generate_request(
# - audio_data: None or a list of audio strings (URLs).
# None skips any image processing in GenerateReqInput.
strict_tag = None
prompt = ""
if not isinstance(request.messages, str):
# Apply chat template and its stop strings.
tools = None
@@ -1005,11 +1007,13 @@ def v1_chat_generate_request(
image_data = None
audio_data = None
modalities = []
prompt = request.messages
input_ids.append(prompt_ids)
return_logprobs.append(request.logprobs)
logprob_start_lens.append(-1)
top_logprobs_nums.append(request.top_logprobs or 0)
lora_paths.append(request.lora_path)
prompts.append(prompt)
sampling_params = {
"temperature": request.temperature,
@@ -1063,10 +1067,14 @@ def v1_chat_generate_request(
audio_data_list.append(audio_data)
modalities_list.append(modalities)
if len(all_requests) == 1:
if isinstance(input_ids[0], str):
prompt_kwargs = {"text": input_ids[0]}
if tokenizer_manager.model_config.is_multimodal:
# processor will need text input
prompt_kwargs = {"text": prompts[0]}
else:
prompt_kwargs = {"input_ids": input_ids[0]}
if isinstance(input_ids[0], str):
prompt_kwargs = {"text": input_ids[0]}
else:
prompt_kwargs = {"input_ids": input_ids[0]}
sampling_params_list = sampling_params_list[0]
image_data_list = image_data_list[0]
audio_data_list = audio_data_list[0]
@@ -1076,10 +1084,14 @@ def v1_chat_generate_request(
modalities_list = modalities_list[0]
lora_paths = lora_paths[0]
else:
if isinstance(input_ids[0], str):
prompt_kwargs = {"text": input_ids}
if tokenizer_manager.model_config.is_multimodal:
# processor will need text input
prompt_kwargs = {"text": prompts}
else:
prompt_kwargs = {"input_ids": input_ids}
if isinstance(input_ids[0], str):
prompt_kwargs = {"text": input_ids}
else:
prompt_kwargs = {"input_ids": input_ids}
adapted_request = GenerateReqInput(
**prompt_kwargs,