[Feat] Add modalities for vision server when handling pixel values for llava (#1346)

This commit is contained in:
Kaichen Zhang - NTU
2024-09-09 17:07:34 +08:00
committed by GitHub
parent 8e6bdf851c
commit 662ecd9368
11 changed files with 40 additions and 2 deletions

View File

@@ -188,6 +188,7 @@ class TokenizerManager:
pixel_values, image_hashes, image_sizes = await self._get_pixel_values(
obj.image_data if not_use_index else obj.image_data[index]
)
modalities = obj.modalities
return_logprob = (
obj.return_logprob if not_use_index else obj.return_logprob[index]
)
@@ -243,6 +244,7 @@ class TokenizerManager:
pixel_values, image_hashes, image_sizes = await self._get_pixel_values(
obj.image_data[0]
)
modalities = obj.modalities
return_logprob = obj.return_logprob[0]
logprob_start_len = obj.logprob_start_len[0]
top_logprobs_num = obj.top_logprobs_num[0]
@@ -263,6 +265,7 @@ class TokenizerManager:
logprob_start_len,
top_logprobs_num,
obj.stream,
modalities,
)
else: # is embedding
tokenized_obj = TokenizedEmbeddingReqInput(
@@ -346,6 +349,7 @@ class TokenizerManager:
pixel_values, image_hashes, image_sizes = (
await self._get_pixel_values(obj.image_data[index])
)
modalities = obj.modalities
tokenized_obj = TokenizedGenerateReqInput(
rid,
@@ -359,6 +363,7 @@ class TokenizerManager:
obj.logprob_start_len[index],
obj.top_logprobs_num[index],
obj.stream,
modalities,
)
else:
tokenized_obj = TokenizedEmbeddingReqInput(