model: support deepseek-ocr (#11891)
Co-authored-by: yhyang201 <47235274+yhyang201@users.noreply.github.com> Co-authored-by: yhyang201 <yhyang201@gmail.com> Co-authored-by: Shi Shuai <126407087+shuaills@users.noreply.github.com> Co-authored-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
This commit is contained in:
@@ -150,6 +150,62 @@ class TestQwen2AudioServer(AudioOpenAITestMixin):
|
||||
model = "Qwen/Qwen2-Audio-7B-Instruct"
|
||||
|
||||
|
||||
class TestDeepseekOCRServer(TestOpenAIMLLMServerBase):
|
||||
model = "deepseek-ai/DeepSeek-OCR"
|
||||
trust_remote_code = False
|
||||
|
||||
def verify_single_image_response_for_ocr(self, response):
|
||||
"""Verify DeepSeek-OCR grounding output with coordinates"""
|
||||
assert response.choices[0].message.role == "assistant"
|
||||
text = response.choices[0].message.content
|
||||
assert isinstance(text, str)
|
||||
|
||||
# DeepSeek-OCR uses grounding format, outputs coordinates
|
||||
assert "text" in text.lower(), f"OCR text: {text}, should contain 'text'"
|
||||
|
||||
# Verify coordinate format [[x1, y1, x2, y2]]
|
||||
import re
|
||||
|
||||
coord_pattern = r"\[\[[\d\s,]+\]\]"
|
||||
assert re.search(
|
||||
coord_pattern, text
|
||||
), f"OCR text: {text}, should contain coordinate format [[x1, y1, x2, y2]]"
|
||||
|
||||
# Verify basic response fields
|
||||
assert response.id
|
||||
assert response.created
|
||||
assert response.usage.prompt_tokens > 0
|
||||
assert response.usage.completion_tokens > 0
|
||||
assert response.usage.total_tokens > 0
|
||||
|
||||
def test_single_image_chat_completion(self):
|
||||
client = openai.Client(api_key=self.api_key, base_url=self.base_url)
|
||||
image_url = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/images/ocr-text.png"
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model="default",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url},
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "<|grounding|>Convert the document to markdown.",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
temperature=0,
|
||||
**(self.get_vision_request_kwargs()),
|
||||
)
|
||||
|
||||
self.verify_single_image_response_for_ocr(response)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
del (
|
||||
TestOpenAIMLLMServerBase,
|
||||
|
||||
@@ -32,6 +32,7 @@ class TestOpenAIMLLMServerBase(CustomTestCase):
|
||||
model: str
|
||||
extra_args: list = []
|
||||
fixed_args: list = ["--trust-remote-code", "--enable-multimodal"]
|
||||
trust_remote_code: bool = True
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
@@ -42,7 +43,11 @@ class TestOpenAIMLLMServerBase(CustomTestCase):
|
||||
cls.base_url,
|
||||
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
api_key=cls.api_key,
|
||||
other_args=cls.extra_args + cls.fixed_args,
|
||||
other_args=(
|
||||
cls.extra_args + cls.fixed_args + ["--trust-remote-code"]
|
||||
if cls.trust_remote_code
|
||||
else []
|
||||
),
|
||||
)
|
||||
cls.base_url += "/v1"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user