model: support deepseek-ocr (#11891)

Co-authored-by: yhyang201 <47235274+yhyang201@users.noreply.github.com>
Co-authored-by: yhyang201 <yhyang201@gmail.com>
Co-authored-by: Shi Shuai <126407087+shuaills@users.noreply.github.com>
Co-authored-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
This commit is contained in:
Mick
2025-10-24 03:15:17 +08:00
committed by GitHub
parent 39c237f02c
commit 770529a731
13 changed files with 2125 additions and 117 deletions

View File

@@ -150,6 +150,62 @@ class TestQwen2AudioServer(AudioOpenAITestMixin):
model = "Qwen/Qwen2-Audio-7B-Instruct"
class TestDeepseekOCRServer(TestOpenAIMLLMServerBase):
model = "deepseek-ai/DeepSeek-OCR"
trust_remote_code = False
def verify_single_image_response_for_ocr(self, response):
"""Verify DeepSeek-OCR grounding output with coordinates"""
assert response.choices[0].message.role == "assistant"
text = response.choices[0].message.content
assert isinstance(text, str)
# DeepSeek-OCR uses grounding format, outputs coordinates
assert "text" in text.lower(), f"OCR text: {text}, should contain 'text'"
# Verify coordinate format [[x1, y1, x2, y2]]
import re
coord_pattern = r"\[\[[\d\s,]+\]\]"
assert re.search(
coord_pattern, text
), f"OCR text: {text}, should contain coordinate format [[x1, y1, x2, y2]]"
# Verify basic response fields
assert response.id
assert response.created
assert response.usage.prompt_tokens > 0
assert response.usage.completion_tokens > 0
assert response.usage.total_tokens > 0
def test_single_image_chat_completion(self):
client = openai.Client(api_key=self.api_key, base_url=self.base_url)
image_url = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/images/ocr-text.png"
response = client.chat.completions.create(
model="default",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": image_url},
},
{
"type": "text",
"text": "<|grounding|>Convert the document to markdown.",
},
],
},
],
temperature=0,
**(self.get_vision_request_kwargs()),
)
self.verify_single_image_response_for_ocr(response)
if __name__ == "__main__":
del (
TestOpenAIMLLMServerBase,

View File

@@ -32,6 +32,7 @@ class TestOpenAIMLLMServerBase(CustomTestCase):
model: str
extra_args: list = []
fixed_args: list = ["--trust-remote-code", "--enable-multimodal"]
trust_remote_code: bool = True
@classmethod
def setUpClass(cls):
@@ -42,7 +43,11 @@ class TestOpenAIMLLMServerBase(CustomTestCase):
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
api_key=cls.api_key,
other_args=cls.extra_args + cls.fixed_args,
other_args=(
cls.extra_args + cls.fixed_args + ["--trust-remote-code"]
if cls.trust_remote_code
else []
),
)
cls.base_url += "/v1"