2024-08-24 05:11:16 +08:00
import base64
import io
import os
2024-08-04 20:51:55 -07:00
2024-08-24 05:11:16 +08:00
import numpy as np
2024-08-04 20:51:55 -07:00
import openai
2024-08-24 05:11:16 +08:00
import requests
from PIL import Image
2024-08-04 20:51:55 -07:00
2024-11-28 00:22:39 -08:00
from sglang . srt . utils import kill_process_tree
2025-08-17 13:25:02 +08:00
from sglang . test . test_utils import DEFAULT_URL_FOR_TEST , CustomTestCase
2024-08-04 20:51:55 -07:00
2025-03-23 13:48:49 +08:00
# image
IMAGE_MAN_IRONING_URL = " https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/images/man_ironing_on_back_of_suv.png "
IMAGE_SGL_LOGO_URL = " https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/images/sgl_logo.png "
# video
VIDEO_JOBS_URL = " https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/videos/jobs_presenting_ipod.mp4 "
# audio
AUDIO_TRUMP_SPEECH_URL = " https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/audios/Trump_WEF_2018_10s.mp3 "
AUDIO_BIRD_SONG_URL = " https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/audios/bird_song.mp3 "
2024-08-04 20:51:55 -07:00
2025-08-17 13:25:02 +08:00
class TestOpenAIOmniServerBase ( CustomTestCase ) :
2024-08-04 20:51:55 -07:00
@classmethod
def setUpClass ( cls ) :
2025-08-17 13:25:02 +08:00
cls . model = " "
2024-08-25 19:02:08 -07:00
cls . base_url = DEFAULT_URL_FOR_TEST
2024-08-04 20:51:55 -07:00
cls . api_key = " sk-123456 "
2025-08-17 13:25:02 +08:00
cls . process = None
2024-08-04 20:51:55 -07:00
cls . base_url + = " /v1 "
@classmethod
def tearDownClass ( cls ) :
2024-11-28 00:22:39 -08:00
kill_process_tree ( cls . process . pid )
2024-08-04 20:51:55 -07:00
2025-07-18 21:03:53 -07:00
def get_vision_request_kwargs ( self ) :
return self . get_request_kwargs ( )
2025-06-04 22:08:30 -07:00
def get_request_kwargs ( self ) :
return { }
2025-08-17 13:25:02 +08:00
def get_or_download_file ( self , url : str ) - > str :
cache_dir = os . path . expanduser ( " ~/.cache " )
if url is None :
raise ValueError ( )
file_name = url . split ( " / " ) [ - 1 ]
file_path = os . path . join ( cache_dir , file_name )
os . makedirs ( cache_dir , exist_ok = True )
if not os . path . exists ( file_path ) :
response = requests . get ( url )
response . raise_for_status ( )
with open ( file_path , " wb " ) as f :
f . write ( response . content )
return file_path
class AudioOpenAITestMixin ( TestOpenAIOmniServerBase ) :
def prepare_audio_messages ( self , prompt , audio_file_name ) :
messages = [
{
" role " : " user " ,
" content " : [
{
" type " : " audio_url " ,
" audio_url " : { " url " : f " { audio_file_name } " } ,
} ,
{
" type " : " text " ,
" text " : prompt ,
} ,
] ,
}
]
return messages
def get_audio_request_kwargs ( self ) :
return self . get_request_kwargs ( )
def get_audio_response ( self , url : str , prompt , category ) :
audio_file_path = self . get_or_download_file ( url )
client = openai . Client ( api_key = " sk-123456 " , base_url = self . base_url )
messages = self . prepare_audio_messages ( prompt , audio_file_path )
response = client . chat . completions . create (
model = " default " ,
messages = messages ,
temperature = 0 ,
max_tokens = 128 ,
stream = False ,
* * ( self . get_audio_request_kwargs ( ) ) ,
)
audio_response = response . choices [ 0 ] . message . content
print ( " - " * 30 )
print ( f " audio { category } response: \n { audio_response } " )
print ( " - " * 30 )
audio_response = audio_response . lower ( )
self . assertIsNotNone ( audio_response )
self . assertGreater ( len ( audio_response ) , 0 )
return audio_response . lower ( )
def test_audio_speech_completion ( self ) :
# a fragment of Trump's speech
audio_response = self . get_audio_response (
AUDIO_TRUMP_SPEECH_URL ,
" Listen to this audio and write down the audio transcription in English. " ,
category = " speech " ,
)
check_list = [
" thank you " ,
" it ' s a privilege to be here " ,
" leader " ,
" science " ,
" art " ,
]
for check_word in check_list :
assert (
check_word in audio_response
) , f " audio_response: | { audio_response } | should contain | { check_word } | "
def test_audio_ambient_completion ( self ) :
# bird song
audio_response = self . get_audio_response (
AUDIO_BIRD_SONG_URL ,
" Please listen to the audio snippet carefully and transcribe the content in English. " ,
" ambient " ,
)
assert " bird " in audio_response
class ImageOpenAITestMixin ( TestOpenAIOmniServerBase ) :
2025-03-12 03:35:35 +08:00
def test_single_image_chat_completion ( self ) :
2024-08-04 20:51:55 -07:00
client = openai . Client ( api_key = self . api_key , base_url = self . base_url )
response = client . chat . completions . create (
model = " default " ,
messages = [
{
" role " : " user " ,
" content " : [
{
" type " : " image_url " ,
2025-03-23 13:48:49 +08:00
" image_url " : { " url " : IMAGE_MAN_IRONING_URL } ,
2024-08-04 20:51:55 -07:00
} ,
2024-08-04 22:52:41 -07:00
{
" type " : " text " ,
2025-07-31 21:49:45 -07:00
" text " : " Describe this image in a sentence. " ,
2024-08-04 22:52:41 -07:00
} ,
2024-08-04 20:51:55 -07:00
] ,
} ,
] ,
temperature = 0 ,
2025-07-18 21:03:53 -07:00
* * ( self . get_vision_request_kwargs ( ) ) ,
2024-08-04 20:51:55 -07:00
)
assert response . choices [ 0 ] . message . role == " assistant "
2024-08-04 22:52:41 -07:00
text = response . choices [ 0 ] . message . content
assert isinstance ( text , str )
2025-03-17 08:37:32 +08:00
# `driver` is for gemma-3-it
2025-04-30 12:31:19 +08:00
assert (
" man " in text or " person " or " driver " in text
) , f " text: { text } , should contain man, person or driver "
assert (
" cab " in text
or " taxi " in text
or " SUV " in text
or " vehicle " in text
or " car " in text
) , f " text: { text } , should contain cab, taxi, SUV, vehicle or car "
2025-03-25 11:08:40 +08:00
# MiniCPMO fails to recognize `iron`, but `hanging`
2025-04-30 12:31:19 +08:00
assert (
2025-08-09 00:59:13 -07:00
" iron " in text
or " hang " in text
or " cloth " in text
or " coat " in text
or " holding " in text
or " outfit " in text
) , f " text: { text } , should contain iron, hang, cloth, coat or holding or outfit "
2024-08-30 11:51:44 -07:00
assert response . id
assert response . created
assert response . usage . prompt_tokens > 0
assert response . usage . completion_tokens > 0
assert response . usage . total_tokens > 0
def test_multi_turn_chat_completion ( self ) :
client = openai . Client ( api_key = self . api_key , base_url = self . base_url )
response = client . chat . completions . create (
model = " default " ,
messages = [
{
" role " : " user " ,
" content " : [
{
" type " : " image_url " ,
2025-03-23 13:48:49 +08:00
" image_url " : { " url " : IMAGE_MAN_IRONING_URL } ,
2024-08-30 11:51:44 -07:00
} ,
{
" type " : " text " ,
2025-07-31 21:49:45 -07:00
" text " : " Describe this image in a sentence. " ,
2024-08-30 11:51:44 -07:00
} ,
] ,
} ,
{
" role " : " assistant " ,
" content " : [
{
" type " : " text " ,
" text " : " There is a man at the back of a yellow cab ironing his clothes. " ,
}
] ,
} ,
{
" role " : " user " ,
" content " : [
{ " type " : " text " , " text " : " Repeat your previous answer. " }
] ,
} ,
] ,
temperature = 0 ,
2025-07-18 21:03:53 -07:00
* * ( self . get_vision_request_kwargs ( ) ) ,
2024-08-30 11:51:44 -07:00
)
assert response . choices [ 0 ] . message . role == " assistant "
text = response . choices [ 0 ] . message . content
assert isinstance ( text , str )
2025-04-30 12:31:19 +08:00
assert (
" man " in text or " cab " in text
) , f " text: { text } , should contain man or cab "
2024-08-04 20:51:55 -07:00
assert response . id
assert response . created
assert response . usage . prompt_tokens > 0
assert response . usage . completion_tokens > 0
2024-08-26 01:28:23 +08:00
assert response . usage . total_tokens > 0
2024-11-01 17:47:44 -07:00
def test_multi_images_chat_completion ( self ) :
2024-08-26 01:28:23 +08:00
client = openai . Client ( api_key = self . api_key , base_url = self . base_url )
response = client . chat . completions . create (
model = " default " ,
messages = [
{
" role " : " user " ,
" content " : [
{
" type " : " image_url " ,
2025-04-01 00:57:51 +08:00
" image_url " : { " url " : IMAGE_MAN_IRONING_URL } ,
2024-09-09 17:07:34 +08:00
" modalities " : " multi-images " ,
2024-08-26 01:28:23 +08:00
} ,
{
" type " : " image_url " ,
2025-03-23 13:48:49 +08:00
" image_url " : { " url " : IMAGE_SGL_LOGO_URL } ,
2024-09-09 17:07:34 +08:00
" modalities " : " multi-images " ,
2024-08-26 01:28:23 +08:00
} ,
{
" type " : " text " ,
2024-08-25 19:02:08 -07:00
" text " : " I have two very different images. They are not related at all. "
" Please describe the first image in one sentence, and then describe the second image in another sentence. " ,
2024-08-26 01:28:23 +08:00
} ,
] ,
} ,
] ,
temperature = 0 ,
2025-07-18 21:03:53 -07:00
* * ( self . get_vision_request_kwargs ( ) ) ,
2024-08-26 01:28:23 +08:00
)
assert response . choices [ 0 ] . message . role == " assistant "
text = response . choices [ 0 ] . message . content
assert isinstance ( text , str )
2025-03-25 11:08:40 +08:00
print ( " - " * 30 )
print ( f " Multi images response: \n { text } " )
print ( " - " * 30 )
2025-04-30 12:31:19 +08:00
assert (
2025-08-09 00:59:13 -07:00
" man " in text
or " cab " in text
or " SUV " in text
or " taxi " in text
or " car " in text
) , f " text: { text } , should contain man, cab, SUV, taxi or car "
2025-04-30 12:31:19 +08:00
assert (
2025-08-09 00:59:13 -07:00
" logo " in text or ' " S " ' in text or " SG " in text or " graphic " in text
) , f " text: { text } , should contain logo, S or SG or graphic "
2024-08-26 01:28:23 +08:00
assert response . id
assert response . created
assert response . usage . prompt_tokens > 0
assert response . usage . completion_tokens > 0
2025-08-16 19:55:56 -07:00
assert response . usage . total_tokens > 0
def _test_mixed_image_audio_chat_completion ( self ) :
client = openai . Client ( api_key = self . api_key , base_url = self . base_url )
response = client . chat . completions . create (
model = " default " ,
messages = [
{
" role " : " user " ,
" content " : [
{
" type " : " image_url " ,
" image_url " : { " url " : IMAGE_MAN_IRONING_URL } ,
} ,
{
" type " : " audio_url " ,
" audio_url " : { " url " : AUDIO_TRUMP_SPEECH_URL } ,
} ,
{
" type " : " text " ,
" text " : " Please describe the image in one sentence, and then write down the audio transcription in English. " ,
} ,
] ,
} ,
] ,
temperature = 0 ,
* * ( self . get_vision_request_kwargs ( ) ) ,
)
assert response . choices [ 0 ] . message . role == " assistant "
text = response . choices [ 0 ] . message . content
assert isinstance ( text , str )
print ( " - " * 30 )
print ( f " Mixed image & audio response: \n { text } " )
print ( " - " * 30 )
assert (
" man " in text
or " cab " in text
or " SUV " in text
or " taxi " in text
or " car " in text
) , f " text: { text } , should contain man, cab, SUV, taxi or car "
check_list = [
" thank you " ,
" it ' s a privilege to be here " ,
" leader " ,
" science " ,
" art " ,
]
for check_word in check_list :
assert (
check_word in text
) , f " text: | { text } | should contain | { check_word } | "
assert response . id
assert response . created
assert response . usage . prompt_tokens > 0
assert response . usage . completion_tokens > 0
2024-08-04 20:51:55 -07:00
assert response . usage . total_tokens > 0
2025-07-10 14:48:35 +08:00
def prepare_video_images_messages ( self , video_path ) :
2025-01-28 16:22:13 +08:00
# the memory consumed by the Vision Attention varies a lot, e.g. blocked qkv vs full-sequence sdpa
# the size of the video embeds differs from the `modality` argument when preprocessed
2025-02-24 16:17:38 -08:00
# We import decord here to avoid a strange Segmentation fault (core dumped) issue.
# The following import order will cause Segmentation fault.
# import decord
# from transformers import AutoTokenizer
from decord import VideoReader , cpu
2025-07-10 14:48:35 +08:00
max_frames_num = 10
2024-08-24 05:11:16 +08:00
vr = VideoReader ( video_path , ctx = cpu ( 0 ) )
total_frame_num = len ( vr )
uniform_sampled_frames = np . linspace (
0 , total_frame_num - 1 , max_frames_num , dtype = int
)
frame_idx = uniform_sampled_frames . tolist ( )
frames = vr . get_batch ( frame_idx ) . asnumpy ( )
base64_frames = [ ]
for frame in frames :
pil_img = Image . fromarray ( frame )
buff = io . BytesIO ( )
pil_img . save ( buff , format = " JPEG " )
base64_str = base64 . b64encode ( buff . getvalue ( ) ) . decode ( " utf-8 " )
base64_frames . append ( base64_str )
messages = [ { " role " : " user " , " content " : [ ] } ]
frame_format = {
" type " : " image_url " ,
" image_url " : { " url " : " data:image/jpeg;base64, {} " } ,
2025-07-10 14:48:35 +08:00
" modalities " : " image " ,
2024-08-24 05:11:16 +08:00
}
for base64_frame in base64_frames :
frame_format [ " image_url " ] [ " url " ] = " data:image/jpeg;base64, {} " . format (
base64_frame
)
messages [ 0 ] [ " content " ] . append ( frame_format . copy ( ) )
prompt = { " type " : " text " , " text " : " Please describe the video in detail. " }
messages [ 0 ] [ " content " ] . append ( prompt )
return messages
2025-07-10 14:48:35 +08:00
def test_video_images_chat_completion ( self ) :
url = VIDEO_JOBS_URL
file_path = self . get_or_download_file ( url )
client = openai . Client ( api_key = self . api_key , base_url = self . base_url )
messages = self . prepare_video_images_messages ( file_path )
response = client . chat . completions . create (
model = " default " ,
messages = messages ,
temperature = 0 ,
max_tokens = 1024 ,
stream = False ,
)
video_response = response . choices [ 0 ] . message . content
print ( " - " * 30 )
print ( f " Video images response: \n { video_response } " )
print ( " - " * 30 )
# Add assertions to validate the video response
assert (
" iPod " in video_response
or " device " in video_response
or " microphone " in video_response
2025-07-20 21:43:09 -07:00
) , f """
== == == == == == == == == == == video_response == == == == == == == == == == =
{ video_response }
== == == == == == == == == == == == == == == == == == == == == == == == == == == == == =
should contain ' iPod ' or ' device ' or ' microphone '
"""
2025-07-10 14:48:35 +08:00
assert (
" man " in video_response
or " person " in video_response
or " individual " in video_response
or " speaker " in video_response
2025-08-13 02:08:30 -07:00
or " presenter " in video_response
2025-07-20 21:43:09 -07:00
or " Steve " in video_response
2025-08-09 00:59:13 -07:00
or " hand " in video_response
2025-07-20 21:43:09 -07:00
) , f """
== == == == == == == == == == == video_response == == == == == == == == == == =
{ video_response }
== == == == == == == == == == == == == == == == == == == == == == == == == == == == == =
2025-08-13 02:08:30 -07:00
should contain ' man ' or ' person ' or ' individual ' or ' speaker ' or ' presenter ' or ' Steve ' or ' hand '
2025-07-20 21:43:09 -07:00
"""
2025-07-10 14:48:35 +08:00
assert (
" present " in video_response
or " examine " in video_response
or " display " in video_response
or " hold " in video_response
2025-07-20 21:43:09 -07:00
) , f """
== == == == == == == == == == == video_response == == == == == == == == == == =
{ video_response }
== == == == == == == == == == == == == == == == == == == == == == == == == == == == == =
should contain ' present ' or ' examine ' or ' display ' or ' hold '
"""
2025-07-10 14:48:35 +08:00
self . assertIsNotNone ( video_response )
self . assertGreater ( len ( video_response ) , 0 )
2025-08-17 13:25:02 +08:00
class VideoOpenAITestMixin ( TestOpenAIOmniServerBase ) :
def prepare_video_messages ( self , video_path ) :
messages = [
{
" role " : " user " ,
" content " : [
{
" type " : " video_url " ,
" video_url " : { " url " : f " { video_path } " } ,
} ,
{ " type " : " text " , " text " : " Please describe the video in detail. " } ,
] ,
} ,
]
return messages
def test_video_chat_completion ( self ) :
2025-03-23 13:48:49 +08:00
url = VIDEO_JOBS_URL
file_path = self . get_or_download_file ( url )
2024-08-24 05:11:16 +08:00
client = openai . Client ( api_key = self . api_key , base_url = self . base_url )
messages = self . prepare_video_messages ( file_path )
2025-03-25 11:08:40 +08:00
response = client . chat . completions . create (
2024-08-24 05:11:16 +08:00
model = " default " ,
messages = messages ,
temperature = 0 ,
max_tokens = 1024 ,
2025-03-25 11:08:40 +08:00
stream = False ,
2025-07-18 21:03:53 -07:00
* * ( self . get_vision_request_kwargs ( ) ) ,
2024-08-24 05:11:16 +08:00
)
2024-08-24 08:02:23 -07:00
2025-03-25 11:08:40 +08:00
video_response = response . choices [ 0 ] . message . content
2024-08-24 05:11:16 +08:00
print ( " - " * 30 )
2025-03-25 11:08:40 +08:00
print ( f " Video response: \n { video_response } " )
2024-08-24 05:11:16 +08:00
print ( " - " * 30 )
# Add assertions to validate the video response
2025-05-22 20:32:41 -07:00
assert (
2025-07-10 14:48:35 +08:00
" iPod " in video_response
or " device " in video_response
or " microphone " in video_response
2025-05-22 20:32:41 -07:00
) , f " video_response: { video_response } , should contain ' iPod ' or ' device ' "
2025-02-16 16:58:53 +08:00
assert (
" man " in video_response
or " person " in video_response
or " individual " in video_response
2025-03-18 09:12:38 +08:00
or " speaker " in video_response
2025-08-13 02:08:30 -07:00
or " presenter " in video_response
2025-08-09 00:59:13 -07:00
or " hand " in video_response
2025-08-13 02:08:30 -07:00
) , f " video_response: { video_response } , should either have ' man ' in video_response, or ' person ' in video_response, or ' individual ' in video_response or ' speaker ' in video_response or ' presenter ' or ' hand ' in video_response "
2025-02-16 16:58:53 +08:00
assert (
" present " in video_response
or " examine " in video_response
or " display " in video_response
2025-03-23 13:48:49 +08:00
or " hold " in video_response
2025-05-22 20:32:41 -07:00
) , f " video_response: { video_response } , should contain ' present ' , ' examine ' , ' display ' , or ' hold ' "
assert (
" black " in video_response or " dark " in video_response
) , f " video_response: { video_response } , should contain ' black ' or ' dark ' "
2024-08-24 05:11:16 +08:00
self . assertIsNotNone ( video_response )
self . assertGreater ( len ( video_response ) , 0 )