diff --git a/python/sglang/srt/multimodal/processors/base_processor.py b/python/sglang/srt/multimodal/processors/base_processor.py index c7df9265d..c98720652 100644 --- a/python/sglang/srt/multimodal/processors/base_processor.py +++ b/python/sglang/srt/multimodal/processors/base_processor.py @@ -192,7 +192,12 @@ class BaseMultimodalProcessor(ABC): # name of the feature filed # TODO: pass from processors - self.FEATURE_NAMES = ["pixel_values", "pixel_values_videos", "audio_features"] + self.FEATURE_NAMES = [ + "pixel_values", + "pixel_values_videos", + "audio_features", + "input_features", + ] def process_mm_data( self, input_text, images=None, videos=None, audios=None, **kwargs diff --git a/test/srt/test_vision_openai_server_b.py b/test/srt/test_vision_openai_server_b.py index f5b33a72e..533312aaf 100644 --- a/test/srt/test_vision_openai_server_b.py +++ b/test/srt/test_vision_openai_server_b.py @@ -171,6 +171,11 @@ class TestGemma3nServer(TestOpenAIVisionServer): ) cls.base_url += "/v1" + def test_audio_chat_completion(self): + self._test_audio_speech_completion() + # This _test_audio_ambient_completion test is way too complicated to pass for a small LLM + # self._test_audio_ambient_completion() + class TestKimiVLServer(TestOpenAIVisionServer): @classmethod @@ -252,9 +257,7 @@ class TestPhi4MMServer(TestOpenAIVisionServer): def test_audio_chat_completion(self): self._test_audio_speech_completion() - # TODO: currently phi4-mm cannot pass this test. - # We are investigating this issue. - # Response: La ciudad está situada en la costa este de la isla, en la desembocadura del río St. Lawrence. + # This _test_audio_ambient_completion test is way too complicated to pass for a small LLM # self._test_audio_ambient_completion()