diff --git a/test/srt/test_vision_openai_server_b.py b/test/srt/test_vision_openai_server_b.py index 3d1b6519c..f5b33a72e 100644 --- a/test/srt/test_vision_openai_server_b.py +++ b/test/srt/test_vision_openai_server_b.py @@ -172,29 +172,28 @@ class TestGemma3nServer(TestOpenAIVisionServer): cls.base_url += "/v1" -# commented out before https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/discussions/27 get fixed -# class TestKimiVLServer(TestOpenAIVisionServer): -# @classmethod -# def setUpClass(cls): -# cls.model = "moonshotai/Kimi-VL-A3B-Instruct" -# cls.base_url = DEFAULT_URL_FOR_TEST -# cls.api_key = "sk-123456" -# cls.process = popen_launch_server( -# cls.model, -# cls.base_url, -# timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, -# other_args=[ -# "--trust-remote-code", -# "--context-length", -# "4096", -# "--dtype", -# "bfloat16", -# ], -# ) -# cls.base_url += "/v1" +class TestKimiVLServer(TestOpenAIVisionServer): + @classmethod + def setUpClass(cls): + cls.model = "moonshotai/Kimi-VL-A3B-Instruct" + cls.base_url = DEFAULT_URL_FOR_TEST + cls.api_key = "sk-123456" + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--trust-remote-code", + "--context-length", + "4096", + "--dtype", + "bfloat16", + ], + ) + cls.base_url += "/v1" -# def test_video_images_chat_completion(self): -# pass + def test_video_images_chat_completion(self): + pass class TestPhi4MMServer(TestOpenAIVisionServer): diff --git a/test/srt/test_vlm_input_format.py b/test/srt/test_vlm_input_format.py index 39f28a4b3..4f9ad64c3 100644 --- a/test/srt/test_vlm_input_format.py +++ b/test/srt/test_vlm_input_format.py @@ -189,32 +189,31 @@ class TestGemmaUnderstandsImage(VLMInputTestBase, unittest.IsolatedAsyncioTestCa ) -# commented out before https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/discussions/27 get fixed -# class TestKimiVLImageUnderstandsImage( -# VLMInputTestBase, unittest.IsolatedAsyncioTestCase -# ): -# model_path = "moonshotai/Kimi-VL-A3B-Instruct" -# chat_template = "kimi-vl" +class TestKimiVLImageUnderstandsImage( + VLMInputTestBase, unittest.IsolatedAsyncioTestCase +): + model_path = "moonshotai/Kimi-VL-A3B-Instruct" + chat_template = "kimi-vl" -# @classmethod -# def _init_visual(cls): -# model = AutoModel.from_pretrained(cls.model_path, trust_remote_code=True) -# cls.vision_tower = model.vision_tower.eval().to(cls.device) -# cls.mm_projector = model.multi_modal_projector.eval().to(cls.device) + @classmethod + def _init_visual(cls): + model = AutoModel.from_pretrained(cls.model_path, trust_remote_code=True) + cls.vision_tower = model.vision_tower.eval().to(cls.device) + cls.mm_projector = model.multi_modal_projector.eval().to(cls.device) -# cls.visual = lambda tokenizer_output: cls.mm_projector( -# cls.vision_tower( -# pixel_values=tokenizer_output["pixel_values"], -# grid_hws=tokenizer_output["image_grid_hws"], -# ) -# ) + cls.visual = lambda tokenizer_output: cls.mm_projector( + cls.vision_tower( + pixel_values=tokenizer_output["pixel_values"], + grid_hws=tokenizer_output["image_grid_hws"], + ) + ) -# def _pixel_values_image_data(self, processor_output): -# return dict( -# modality="IMAGE", -# pixel_values=processor_output["pixel_values"], -# image_grid_hws=processor_output["image_grid_hws"], -# ) + def _pixel_values_image_data(self, processor_output): + return dict( + modality="IMAGE", + pixel_values=processor_output["pixel_values"], + image_grid_hws=processor_output["image_grid_hws"], + ) # not for CI: too large