diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py index 80816c28e..9625ff44e 100644 --- a/python/sglang/srt/managers/io_struct.py +++ b/python/sglang/srt/managers/io_struct.py @@ -56,6 +56,9 @@ class GenerateReqInput: # LoRA related lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None + # Whether it is a single request or a batch request + is_single: bool = True + def post_init(self): if (self.text is None and self.input_ids is None) or ( self.text is not None and self.input_ids is not None diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 2621ccd4f..2bc7ff04b 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -150,9 +150,13 @@ class TokenizerManager: while self.model_update_lock.locked(): await asyncio.sleep(0.001) + if isinstance(obj, EmbeddingReqInput) and self.is_generation: + raise ValueError( + "This model does not appear to be an embedding model by default. Please add `--is-embedding` when launching the server or try another model." + ) + obj.post_init() is_single = obj.is_single - if is_single: async for response in self._handle_single_request(obj, request): yield response diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 644cb2b8a..9afae99f9 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -542,8 +542,6 @@ def _wait_and_warmup(server_args, pipe_finish_writer, pid): kill_child_process(pid, including_parent=False) return - print(f"{res.json()=}") - logger.info("The server is fired up and ready to roll!") if pipe_finish_writer is not None: pipe_finish_writer.send("ready") diff --git a/test/srt/models/test_generation_models.py b/test/srt/models/test_generation_models.py index ad887e819..4d05eab8d 100755 --- a/test/srt/models/test_generation_models.py +++ b/test/srt/models/test_generation_models.py @@ -40,20 +40,23 @@ class ModelCase: prefill_tolerance: float = 5e-2 decode_tolerance: float = 5e-2 rouge_l_tolerance: float = 1 + skip_long_prompt: bool = False # Popular models that run on the CI CI_MODELS = [ ModelCase("meta-llama/Llama-3.1-8B-Instruct"), - ModelCase("google/gemma-2-2b"), + ModelCase( + "google/gemma-2-2b", skip_long_prompt=True + ), # There is a bug with new transformers library. This can only run with transformers==4.44 ] # All other models that do not run on the CI ALL_OTHER_MODELS = [ ModelCase("Qwen/Qwen2-1.5B"), ModelCase("Qwen/Qwen2.5-14B-Instruct"), - ModelCase("HuggingFaceTB/SmolLM-135M-Instruct"), - ModelCase("allenai/OLMo-1B-0724-hf", decode_tolerance=8e-2), + ModelCase("HuggingFaceTB/SmolLM-135M-Instruct", skip_long_prompt=True), + ModelCase("allenai/OLMo-1B-0724-hf", decode_tolerance=8e-2, skip_long_prompt=True), ] TORCH_DTYPES = [torch.float16] @@ -136,8 +139,15 @@ class TestGenerationModels(unittest.TestCase): def test_ci_models(self): for model_case in CI_MODELS: for torch_dtype in TORCH_DTYPES: + + # Skip long prompts for models that do not have a long context + prompts = DEFAULT_PROMPTS + if model_case.skip_long_prompt: + prompts = [p for p in DEFAULT_PROMPTS if len(p) < 1000] + + # Assert the logits and output strs are close self.assert_close_logits_and_output_strs( - DEFAULT_PROMPTS, model_case, torch_dtype + prompts, model_case, torch_dtype ) def test_others(self): @@ -152,13 +162,9 @@ class TestGenerationModels(unittest.TestCase): ): continue - # Skip long prompts for models that does not have a long context + # Skip long prompts for models that do not have a long context prompts = DEFAULT_PROMPTS - if model_case.model_path in [ - "HuggingFaceTB/SmolLM-135M-Instruct", - "allenai/OLMo-1B-0724-hf", - "google/gemma-2-2b", # There is a bug with new transformers library. This can only run with transformers==4.44 - ]: + if model_case.skip_long_prompt: prompts = [p for p in DEFAULT_PROMPTS if len(p) < 1000] # Assert the logits and output strs are close