From 0f9cc6d8d3c688eeb8d61e5e869a59d8d756044b Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Thu, 2 Jan 2025 18:25:26 -0800 Subject: [PATCH] Fix package loss for small models (#2717) Co-authored-by: sdli1995 < mmlmonkey@163.com> --- docs/references/contribution_guide.md | 3 +- python/sglang/srt/managers/scheduler.py | 10 ++--- .../sglang/srt/managers/tokenizer_manager.py | 40 +++++++++---------- test/README.md | 8 ++-- 4 files changed, 30 insertions(+), 31 deletions(-) diff --git a/docs/references/contribution_guide.md b/docs/references/contribution_guide.md index 43495e21b..b2211f463 100644 --- a/docs/references/contribution_guide.md +++ b/docs/references/contribution_guide.md @@ -6,7 +6,7 @@ Welcome to **SGLang**! We appreciate your interest in contributing. This guide p ### Fork and Clone the Repository -**Note**: New contributors do **not** have the write permission to push to SGLang. Please fork the repository under your GitHub account, then clone your fork locally. +**Note**: New contributors do **not** have the write permission to push to the official SGLang repo. Please fork the repository under your GitHub account, then clone your fork locally. ```bash git clone https://github.com//sglang.git @@ -36,7 +36,6 @@ SGLang uses Python's built-in [unittest](https://docs.python.org/3/library/unitt We recommend new contributors start from writing documentation, which helps you quickly understand SGLang codebase. For more details, please refer to [docs/README.md](https://github.com/sgl-project/sglang/tree/main/docs/README.md). - ## Tips for Newcomers If you want to contribute but don’t have a specific idea in mind, pick issues labeled [“good first issue” or “help wanted”](https://github.com/sgl-project/sglang/issues?q=is%3Aissue+label%3A%22good+first+issue%22%2C%22help+wanted%22). These tasks typically have lower complexity and provide an excellent introduction to the codebase. Also check out this [code walk-through](https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/tree/main/sglang/code-walk-through) for a deeper look into SGLang’s workflow. diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 0d51c695a..8956352ca 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -1364,11 +1364,11 @@ class Scheduler: embeddings = [] prompt_tokens = [] for req in reqs: - assert req.finished() - rids.append(req.rid) - finished_reasons.append(req.finished_reason.to_json()) - embeddings.append(req.embedding) - prompt_tokens.append(len(req.origin_input_ids)) + if req.finished(): + rids.append(req.rid) + finished_reasons.append(req.finished_reason.to_json()) + embeddings.append(req.embedding) + prompt_tokens.append(len(req.origin_input_ids)) self.send_to_detokenizer.send_pyobj( BatchEmbeddingOut(rids, finished_reasons, embeddings, prompt_tokens) ) diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 3196e60cb..08dbd02c5 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -222,10 +222,8 @@ class TokenizerManager: is_single = obj.is_single if is_single: tokenized_obj = await self._tokenize_one_request(obj) - self.send_to_scheduler.send_pyobj(tokenized_obj) - async for response in self._wait_one_response( - obj, request, created_time - ): + self._send_one_request(obj, tokenized_obj, created_time) + async for response in self._wait_one_response(obj, request): yield response else: async for response in self._handle_batch_request( @@ -306,16 +304,24 @@ class TokenizerManager: return tokenized_obj + def _send_one_request( + self, + obj: Union[GenerateReqInput, EmbeddingReqInput], + tokenized_obj: Union[TokenizedGenerateReqInput, TokenizedEmbeddingReqInput], + created_time: Optional[float] = None, + ): + event = asyncio.Event() + state = ReqState([], False, event, obj, created_time=created_time) + self.rid_to_state[obj.rid] = state + self.send_to_scheduler.send_pyobj(tokenized_obj) + async def _wait_one_response( self, obj: Union[GenerateReqInput, EmbeddingReqInput], request: Optional[fastapi.Request] = None, - created_time: Optional[float] = None, ): """Wait for the response of one request.""" - event = asyncio.Event() - state = ReqState([], False, event, obj, created_time=created_time) - self.rid_to_state[obj.rid] = state + state = self.rid_to_state[obj.rid] while True: try: @@ -361,10 +367,8 @@ class TokenizerManager: for i in range(batch_size): tmp_obj = obj[i] tokenized_obj = await self._tokenize_one_request(tmp_obj) - self.send_to_scheduler.send_pyobj(tokenized_obj) - generators.append( - self._wait_one_response(tmp_obj, request, created_time) - ) + self._send_one_request(tmp_obj, tokenized_obj, created_time) + generators.append(self._wait_one_response(tmp_obj, request)) rids.append(tmp_obj.rid) else: # FIXME: When using batch and parallel_sample_num together, the perf is not optimal. @@ -389,10 +393,8 @@ class TokenizerManager: tokenized_obj.sampling_params = copy.copy(tokenized_obj.sampling_params) tokenized_obj.sampling_params.max_new_tokens = 0 tokenized_obj.stream = False - self.send_to_scheduler.send_pyobj(tokenized_obj) - await self._wait_one_response( - tmp_obj, request, created_time - ).__anext__() + self._send_one_request(tmp_obj, tokenized_obj, created_time) + await self._wait_one_response(tmp_obj, request).__anext__() # Expand requests, assign new rids for them, and send them for i in range(batch_size): @@ -400,10 +402,8 @@ class TokenizerManager: tmp_obj = copy.copy(objs[i]) tokenized_obj = copy.copy(tokenized_objs[i]) tokenized_obj.rid = tmp_obj.regenerate_rid() - self.send_to_scheduler.send_pyobj(tokenized_obj) - generators.append( - self._wait_one_response(tmp_obj, request, created_time) - ) + self._send_one_request(tmp_obj, tokenized_obj, created_time) + generators.append(self._wait_one_response(tmp_obj, request)) rids.append(tmp_obj.rid) # Wait for all requests diff --git a/test/README.md b/test/README.md index f06724e4e..3d739cc04 100644 --- a/test/README.md +++ b/test/README.md @@ -13,7 +13,7 @@ python3 test_srt_endpoint.py python3 -m unittest test_srt_endpoint.TestSRTEndpoint.test_simple_decode # Run a suite with multiple files -python3 run_suite.py --suite minimal +python3 run_suite.py --suite per-commit ``` ## Test Frontend Language @@ -28,14 +28,14 @@ python3 test_openai_backend.py python3 -m unittest test_openai_backend.TestOpenAIBackend.test_few_shot_qa # Run a suite with multiple files -python3 run_suite.py --suite minimal +python3 run_suite.py --suite per-commit ``` ## Adding or Updating Tests in CI - Create new test files under `test/srt` or `test/lang` depending on the type of test. -- Ensure they are referenced in the respective `run_suite.py` (e.g., `test/srt/run_suite.py` or `test/lang/run_suite.py`) so they’re picked up in CI. -- In CI, all tests run automatically. You may modify the workflows in [`.github/workflows/`](https://github.com/sgl-project/sglang/tree/main/.github/workflows) to add custom test groups or extra checks. +- Ensure they are referenced in the respective `run_suite.py` (e.g., `test/srt/run_suite.py` or `test/lang/run_suite.py`) so they’re picked up in CI. For most small test cases, they can be added to the `per-commit` suite. +- The CI will run the `per-commit` and `nightly` automatically. If you need special setup or custom test groups, you may modify the workflows in [`.github/workflows/`](https://github.com/sgl-project/sglang/tree/main/.github/workflows). ## Writing Elegant Test Cases