Fix package loss for small models (#2717)
Co-authored-by: sdli1995 < mmlmonkey@163.com>
This commit is contained in:
@@ -6,7 +6,7 @@ Welcome to **SGLang**! We appreciate your interest in contributing. This guide p
|
|||||||
|
|
||||||
### Fork and Clone the Repository
|
### Fork and Clone the Repository
|
||||||
|
|
||||||
**Note**: New contributors do **not** have the write permission to push to SGLang. Please fork the repository under your GitHub account, then clone your fork locally.
|
**Note**: New contributors do **not** have the write permission to push to the official SGLang repo. Please fork the repository under your GitHub account, then clone your fork locally.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/<your_user_name>/sglang.git
|
git clone https://github.com/<your_user_name>/sglang.git
|
||||||
@@ -36,7 +36,6 @@ SGLang uses Python's built-in [unittest](https://docs.python.org/3/library/unitt
|
|||||||
|
|
||||||
We recommend new contributors start from writing documentation, which helps you quickly understand SGLang codebase. For more details, please refer to [docs/README.md](https://github.com/sgl-project/sglang/tree/main/docs/README.md).
|
We recommend new contributors start from writing documentation, which helps you quickly understand SGLang codebase. For more details, please refer to [docs/README.md](https://github.com/sgl-project/sglang/tree/main/docs/README.md).
|
||||||
|
|
||||||
|
|
||||||
## Tips for Newcomers
|
## Tips for Newcomers
|
||||||
|
|
||||||
If you want to contribute but don’t have a specific idea in mind, pick issues labeled [“good first issue” or “help wanted”](https://github.com/sgl-project/sglang/issues?q=is%3Aissue+label%3A%22good+first+issue%22%2C%22help+wanted%22). These tasks typically have lower complexity and provide an excellent introduction to the codebase. Also check out this [code walk-through](https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/tree/main/sglang/code-walk-through) for a deeper look into SGLang’s workflow.
|
If you want to contribute but don’t have a specific idea in mind, pick issues labeled [“good first issue” or “help wanted”](https://github.com/sgl-project/sglang/issues?q=is%3Aissue+label%3A%22good+first+issue%22%2C%22help+wanted%22). These tasks typically have lower complexity and provide an excellent introduction to the codebase. Also check out this [code walk-through](https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/tree/main/sglang/code-walk-through) for a deeper look into SGLang’s workflow.
|
||||||
|
|||||||
@@ -1364,11 +1364,11 @@ class Scheduler:
|
|||||||
embeddings = []
|
embeddings = []
|
||||||
prompt_tokens = []
|
prompt_tokens = []
|
||||||
for req in reqs:
|
for req in reqs:
|
||||||
assert req.finished()
|
if req.finished():
|
||||||
rids.append(req.rid)
|
rids.append(req.rid)
|
||||||
finished_reasons.append(req.finished_reason.to_json())
|
finished_reasons.append(req.finished_reason.to_json())
|
||||||
embeddings.append(req.embedding)
|
embeddings.append(req.embedding)
|
||||||
prompt_tokens.append(len(req.origin_input_ids))
|
prompt_tokens.append(len(req.origin_input_ids))
|
||||||
self.send_to_detokenizer.send_pyobj(
|
self.send_to_detokenizer.send_pyobj(
|
||||||
BatchEmbeddingOut(rids, finished_reasons, embeddings, prompt_tokens)
|
BatchEmbeddingOut(rids, finished_reasons, embeddings, prompt_tokens)
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -222,10 +222,8 @@ class TokenizerManager:
|
|||||||
is_single = obj.is_single
|
is_single = obj.is_single
|
||||||
if is_single:
|
if is_single:
|
||||||
tokenized_obj = await self._tokenize_one_request(obj)
|
tokenized_obj = await self._tokenize_one_request(obj)
|
||||||
self.send_to_scheduler.send_pyobj(tokenized_obj)
|
self._send_one_request(obj, tokenized_obj, created_time)
|
||||||
async for response in self._wait_one_response(
|
async for response in self._wait_one_response(obj, request):
|
||||||
obj, request, created_time
|
|
||||||
):
|
|
||||||
yield response
|
yield response
|
||||||
else:
|
else:
|
||||||
async for response in self._handle_batch_request(
|
async for response in self._handle_batch_request(
|
||||||
@@ -306,16 +304,24 @@ class TokenizerManager:
|
|||||||
|
|
||||||
return tokenized_obj
|
return tokenized_obj
|
||||||
|
|
||||||
|
def _send_one_request(
|
||||||
|
self,
|
||||||
|
obj: Union[GenerateReqInput, EmbeddingReqInput],
|
||||||
|
tokenized_obj: Union[TokenizedGenerateReqInput, TokenizedEmbeddingReqInput],
|
||||||
|
created_time: Optional[float] = None,
|
||||||
|
):
|
||||||
|
event = asyncio.Event()
|
||||||
|
state = ReqState([], False, event, obj, created_time=created_time)
|
||||||
|
self.rid_to_state[obj.rid] = state
|
||||||
|
self.send_to_scheduler.send_pyobj(tokenized_obj)
|
||||||
|
|
||||||
async def _wait_one_response(
|
async def _wait_one_response(
|
||||||
self,
|
self,
|
||||||
obj: Union[GenerateReqInput, EmbeddingReqInput],
|
obj: Union[GenerateReqInput, EmbeddingReqInput],
|
||||||
request: Optional[fastapi.Request] = None,
|
request: Optional[fastapi.Request] = None,
|
||||||
created_time: Optional[float] = None,
|
|
||||||
):
|
):
|
||||||
"""Wait for the response of one request."""
|
"""Wait for the response of one request."""
|
||||||
event = asyncio.Event()
|
state = self.rid_to_state[obj.rid]
|
||||||
state = ReqState([], False, event, obj, created_time=created_time)
|
|
||||||
self.rid_to_state[obj.rid] = state
|
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
@@ -361,10 +367,8 @@ class TokenizerManager:
|
|||||||
for i in range(batch_size):
|
for i in range(batch_size):
|
||||||
tmp_obj = obj[i]
|
tmp_obj = obj[i]
|
||||||
tokenized_obj = await self._tokenize_one_request(tmp_obj)
|
tokenized_obj = await self._tokenize_one_request(tmp_obj)
|
||||||
self.send_to_scheduler.send_pyobj(tokenized_obj)
|
self._send_one_request(tmp_obj, tokenized_obj, created_time)
|
||||||
generators.append(
|
generators.append(self._wait_one_response(tmp_obj, request))
|
||||||
self._wait_one_response(tmp_obj, request, created_time)
|
|
||||||
)
|
|
||||||
rids.append(tmp_obj.rid)
|
rids.append(tmp_obj.rid)
|
||||||
else:
|
else:
|
||||||
# FIXME: When using batch and parallel_sample_num together, the perf is not optimal.
|
# FIXME: When using batch and parallel_sample_num together, the perf is not optimal.
|
||||||
@@ -389,10 +393,8 @@ class TokenizerManager:
|
|||||||
tokenized_obj.sampling_params = copy.copy(tokenized_obj.sampling_params)
|
tokenized_obj.sampling_params = copy.copy(tokenized_obj.sampling_params)
|
||||||
tokenized_obj.sampling_params.max_new_tokens = 0
|
tokenized_obj.sampling_params.max_new_tokens = 0
|
||||||
tokenized_obj.stream = False
|
tokenized_obj.stream = False
|
||||||
self.send_to_scheduler.send_pyobj(tokenized_obj)
|
self._send_one_request(tmp_obj, tokenized_obj, created_time)
|
||||||
await self._wait_one_response(
|
await self._wait_one_response(tmp_obj, request).__anext__()
|
||||||
tmp_obj, request, created_time
|
|
||||||
).__anext__()
|
|
||||||
|
|
||||||
# Expand requests, assign new rids for them, and send them
|
# Expand requests, assign new rids for them, and send them
|
||||||
for i in range(batch_size):
|
for i in range(batch_size):
|
||||||
@@ -400,10 +402,8 @@ class TokenizerManager:
|
|||||||
tmp_obj = copy.copy(objs[i])
|
tmp_obj = copy.copy(objs[i])
|
||||||
tokenized_obj = copy.copy(tokenized_objs[i])
|
tokenized_obj = copy.copy(tokenized_objs[i])
|
||||||
tokenized_obj.rid = tmp_obj.regenerate_rid()
|
tokenized_obj.rid = tmp_obj.regenerate_rid()
|
||||||
self.send_to_scheduler.send_pyobj(tokenized_obj)
|
self._send_one_request(tmp_obj, tokenized_obj, created_time)
|
||||||
generators.append(
|
generators.append(self._wait_one_response(tmp_obj, request))
|
||||||
self._wait_one_response(tmp_obj, request, created_time)
|
|
||||||
)
|
|
||||||
rids.append(tmp_obj.rid)
|
rids.append(tmp_obj.rid)
|
||||||
|
|
||||||
# Wait for all requests
|
# Wait for all requests
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ python3 test_srt_endpoint.py
|
|||||||
python3 -m unittest test_srt_endpoint.TestSRTEndpoint.test_simple_decode
|
python3 -m unittest test_srt_endpoint.TestSRTEndpoint.test_simple_decode
|
||||||
|
|
||||||
# Run a suite with multiple files
|
# Run a suite with multiple files
|
||||||
python3 run_suite.py --suite minimal
|
python3 run_suite.py --suite per-commit
|
||||||
```
|
```
|
||||||
|
|
||||||
## Test Frontend Language
|
## Test Frontend Language
|
||||||
@@ -28,14 +28,14 @@ python3 test_openai_backend.py
|
|||||||
python3 -m unittest test_openai_backend.TestOpenAIBackend.test_few_shot_qa
|
python3 -m unittest test_openai_backend.TestOpenAIBackend.test_few_shot_qa
|
||||||
|
|
||||||
# Run a suite with multiple files
|
# Run a suite with multiple files
|
||||||
python3 run_suite.py --suite minimal
|
python3 run_suite.py --suite per-commit
|
||||||
```
|
```
|
||||||
|
|
||||||
## Adding or Updating Tests in CI
|
## Adding or Updating Tests in CI
|
||||||
|
|
||||||
- Create new test files under `test/srt` or `test/lang` depending on the type of test.
|
- Create new test files under `test/srt` or `test/lang` depending on the type of test.
|
||||||
- Ensure they are referenced in the respective `run_suite.py` (e.g., `test/srt/run_suite.py` or `test/lang/run_suite.py`) so they’re picked up in CI.
|
- Ensure they are referenced in the respective `run_suite.py` (e.g., `test/srt/run_suite.py` or `test/lang/run_suite.py`) so they’re picked up in CI. For most small test cases, they can be added to the `per-commit` suite.
|
||||||
- In CI, all tests run automatically. You may modify the workflows in [`.github/workflows/`](https://github.com/sgl-project/sglang/tree/main/.github/workflows) to add custom test groups or extra checks.
|
- The CI will run the `per-commit` and `nightly` automatically. If you need special setup or custom test groups, you may modify the workflows in [`.github/workflows/`](https://github.com/sgl-project/sglang/tree/main/.github/workflows).
|
||||||
|
|
||||||
|
|
||||||
## Writing Elegant Test Cases
|
## Writing Elegant Test Cases
|
||||||
|
|||||||
Reference in New Issue
Block a user