diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 0926cfbe9..21f9a2111 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,16 +1,15 @@ - + ## Motivation - + -## Modification +## Modifications - + ## Checklist -- [ ] Before submitting a PR for review, make sure it has passed verification in your local development environment **at least**. -- [ ] Ensure pre-commit `pre-commit run --all-files` or other linting tools are used to fix potential lint issues. -- [ ] Confirm that modifications are covered by complete unit tests. If not, please add more unit tests for correctness. -- [ ] Modify documentation as needed, such as docstrings or example tutorials. +- [ ] Format your code according to the [Contributor Guide](https://github.com/sgl-project/sglang/blob/main/docs/en/contributor_guide.md). +- [ ] Add unit tests as outlined in the [Contributor Guide](https://github.com/sgl-project/sglang/blob/main/docs/en/contributor_guide.md). +- [ ] Update documentation as needed, including docstrings or example tutorials. \ No newline at end of file diff --git a/README.md b/README.md index 9ac4fbb30..10b4f95ff 100644 --- a/README.md +++ b/README.md @@ -81,14 +81,17 @@ docker run --gpus all \ ### Method 4: Using docker compose +
> This method is recommended if you plan to serve it as a service. > A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml). 1. Copy the [compose.yml](./docker/compose.yaml) to your local machine 2. Execute the command `docker compose up -d` in your terminal. +
### Method 5: Run on Kubernetes or Clouds with SkyPilot +
To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot). 1. Install SkyPilot and set up Kubernetes cluster or cloud access: see [SkyPilot's documentation](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html). @@ -114,8 +117,6 @@ run: | --port 30000 ``` -
- ```bash # Deploy on any cloud or Kubernetes cluster. Use --cloud to select a specific cloud provider. HF_TOKEN= sky launch -c sglang --env HF_TOKEN sglang.yaml @@ -124,7 +125,7 @@ HF_TOKEN= sky launch -c sglang --env HF_TOKEN sglang.yaml sky status --endpoint 30000 sglang ``` 3. To further scale up your deployment with autoscaling and failure recovery, check out the [SkyServe + SGLang guide](https://github.com/skypilot-org/skypilot/tree/master/llm/sglang#serving-llama-2-with-sglang-for-more-traffic-using-skyserve). - + ### Common Notes diff --git a/python/sglang/srt/hf_transformers_utils.py b/python/sglang/srt/hf_transformers_utils.py index fb198fd73..b3576b47b 100644 --- a/python/sglang/srt/hf_transformers_utils.py +++ b/python/sglang/srt/hf_transformers_utils.py @@ -147,13 +147,12 @@ def get_tokenizer( and kwargs.get("use_fast", True) and tokenizer_name != _FAST_LLAMA_TOKENIZER ): - pass - # warnings.warn( - # "For some LLaMA V1 models, initializing the fast tokenizer may " - # "take a long time. To reduce the initialization time, consider " - # f"using '{_FAST_LLAMA_TOKENIZER}' instead of the original " - # "tokenizer." - # ) + warnings.warn( + "For some LLaMA V1 models, initializing the fast tokenizer may " + "take a long time. To reduce the initialization time, consider " + f"using '{_FAST_LLAMA_TOKENIZER}' instead of the original " + "tokenizer." + ) try: tokenizer = AutoTokenizer.from_pretrained( tokenizer_name, diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 42c291bb1..14374e580 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -270,7 +270,7 @@ class Req: if all_ids[prompt_tokens - 1] != self.origin_input_ids_unpadded[-1]: # TODO(lsyin): fix token fusion - warnings.warn( + logging.warning( "Token fusion between input and output, try to avoid this by removing the space at the end of the input." ) return False @@ -791,7 +791,7 @@ class ScheduleBatch: ) if not torch.all(success): - warnings.warn("Sampling failed, fallback to top_k=1 strategy") + logging.warning("Sampling failed, fallback to top_k=1 strategy") probs = probs.masked_fill(torch.isnan(probs), 0.0) argmax_ids = torch.argmax(probs, dim=-1) batch_next_token_ids = torch.where( diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py index b6cfa68bd..b8a4576f7 100644 --- a/python/sglang/srt/managers/tp_worker.py +++ b/python/sglang/srt/managers/tp_worker.py @@ -774,7 +774,7 @@ class ModelTpServer: torch.cuda.empty_cache() logger.info("Cache flushed successfully!") else: - warnings.warn( + logging.warning( f"Cache not flushed because there are pending requests. " f"#queue-req: {len(self.waiting_queue)}, " f"#running-req: {0 if self.running_batch is None else len(self.running_batch.reqs)}" diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index b74a19e60..2406addc8 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -237,7 +237,7 @@ class ModelRunner: self.max_total_num_tokens = self.profile_max_num_token(total_gpu_memory) if max_total_tokens is not None: if max_total_tokens > self.max_total_num_tokens: - warnings.warn( + logging.warning( f"max_total_tokens={max_total_tokens} is larger than the profiled value " f"{self.max_total_num_tokens}. " f"Use the profiled value instead." diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 12b40d6c4..582457ae0 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -17,10 +17,10 @@ limitations under the License. import asyncio import json +import logging import os import time import uuid -import warnings from http import HTTPStatus from typing import Dict, List, Optional @@ -65,6 +65,8 @@ from sglang.srt.openai_api.protocol import ( UsageInfo, ) +logger = logging.getLogger(__name__) + chat_template_name = None @@ -408,7 +410,7 @@ def v1_generate_request(all_requests: List[CompletionRequest]): "Parallel sampling is not supported for completions from files" ) if request.echo and request.logprobs: - warnings.warn( + logger.warning( "Echo is not compatible with logprobs. " "To compute logprobs of input prompt, please use SGLang /request API." )