diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index 0926cfbe9..21f9a2111 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,16 +1,15 @@
-
+
## Motivation
-
+
-## Modification
+## Modifications
-
+
## Checklist
-- [ ] Before submitting a PR for review, make sure it has passed verification in your local development environment **at least**.
-- [ ] Ensure pre-commit `pre-commit run --all-files` or other linting tools are used to fix potential lint issues.
-- [ ] Confirm that modifications are covered by complete unit tests. If not, please add more unit tests for correctness.
-- [ ] Modify documentation as needed, such as docstrings or example tutorials.
+- [ ] Format your code according to the [Contributor Guide](https://github.com/sgl-project/sglang/blob/main/docs/en/contributor_guide.md).
+- [ ] Add unit tests as outlined in the [Contributor Guide](https://github.com/sgl-project/sglang/blob/main/docs/en/contributor_guide.md).
+- [ ] Update documentation as needed, including docstrings or example tutorials.
\ No newline at end of file
diff --git a/README.md b/README.md
index 9ac4fbb30..10b4f95ff 100644
--- a/README.md
+++ b/README.md
@@ -81,14 +81,17 @@ docker run --gpus all \
### Method 4: Using docker compose
+
> This method is recommended if you plan to serve it as a service.
> A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
2. Execute the command `docker compose up -d` in your terminal.
+
### Method 5: Run on Kubernetes or Clouds with SkyPilot
+
To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot).
1. Install SkyPilot and set up Kubernetes cluster or cloud access: see [SkyPilot's documentation](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html).
@@ -114,8 +117,6 @@ run: |
--port 30000
```
-
-
```bash
# Deploy on any cloud or Kubernetes cluster. Use --cloud to select a specific cloud provider.
HF_TOKEN= sky launch -c sglang --env HF_TOKEN sglang.yaml
@@ -124,7 +125,7 @@ HF_TOKEN= sky launch -c sglang --env HF_TOKEN sglang.yaml
sky status --endpoint 30000 sglang
```
3. To further scale up your deployment with autoscaling and failure recovery, check out the [SkyServe + SGLang guide](https://github.com/skypilot-org/skypilot/tree/master/llm/sglang#serving-llama-2-with-sglang-for-more-traffic-using-skyserve).
-
+
### Common Notes
diff --git a/python/sglang/srt/hf_transformers_utils.py b/python/sglang/srt/hf_transformers_utils.py
index fb198fd73..b3576b47b 100644
--- a/python/sglang/srt/hf_transformers_utils.py
+++ b/python/sglang/srt/hf_transformers_utils.py
@@ -147,13 +147,12 @@ def get_tokenizer(
and kwargs.get("use_fast", True)
and tokenizer_name != _FAST_LLAMA_TOKENIZER
):
- pass
- # warnings.warn(
- # "For some LLaMA V1 models, initializing the fast tokenizer may "
- # "take a long time. To reduce the initialization time, consider "
- # f"using '{_FAST_LLAMA_TOKENIZER}' instead of the original "
- # "tokenizer."
- # )
+ warnings.warn(
+ "For some LLaMA V1 models, initializing the fast tokenizer may "
+ "take a long time. To reduce the initialization time, consider "
+ f"using '{_FAST_LLAMA_TOKENIZER}' instead of the original "
+ "tokenizer."
+ )
try:
tokenizer = AutoTokenizer.from_pretrained(
tokenizer_name,
diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index 42c291bb1..14374e580 100644
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -270,7 +270,7 @@ class Req:
if all_ids[prompt_tokens - 1] != self.origin_input_ids_unpadded[-1]:
# TODO(lsyin): fix token fusion
- warnings.warn(
+ logging.warning(
"Token fusion between input and output, try to avoid this by removing the space at the end of the input."
)
return False
@@ -791,7 +791,7 @@ class ScheduleBatch:
)
if not torch.all(success):
- warnings.warn("Sampling failed, fallback to top_k=1 strategy")
+ logging.warning("Sampling failed, fallback to top_k=1 strategy")
probs = probs.masked_fill(torch.isnan(probs), 0.0)
argmax_ids = torch.argmax(probs, dim=-1)
batch_next_token_ids = torch.where(
diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py
index b6cfa68bd..b8a4576f7 100644
--- a/python/sglang/srt/managers/tp_worker.py
+++ b/python/sglang/srt/managers/tp_worker.py
@@ -774,7 +774,7 @@ class ModelTpServer:
torch.cuda.empty_cache()
logger.info("Cache flushed successfully!")
else:
- warnings.warn(
+ logging.warning(
f"Cache not flushed because there are pending requests. "
f"#queue-req: {len(self.waiting_queue)}, "
f"#running-req: {0 if self.running_batch is None else len(self.running_batch.reqs)}"
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index b74a19e60..2406addc8 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -237,7 +237,7 @@ class ModelRunner:
self.max_total_num_tokens = self.profile_max_num_token(total_gpu_memory)
if max_total_tokens is not None:
if max_total_tokens > self.max_total_num_tokens:
- warnings.warn(
+ logging.warning(
f"max_total_tokens={max_total_tokens} is larger than the profiled value "
f"{self.max_total_num_tokens}. "
f"Use the profiled value instead."
diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py
index 12b40d6c4..582457ae0 100644
--- a/python/sglang/srt/openai_api/adapter.py
+++ b/python/sglang/srt/openai_api/adapter.py
@@ -17,10 +17,10 @@ limitations under the License.
import asyncio
import json
+import logging
import os
import time
import uuid
-import warnings
from http import HTTPStatus
from typing import Dict, List, Optional
@@ -65,6 +65,8 @@ from sglang.srt.openai_api.protocol import (
UsageInfo,
)
+logger = logging.getLogger(__name__)
+
chat_template_name = None
@@ -408,7 +410,7 @@ def v1_generate_request(all_requests: List[CompletionRequest]):
"Parallel sampling is not supported for completions from files"
)
if request.echo and request.logprobs:
- warnings.warn(
+ logger.warning(
"Echo is not compatible with logprobs. "
"To compute logprobs of input prompt, please use SGLang /request API."
)