Misc clean up; Remove the support of jump forward (#4032)

This commit is contained in:
Lianmin Zheng
2025-03-03 07:02:14 -08:00
committed by GitHub
parent 110e006673
commit 935cda944b
41 changed files with 396 additions and 426 deletions

View File

@@ -385,7 +385,7 @@
"print(gen_response)\n",
"\n",
"# parse the response\n",
"parse_url = f\"http://localhost:{port}/function_call\"\n",
"parse_url = f\"http://localhost:{port}/parse_function_call\"\n",
"\n",
"function_call_input = {\n",
" \"text\": gen_response,\n",

View File

@@ -1,72 +1,284 @@
# Sampling Parameters
# Sampling Parameters in SGLang Runtime
This doc describes the sampling parameters of the SGLang Runtime.
It is the low-level endpoint of the runtime.
If you want a high-level endpoint that can automatically handle chat templates, consider using the [OpenAI Compatible API](https://docs.sglang.ai/backend/openai_api_completions.html).
If you want a high-level endpoint that can automatically handle chat templates, consider using the [OpenAI Compatible API](../backend/openai_api_completions.ipynb).
## `/generate` Endpoint
The `/generate` endpoint accepts the following arguments in the JSON format. You can code examples below.
The `/generate` endpoint accepts the following parameters in JSON format. For in detail usage see the [native api doc](https://docs.sglang.ai/backend/native_api.html).
```python
@dataclass
class GenerateReqInput:
# The input prompt. It can be a single prompt or a batch of prompts.
text: Optional[Union[List[str], str]] = None
# The token ids for text; one can specify either text or input_ids
input_ids: Optional[Union[List[List[int]], List[int]]] = None
# The embeddings for input_ids; one can specify either text or input_ids or input_embeds.
input_embeds: Optional[Union[List[List[List[float]]], List[List[float]]]] = None
# The image input. It can be a file name, a url, or base64 encoded string.
# See also python/sglang/srt/utils.py:load_image.
image_data: Optional[Union[List[str], str]] = None
# The sampling_params. See descriptions below.
sampling_params: Optional[Union[List[Dict], Dict]] = None
# The request id.
rid: Optional[Union[List[str], str]] = None
# Whether to return logprobs.
return_logprob: Optional[Union[List[bool], bool]] = None
# If return logprobs, the start location in the prompt for returning logprobs.
# By default, this value is "-1", which means it will only return logprobs for output tokens.
logprob_start_len: Optional[Union[List[int], int]] = None
# If return logprobs, the number of top logprobs to return at each position.
top_logprobs_num: Optional[Union[List[int], int]] = None
# If return logprobs, the token ids to return logprob for.
token_ids_logprob: Optional[Union[List[List[int]], List[int]]] = None
# Whether to detokenize tokens in text in the returned logprobs.
return_text_in_logprobs: bool = False
# Whether to stream output.
stream: bool = False
* `prompt`: The input prompt. Can be a single prompt or a batch of prompts. `Optional[Union[List[str], str]] = None`
* `input_ids`: Alternative to `text`. Specify the input as token IDs instead of text. `Optional[Union[List[List[int]], List[int]]] = None`
* `sampling_params`: The sampling parameters as described in the sections below. `Optional[Union[List[Dict], Dict]] = None`
* `return_logprob`: Whether to return log probabilities for tokens. `Optional[Union[List[bool], bool]] = None`
* `logprob_start_len`: If returning log probabilities, specifies the start position in the prompt. Default is "-1" which returns logprobs only for output tokens. `Optional[Union[List[int], int]] = None`
* `top_logprobs_num`: If returning log probabilities, specifies the number of top logprobs to return at each position. `Optional[Union[List[int], int]] = None`
* `stream`: Whether to stream the output. `bool = False`
* `lora_path`: Path to LoRA weights. `Optional[Union[List[Optional[str]], Optional[str]]] = None`
* `custom_logit_processor`: Custom logit processor for advanced sampling control. For usage see below. `Optional[Union[List[Optional[str]], str]] = None`
* `return_hidden_states`: Whether to return hidden states of the model. Note that each time it changes, the cuda graph will be recaptured, which might lead to a performance hit. See the [examples](https://github.com/sgl-project/sglang/blob/main/examples/runtime/engine/hidden_states.py) for more information. `bool = False`
# The modalities of the image data [image, multi-images, video]
modalities: Optional[List[str]] = None
# LoRA related
lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
## Sampling params
# Custom logit processor for advanced sampling control. Must be a serialized instance
# of `CustomLogitProcessor` in python/sglang/srt/sampling/custom_logit_processor.py
# Use the processor's `to_str()` method to generate the serialized string.
custom_logit_processor: Optional[Union[List[Optional[str]], str]] = None
### Core Parameters
# Whether to return hidden states
return_hidden_states: bool = False
```
* `max_new_tokens`: The maximum output length measured in tokens. `int = 128`
* `stop`: One or multiple [stop words](https://platform.openai.com/docs/api-reference/chat/create#chat-create-stop). Generation will stop if one of these words is sampled. `Optional[Union[str, List[str]]] = None`
* `stop_token_ids`: Provide stop words in form of token ids. Generation will stop if one of these token ids is sampled. `Optional[List[int]] = []`
* `temperature`: [Temperature](https://platform.openai.com/docs/api-reference/chat/create#chat-create-temperature) when sampling the next token. `temperature = 0` corresponds to greedy sampling, higher temperature leads to more diversity. `float = 1.0`
* `top_p`: [Top-p](https://platform.openai.com/docs/api-reference/chat/create#chat-create-top_p) selects tokens from the smallest sorted set whose cumulative probability exceeds `top_p`. When `top_p = 1`, this reduces to unrestricted sampling from all tokens. `top_p: float = 1.0`
* `top_k`: [Top-k](https://developer.nvidia.com/blog/how-to-get-better-outputs-from-your-large-language-model/#predictability_vs_creativity) randomly selects from the `k` highest-probability tokens. `int = -1`
* `min_p`: [Min-p](https://github.com/huggingface/transformers/issues/27670) samples from tokens with probability larger than `min_p * highest_token_probability`. `float = 0.0`
The `sampling_params` follows this format
### Penalizers
```python
# The maximum number of output tokens
max_new_tokens: int = 128,
# Stop when hitting any of the strings in this list
stop: Optional[Union[str, List[str]]] = None,
# Stop when hitting any of the token_ids in this list
stop_token_ids: Optional[List[int]] = [],
# Sampling temperature
temperature: float = 1.0,
# Top-p sampling
top_p: float = 1.0,
# Top-k sampling
top_k: int = -1,
# Min-p sampling
min_p: float = 0.0,
# Do parallel sampling and return `n` outputs.
n: int = 1,
To use penalizers you will need to `--disable-overlap`. Please note that this might degrade performance.
## Structured Outputs
# Only one of the below three can be set for a request.
* `frequency_penalty`: Penalizes tokens based on their frequency in generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of penalization grows linearly with each appearance of a token. `float = 0.0`
* `presence_penalty`: Penalizes tokens if they appeared in the generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of the penalization is constant if a token occured. `float = 0.0`
* `repetition_penalty`: Penalizes tokens if they appeared in prompt or generation so far. Must be between `0` and `2` where numbers smaller than `1` encourage repeatment of tokens and numbers larger than `2` encourages sampling of new tokens. The penalization scales multiplicatively. `float = 0.0`
* `min_new_tokens`: Forces the model to generate at least `min_new_tokens` until a stop word or EOS token is sampled. Note that this might lead to unintended behavior for example if the distribution is highly skewed towards these tokens. `int = 0`
# Constrain the output to follow a given JSON schema.
json_schema: Optional[str] = None,
# Constrain the output to follow a given regular expression.
regex: Optional[str] = None,
# Constrain the output to follow a given EBNF grammar.
ebnf: Optional[str] = None,
### Constrained decoding
## Penalties
Please refer to our dedicated guide on [constrained decoding](https://docs.sglang.ai/backend/structured_outputs.html#Native-API-and-SGLang-Runtime-(SRT)) for the following parameters.
# Float that penalizes new tokens based on their frequency in the generated text so far.
# Values > 0 encourage the model to use new tokens, while values < 0 encourage the model to
# repeat tokens. Must be -2 <= value <= 2. Setting to 0 (default) will disable this penalty.
frequency_penalty: float = 0.0,
# Float that penalizes new tokens based on whether they appear in the generated text so far.
# Values > 0 encourage the model to use new tokens, while values < 0 encourage the model to repeat
# tokens. Must be -2 <= value <= 2. Setting to 0 (default) will disable this penalty.
presence_penalty: float = 0.0,
# Guides inference to generate at least this number of tokens by penalizing logits of tokenizer's
# EOS token and `stop_token_ids` to -inf, until the output token reaches given length.
# Note that any of the `stop` string can be generated before reaching `min_new_tokens`, as it is
# difficult to infer the correct token ID by given `stop` strings.
# Must be 0 <= value < max_new_tokens. Setting to 0 (default) will disable this penalty.
min_new_tokens: int = 0,
* `json_schema`: `Optional[str] = None`
* `regex`: `Optional[str] = None`
* `ebnf`: `Optional[str] = None`
# Whether to ignore EOS token
ignore_eos: bool = False,
# Whether to skip the special tokens during detokenization
skip_special_tokens: bool = True,
# Whether to add spaces between special tokens during detokenization
spaces_between_special_tokens: bool = True,
### Other options
## Custom Parameters for Custom Logit Processor.
# A dictionary of custom parameters for the custom logit processor.
# The custom logit processor takes a list of dictionaries as input, where each
# dictionary is the custom parameters for one token in a batch of the input.
# See also python/sglang/srt/sampling/custom_logit_processor.py
custom_params: Optional[Dict[str, Any]] = None,
```
* `n`: Specifies the number of output sequences to generate per request. (Generating multiple outputs in one request (n > 1) is discouraged; repeat the same prompts for several times offer better control and efficiency.) `int = 1`
* `spaces_between_special_tokens`: Whether or not to add spaces between special tokens during detokenization. `bool = True`
* `no_stop_trim`: Don't trim stop words or EOS token from the generated text. `bool = False`
* `ignore_eos`: Don't stop generation when EOS token is sampled. `bool = False`
* `skip_special_tokens`: Remove special tokens during decoding. `bool = True`
* `custom_params`: Used when employing `CustomLogitProcessor`. For usage see below. `Optional[List[Optional[Dict[str, Any]]]] = None`
## Examples
### Normal
Launch a server
```
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000
```
Send a request
```python
import requests
response = requests.post(
"http://localhost:30000/generate",
json={
"text": "The capital of France is",
"sampling_params": {
"temperature": 0,
"max_new_tokens": 32,
},
},
)
print(response.json())
```
### Streaming
Send a request and stream the output
```python
import requests, json
response = requests.post(
"http://localhost:30000/generate",
json={
"text": "The capital of France is",
"sampling_params": {
"temperature": 0,
"max_new_tokens": 32,
},
"stream": True,
},
stream=True,
)
prev = 0
for chunk in response.iter_lines(decode_unicode=False):
chunk = chunk.decode("utf-8")
if chunk and chunk.startswith("data:"):
if chunk == "data: [DONE]":
break
data = json.loads(chunk[5:].strip("\n"))
output = data["text"].strip()
print(output[prev:], end="", flush=True)
prev = len(output)
print("")
```
### Multi modal
Launch a server
```
python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --chat-template chatml-llava
```
Download an image
```
curl -o example_image.png -L https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true
```
Send a request
```python
import requests
response = requests.post(
"http://localhost:30000/generate",
json={
"text": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
"<|im_start|>user\n<image>\nDescribe this image in a very short sentence.<|im_end|>\n"
"<|im_start|>assistant\n",
"image_data": "example_image.png",
"sampling_params": {
"temperature": 0,
"max_new_tokens": 32,
},
},
)
print(response.json())
```
The `image_data` can be a file name, a URL, or a base64 encoded string. See also `python/sglang/srt/utils.py:load_image`.
Streaming is supported in a similar manner as [above](#streaming).
### Structured Outputs (JSON, Regex, EBNF)
You can specify a JSON schema, regular expression or [EBNF](https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form) to constrain the model output. The model output will be guaranteed to follow the given constraints. Only one constraint parameter (`json_schema`, `regex`, or `ebnf`) can be specified for a request.
SGLang supports two grammar backends:
- [Outlines](https://github.com/dottxt-ai/outlines) (default): Supports JSON schema and regular expression constraints.
- [XGrammar](https://github.com/mlc-ai/xgrammar): Supports JSON schema, regular expression, and EBNF constraints.
- XGrammar currently uses the [GGML BNF format](https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md)
Initialize the XGrammar backend using `--grammar-backend xgrammar` flag
```bash
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
--port 30000 --host 0.0.0.0 --grammar-backend [xgrammar|outlines] # xgrammar or outlines (default: outlines)
```
```python
import json
import requests
json_schema = json.dumps({
"type": "object",
"properties": {
"name": {"type": "string", "pattern": "^[\\w]+$"},
"population": {"type": "integer"},
},
"required": ["name", "population"],
})
# JSON (works with both Outlines and XGrammar)
response = requests.post(
"http://localhost:30000/generate",
json={
"text": "Here is the information of the capital of France in the JSON format.\n",
"sampling_params": {
"temperature": 0,
"max_new_tokens": 64,
"json_schema": json_schema,
},
},
)
print(response.json())
# Regular expression (Outlines backend only)
response = requests.post(
"http://localhost:30000/generate",
json={
"text": "Paris is the capital of",
"sampling_params": {
"temperature": 0,
"max_new_tokens": 64,
"regex": "(France|England)",
},
},
)
print(response.json())
# EBNF (XGrammar backend only)
response = requests.post(
"http://localhost:30000/generate",
json={
"text": "Write a greeting.",
"sampling_params": {
"temperature": 0,
"max_new_tokens": 64,
"ebnf": 'root ::= "Hello" | "Hi" | "Hey"',
},
},
)
print(response.json())
```
### Custom Logit Processor
Launch a server with `--enable-custom-logit-processor` flag on.
```
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --enable-custom-logit-processor
```
Define a custom logit processor that will always sample a specific token id.
```python
from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
@@ -89,7 +301,6 @@ class DeterministicLogitProcessor(CustomLogitProcessor):
```
Send a request
```python
import requests

View File

@@ -68,7 +68,7 @@ Please consult the documentation below to learn more about the parameters you ma
### API configuration
* `api_key`: Sets an API key for the server and the OpenAI-compatible API.
* `file_storage_pth`: Directory for storing uploaded or generated files from API calls.
* `file_storage_path`: Directory for storing uploaded or generated files from API calls.
* `enable_cache_report`: If set, includes detailed usage of cached tokens in the response usage.
## Parallelism
@@ -162,7 +162,6 @@ Please consult the documentation below to learn more about the parameters you ma
*Note: We recommend to stay with the defaults and only use these options for debugging for best possible performance.*
* `disable_radix_cache`: Disable [Radix](https://lmsys.org/blog/2024-01-17-sglang/) backend for prefix caching.
* `disable_jump_forward`: Disable [jump-forward](https://lmsys.org/blog/2024-02-05-compressed-fsm/#our-method-jump-forward-decoding-with-a-compressed-finite-state-machine) for outlines grammar backend.
* `disable_cuda_graph`: Disable [cuda graph](https://pytorch.org/blog/accelerating-pytorch-with-cuda-graphs/) for model forward. Use if encountering uncorrectable CUDA ECC errors.
* `disable_cuda_graph_padding`: Disable cuda graph when padding is needed. In other case still use cuda graph.
* `disable_outlines_disk_cache`: Disable disk cache for outlines grammar backend.

View File

@@ -47,7 +47,7 @@
"server_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf --speculative-algorithm EAGLE \\\n",
" --speculative-draft-model-path lmzheng/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 5 \\\n",
" --speculative-draft-model-path lmsys/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 5 \\\n",
" --speculative-eagle-topk 8 --speculative-num-draft-tokens 64\n",
"\"\"\"\n",
")\n",
@@ -104,7 +104,7 @@
"server_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf --speculative-algorithm EAGLE \\\n",
" --speculative-draft-model-path lmzheng/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 5 \\\n",
" --speculative-draft-model-path lmsys/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 5 \\\n",
" --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --mem-fraction 0.6 \\\n",
" --enable-torch-compile --cuda-graph-max-bs 2\n",
"\"\"\"\n",
@@ -175,7 +175,7 @@
"server_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3-8B-Instruct --speculative-algorithm EAGLE \\\n",
" --speculative-draft-model-path lmzheng/sglang-EAGLE-LLaMA3-Instruct-8B --speculative-num-steps 5 \\\n",
" --speculative-draft-model-path lmsys/sglang-EAGLE-LLaMA3-Instruct-8B --speculative-num-steps 5 \\\n",
" --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --speculative-token-map thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt \\\n",
" --mem-fraction 0.7 --cuda-graph-max-bs 2 --dtype float16 \n",
"\"\"\"\n",

View File

@@ -43,4 +43,4 @@ If you want to contribute but dont have a specific idea in mind, pick issues
If you have any questions or want to start a discussion, please feel free to ask in our [Slack channel](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2um0ad92q-LkU19KQTxCGzlCgRiOiQEw).
Thank you for your interest in SGLang—**happy coding**!
Thank you for your interest in SGLang. Happy coding!

View File

@@ -71,7 +71,7 @@ srun --ntasks=2 --nodes=2 --output="SLURM_Logs/%x_%j_node$SLURM_NODEID.out" \
--model-path "$model" \
--grammar-backend "xgrammar" \
--tp "$tp_size" \
--nccl-init-addr "$NCCL_INIT_ADDR" \
--dist-init-addr "$NCCL_INIT_ADDR" \
--nnodes 2 \
--node-rank "$SLURM_NODEID" &

View File

@@ -2,9 +2,10 @@
You can install SGLang using any of the methods below.
For running DeepSeek V3/R1, refer to [DeepSeek V3 Support](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3). It is recommended to use the [latest version](https://pypi.org/project/sglang/#history) and deploy it with [Docker](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#using-docker-recommended) to avoid environment-related problems.
For running DeepSeek V3/R1, refer to [DeepSeek V3 Support](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3). It is recommended to use the latest version and deploy it with [Docker](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#using-docker-recommended) to avoid environment-related issues.
It is recommended to use uv to install the dependencies for faster installation:
We recommend using uv to install the dependencies with a higher installation speed:
## Method 1: With pip or uv
```bash
@@ -13,14 +14,13 @@ pip install uv
uv pip install "sglang[all]>=0.4.3.post2" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python
```
**Quick Fixes to Installation**
**Quick Fixes to Common Problems**
- SGLang currently uses torch 2.5, so you need to install flashinfer for torch 2.5. If you want to install flashinfer separately, please refer to [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html). Please note that the FlashInfer pypi package is called `flashinfer-python` instead of `flashinfer`.
- If you encounter `OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root`, please try either of the following solutions:
1. Use `export CUDA_HOME=/usr/local/cuda-<your-cuda-version>` to set the `CUDA_HOME` environment variable.
2. Install FlashInfer first following [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html), then install SGLang as described above.
- If you encounter `OSError: CUDA_HOME environment variable is not set`. Please set it to your CUDA install root with either of the following solutions:
1. Use `export CUDA_HOME=/usr/local/cuda-<your-cuda-version>` to set the `CUDA_HOME` environment variable.
2. Install FlashInfer first following [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html), then install SGLang as described above.
- If you encounter `ImportError; cannot import name 'is_valid_list_of_images' from 'transformers.models.llama.image_processing_llama'`, try to use the specified version of `transformers` in [pyproject.toml](https://github.com/sgl-project/sglang/blob/main/python/pyproject.toml). Currently, just running `pip install transformers==4.48.3`.

View File

@@ -17,7 +17,7 @@ def main():
llm = sgl.Engine(
model_path="meta-llama/Llama-2-7b-chat-hf",
speculative_algorithm="EAGLE",
speculative_draft_model_path="lmzheng/sglang-EAGLE-llama2-chat-7B",
speculative_draft_model_path="lmsys/sglang-EAGLE-llama2-chat-7B",
speculative_num_steps=3,
speculative_eagle_topk=4,
speculative_num_draft_tokens=16,

View File

@@ -52,7 +52,7 @@ srt = [
# HIP (Heterogeneous-computing Interface for Portability) for AMD
# => base docker rocm/vllm-dev:20241022, not from public vllm whl
srt_hip = ["sglang[runtime_common]", "sgl-kernel>=0.0.3.post1", "torch", "vllm==0.6.7.dev2", "outlines==0.1.11"]
srt_hip = ["sglang[runtime_common]", "sgl-kernel==0.0.3.post6", "torch", "vllm==0.6.7.dev2", "outlines==0.1.11"]
# xpu is not enabled in public vllm and torch whl,
# need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm

View File

@@ -12,6 +12,5 @@
- `global_config.py`: The global configs and constants.
- `launch_server.py`: The entry point for launching the local server.
- `llama3_eval.py`: Evaluation of Llama 3 using the Meta Llama dataset.
- `profiler.py`: Profile a running server.
- `utils.py`: Common utilities.
- `version.py`: Version info.

View File

@@ -1 +0,0 @@
raise ValueError("bench_latency.py has been renamed to bench_one_batch.py")

View File

@@ -4,6 +4,13 @@ import os
class GlobalConfig:
"""
Store some global constants.
See also python/sglang/srt/managers/schedule_batch.py::global_server_args_dict, which stores
many global runtime arguments as well.
"""
def __init__(self):
# Verbosity level
# 0: do not output anything

View File

@@ -80,7 +80,6 @@ def create_grammar_backend(server_args: ServerArgs, tokenizer, vocab_size):
grammar_backend = OutlinesGrammarBackend(
tokenizer,
whitespace_pattern=server_args.constrained_json_whitespace_pattern,
allow_jump_forward=not server_args.disable_jump_forward,
)
elif server_args.grammar_backend == "xgrammar":
from sglang.srt.constrained.xgrammar_backend import XGrammarGrammarBackend

View File

@@ -115,7 +115,6 @@ class OutlinesGrammarBackend(BaseGrammarBackend):
self,
tokenizer,
whitespace_pattern: bool,
allow_jump_forward: bool,
):
super().__init__()
@@ -140,7 +139,6 @@ class OutlinesGrammarBackend(BaseGrammarBackend):
self.outlines_tokenizer.vocabulary = (
self.outlines_tokenizer.tokenizer.get_vocab()
)
self.allow_jump_forward = allow_jump_forward
self.whitespace_pattern = whitespace_pattern
def init_value_impl(self, key: Tuple[str, str]) -> OutlinesGrammar:
@@ -172,10 +170,7 @@ class OutlinesGrammarBackend(BaseGrammarBackend):
logger.warning(f"skip invalid regex schema: {regex=}, {e=}")
return None
if self.allow_jump_forward:
jump_forward_map = OutlinesJumpForwardMap(regex)
else:
jump_forward_map = None
jump_forward_map = None
return OutlinesGrammar(guide, jump_forward_map)

View File

@@ -438,8 +438,8 @@ async def configure_logging(obj: ConfigureLoggingReq, request: Request):
return Response(status_code=200)
@app.post("/function_call")
async def function_call_request(obj: ParseFunctionCallReq, request: Request):
@app.post("/parse_function_call")
async def parse_function_call_request(obj: ParseFunctionCallReq, request: Request):
"""
A native API endpoint to parse function calls from a text.
"""
@@ -492,7 +492,7 @@ def available_models():
@app.post("/v1/files")
async def openai_v1_files(file: UploadFile = File(...), purpose: str = Form("batch")):
return await v1_files_create(
file, purpose, _global_state.tokenizer_manager.server_args.file_storage_pth
file, purpose, _global_state.tokenizer_manager.server_args.file_storage_path
)

View File

@@ -4,7 +4,7 @@ from typing import TYPE_CHECKING
import torch
from sglang.srt.layers.attention import AttentionBackend
from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
from sglang.srt.managers.schedule_batch import global_server_args_dict
from sglang.srt.model_executor.forward_batch_info import ForwardBatch

View File

@@ -19,9 +19,8 @@ import triton
import triton.language as tl
from sglang.global_config import global_config
from sglang.srt.layers.attention import AttentionBackend
from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
from sglang.srt.layers.dp_attention import get_attention_tp_size
from sglang.srt.managers.schedule_batch import global_server_args_dict
from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
from sglang.srt.utils import is_flashinfer_available

View File

@@ -15,7 +15,7 @@ from typing import TYPE_CHECKING, Optional, Union
import torch
from sglang.global_config import global_config
from sglang.srt.layers.attention import AttentionBackend
from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
from sglang.srt.layers.attention.flashinfer_backend import (
create_flashinfer_kv_indices_triton,
)
@@ -34,7 +34,6 @@ if is_flashinfer_available():
BatchMLAPagedAttentionWrapper,
BatchPrefillWithRaggedKVCacheWrapper,
)
from flashinfer.cascade import merge_state
@dataclass

View File

@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
import torch
from torch.nn.functional import scaled_dot_product_attention
from sglang.srt.layers.attention import AttentionBackend
from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
if TYPE_CHECKING:

View File

@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, Optional, Union
import torch
import triton
from sglang.srt.layers.attention import AttentionBackend
from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
from sglang.srt.layers.attention.flashinfer_backend import (
create_flashinfer_kv_indices_triton,
)

View File

@@ -12,7 +12,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
requantize_with_max_scale,
)
from sglang.srt.layers.attention import AttentionBackend
from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
from sglang.srt.layers.linear import LinearBase, LinearMethodBase
from sglang.srt.layers.parameter import ModelWeightParameter, PerTensorScaleParameter
from sglang.srt.layers.quantization.base_config import (

View File

@@ -57,7 +57,6 @@ DETOKENIZER_MAX_STATES = int(os.environ.get("SGLANG_DETOKENIZER_MAX_STATES", 1 <
class DecodeStatus:
"""Store the status of incremental decoding."""
vid: int
decoded_text: str
decode_ids: List[int]
surr_offset: int
@@ -143,10 +142,8 @@ class DetokenizerManager:
read_ids, surr_ids = [], []
for i in range(bs):
rid = recv_obj.rids[i]
vid = recv_obj.vids[i]
if rid not in self.decode_status or self.decode_status[rid].vid != vid:
if rid not in self.decode_status:
s = DecodeStatus(
vid=vid,
decoded_text=recv_obj.decoded_texts[i],
decode_ids=recv_obj.decode_ids[i],
surr_offset=0,

View File

@@ -376,8 +376,6 @@ class BatchTokenIDOut:
# The finish reason
finished_reasons: List[BaseFinishReason]
# For incremental decoding
# The version id to sync decode status with in detokenizer_manager
vids: List[int]
decoded_texts: List[str]
decode_ids: List[int]
read_offsets: List[int]

View File

@@ -296,7 +296,6 @@ class Req:
# 1: surr_offset
# 2: read_offset
# 3: last token
self.vid = 0 # version id to sync decode status with in detokenizer_manager
self.surr_offset = None # Surrounding offset to defeat the cleanup algorithm
self.read_offset = None
self.decoded_text = ""
@@ -357,11 +356,6 @@ class Req:
) = None
self.hidden_states = []
# Logprobs (internal values)
# The tokens is prefilled but need to be considered as decode tokens
# and should be updated for the decode logprobs
self.last_update_decode_tokens = 0
# Embedding (return values)
self.embedding = None
@@ -500,68 +494,6 @@ class Req:
self.finished_reason = FINISH_MATCHED_STR(matched=stop_str)
return
def jump_forward_and_retokenize(self, jump_forward_str, next_state):
if self.origin_input_text is None:
# Recovering text can only use unpadded ids
self.origin_input_text = self.tokenizer.decode(
self.origin_input_ids_unpadded
)
all_text = self.origin_input_text + self.decoded_text + jump_forward_str
all_ids = self.tokenizer.encode(all_text)
if not all_ids:
logger.warning("Encoded all_text resulted in empty all_ids")
return False
prompt_tokens = len(self.origin_input_ids_unpadded)
if prompt_tokens > len(all_ids):
logger.warning("prompt_tokens is larger than encoded all_ids")
return False
if all_ids[prompt_tokens - 1] != self.origin_input_ids_unpadded[-1]:
# TODO(lsyin): fix token fusion
logger.warning(
"Token fusion between input and output, try to avoid this by removing the space at the end of the input."
)
return False
old_output_ids = self.output_ids
self.output_ids = all_ids[prompt_tokens:]
self.decoded_text = self.decoded_text + jump_forward_str
self.surr_offset = prompt_tokens
self.read_offset = len(all_ids)
# NOTE: A trick to reduce the surrouding tokens decoding overhead
for i in range(0, INIT_INCREMENTAL_DETOKENIZATION_OFFSET):
surr_text_ = self.tokenizer.decode(
all_ids[self.read_offset - i : self.read_offset]
)
if not surr_text_.endswith("<EFBFBD>"):
self.surr_offset = self.read_offset - i
break
# update the inner state of the grammar
self.grammar.jump_and_retokenize(old_output_ids, self.output_ids, next_state)
if self.return_logprob:
# For fast-forward part's logprobs
k = 0
for i, old_id in enumerate(old_output_ids):
if old_id == self.output_ids[i]:
k = k + 1
else:
break
self.output_token_logprobs_val = self.output_token_logprobs_val[:k]
self.output_token_logprobs_idx = self.output_token_logprobs_idx[:k]
self.output_top_logprobs_val = self.output_top_logprobs_val[:k]
self.output_top_logprobs_idx = self.output_top_logprobs_idx[:k]
self.output_token_ids_logprobs_val = self.output_token_ids_logprobs_val[:k]
self.output_token_ids_logprobs_idx = self.output_token_ids_logprobs_idx[:k]
self.logprob_start_len = prompt_tokens + k
self.last_update_decode_tokens = len(self.output_ids) - k
return True
def reset_for_retract(self):
self.prefix_indices = []
self.last_node = None
@@ -574,8 +506,6 @@ class Req:
self.is_chunked = 0
self.req_pool_idx = None
self.last_update_decode_tokens = 0
def __repr__(self):
return (
f"Req(rid={self.rid}, "
@@ -672,7 +602,6 @@ class ScheduleBatch:
enable_overlap: bool,
spec_algorithm: SpeculativeAlgorithm,
enable_custom_logit_processor: bool,
return_hidden_states: bool = False,
):
return cls(
reqs=reqs,
@@ -687,7 +616,7 @@ class ScheduleBatch:
device=req_to_token_pool.device,
spec_algorithm=spec_algorithm,
enable_custom_logit_processor=enable_custom_logit_processor,
return_hidden_states=return_hidden_states,
return_hidden_states=any(req.return_hidden_states for req in reqs),
)
def batch_size(self):
@@ -1091,59 +1020,6 @@ class ScheduleBatch:
return retracted_reqs, new_estimate_ratio
def check_for_jump_forward(self, pad_input_ids_func):
jump_forward_reqs = []
keep_indices = set(i for i in range(len(self.reqs)))
for i, req in enumerate(self.reqs):
if req.grammar is not None:
jump_helper = req.grammar.try_jump_forward(req.tokenizer)
if jump_helper:
suffix_ids, _ = jump_helper
# Current ids, for cache and revert
cur_all_ids = tuple(req.origin_input_ids + req.output_ids)[:-1]
cur_output_ids = req.output_ids
req.output_ids.extend(suffix_ids)
decode_res, new_text = req.get_next_inc_detokenization()
if not decode_res:
req.output_ids = cur_output_ids
continue
(
jump_forward_str,
next_state,
) = req.grammar.jump_forward_str_state(jump_helper)
# Make the incrementally decoded text part of jump_forward_str
# so that the UTF-8 will not corrupt
jump_forward_str = new_text + jump_forward_str
if not req.jump_forward_and_retokenize(
jump_forward_str, next_state
):
req.output_ids = cur_output_ids
continue
# The decode status has diverged from detokenizer_manager
req.vid += 1
# insert the old request into tree_cache
self.tree_cache.cache_finished_req(req, cur_all_ids)
# re-applying image padding
if req.image_inputs is not None:
req.origin_input_ids = pad_input_ids_func(
req.origin_input_ids_unpadded, req.image_inputs
)
jump_forward_reqs.append(req)
keep_indices.remove(i)
self.filter_batch(keep_indices=list(keep_indices))
return jump_forward_reqs
def prepare_encoder_info_decode(self):
# Reset the encoder cached status
self.encoder_cached = [True] * len(self.reqs)

View File

@@ -150,7 +150,6 @@ class Scheduler:
self.tp_rank = tp_rank
self.tp_size = server_args.tp_size
self.schedule_policy = server_args.schedule_policy
self.disable_jump_forward = server_args.disable_jump_forward
self.lora_paths = server_args.lora_paths
self.max_loras_per_batch = server_args.max_loras_per_batch
self.enable_overlap = not server_args.disable_overlap_schedule
@@ -251,9 +250,6 @@ class Scheduler:
self.enable_overlap = False
logger.info("Overlap scheduler is disabled for multimodal models.")
if self.enable_overlap:
self.disable_jump_forward = True
# Launch a tensor parallel worker
if self.enable_overlap:
TpWorkerClass = TpModelWorkerClient
@@ -1024,11 +1020,8 @@ class Scheduler:
if self.running_batch is not None
else set([])
)
return_hidden_states = False
# Get requests from the waiting queue to a new prefill batch
for req in self.waiting_queue:
if req.return_hidden_states:
return_hidden_states = True
if (
self.lora_paths
and len(
@@ -1114,7 +1107,6 @@ class Scheduler:
self.enable_overlap,
self.spec_algorithm,
self.server_args.enable_custom_logit_processor,
return_hidden_states,
)
new_batch.prepare_for_extend()
@@ -1168,14 +1160,6 @@ class Scheduler:
self.min_new_token_ratio,
)
# Check for jump-forward
if not self.disable_jump_forward and batch.has_grammar:
jump_forward_reqs = batch.check_for_jump_forward(self.pad_input_ids_func)
self._extend_requests_to_queue(jump_forward_reqs)
if batch.is_empty():
self.batch_is_full = False
return None
if batch.batch_size() < initial_bs:
self.batch_is_full = False
@@ -1530,8 +1514,6 @@ class Scheduler:
prefill (e.g., computing input token logprobs).
"""
assert output.input_token_logprobs is not None
# It is for jump decoding that will be deprecated.
assert req.last_update_decode_tokens == 0
if req.input_token_logprobs is None:
req.input_token_logprobs = []
if req.temp_input_top_logprobs_val is None:
@@ -1658,50 +1640,12 @@ class Scheduler:
self.add_input_logprob_return_values(
i, req, output, pt, num_input_logprobs, last_prefill_chunk=True
)
if req.last_update_decode_tokens != 0:
# Some decode tokens are re-computed in an extend batch
req.output_token_logprobs_val.extend(
output.input_token_logprobs[
pt
+ num_input_logprobs
- 1
- req.last_update_decode_tokens : pt
+ num_input_logprobs
- 1
],
)
req.output_token_logprobs_idx.extend(
req.fill_ids[
len(req.fill_ids)
- req.last_update_decode_tokens : len(req.fill_ids)
]
)
if req.top_logprobs_num > 0:
if req.last_update_decode_tokens != 0:
req.output_top_logprobs_val.extend(
output.input_top_logprobs_val[i][-req.last_update_decode_tokens :]
)
req.output_top_logprobs_idx.extend(
output.input_top_logprobs_idx[i][-req.last_update_decode_tokens :]
)
req.output_top_logprobs_val.append(output.next_token_top_logprobs_val[i])
req.output_top_logprobs_idx.append(output.next_token_top_logprobs_idx[i])
if req.token_ids_logprob is not None:
if req.last_update_decode_tokens != 0:
req.output_token_ids_logprobs_val.extend(
output.input_token_ids_logprobs_val[i][
-req.last_update_decode_tokens :
]
)
req.output_token_ids_logprobs_idx.extend(
output.input_token_ids_logprobs_idx[i][
-req.last_update_decode_tokens :
]
)
req.output_token_ids_logprobs_val.append(
output.next_token_token_ids_logprobs_val[i]
)
@@ -1719,7 +1663,6 @@ class Scheduler:
finished_reasons: List[BaseFinishReason] = []
if self.is_generation:
vids = []
decoded_texts = []
decode_ids_list = []
read_offsets = []
@@ -1786,7 +1729,6 @@ class Scheduler:
finished_reasons.append(
req.finished_reason.to_json() if req.finished_reason else None
)
vids.append(req.vid)
decoded_texts.append(req.decoded_text)
decode_ids, read_offset = req.init_incremental_detokenize()
decode_ids_list.append(decode_ids)
@@ -1842,7 +1784,6 @@ class Scheduler:
BatchTokenIDOut(
rids,
finished_reasons,
vids,
decoded_texts,
decode_ids_list,
read_offsets,

View File

@@ -41,7 +41,7 @@ from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
from sglang.srt.utils import get_compiler_backend
if TYPE_CHECKING:
from sglang.srt.layers.attention import AttentionBackend
from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
from sglang.srt.managers.schedule_batch import ImageInputs, ModelWorkerBatch
from sglang.srt.mem_cache.memory_pool import BaseTokenToKVPool, ReqToTokenPool
from sglang.srt.model_executor.model_runner import ModelRunner

View File

@@ -26,8 +26,6 @@ from fastapi import HTTPException, Request, UploadFile
from fastapi.responses import ORJSONResponse, StreamingResponse
from pydantic import ValidationError
from sglang.lang.chat_template import get_chat_template_by_model_path
try:
from outlines.fsm.json_schema import convert_json_schema_to_str
except ImportError:
@@ -165,24 +163,19 @@ def load_chat_template_for_openai_api(tokenizer_manager, chat_template_arg, mode
else:
chat_template_name = chat_template_arg
# check chat-template
chat_template = get_chat_template_by_model_path(model_path)
if chat_template is not None:
official_chat_template = chat_template.name
used_chat_template = chat_template_name
if official_chat_template != used_chat_template:
logger.warning(
f"Using a chat_template: '{used_chat_template}', "
f"which is different from official chat template: '{official_chat_template}', "
f"This discrepancy may lead to performance degradation."
)
# Check chat-template
# TODO:
# 1. Do not import any code from sglang.lang
# 2. For VLM, when chat_template_arg is None, set it automatically by guessing from model_path.
async def v1_files_create(file: UploadFile, purpose: str, file_storage_pth: str = None):
async def v1_files_create(
file: UploadFile, purpose: str, file_storage_path: str = None
):
try:
global storage_dir
if file_storage_pth:
storage_dir = file_storage_pth
if file_storage_path:
storage_dir = file_storage_path
# Read the file content
file_content = await file.read()

View File

@@ -40,17 +40,23 @@ class SamplingParams:
presence_penalty: float = 0.0,
repetition_penalty: float = 1.0,
min_new_tokens: int = 0,
spaces_between_special_tokens: bool = True,
n: int = 1,
json_schema: Optional[str] = None,
regex: Optional[str] = None,
ebnf: Optional[str] = None,
structural_tag: Optional[str] = None,
no_stop_trim: bool = False,
ignore_eos: bool = False,
skip_special_tokens: bool = True,
spaces_between_special_tokens: bool = True,
no_stop_trim: bool = False,
custom_params: Optional[Dict[str, Any]] = None,
) -> None:
self.max_new_tokens = max_new_tokens
self.stop_strs = stop
if stop_token_ids:
self.stop_token_ids = set(stop_token_ids)
else:
self.stop_token_ids = None
self.temperature = temperature
self.top_p = top_p
self.top_k = top_k
@@ -58,26 +64,21 @@ class SamplingParams:
self.frequency_penalty = frequency_penalty
self.presence_penalty = presence_penalty
self.repetition_penalty = repetition_penalty
self.stop_strs = stop
if stop_token_ids:
self.stop_token_ids = set(stop_token_ids)
else:
self.stop_token_ids = None
self.max_new_tokens = max_new_tokens
self.min_new_tokens = min_new_tokens
self.ignore_eos = ignore_eos
self.skip_special_tokens = skip_special_tokens
self.spaces_between_special_tokens = spaces_between_special_tokens
self.regex = regex
self.n = n
self.json_schema = json_schema
self.ebnf = ebnf
self.structural_tag = structural_tag
self.ignore_eos = ignore_eos
self.skip_special_tokens = skip_special_tokens
self.spaces_between_special_tokens = spaces_between_special_tokens
self.no_stop_trim = no_stop_trim
self.custom_params = custom_params
# Process some special cases
if self.temperature < _SAMPLING_EPS:
# top_k = 1 means greedy sampling
self.temperature = 1.0
self.top_k = 1
if self.top_k == -1:

View File

@@ -15,21 +15,15 @@
import argparse
import dataclasses
import json
import logging
import os
import random
import subprocess
import tempfile
import uuid
from pathlib import Path
from typing import List, Optional
import torch
from sglang.srt.hf_transformers_utils import check_gguf_file
from sglang.srt.utils import (
create_checksum,
get_amdgpu_memory_capacity,
get_hpu_memory_capacity,
get_nvgpu_memory_capacity,
@@ -101,7 +95,7 @@ class ServerArgs:
# API related
api_key: Optional[str] = None
file_storage_pth: str = "sglang_storage"
file_storage_path: str = "sglang_storage"
enable_cache_report: bool = False
# Data parallelism
@@ -149,7 +143,6 @@ class ServerArgs:
# Optimization/debug options
disable_radix_cache: bool = False
disable_jump_forward: bool = False
disable_cuda_graph: bool = False
disable_cuda_graph_padding: bool = False
enable_nccl_nvls: bool = False
@@ -627,9 +620,9 @@ class ServerArgs:
help="Set API key of the server. It is also used in the OpenAI API compatible server.",
)
parser.add_argument(
"--file-storage-pth",
"--file-storage-path",
type=str,
default=ServerArgs.file_storage_pth,
default=ServerArgs.file_storage_path,
help="The path of the file storage in backend.",
)
parser.add_argument(
@@ -836,11 +829,6 @@ class ServerArgs:
action="store_true",
help="Disable RadixAttention for prefix caching.",
)
parser.add_argument(
"--disable-jump-forward",
action="store_true",
help="Disable jump-forward for grammar-guided decoding.",
)
parser.add_argument(
"--disable-cuda-graph",
action="store_true",

View File

@@ -44,7 +44,7 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmzheng/sglang-EAGLE-llama2-chat-7B"
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
def is_in_ci():

View File

@@ -1,7 +1,7 @@
"""
Usage:
# single GPU
python3 bench_speculative.py --model-path meta-llama/Llama-2-7b-chat-hf --speculative-draft-model-path lmzheng/sglang-EAGLE-llama2-chat-7B
python3 bench_speculative.py --model-path meta-llama/Llama-2-7b-chat-hf --speculative-draft-model-path lmsys/sglang-EAGLE-llama2-chat-7B
"""
import argparse

View File

@@ -17,3 +17,59 @@ For CUDA 12.1 or CUDA 12.4:
```bash
pip3 install sgl-kernel
```
# Developer Guide
## Development Environment Setup
Use Docker to set up the development environment. See [Docker setup guide](https://github.com/sgl-project/sglang/blob/main/docs/developer/development_guide_using_docker.md#setup-docker-container).
Create and enter development container:
```bash
docker run -itd --shm-size 32g --gpus all -v $HOME/.cache:/root/.cache --ipc=host --name sglang_zhyncs lmsysorg/sglang:dev /bin/zsh
docker exec -it sglang_zhyncs /bin/zsh
```
## Project Structure
### Dependencies
Third-party libraries:
- [CCCL](https://github.com/NVIDIA/cccl)
- [CUTLASS](https://github.com/NVIDIA/cutlass)
- [FlashInfer](https://github.com/flashinfer-ai/flashinfer)
- [TurboMind](https://github.com/InternLM/turbomind)
### Kernel Development
Steps to add a new kernel:
1. Implement in [src/sgl-kernel/csrc/](https://github.com/sgl-project/sglang/tree/main/sgl-kernel/src/sgl-kernel/csrc)
2. Expose interface in [src/sgl-kernel/include/sgl_kernels_ops.h](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/include/sgl_kernels_ops.h)
3. Create torch extension in [src/sgl-kernel/torch_extension.cc](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/torch_extension.cc)
4. Create Python wrapper in [src/sgl-kernel/ops/\_\_init\_\_.py](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/ops/__init__.py)
5. Expose Python interface in [src/sgl-kernel/\_\_init\_\_.py](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/__init__.py)
6. Update [setup.py](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/setup.py) to include new CUDA source
### Build & Install
Development build:
```bash
make build
```
Note:
The `sgl-kernel` is rapidly evolving. If you experience a compilation failure, try using `make rebuild`.
### Testing & Benchmarking
1. Add pytest tests in [tests/](https://github.com/sgl-project/sglang/tree/main/sgl-kernel/tests)
2. Add benchmarks using [triton benchmark](https://triton-lang.org/main/python-api/generated/triton.testing.Benchmark.html) in [benchmark/](https://github.com/sgl-project/sglang/tree/main/sgl-kernel/benchmark)
3. Run test suite
### Release new version
Update version in [pyproject.toml](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/pyproject.toml) and [version.py](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/version.py)

View File

@@ -1,55 +0,0 @@
# Developer Guide for sgl-kernel
## Development Environment Setup
Use Docker to set up the development environment. See [Docker setup guide](https://github.com/sgl-project/sglang/blob/main/docs/developer/development_guide_using_docker.md#setup-docker-container).
Create and enter development container:
```bash
docker run -itd --shm-size 32g --gpus all -v $HOME/.cache:/root/.cache --ipc=host --name sglang_zhyncs lmsysorg/sglang:dev /bin/zsh
docker exec -it sglang_zhyncs /bin/zsh
```
## Project Structure
### Dependencies
Third-party libraries:
- [CCCL](https://github.com/NVIDIA/cccl)
- [CUTLASS](https://github.com/NVIDIA/cutlass)
- [FlashInfer](https://github.com/flashinfer-ai/flashinfer)
- [TurboMind](https://github.com/InternLM/turbomind)
### Kernel Development
Steps to add a new kernel:
1. Implement in [src/sgl-kernel/csrc/](https://github.com/sgl-project/sglang/tree/main/sgl-kernel/src/sgl-kernel/csrc)
2. Expose interface in [src/sgl-kernel/include/sgl_kernels_ops.h](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/include/sgl_kernels_ops.h)
3. Create torch extension in [src/sgl-kernel/torch_extension.cc](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/torch_extension.cc)
4. Create Python wrapper in [src/sgl-kernel/ops/\_\_init\_\_.py](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/ops/__init__.py)
5. Expose Python interface in [src/sgl-kernel/\_\_init\_\_.py](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/__init__.py)
6. Update [setup.py](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/setup.py) to include new CUDA source
### Build & Install
Development build:
```bash
make build
```
Note:
The `sgl-kernel` is rapidly evolving. If you experience a compilation failure, try using `make rebuild`.
### Testing & Benchmarking
1. Add pytest tests in [tests/](https://github.com/sgl-project/sglang/tree/main/sgl-kernel/tests)
2. Add benchmarks using [triton benchmark](https://triton-lang.org/main/python-api/generated/triton.testing.Benchmark.html) in [benchmark/](https://github.com/sgl-project/sglang/tree/main/sgl-kernel/benchmark)
3. Run test suite
### Release new version
Update version in [pyproject.toml](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/pyproject.toml) and [version.py](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/src/sgl-kernel/version.py)

View File

@@ -100,6 +100,7 @@ sources = [
"src/sgl-kernel/csrc/activation/fused_add_rms_norm_kernel.cu",
"src/sgl-kernel/csrc/allreduce/trt_reduce_internal.cu",
"src/sgl-kernel/csrc/allreduce/trt_reduce_kernel.cu",
"src/sgl-kernel/csrc/attention/lightning_attention_decode_kernel.cu",
"src/sgl-kernel/csrc/gemm/cublas_grouped_gemm.cu",
"src/sgl-kernel/csrc/gemm/fp8_gemm_kernel.cu",
"src/sgl-kernel/csrc/gemm/fp8_blockwise_gemm_kernel.cu",
@@ -108,7 +109,6 @@ sources = [
"src/sgl-kernel/csrc/moe/moe_align_kernel.cu",
"src/sgl-kernel/csrc/speculative/eagle_utils.cu",
"src/sgl-kernel/csrc/speculative/speculative_sampling.cu",
"src/sgl-kernel/csrc/lightning_attention_decode_kernel.cu",
"3rdparty/flashinfer/csrc/activation.cu",
"3rdparty/flashinfer/csrc/bmm_fp8.cu",
"3rdparty/flashinfer/csrc/norm.cu",

View File

@@ -62,6 +62,11 @@ TORCH_LIBRARY_EXPAND(sgl_kernels, m) {
m.def("register_graph_buffers(int fa, int[][] handles, int[][] offsets) -> ()");
m.impl("register_graph_buffers", torch::kCUDA, &register_graph_buffers);
/*
* From csrc/attention
*/
m.impl("lightning_attention_decode", torch::kCUDA, &lightning_attention_decode);
/*
* From csrc/gemm
*/
@@ -163,11 +168,6 @@ TORCH_LIBRARY_EXPAND(sgl_kernels, m) {
"apply_rope_pos_ids_cos_sin_cache(Tensor q, Tensor k, Tensor! q_rope, Tensor! k_rope, Tensor cos_sin_cache, "
"Tensor pos_ids, bool interleave, int cuda_stream) -> ()");
m.impl("apply_rope_pos_ids_cos_sin_cache", torch::kCUDA, &apply_rope_pos_ids_cos_sin_cache);
/*
* Other
*/
m.impl("lightning_attention_decode", torch::kCUDA, &lightning_attention_decode);
}
REGISTER_EXTENSION(_kernels)

View File

@@ -46,7 +46,6 @@ class TestEBNFConstrained(unittest.TestCase):
@classmethod
def setUpClass(cls):
setup_class(cls, "xgrammar", disable_overlap=False)
cls.check_jump_forward = False
@classmethod
def tearDownClass(cls):
@@ -238,12 +237,5 @@ class TestEBNFConstrained(unittest.TestCase):
)
class TestEBNFConstrainedLLGuidance(TestEBNFConstrained):
@classmethod
def setUpClass(cls):
setup_class(cls, "llguidance", disable_overlap=False)
cls.check_jump_forward = False
if __name__ == "__main__":
unittest.main()

View File

@@ -57,7 +57,6 @@ class TestJSONConstrainedOutlinesBackend(unittest.TestCase):
@classmethod
def setUpClass(cls):
setup_class(cls, backend="outlines", disable_overlap=False)
cls.check_jump_forward = False
@classmethod
def tearDownClass(cls):
@@ -134,26 +133,5 @@ class TestJSONConstrainedOutlinesBackend(unittest.TestCase):
list(executor.map(self.run_decode, json_schemas))
class TestJumpForwardOutlinesBackend(unittest.TestCase):
@classmethod
def setUpClass(cls):
setup_class(cls, backend="outlines", disable_overlap=True)
cls.check_jump_forward = True
class TestJSONConstrainedXGrammarBackend(TestJSONConstrainedOutlinesBackend):
@classmethod
def setUpClass(cls):
setup_class(cls, backend="xgrammar", disable_overlap=False)
cls.check_jump_forward = False
class TestJSONConstrainedLLGuidanceBackend(TestJSONConstrainedOutlinesBackend):
@classmethod
def setUpClass(cls):
setup_class(cls, backend="llguidance", disable_overlap=False)
cls.check_jump_forward = False
if __name__ == "__main__":
unittest.main()

View File

@@ -12,7 +12,9 @@ from sglang.test.test_utils import (
DEFAULT_MOE_MODEL_NAME_FOR_TEST,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
is_in_ci,
popen_launch_server,
write_github_step_summary,
)
@@ -49,6 +51,9 @@ class TestMoEEvalAccuracyLarge(unittest.TestCase):
metrics = run_eval(args)
self.assertGreater(metrics["score"], 0.62)
if is_in_ci():
write_github_step_summary(f"### test_mmlu\n" f'{metrics["score"]=:.4f}\n')
def test_human_eval(self):
args = SimpleNamespace(
base_url=self.base_url,
@@ -61,6 +66,11 @@ class TestMoEEvalAccuracyLarge(unittest.TestCase):
metrics = run_eval(args)
self.assertGreater(metrics["score"], 0.40)
if is_in_ci():
write_github_step_summary(
f"### test_human_eval\n" f'{metrics["score"]=:.4f}\n'
)
def test_mgsm_en(self):
args = SimpleNamespace(
base_url=self.base_url,
@@ -73,6 +83,11 @@ class TestMoEEvalAccuracyLarge(unittest.TestCase):
metrics = run_eval(args)
self.assertGreater(metrics["score"], 0.61)
if is_in_ci():
write_github_step_summary(
f"### test_mgsm_en\n" f'{metrics["score"]=:.4f}\n'
)
if __name__ == "__main__":
unittest.main()

View File

@@ -3,8 +3,6 @@ python3 -m unittest test_regex_constrained.TestRegexConstrained.test_regex_gener
python3 -m unittest test_regex_constrained.TestRegexConstrained.test_regex_generate_greeting
python3 -m unittest test_regex_constrained.TestRegexConstrainedLLGuidance.test_regex_generate_email
python3 -m unittest test_regex_constrained.TestRegexConstrainedLLGuidance.test_regex_generate_greeting
python3 -m unittest test_regex_constrained.TestJumpForwardLLGuidance.test_regex_generate_email
python3 -m unittest test_regex_constrained.TestJumpForwardLLGuidance.test_regex_generate_greeting
"""
import json
@@ -47,7 +45,6 @@ class TestRegexConstrained(unittest.TestCase):
@classmethod
def setUpClass(cls):
setup_class(cls, "xgrammar", disable_overlap=False)
cls.check_jump_forward = False
@classmethod
def tearDownClass(cls):
@@ -179,20 +176,6 @@ class TestRegexConstrained(unittest.TestCase):
)
class TestJumpForward(TestRegexConstrained):
@classmethod
def setUpClass(cls):
setup_class(cls, "xgrammar", disable_overlap=True)
cls.check_jump_forward = True
class TestJumpForwardLLGuidance(TestRegexConstrained):
@classmethod
def setUpClass(cls):
setup_class(cls, "llguidance", disable_overlap=True)
cls.check_jump_forward = True
class TestRegexConstrainedLLGuidance(TestRegexConstrained):
@classmethod
def setUpClass(cls):