diff --git a/README.md b/README.md index f9034587d..7e895b862 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ SGLang is a fast serving framework for large language models and vision language It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language. The core features include: -- **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, continuous batching, token attention (paged attention), tensor parallelism, flashinfer kernels, jump-forward constrained decoding, and quantization (AWQ/FP8/GPTQ/Marlin). +- **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, flashinfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin). - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions. ## News @@ -129,7 +129,7 @@ response = client.chat.completions.create( print(response) ``` -It supports streaming and most features of the Chat/Completions/Models endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/). +It supports streaming, vision, and most features of the Chat/Completions/Models endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/). ### Additional Server Arguments - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option. diff --git a/docs/sampling_params.md b/docs/sampling_params.md index 745f20823..e9b3140e3 100644 --- a/docs/sampling_params.md +++ b/docs/sampling_params.md @@ -8,23 +8,24 @@ The `/generate` endpoint accepts the following arguments in the JSON format. class GenerateReqInput: # The input prompt. It can be a single prompt or a batch of prompts. text: Union[List[str], str] - # The token ids for text; one can either specify text or input_ids + # The token ids for text; one can either specify text or input_ids. input_ids: Optional[Union[List[List[int]], List[int]]] = None - # The image input. It can be a file name. + # The image input. It can be a file name, a url, or base64 encoded string. + # See also python/sglang/srt/utils.py:load_image. image_data: Optional[Union[List[str], str]] = None - # The sampling_params + # The sampling_params. sampling_params: Union[List[Dict], Dict] = None - # The request id + # The request id. rid: Optional[Union[List[str], str]] = None - # Whether to return logprobs + # Whether to return logprobs. return_logprob: Optional[Union[List[bool], bool]] = None - # The start location of the prompt for return_logprob + # The start location of the prompt for return_logprob. logprob_start_len: Optional[Union[List[int], int]] = None - # The number of top logprobs to return + # The number of top logprobs to return. top_logprobs_num: Optional[Union[List[int], int]] = None - # Whether to detokenize tokens in logprobs + # Whether to detokenize tokens in logprobs. return_text_in_logprobs: bool = False - # Whether to stream output + # Whether to stream output. stream: bool = False ``` @@ -48,13 +49,19 @@ class SamplingParams: ) -> None: ``` +- `max_new_tokens`, `stop`, `temperature`, `top_p`, `top_k` are common sampling parameters. +- `ignore_eos` means ignoring the EOS token and continue decoding, which is helpful for benchmarking purposes. +- `regex` constrains the output to follow a given regular expression. + ## Examples ### Normal +Launch a server ``` -python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 +python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 ``` +Send a request ```python import requests @@ -72,7 +79,7 @@ print(response.json()) ``` ### Streaming - +Send a request and stream the output ```python import requests, json @@ -104,4 +111,32 @@ print("") ### Multi modal -See [test_httpserver_llava.py](../test/srt/test_httpserver_llava.py). +Launch a server +``` +python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000 +``` + +Download an image +``` +curl -o example_image.png -L https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true +``` + +```python +import requests + +response = requests.post( + "http://localhost:30000/generate", + json={ + "text": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: \nDescribe this picture ASSISTANT:", + "image_data": "example_image.png", + "sampling_params": { + "temperature": 0, + "max_new_tokens": 32, + }, + }, +) +print(response.json()) +``` + +The `image_data` can be a file name, a URL, or a base64 encoded string. See also `python/sglang/srt/utils.py:load_image`. +Streaming is supported in a similar manner as [above](#streaming). \ No newline at end of file diff --git a/python/sglang/README.md b/python/sglang/README.md index 0b01ec1df..38cfb5a3b 100644 --- a/python/sglang/README.md +++ b/python/sglang/README.md @@ -4,7 +4,8 @@ - `srt`: The backend engine for running local models. (SRT = SGLang Runtime). - `test`: Test utilities. - `api.py`: Public API. -- `bench_latency.py`: Benchmark utilities. +- `bench_latency.py`: Benchmark a single static batch. +- `bench_serving.py`: Benchmark online serving with dynamic requests. - `global_config.py`: The global configs and constants. - `launch_server.py`: The entry point of launching local server. - `utils.py`: Common utilities. diff --git a/python/sglang/check_env.py b/python/sglang/check_env.py index 957fb2852..1b5828c46 100644 --- a/python/sglang/check_env.py +++ b/python/sglang/check_env.py @@ -1,3 +1,5 @@ +"""Check environment configurations and dependency versions.""" + import importlib import os import resource diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py index f0240f6dc..d1ad4b097 100644 --- a/python/sglang/srt/managers/io_struct.py +++ b/python/sglang/srt/managers/io_struct.py @@ -13,25 +13,26 @@ from sglang.srt.sampling_params import SamplingParams @dataclass class GenerateReqInput: - # The input prompt - text: Optional[Union[List[str], str]] = None - # The token ids for text; one can either specify text or input_ids + # The input prompt. It can be a single prompt or a batch of prompts. + text: Union[List[str], str] + # The token ids for text; one can either specify text or input_ids. input_ids: Optional[Union[List[List[int]], List[int]]] = None - # The image input + # The image input. It can be a file name, a url, or base64 encoded string. + # See also python/sglang/srt/utils.py:load_image. image_data: Optional[Union[List[str], str]] = None - # The sampling_params + # The sampling_params. sampling_params: Union[List[Dict], Dict] = None - # The request id + # The request id. rid: Optional[Union[List[str], str]] = None - # Whether to return logprobs + # Whether to return logprobs. return_logprob: Optional[Union[List[bool], bool]] = None - # The start location of the prompt for return_logprob + # The start location of the prompt for return_logprob. logprob_start_len: Optional[Union[List[int], int]] = None - # The number of top logprobs to return + # The number of top logprobs to return. top_logprobs_num: Optional[Union[List[int], int]] = None - # Whether to detokenize tokens in logprobs + # Whether to detokenize tokens in logprobs. return_text_in_logprobs: bool = False - # Whether to stream output + # Whether to stream output. stream: bool = False def post_init(self): diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 522874035..4c1b5f4a4 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -74,21 +74,6 @@ async def health() -> Response: return Response(status_code=200) -def get_model_list(): - """Available models.""" - model_names = [tokenizer_manager.model_path] - return model_names - - -@app.get("/v1/models") -def available_models(): - """Show available models.""" - model_cards = [] - for model_name in get_model_list(): - model_cards.append(ModelCard(id=model_name, root=model_name)) - return ModelList(data=model_cards) - - @app.get("/get_model_info") async def get_model_info(): result = { @@ -154,6 +139,16 @@ async def openai_v1_chat_completions(raw_request: Request): return await v1_chat_completions(tokenizer_manager, raw_request) +@app.get("/v1/models") +def available_models(): + """Show available models.""" + model_names = [tokenizer_manager.model_path] + model_cards = [] + for model_name in model_names: + model_cards.append(ModelCard(id=model_name, root=model_name)) + return ModelList(data=model_cards) + + def _set_global_server_args(server_args: ServerArgs): global global_server_args_dict global_server_args_dict = {