Improve doc strings (#518)
This commit is contained in:
12
README.md
12
README.md
@@ -10,8 +10,8 @@ SGLang is a structured generation language designed for large language models (L
|
|||||||
It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
|
It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
|
||||||
|
|
||||||
The core features include:
|
The core features include:
|
||||||
- **A Flexible Front-End Language**: This allows for easy programming of LLM applications with multiple chained generation calls, advanced prompting techniques, control flow, multiple modalities, parallelism, and external interaction.
|
- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
|
||||||
- **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by automatically reusing the KV cache across multiple calls. It can also be used as a standalone serving engine with all common techniques implemented, such as continuous batching and tensor parallelism.
|
- **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
|
||||||
|
|
||||||
## News
|
## News
|
||||||
- [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
- [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
||||||
@@ -403,10 +403,10 @@ https://github.com/sgl-project/sglang/issues/157
|
|||||||
|
|
||||||
## Citation And Acknowledgment
|
## Citation And Acknowledgment
|
||||||
```
|
```
|
||||||
@misc{zheng2023efficiently,
|
@misc{zheng2024sglang,
|
||||||
title={Efficiently Programming Large Language Models using SGLang},
|
title={SGLang: Efficient Execution of Structured Language Model Programs},
|
||||||
author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Jeff Huang and Chuyue Sun and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
|
author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
|
||||||
year={2023},
|
year={2024},
|
||||||
eprint={2312.07104},
|
eprint={2312.07104},
|
||||||
archivePrefix={arXiv},
|
archivePrefix={arXiv},
|
||||||
primaryClass={cs.AI}
|
primaryClass={cs.AI}
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
"""Some Public API Definitions"""
|
"""Public APIs of the language."""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
"""Launch the inference server."""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from sglang.srt.server import ServerArgs, launch_server
|
from sglang.srt.server import ServerArgs, launch_server
|
||||||
@@ -8,4 +10,4 @@ if __name__ == "__main__":
|
|||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
server_args = ServerArgs.from_cli_args(args)
|
server_args = ServerArgs.from_cli_args(args)
|
||||||
|
|
||||||
launch_server(server_args, None)
|
launch_server(server_args, None)
|
||||||
@@ -1,3 +1,4 @@
|
|||||||
|
"""Launch the inference server for Llava-video model."""
|
||||||
import argparse
|
import argparse
|
||||||
import multiprocessing as mp
|
import multiprocessing as mp
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
"""Cache for the compressed finite state machine."""
|
||||||
from sglang.srt.constrained import RegexFSM, TransformerTokenizer
|
from sglang.srt.constrained import RegexFSM, TransformerTokenizer
|
||||||
from sglang.srt.constrained.base_cache import BaseCache
|
from sglang.srt.constrained.base_cache import BaseCache
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,7 @@
|
|||||||
|
"""
|
||||||
|
Faster constrained decoding.
|
||||||
|
Reference: https://lmsys.org/blog/2024-02-05-compressed-fsm/
|
||||||
|
"""
|
||||||
import interegular
|
import interegular
|
||||||
|
|
||||||
from sglang.srt.constrained import FSMInfo, disk_cache, make_deterministic_fsm
|
from sglang.srt.constrained import FSMInfo, disk_cache, make_deterministic_fsm
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
"""Conversation templates."""
|
||||||
# Adapted from
|
# Adapted from
|
||||||
# https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
|
# https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
|
||||||
import dataclasses
|
import dataclasses
|
||||||
|
|||||||
@@ -1,4 +1,6 @@
|
|||||||
"""
|
"""
|
||||||
|
Flush the KV cache.
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
python3 -m sglang.srt.flush_cache --url http://localhost:30000
|
python3 -m sglang.srt.flush_cache --url http://localhost:30000
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
"""Logits processing."""
|
||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from vllm.distributed import (
|
from vllm.distributed import (
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
"""Radix attention."""
|
||||||
import torch
|
import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
"""A data parallel worker thread."""
|
"""A data parallel worker thread."""
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
import queue
|
import queue
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
"""Meta data for requests and batches"""
|
"""Meta data for requests and batches"""
|
||||||
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from enum import IntEnum, auto
|
from enum import IntEnum, auto
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
"""ModelRunner runs the forward passes of the models."""
|
||||||
import importlib
|
import importlib
|
||||||
import importlib.resources
|
import importlib.resources
|
||||||
import logging
|
import logging
|
||||||
|
|||||||
@@ -1,3 +1,6 @@
|
|||||||
|
"""
|
||||||
|
The radix tree data structure for managing the KV cache.
|
||||||
|
"""
|
||||||
import heapq
|
import heapq
|
||||||
import time
|
import time
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
"""Request scheduler heuristic."""
|
||||||
import random
|
import random
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
"""A tensor parallel worker."""
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
"""DetokenizerManager is a process that detokenizes the token ids."""
|
||||||
import asyncio
|
import asyncio
|
||||||
import inspect
|
import inspect
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,8 @@
|
|||||||
|
"""
|
||||||
|
The definition of objects transfered between different
|
||||||
|
processes (TokenizerManager, DetokenizerManager, Controller).
|
||||||
|
"""
|
||||||
|
|
||||||
import uuid
|
import uuid
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Dict, List, Optional, Union
|
from typing import Dict, List, Optional, Union
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
"""TokenizerManager is a process that tokenizes the text."""
|
||||||
import asyncio
|
import asyncio
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
import dataclasses
|
import dataclasses
|
||||||
@@ -283,7 +284,7 @@ class TokenizerManager:
|
|||||||
req = AbortReq(rid)
|
req = AbortReq(rid)
|
||||||
self.send_to_router.send_pyobj(req)
|
self.send_to_router.send_pyobj(req)
|
||||||
|
|
||||||
def create_abort_task(self, obj):
|
def create_abort_task(self, obj: GenerateReqInput):
|
||||||
# Abort the request if the client is disconnected.
|
# Abort the request if the client is disconnected.
|
||||||
async def abort_request():
|
async def abort_request():
|
||||||
await asyncio.sleep(3)
|
await asyncio.sleep(3)
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
"""pydantic models for OpenAI API protocol"""
|
"""Pydantic models for OpenAI API protocol"""
|
||||||
|
|
||||||
import time
|
import time
|
||||||
from typing import Dict, List, Optional, Union
|
from typing import Dict, List, Optional, Union
|
||||||
|
|||||||
@@ -1,4 +1,7 @@
|
|||||||
"""SRT: SGLang Runtime"""
|
"""
|
||||||
|
The entry point of inference server.
|
||||||
|
SRT = SGLang Runtime.
|
||||||
|
"""
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import dataclasses
|
import dataclasses
|
||||||
@@ -10,7 +13,7 @@ import sys
|
|||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
from http import HTTPStatus
|
from http import HTTPStatus
|
||||||
from typing import Optional
|
from typing import Optional, Dict
|
||||||
|
|
||||||
# Fix a bug of Python threading
|
# Fix a bug of Python threading
|
||||||
setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
|
setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
|
||||||
@@ -148,7 +151,6 @@ def launch_server(server_args: ServerArgs, pipe_finish_writer, model_overide_arg
|
|||||||
server_args.dp_size,
|
server_args.dp_size,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Init local models port args
|
|
||||||
ports = server_args.additional_ports
|
ports = server_args.additional_ports
|
||||||
tp = server_args.tp_size
|
tp = server_args.tp_size
|
||||||
model_port_args = []
|
model_port_args = []
|
||||||
@@ -269,6 +271,12 @@ def launch_server(server_args: ServerArgs, pipe_finish_writer, model_overide_arg
|
|||||||
|
|
||||||
|
|
||||||
class Runtime:
|
class Runtime:
|
||||||
|
"""
|
||||||
|
A wrapper for the server.
|
||||||
|
This is used for launching the server in a python program without
|
||||||
|
using the commond line interface.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
log_level: str = "error",
|
log_level: str = "error",
|
||||||
@@ -339,7 +347,7 @@ class Runtime:
|
|||||||
async def add_request(
|
async def add_request(
|
||||||
self,
|
self,
|
||||||
prompt: str,
|
prompt: str,
|
||||||
sampling_params,
|
sampling_params: Dict,
|
||||||
):
|
):
|
||||||
json_data = {
|
json_data = {
|
||||||
"text": prompt,
|
"text": prompt,
|
||||||
|
|||||||
@@ -1,6 +1,4 @@
|
|||||||
"""
|
"""This file contains the SGL programs used for unit testing."""
|
||||||
This file contains the SGL programs used for unit testing.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
|||||||
Reference in New Issue
Block a user