Improve doc strings (#518)

This commit is contained in:
Lianmin Zheng
2024-06-08 02:06:52 -07:00
parent e8a2327d52
commit f6dbd24043
22 changed files with 52 additions and 17 deletions

View File

@@ -10,8 +10,8 @@ SGLang is a structured generation language designed for large language models (L
It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
The core features include:
- **A Flexible Front-End Language**: This allows for easy programming of LLM applications with multiple chained generation calls, advanced prompting techniques, control flow, multiple modalities, parallelism, and external interaction.
- **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by automatically reusing the KV cache across multiple calls. It can also be used as a standalone serving engine with all common techniques implemented, such as continuous batching and tensor parallelism.
- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
- **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
## News
- [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
@@ -403,10 +403,10 @@ https://github.com/sgl-project/sglang/issues/157
## Citation And Acknowledgment
```
@misc{zheng2023efficiently,
title={Efficiently Programming Large Language Models using SGLang},
author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Jeff Huang and Chuyue Sun and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
year={2023},
@misc{zheng2024sglang,
title={SGLang: Efficient Execution of Structured Language Model Programs},
author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
year={2024},
eprint={2312.07104},
archivePrefix={arXiv},
primaryClass={cs.AI}

View File

@@ -1,4 +1,4 @@
"""Some Public API Definitions"""
"""Public APIs of the language."""
import os
import re

View File

@@ -1,3 +1,5 @@
"""Launch the inference server."""
import argparse
from sglang.srt.server import ServerArgs, launch_server
@@ -8,4 +10,4 @@ if __name__ == "__main__":
args = parser.parse_args()
server_args = ServerArgs.from_cli_args(args)
launch_server(server_args, None)
launch_server(server_args, None)

View File

@@ -1,3 +1,4 @@
"""Launch the inference server for Llava-video model."""
import argparse
import multiprocessing as mp

View File

@@ -1,3 +1,4 @@
"""Cache for the compressed finite state machine."""
from sglang.srt.constrained import RegexFSM, TransformerTokenizer
from sglang.srt.constrained.base_cache import BaseCache

View File

@@ -1,3 +1,7 @@
"""
Faster constrained decoding.
Reference: https://lmsys.org/blog/2024-02-05-compressed-fsm/
"""
import interegular
from sglang.srt.constrained import FSMInfo, disk_cache, make_deterministic_fsm

View File

@@ -1,3 +1,4 @@
"""Conversation templates."""
# Adapted from
# https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
import dataclasses

View File

@@ -1,4 +1,6 @@
"""
Flush the KV cache.
Usage:
python3 -m sglang.srt.flush_cache --url http://localhost:30000
"""

View File

@@ -1,3 +1,4 @@
"""Logits processing."""
import torch
from torch import nn
from vllm.distributed import (

View File

@@ -1,3 +1,4 @@
"""Radix attention."""
import torch
import numpy as np
from torch import nn

View File

@@ -1,4 +1,5 @@
"""A data parallel worker thread."""
import asyncio
import logging
import queue

View File

@@ -1,4 +1,5 @@
"""Meta data for requests and batches"""
from dataclasses import dataclass
from enum import IntEnum, auto
from typing import List

View File

@@ -1,3 +1,4 @@
"""ModelRunner runs the forward passes of the models."""
import importlib
import importlib.resources
import logging

View File

@@ -1,3 +1,6 @@
"""
The radix tree data structure for managing the KV cache.
"""
import heapq
import time
from collections import defaultdict

View File

@@ -1,3 +1,4 @@
"""Request scheduler heuristic."""
import random
from collections import defaultdict

View File

@@ -1,3 +1,5 @@
"""A tensor parallel worker."""
import asyncio
import logging
import time

View File

@@ -1,3 +1,4 @@
"""DetokenizerManager is a process that detokenizes the token ids."""
import asyncio
import inspect

View File

@@ -1,3 +1,8 @@
"""
The definition of objects transfered between different
processes (TokenizerManager, DetokenizerManager, Controller).
"""
import uuid
from dataclasses import dataclass
from typing import Dict, List, Optional, Union

View File

@@ -1,3 +1,4 @@
"""TokenizerManager is a process that tokenizes the text."""
import asyncio
import concurrent.futures
import dataclasses
@@ -283,7 +284,7 @@ class TokenizerManager:
req = AbortReq(rid)
self.send_to_router.send_pyobj(req)
def create_abort_task(self, obj):
def create_abort_task(self, obj: GenerateReqInput):
# Abort the request if the client is disconnected.
async def abort_request():
await asyncio.sleep(3)

View File

@@ -1,4 +1,4 @@
"""pydantic models for OpenAI API protocol"""
"""Pydantic models for OpenAI API protocol"""
import time
from typing import Dict, List, Optional, Union

View File

@@ -1,4 +1,7 @@
"""SRT: SGLang Runtime"""
"""
The entry point of inference server.
SRT = SGLang Runtime.
"""
import asyncio
import dataclasses
@@ -10,7 +13,7 @@ import sys
import threading
import time
from http import HTTPStatus
from typing import Optional
from typing import Optional, Dict
# Fix a bug of Python threading
setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
@@ -148,7 +151,6 @@ def launch_server(server_args: ServerArgs, pipe_finish_writer, model_overide_arg
server_args.dp_size,
)
# Init local models port args
ports = server_args.additional_ports
tp = server_args.tp_size
model_port_args = []
@@ -269,6 +271,12 @@ def launch_server(server_args: ServerArgs, pipe_finish_writer, model_overide_arg
class Runtime:
"""
A wrapper for the server.
This is used for launching the server in a python program without
using the commond line interface.
"""
def __init__(
self,
log_level: str = "error",
@@ -339,7 +347,7 @@ class Runtime:
async def add_request(
self,
prompt: str,
sampling_params,
sampling_params: Dict,
):
json_data = {
"text": prompt,

View File

@@ -1,6 +1,4 @@
"""
This file contains the SGL programs used for unit testing.
"""
"""This file contains the SGL programs used for unit testing."""
import json
import re