From f6dbd24043b8c18d87a14b3c6fe5c4f567f6c1ba Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sat, 8 Jun 2024 02:06:52 -0700 Subject: [PATCH] Improve doc strings (#518) --- README.md | 12 ++++++------ python/sglang/api.py | 2 +- python/sglang/launch_server.py | 4 +++- python/sglang/launch_server_llavavid.py | 1 + python/sglang/srt/constrained/fsm_cache.py | 1 + python/sglang/srt/constrained/jump_forward.py | 4 ++++ python/sglang/srt/conversation.py | 1 + python/sglang/srt/flush_cache.py | 2 ++ python/sglang/srt/layers/logits_processor.py | 1 + python/sglang/srt/layers/radix_attention.py | 1 + .../sglang/srt/managers/controller/dp_worker.py | 1 + .../srt/managers/controller/infer_batch.py | 1 + .../srt/managers/controller/model_runner.py | 1 + .../srt/managers/controller/radix_cache.py | 3 +++ .../managers/controller/schedule_heuristic.py | 1 + .../sglang/srt/managers/controller/tp_worker.py | 2 ++ .../sglang/srt/managers/detokenizer_manager.py | 1 + python/sglang/srt/managers/io_struct.py | 5 +++++ python/sglang/srt/managers/tokenizer_manager.py | 3 ++- python/sglang/srt/openai_protocol.py | 2 +- python/sglang/srt/server.py | 16 ++++++++++++---- python/sglang/test/test_programs.py | 4 +--- 22 files changed, 52 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 332aaa0af..7aef05673 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,8 @@ SGLang is a structured generation language designed for large language models (L It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system. The core features include: -- **A Flexible Front-End Language**: This allows for easy programming of LLM applications with multiple chained generation calls, advanced prompting techniques, control flow, multiple modalities, parallelism, and external interaction. -- **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by automatically reusing the KV cache across multiple calls. It can also be used as a standalone serving engine with all common techniques implemented, such as continuous batching and tensor parallelism. +- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions. +- **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone engine with all common techniques implemented (e.g., continuous batching and tensor parallelism). ## News - [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)). @@ -403,10 +403,10 @@ https://github.com/sgl-project/sglang/issues/157 ## Citation And Acknowledgment ``` -@misc{zheng2023efficiently, - title={Efficiently Programming Large Language Models using SGLang}, - author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Jeff Huang and Chuyue Sun and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng}, - year={2023}, +@misc{zheng2024sglang, + title={SGLang: Efficient Execution of Structured Language Model Programs}, + author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng}, + year={2024}, eprint={2312.07104}, archivePrefix={arXiv}, primaryClass={cs.AI} diff --git a/python/sglang/api.py b/python/sglang/api.py index 2a935a4e0..ff113ab9b 100644 --- a/python/sglang/api.py +++ b/python/sglang/api.py @@ -1,4 +1,4 @@ -"""Some Public API Definitions""" +"""Public APIs of the language.""" import os import re diff --git a/python/sglang/launch_server.py b/python/sglang/launch_server.py index 9d63a2aed..3b4ee3ed8 100644 --- a/python/sglang/launch_server.py +++ b/python/sglang/launch_server.py @@ -1,3 +1,5 @@ +"""Launch the inference server.""" + import argparse from sglang.srt.server import ServerArgs, launch_server @@ -8,4 +10,4 @@ if __name__ == "__main__": args = parser.parse_args() server_args = ServerArgs.from_cli_args(args) - launch_server(server_args, None) + launch_server(server_args, None) \ No newline at end of file diff --git a/python/sglang/launch_server_llavavid.py b/python/sglang/launch_server_llavavid.py index 564ead5e4..294a4fa70 100644 --- a/python/sglang/launch_server_llavavid.py +++ b/python/sglang/launch_server_llavavid.py @@ -1,3 +1,4 @@ +"""Launch the inference server for Llava-video model.""" import argparse import multiprocessing as mp diff --git a/python/sglang/srt/constrained/fsm_cache.py b/python/sglang/srt/constrained/fsm_cache.py index fb1588f95..a8cbde1dd 100644 --- a/python/sglang/srt/constrained/fsm_cache.py +++ b/python/sglang/srt/constrained/fsm_cache.py @@ -1,3 +1,4 @@ +"""Cache for the compressed finite state machine.""" from sglang.srt.constrained import RegexFSM, TransformerTokenizer from sglang.srt.constrained.base_cache import BaseCache diff --git a/python/sglang/srt/constrained/jump_forward.py b/python/sglang/srt/constrained/jump_forward.py index 5955c6147..9e4a58803 100644 --- a/python/sglang/srt/constrained/jump_forward.py +++ b/python/sglang/srt/constrained/jump_forward.py @@ -1,3 +1,7 @@ +""" +Faster constrained decoding. +Reference: https://lmsys.org/blog/2024-02-05-compressed-fsm/ +""" import interegular from sglang.srt.constrained import FSMInfo, disk_cache, make_deterministic_fsm diff --git a/python/sglang/srt/conversation.py b/python/sglang/srt/conversation.py index 6f07d0946..992c2021b 100644 --- a/python/sglang/srt/conversation.py +++ b/python/sglang/srt/conversation.py @@ -1,3 +1,4 @@ +"""Conversation templates.""" # Adapted from # https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py import dataclasses diff --git a/python/sglang/srt/flush_cache.py b/python/sglang/srt/flush_cache.py index e962bb38b..575ba9600 100644 --- a/python/sglang/srt/flush_cache.py +++ b/python/sglang/srt/flush_cache.py @@ -1,4 +1,6 @@ """ +Flush the KV cache. + Usage: python3 -m sglang.srt.flush_cache --url http://localhost:30000 """ diff --git a/python/sglang/srt/layers/logits_processor.py b/python/sglang/srt/layers/logits_processor.py index e7efaadec..eb32ff7b1 100644 --- a/python/sglang/srt/layers/logits_processor.py +++ b/python/sglang/srt/layers/logits_processor.py @@ -1,3 +1,4 @@ +"""Logits processing.""" import torch from torch import nn from vllm.distributed import ( diff --git a/python/sglang/srt/layers/radix_attention.py b/python/sglang/srt/layers/radix_attention.py index 7d0475e50..651349735 100644 --- a/python/sglang/srt/layers/radix_attention.py +++ b/python/sglang/srt/layers/radix_attention.py @@ -1,3 +1,4 @@ +"""Radix attention.""" import torch import numpy as np from torch import nn diff --git a/python/sglang/srt/managers/controller/dp_worker.py b/python/sglang/srt/managers/controller/dp_worker.py index 16f5d2308..ca2a03cf2 100644 --- a/python/sglang/srt/managers/controller/dp_worker.py +++ b/python/sglang/srt/managers/controller/dp_worker.py @@ -1,4 +1,5 @@ """A data parallel worker thread.""" + import asyncio import logging import queue diff --git a/python/sglang/srt/managers/controller/infer_batch.py b/python/sglang/srt/managers/controller/infer_batch.py index 243a4397c..6e235fefa 100644 --- a/python/sglang/srt/managers/controller/infer_batch.py +++ b/python/sglang/srt/managers/controller/infer_batch.py @@ -1,4 +1,5 @@ """Meta data for requests and batches""" + from dataclasses import dataclass from enum import IntEnum, auto from typing import List diff --git a/python/sglang/srt/managers/controller/model_runner.py b/python/sglang/srt/managers/controller/model_runner.py index 0033acbf8..bc622208d 100644 --- a/python/sglang/srt/managers/controller/model_runner.py +++ b/python/sglang/srt/managers/controller/model_runner.py @@ -1,3 +1,4 @@ +"""ModelRunner runs the forward passes of the models.""" import importlib import importlib.resources import logging diff --git a/python/sglang/srt/managers/controller/radix_cache.py b/python/sglang/srt/managers/controller/radix_cache.py index 5309a4265..04a184c10 100644 --- a/python/sglang/srt/managers/controller/radix_cache.py +++ b/python/sglang/srt/managers/controller/radix_cache.py @@ -1,3 +1,6 @@ +""" +The radix tree data structure for managing the KV cache. +""" import heapq import time from collections import defaultdict diff --git a/python/sglang/srt/managers/controller/schedule_heuristic.py b/python/sglang/srt/managers/controller/schedule_heuristic.py index 6c585eb9b..6e75a7ad4 100644 --- a/python/sglang/srt/managers/controller/schedule_heuristic.py +++ b/python/sglang/srt/managers/controller/schedule_heuristic.py @@ -1,3 +1,4 @@ +"""Request scheduler heuristic.""" import random from collections import defaultdict diff --git a/python/sglang/srt/managers/controller/tp_worker.py b/python/sglang/srt/managers/controller/tp_worker.py index d85873117..1edd26337 100644 --- a/python/sglang/srt/managers/controller/tp_worker.py +++ b/python/sglang/srt/managers/controller/tp_worker.py @@ -1,3 +1,5 @@ +"""A tensor parallel worker.""" + import asyncio import logging import time diff --git a/python/sglang/srt/managers/detokenizer_manager.py b/python/sglang/srt/managers/detokenizer_manager.py index d60edf273..1c591a6cc 100644 --- a/python/sglang/srt/managers/detokenizer_manager.py +++ b/python/sglang/srt/managers/detokenizer_manager.py @@ -1,3 +1,4 @@ +"""DetokenizerManager is a process that detokenizes the token ids.""" import asyncio import inspect diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py index 004308c3b..abc4d3033 100644 --- a/python/sglang/srt/managers/io_struct.py +++ b/python/sglang/srt/managers/io_struct.py @@ -1,3 +1,8 @@ +""" +The definition of objects transfered between different +processes (TokenizerManager, DetokenizerManager, Controller). +""" + import uuid from dataclasses import dataclass from typing import Dict, List, Optional, Union diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 38f07739e..8fe3ff8dc 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -1,3 +1,4 @@ +"""TokenizerManager is a process that tokenizes the text.""" import asyncio import concurrent.futures import dataclasses @@ -283,7 +284,7 @@ class TokenizerManager: req = AbortReq(rid) self.send_to_router.send_pyobj(req) - def create_abort_task(self, obj): + def create_abort_task(self, obj: GenerateReqInput): # Abort the request if the client is disconnected. async def abort_request(): await asyncio.sleep(3) diff --git a/python/sglang/srt/openai_protocol.py b/python/sglang/srt/openai_protocol.py index 79c69ebdb..ce37097a7 100644 --- a/python/sglang/srt/openai_protocol.py +++ b/python/sglang/srt/openai_protocol.py @@ -1,4 +1,4 @@ -"""pydantic models for OpenAI API protocol""" +"""Pydantic models for OpenAI API protocol""" import time from typing import Dict, List, Optional, Union diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 2403ef57f..7b6dca68f 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -1,4 +1,7 @@ -"""SRT: SGLang Runtime""" +""" +The entry point of inference server. +SRT = SGLang Runtime. +""" import asyncio import dataclasses @@ -10,7 +13,7 @@ import sys import threading import time from http import HTTPStatus -from typing import Optional +from typing import Optional, Dict # Fix a bug of Python threading setattr(threading, "_register_atexit", lambda *args, **kwargs: None) @@ -148,7 +151,6 @@ def launch_server(server_args: ServerArgs, pipe_finish_writer, model_overide_arg server_args.dp_size, ) - # Init local models port args ports = server_args.additional_ports tp = server_args.tp_size model_port_args = [] @@ -269,6 +271,12 @@ def launch_server(server_args: ServerArgs, pipe_finish_writer, model_overide_arg class Runtime: + """ + A wrapper for the server. + This is used for launching the server in a python program without + using the commond line interface. + """ + def __init__( self, log_level: str = "error", @@ -339,7 +347,7 @@ class Runtime: async def add_request( self, prompt: str, - sampling_params, + sampling_params: Dict, ): json_data = { "text": prompt, diff --git a/python/sglang/test/test_programs.py b/python/sglang/test/test_programs.py index 2be7ecdb9..4ad480887 100644 --- a/python/sglang/test/test_programs.py +++ b/python/sglang/test/test_programs.py @@ -1,6 +1,4 @@ -""" -This file contains the SGL programs used for unit testing. -""" +"""This file contains the SGL programs used for unit testing.""" import json import re