From f6dbd24043b8c18d87a14b3c6fe5c4f567f6c1ba Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sat, 8 Jun 2024 02:06:52 -0700
Subject: [PATCH] Improve doc strings (#518)

---
 README.md                                        | 12 ++++++------
 python/sglang/api.py                             |  2 +-
 python/sglang/launch_server.py                   |  4 +++-
 python/sglang/launch_server_llavavid.py          |  1 +
 python/sglang/srt/constrained/fsm_cache.py       |  1 +
 python/sglang/srt/constrained/jump_forward.py    |  4 ++++
 python/sglang/srt/conversation.py                |  1 +
 python/sglang/srt/flush_cache.py                 |  2 ++
 python/sglang/srt/layers/logits_processor.py     |  1 +
 python/sglang/srt/layers/radix_attention.py      |  1 +
 .../sglang/srt/managers/controller/dp_worker.py  |  1 +
 .../srt/managers/controller/infer_batch.py       |  1 +
 .../srt/managers/controller/model_runner.py      |  1 +
 .../srt/managers/controller/radix_cache.py       |  3 +++
 .../managers/controller/schedule_heuristic.py    |  1 +
 .../sglang/srt/managers/controller/tp_worker.py  |  2 ++
 .../sglang/srt/managers/detokenizer_manager.py   |  1 +
 python/sglang/srt/managers/io_struct.py          |  5 +++++
 python/sglang/srt/managers/tokenizer_manager.py  |  3 ++-
 python/sglang/srt/openai_protocol.py             |  2 +-
 python/sglang/srt/server.py                      | 16 ++++++++++++----
 python/sglang/test/test_programs.py              |  4 +---
 22 files changed, 52 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 332aaa0af..7aef05673 100644
--- a/README.md
+++ b/README.md
@@ -10,8 +10,8 @@ SGLang is a structured generation language designed for large language models (L
 It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
 
 The core features include:
-- **A Flexible Front-End Language**: This allows for easy programming of LLM applications with multiple chained generation calls, advanced prompting techniques, control flow, multiple modalities, parallelism, and external interaction.
-- **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by automatically reusing the KV cache across multiple calls. It can also be used as a standalone serving engine with all common techniques implemented, such as continuous batching and tensor parallelism.
+- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
+- **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
 
 ## News
 - [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
@@ -403,10 +403,10 @@ https://github.com/sgl-project/sglang/issues/157
 
 ## Citation And Acknowledgment
 ```
-@misc{zheng2023efficiently,
-      title={Efficiently Programming Large Language Models using SGLang},
-      author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Jeff Huang and Chuyue Sun and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
-      year={2023},
+@misc{zheng2024sglang,
+      title={SGLang: Efficient Execution of Structured Language Model Programs},
+      author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
+      year={2024},
       eprint={2312.07104},
       archivePrefix={arXiv},
       primaryClass={cs.AI}
diff --git a/python/sglang/api.py b/python/sglang/api.py
index 2a935a4e0..ff113ab9b 100644
--- a/python/sglang/api.py
+++ b/python/sglang/api.py
@@ -1,4 +1,4 @@
-"""Some Public API Definitions"""
+"""Public APIs of the language."""
 
 import os
 import re
diff --git a/python/sglang/launch_server.py b/python/sglang/launch_server.py
index 9d63a2aed..3b4ee3ed8 100644
--- a/python/sglang/launch_server.py
+++ b/python/sglang/launch_server.py
@@ -1,3 +1,5 @@
+"""Launch the inference server."""
+
 import argparse
 
 from sglang.srt.server import ServerArgs, launch_server
@@ -8,4 +10,4 @@ if __name__ == "__main__":
     args = parser.parse_args()
     server_args = ServerArgs.from_cli_args(args)
 
-    launch_server(server_args, None)
+    launch_server(server_args, None)
\ No newline at end of file
diff --git a/python/sglang/launch_server_llavavid.py b/python/sglang/launch_server_llavavid.py
index 564ead5e4..294a4fa70 100644
--- a/python/sglang/launch_server_llavavid.py
+++ b/python/sglang/launch_server_llavavid.py
@@ -1,3 +1,4 @@
+"""Launch the inference server for Llava-video model."""
 import argparse
 import multiprocessing as mp
 
diff --git a/python/sglang/srt/constrained/fsm_cache.py b/python/sglang/srt/constrained/fsm_cache.py
index fb1588f95..a8cbde1dd 100644
--- a/python/sglang/srt/constrained/fsm_cache.py
+++ b/python/sglang/srt/constrained/fsm_cache.py
@@ -1,3 +1,4 @@
+"""Cache for the compressed finite state machine."""
 from sglang.srt.constrained import RegexFSM, TransformerTokenizer
 from sglang.srt.constrained.base_cache import BaseCache
 
diff --git a/python/sglang/srt/constrained/jump_forward.py b/python/sglang/srt/constrained/jump_forward.py
index 5955c6147..9e4a58803 100644
--- a/python/sglang/srt/constrained/jump_forward.py
+++ b/python/sglang/srt/constrained/jump_forward.py
@@ -1,3 +1,7 @@
+"""
+Faster constrained decoding.
+Reference: https://lmsys.org/blog/2024-02-05-compressed-fsm/
+"""
 import interegular
 
 from sglang.srt.constrained import FSMInfo, disk_cache, make_deterministic_fsm
diff --git a/python/sglang/srt/conversation.py b/python/sglang/srt/conversation.py
index 6f07d0946..992c2021b 100644
--- a/python/sglang/srt/conversation.py
+++ b/python/sglang/srt/conversation.py
@@ -1,3 +1,4 @@
+"""Conversation templates."""
 # Adapted from
 # https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
 import dataclasses
diff --git a/python/sglang/srt/flush_cache.py b/python/sglang/srt/flush_cache.py
index e962bb38b..575ba9600 100644
--- a/python/sglang/srt/flush_cache.py
+++ b/python/sglang/srt/flush_cache.py
@@ -1,4 +1,6 @@
 """
+Flush the KV cache.
+
 Usage:
 python3 -m sglang.srt.flush_cache --url http://localhost:30000
 """
diff --git a/python/sglang/srt/layers/logits_processor.py b/python/sglang/srt/layers/logits_processor.py
index e7efaadec..eb32ff7b1 100644
--- a/python/sglang/srt/layers/logits_processor.py
+++ b/python/sglang/srt/layers/logits_processor.py
@@ -1,3 +1,4 @@
+"""Logits processing."""
 import torch
 from torch import nn
 from vllm.distributed import (
diff --git a/python/sglang/srt/layers/radix_attention.py b/python/sglang/srt/layers/radix_attention.py
index 7d0475e50..651349735 100644
--- a/python/sglang/srt/layers/radix_attention.py
+++ b/python/sglang/srt/layers/radix_attention.py
@@ -1,3 +1,4 @@
+"""Radix attention."""
 import torch
 import numpy as np
 from torch import nn
diff --git a/python/sglang/srt/managers/controller/dp_worker.py b/python/sglang/srt/managers/controller/dp_worker.py
index 16f5d2308..ca2a03cf2 100644
--- a/python/sglang/srt/managers/controller/dp_worker.py
+++ b/python/sglang/srt/managers/controller/dp_worker.py
@@ -1,4 +1,5 @@
 """A data parallel worker thread."""
+
 import asyncio
 import logging
 import queue
diff --git a/python/sglang/srt/managers/controller/infer_batch.py b/python/sglang/srt/managers/controller/infer_batch.py
index 243a4397c..6e235fefa 100644
--- a/python/sglang/srt/managers/controller/infer_batch.py
+++ b/python/sglang/srt/managers/controller/infer_batch.py
@@ -1,4 +1,5 @@
 """Meta data for requests and batches"""
+
 from dataclasses import dataclass
 from enum import IntEnum, auto
 from typing import List
diff --git a/python/sglang/srt/managers/controller/model_runner.py b/python/sglang/srt/managers/controller/model_runner.py
index 0033acbf8..bc622208d 100644
--- a/python/sglang/srt/managers/controller/model_runner.py
+++ b/python/sglang/srt/managers/controller/model_runner.py
@@ -1,3 +1,4 @@
+"""ModelRunner runs the forward passes of the models."""
 import importlib
 import importlib.resources
 import logging
diff --git a/python/sglang/srt/managers/controller/radix_cache.py b/python/sglang/srt/managers/controller/radix_cache.py
index 5309a4265..04a184c10 100644
--- a/python/sglang/srt/managers/controller/radix_cache.py
+++ b/python/sglang/srt/managers/controller/radix_cache.py
@@ -1,3 +1,6 @@
+"""
+The radix tree data structure for managing the KV cache.
+"""
 import heapq
 import time
 from collections import defaultdict
diff --git a/python/sglang/srt/managers/controller/schedule_heuristic.py b/python/sglang/srt/managers/controller/schedule_heuristic.py
index 6c585eb9b..6e75a7ad4 100644
--- a/python/sglang/srt/managers/controller/schedule_heuristic.py
+++ b/python/sglang/srt/managers/controller/schedule_heuristic.py
@@ -1,3 +1,4 @@
+"""Request scheduler heuristic."""
 import random
 from collections import defaultdict
 
diff --git a/python/sglang/srt/managers/controller/tp_worker.py b/python/sglang/srt/managers/controller/tp_worker.py
index d85873117..1edd26337 100644
--- a/python/sglang/srt/managers/controller/tp_worker.py
+++ b/python/sglang/srt/managers/controller/tp_worker.py
@@ -1,3 +1,5 @@
+"""A tensor parallel worker."""
+
 import asyncio
 import logging
 import time
diff --git a/python/sglang/srt/managers/detokenizer_manager.py b/python/sglang/srt/managers/detokenizer_manager.py
index d60edf273..1c591a6cc 100644
--- a/python/sglang/srt/managers/detokenizer_manager.py
+++ b/python/sglang/srt/managers/detokenizer_manager.py
@@ -1,3 +1,4 @@
+"""DetokenizerManager is a process that detokenizes the token ids."""
 import asyncio
 import inspect
 
diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py
index 004308c3b..abc4d3033 100644
--- a/python/sglang/srt/managers/io_struct.py
+++ b/python/sglang/srt/managers/io_struct.py
@@ -1,3 +1,8 @@
+"""
+The definition of objects transfered between different
+processes (TokenizerManager, DetokenizerManager, Controller).
+"""
+
 import uuid
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Union
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index 38f07739e..8fe3ff8dc 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -1,3 +1,4 @@
+"""TokenizerManager is a process that tokenizes the text."""
 import asyncio
 import concurrent.futures
 import dataclasses
@@ -283,7 +284,7 @@ class TokenizerManager:
         req = AbortReq(rid)
         self.send_to_router.send_pyobj(req)
 
-    def create_abort_task(self, obj):
+    def create_abort_task(self, obj: GenerateReqInput):
         # Abort the request if the client is disconnected.
         async def abort_request():
             await asyncio.sleep(3)
diff --git a/python/sglang/srt/openai_protocol.py b/python/sglang/srt/openai_protocol.py
index 79c69ebdb..ce37097a7 100644
--- a/python/sglang/srt/openai_protocol.py
+++ b/python/sglang/srt/openai_protocol.py
@@ -1,4 +1,4 @@
-"""pydantic models for OpenAI API protocol"""
+"""Pydantic models for OpenAI API protocol"""
 
 import time
 from typing import Dict, List, Optional, Union
diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py
index 2403ef57f..7b6dca68f 100644
--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -1,4 +1,7 @@
-"""SRT: SGLang Runtime"""
+"""
+The entry point of inference server.
+SRT = SGLang Runtime.
+"""
 
 import asyncio
 import dataclasses
@@ -10,7 +13,7 @@ import sys
 import threading
 import time
 from http import HTTPStatus
-from typing import Optional
+from typing import Optional, Dict
 
 # Fix a bug of Python threading
 setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
@@ -148,7 +151,6 @@ def launch_server(server_args: ServerArgs, pipe_finish_writer, model_overide_arg
         server_args.dp_size,
     )
 
-    # Init local models port args
     ports = server_args.additional_ports
     tp = server_args.tp_size
     model_port_args = []
@@ -269,6 +271,12 @@ def launch_server(server_args: ServerArgs, pipe_finish_writer, model_overide_arg
 
 
 class Runtime:
+    """
+    A wrapper for the server.
+    This is used for launching the server in a python program without
+    using the commond line interface.
+    """
+
     def __init__(
         self,
         log_level: str = "error",
@@ -339,7 +347,7 @@ class Runtime:
     async def add_request(
         self,
         prompt: str,
-        sampling_params,
+        sampling_params: Dict,
     ):
         json_data = {
             "text": prompt,
diff --git a/python/sglang/test/test_programs.py b/python/sglang/test/test_programs.py
index 2be7ecdb9..4ad480887 100644
--- a/python/sglang/test/test_programs.py
+++ b/python/sglang/test/test_programs.py
@@ -1,6 +1,4 @@
-"""
-This file contains the SGL programs used for unit testing.
-"""
+"""This file contains the SGL programs used for unit testing."""
 
 import json
 import re