Refactor the docs (#9031)

This commit is contained in:
Lianmin Zheng
2025-08-10 19:49:45 -07:00
committed by GitHub
parent 0f229c07f1
commit 2449a0afe2
80 changed files with 619 additions and 750 deletions

View File

@@ -32,16 +32,20 @@ from sglang.lang.choices import (
token_length_normalized,
unconditional_likelihood_normalized,
)
from sglang.srt.entrypoints.engine import Engine
# Lazy import some libraries
from sglang.utils import LazyImport
from sglang.version import __version__
ServerArgs = LazyImport("sglang.srt.server_args", "ServerArgs")
Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
# Runtime Engine APIs
ServerArgs = LazyImport("sglang.srt.server_args", "ServerArgs")
Engine = LazyImport("sglang.srt.entrypoints.engine", "Engine")
__all__ = [
"Engine",
"Runtime",

View File

@@ -2175,10 +2175,6 @@ class ServerArgs:
self.mem_fraction_static = (
original_server_arg_mem_fraction * final_overall_factor
)
logger.warning(
f"Multimodal model: Dynamically adjusted --mem-fraction-static "
f"from: {original_server_arg_mem_fraction:.3f} to: {self.mem_fraction_static:.3f}."
)
def prepare_server_args(argv: List[str]) -> ServerArgs:

View File

@@ -0,0 +1,59 @@
"""
Do some monkey patch to make the documentation compilation faster and more reliable.
- Avoid port conflicts
- Reduce the server launch time
"""
import weakref
import nest_asyncio
nest_asyncio.apply()
import sglang.srt.server_args as server_args_mod
from sglang.utils import execute_shell_command, reserve_port
DEFAULT_MAX_RUNNING_REQUESTS = 128
DEFAULT_MAX_TOTAL_TOKENS = 20480 # To allow multiple servers on the same machine
_original_post_init = server_args_mod.ServerArgs.__post_init__
def patched_post_init(self):
_original_post_init(self)
if self.max_running_requests is None:
self.max_running_requests = DEFAULT_MAX_RUNNING_REQUESTS
if self.max_total_tokens is None:
self.max_total_tokens = DEFAULT_MAX_TOTAL_TOKENS
self.cuda_graph_max_bs = 4
server_args_mod.ServerArgs.__post_init__ = patched_post_init
process_socket_map = weakref.WeakKeyDictionary()
def launch_server_cmd(command: str, host: str = "0.0.0.0", port: int = None):
"""
Launch the server using the given command.
If no port is specified, a free port is reserved.
"""
if port is None:
port, lock_socket = reserve_port(host)
else:
lock_socket = None
extra_flags = (
f"--max-running-requests {DEFAULT_MAX_RUNNING_REQUESTS} "
f"--max-total-tokens {DEFAULT_MAX_TOTAL_TOKENS} "
f"--cuda-graph-max-bs 4"
)
full_command = f"{command} --port {port} {extra_flags}"
process = execute_shell_command(full_command)
if lock_socket is not None:
process_socket_map[process] = lock_socket
return process, port

View File

@@ -458,7 +458,7 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
We are running those notebooks in a CI parallel environment, so the throughput is not representative of the actual performance.
We are running those notebooks in a CI environment, so the throughput is not representative of the actual performance.
"""
)
break