Refactor the docs (#9031)
This commit is contained in:
@@ -32,16 +32,20 @@ from sglang.lang.choices import (
|
||||
token_length_normalized,
|
||||
unconditional_likelihood_normalized,
|
||||
)
|
||||
from sglang.srt.entrypoints.engine import Engine
|
||||
|
||||
# Lazy import some libraries
|
||||
from sglang.utils import LazyImport
|
||||
from sglang.version import __version__
|
||||
|
||||
ServerArgs = LazyImport("sglang.srt.server_args", "ServerArgs")
|
||||
Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
|
||||
LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
|
||||
OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
|
||||
VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
|
||||
|
||||
# Runtime Engine APIs
|
||||
ServerArgs = LazyImport("sglang.srt.server_args", "ServerArgs")
|
||||
Engine = LazyImport("sglang.srt.entrypoints.engine", "Engine")
|
||||
|
||||
__all__ = [
|
||||
"Engine",
|
||||
"Runtime",
|
||||
|
||||
@@ -2175,10 +2175,6 @@ class ServerArgs:
|
||||
self.mem_fraction_static = (
|
||||
original_server_arg_mem_fraction * final_overall_factor
|
||||
)
|
||||
logger.warning(
|
||||
f"Multimodal model: Dynamically adjusted --mem-fraction-static "
|
||||
f"from: {original_server_arg_mem_fraction:.3f} to: {self.mem_fraction_static:.3f}."
|
||||
)
|
||||
|
||||
|
||||
def prepare_server_args(argv: List[str]) -> ServerArgs:
|
||||
|
||||
59
python/sglang/test/doc_patch.py
Normal file
59
python/sglang/test/doc_patch.py
Normal file
@@ -0,0 +1,59 @@
|
||||
"""
|
||||
Do some monkey patch to make the documentation compilation faster and more reliable.
|
||||
|
||||
- Avoid port conflicts
|
||||
- Reduce the server launch time
|
||||
"""
|
||||
|
||||
import weakref
|
||||
|
||||
import nest_asyncio
|
||||
|
||||
nest_asyncio.apply()
|
||||
|
||||
import sglang.srt.server_args as server_args_mod
|
||||
from sglang.utils import execute_shell_command, reserve_port
|
||||
|
||||
DEFAULT_MAX_RUNNING_REQUESTS = 128
|
||||
DEFAULT_MAX_TOTAL_TOKENS = 20480 # To allow multiple servers on the same machine
|
||||
|
||||
_original_post_init = server_args_mod.ServerArgs.__post_init__
|
||||
|
||||
|
||||
def patched_post_init(self):
|
||||
_original_post_init(self)
|
||||
if self.max_running_requests is None:
|
||||
self.max_running_requests = DEFAULT_MAX_RUNNING_REQUESTS
|
||||
if self.max_total_tokens is None:
|
||||
self.max_total_tokens = DEFAULT_MAX_TOTAL_TOKENS
|
||||
self.cuda_graph_max_bs = 4
|
||||
|
||||
|
||||
server_args_mod.ServerArgs.__post_init__ = patched_post_init
|
||||
|
||||
process_socket_map = weakref.WeakKeyDictionary()
|
||||
|
||||
|
||||
def launch_server_cmd(command: str, host: str = "0.0.0.0", port: int = None):
|
||||
"""
|
||||
Launch the server using the given command.
|
||||
If no port is specified, a free port is reserved.
|
||||
"""
|
||||
if port is None:
|
||||
port, lock_socket = reserve_port(host)
|
||||
else:
|
||||
lock_socket = None
|
||||
|
||||
extra_flags = (
|
||||
f"--max-running-requests {DEFAULT_MAX_RUNNING_REQUESTS} "
|
||||
f"--max-total-tokens {DEFAULT_MAX_TOTAL_TOKENS} "
|
||||
f"--cuda-graph-max-bs 4"
|
||||
)
|
||||
|
||||
full_command = f"{command} --port {port} {extra_flags}"
|
||||
process = execute_shell_command(full_command)
|
||||
|
||||
if lock_socket is not None:
|
||||
process_socket_map[process] = lock_socket
|
||||
|
||||
return process, port
|
||||
@@ -458,7 +458,7 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
|
||||
NOTE: Typically, the server runs in a separate terminal.
|
||||
In this notebook, we run the server and notebook code together, so their outputs are combined.
|
||||
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
|
||||
We are running those notebooks in a CI parallel environment, so the throughput is not representative of the actual performance.
|
||||
We are running those notebooks in a CI environment, so the throughput is not representative of the actual performance.
|
||||
"""
|
||||
)
|
||||
break
|
||||
|
||||
Reference in New Issue
Block a user