diff --git a/docs/developer_guide/contribution_guide.md b/docs/developer_guide/contribution_guide.md index e2171f447..f8e6f692d 100644 --- a/docs/developer_guide/contribution_guide.md +++ b/docs/developer_guide/contribution_guide.md @@ -72,8 +72,13 @@ If you modify files protected by code owners, their approval is required to merg - Avoid code duplication. If the same code snippet (more than five lines) appears multiple times, extract it into a shared function. - Minimize device synchronization. Reduce expensive CPU-GPU synchronization operations, such as `tensor.item()` or `tensor.cpu()`, whenever possible. Use vectorized code. - Keep files concise. If a file exceeds 2,000 lines of code, split it into multiple smaller files. -- Prioritize extreme efficiency. SGLang is a runtime, and most of your code runs on the critical path for every request. Optimize every minor overhead as much as possible. -- Try to make functions as pure as possible. Avoid in-place modification of arguments. +- Prioritize extreme efficiency. SGLang is a runtime, and most of your code runs on the critical path for every request. Optimize all minor overheads as much as possible, especially in the model forward code. + - A common pattern is some runtime checks in the model forward pass (e.g., [this](https://github.com/sgl-project/sglang/blob/f1b0eda55c2c4838e8ab90a0fac7fb1e3d7064ab/python/sglang/srt/models/deepseek_v2.py#L486-L491)). These are very likely the same for every layer. Please cache the result as a single boolean value whenever possible. +- Strive to make functions as pure as possible. Avoid in-place modification of arguments. +- When supporting new hardware or features, follow these guidelines: + - Do not drastically change existing code. + - Always prefer new files to introduce specific components for your new hardware (e.g., `allocator_ascend.py`). + - If you write multiple if/else blocks for new features, ensure the common path (e.g., NVIDIA hardware or the existing code path) is the first branch. ## How to update sgl-kernel Since sglang and sgl-kernel are separate Python packages, our current GitHub CI infrastructure does not support updating a kernel and using it immediately within the same pull request (PR). diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 78515e898..c24c63ce9 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -297,9 +297,6 @@ class ServerArgs: def __post_init__(self): # Check deprecated arguments - def print_deprecated_warning(message: str): - logger.warning(f"\033[33m{message}\033[0m") - if self.enable_ep_moe: self.ep_size = self.tp_size print_deprecated_warning( @@ -1955,24 +1952,25 @@ class ServerArgs: default=None, help="The custom dataloader which used to update the model. Should be set with a valid import path, such as my_package.weight_load_func", ) + parser.add_argument( + "--weight-loader-disable-mmap", + action="store_true", + help="Disable mmap while loading weight using safetensors.", + ) + + # For PD-Multiplexing parser.add_argument( "--enable-pdmux", action="store_true", help="Enable PD-Multiplexing, PD running on greenctx stream.", ) - # For PD-Multiplexing parser.add_argument( "--sm-group-num", type=int, default=ServerArgs.sm_group_num, help="Number of sm partition groups.", ) - parser.add_argument( - "--weight-loader-disable-mmap", - action="store_true", - help="Disable mmap while loading weight using safetensors.", - ) # Deprecated arguments parser.add_argument( @@ -2379,6 +2377,10 @@ class DeprecatedAction(argparse.Action): raise ValueError(self.help) +def print_deprecated_warning(message: str): + logger.warning(f"\033[33m{message}\033[0m") + + def auto_choose_speculative_params(self: ServerArgs): """ Automatically choose the parameters for speculative decoding.