Add typo checker in pre-commit (#6179)

Co-authored-by: Brayden Zhong <b8zhong@uwaterloo.ca>
2025-05-11 00:55:00 -04:00
parent de167cf5fa
commit 2ce8793519
99 changed files with 154 additions and 144 deletions
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -147,3 +147,7 @@ exclude = [
    "scripts*",
    "tests*",
 ]
+
+[tool.codespell]
+ignore-words-list = "ans, als, hel, boostrap, childs, te, vas, hsa, ment"
+skip = "*.json,*.jsonl,*.patch,*.txt"
--- a/python/sglang/bench_offline_throughput.py
+++ b/python/sglang/bench_offline_throughput.py
@@ -315,7 +315,7 @@ def throughput_test(
    tokenizer_id = server_args.tokenizer_path or server_args.model_path
    tokenizer = get_tokenizer(tokenizer_id)

-    # Set global environmnets
+    # Set global environments
    set_ulimit()
    random.seed(bench_args.seed)
    np.random.seed(bench_args.seed)
--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -1263,7 +1263,7 @@ async def benchmark(
    print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
    print(
        "{:<40} {:<10}".format(
-            "Max reqeuest concurrency:",
+            "Max request concurrency:",
            max_concurrency if max_concurrency else "not set",
        )
    )
--- a/python/sglang/compile_deep_gemm.py
+++ b/python/sglang/compile_deep_gemm.py
@@ -129,7 +129,7 @@ def launch_server_process_and_send_one_request(


 def refine_server_args(server_args: ServerArgs, compile_args: CompileArgs):
-    # Disbale cuda graph and torch compile to save time
+    # Disable cuda graph and torch compile to save time
    server_args.disable_cuda_graph = True
    server_args.enable_torch_compile = False
    print(f"Disable CUDA Graph and Torch Compile to save time...")
--- a/python/sglang/lang/tracer.py
+++ b/python/sglang/lang/tracer.py
@@ -38,7 +38,7 @@ def extract_prefix_by_tracing(program, backend):
        with TracingScope(tracer):
            tracer.ret_value = program.func(tracer, **arguments)
    except (StopTracing, TypeError, AttributeError):
-        # Some exceptions may not be catched
+        # Some exceptions may not be caught
        pass

    # Run and cache prefix
--- a/python/sglang/srt/code_completion_parser.py
+++ b/python/sglang/srt/code_completion_parser.py
@@ -27,7 +27,7 @@ completion_template_name = None


 class FimPosition:
-    """Postion of fim middle token."""
+    """Position of fim middle token."""

    MIDDLE = auto()
    END = auto()
--- a/python/sglang/srt/configs/deepseekvl2.py
+++ b/python/sglang/srt/configs/deepseekvl2.py
@@ -416,9 +416,9 @@ class DeepseekVLV2Processor(ProcessorMixin):
            h = w = math.ceil(
                (self.image_size // self.patch_size) / self.downsample_ratio
            )
-            # global views tokens h * (w + 1), 1 is for line seperator
+            # global views tokens h * (w + 1), 1 is for line separator
            tokenized_image = [self.image_token_id] * h * (w + 1)
-            # add a seperator between global and local views
+            # add a separator between global and local views
            tokenized_image += [self.image_token_id]
            # local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
            tokenized_image += (
--- a/python/sglang/srt/disaggregation/decode.py
+++ b/python/sglang/srt/disaggregation/decode.py
@@ -509,7 +509,7 @@ class SchedulerDisaggregationDecodeMixin:
    def event_loop_overlap_disagg_decode(self: Scheduler):
        result_queue = deque()
        self.last_batch: Optional[ScheduleBatch] = None
-        self.last_batch_in_queue = False  # last batch is modifed in-place, so we need another variable to track if it's extend
+        self.last_batch_in_queue = False  # last batch is modified in-place, so we need another variable to track if it's extend

        while True:
            recv_reqs = self.recv_requests()
--- a/python/sglang/srt/disaggregation/fake/conn.py
+++ b/python/sglang/srt/disaggregation/fake/conn.py
@@ -54,7 +54,7 @@ class FakeKVSender(BaseKVSender):
            logger.info(f"FakeKVSender send success")
        else:
            self.has_sent = False
-            logger.info(f"FakeKVSender send fake transfering")
+            logger.info(f"FakeKVSender send fake transferring")

    def failure_exception(self):
        raise Exception("Fake KVSender Exception")
--- a/python/sglang/srt/disaggregation/mooncake/conn.py
+++ b/python/sglang/srt/disaggregation/mooncake/conn.py
@@ -363,7 +363,7 @@ class MooncakeKVManager(BaseKVManager):
        self.request_status[bootstrap_room] = KVPoll.WaitingForInput

    def check_status(self, bootstrap_room: int):
-        # TOOD: do we really need the poll()?
+        # TODO: do we really need the poll()?

        return self.request_status[bootstrap_room]

--- a/python/sglang/srt/disaggregation/utils.py
+++ b/python/sglang/srt/disaggregation/utils.py
@@ -112,7 +112,7 @@ def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType):


 def kv_to_page_indices(kv_indices: np.ndarray, page_size: int):
-    # 1. The page is guaruanteed to be full except the last page.
+    # 1. The page is guaranteed to be full except the last page.
    # 2. page index = kv_index // page_size
    # The return vector is kv_indices[::page_size] // page_size
    if page_size == 1:  # shortcut
--- a/python/sglang/srt/function_call_parser.py
+++ b/python/sglang/srt/function_call_parser.py
@@ -86,8 +86,8 @@ class StructureInfo:

 _GetInfoFunc = Callable[[str], StructureInfo]
 """
-helper alias of function
-ususally it is a function that takes a name string and returns a StructureInfo object,
+Helper alias of function
+Usually it is a function that takes a name string and returns a StructureInfo object,
 which can be used to construct a structural_tag object
 """

--- a/python/sglang/srt/layers/attention/flashattention_backend.py
+++ b/python/sglang/srt/layers/attention/flashattention_backend.py
@@ -308,7 +308,7 @@ class FlashAttentionBackend(AttentionBackend):
        ), "Sliding window and cross attention are not supported together"

        self.forward_metadata: FlashAttentionMetadata = None
-        # extra metdata for handling speculative decoding topk > 1, extended draft decode and verify
+        # extra metadata for handling speculative decoding topk > 1, extended draft decode and verify
        self.forward_metadata_spec_decode_expand: FlashAttentionMetadata = None
        self.max_context_len = model_runner.model_config.context_len
        self.device = model_runner.device
--- a/python/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py
+++ b/python/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py
@@ -919,7 +919,7 @@ def _fwd_kernel(

        e_max = n_e_max

-    # stage 2: compute the trianlge part
+    # stage 2: compute the triangle part

    cur_block_m_end = tl.minimum(cur_seq_len_extend, (cur_block_m + 1) * BLOCK_M)
    for start_n in range(0, cur_block_m_end, BLOCK_N):
--- a/python/sglang/srt/layers/dp_attention.py
+++ b/python/sglang/srt/layers/dp_attention.py
@@ -201,7 +201,7 @@ def _dp_gather(
            global_tokens, local_tokens, 0, local_start_pos, local_num_tokens, False
        )

-    # Input IDs are in int 32. We should use inplace_all_reduce for local case becaues of custom all reduce.
+    # Input IDs are in int 32. We should use inplace_all_reduce for local case because of custom all reduce.
    NUM_GPUS_PER_NODE = 8
    if (
        not local_tokens.dtype.is_floating_point
--- a/python/sglang/srt/layers/layernorm.py
+++ b/python/sglang/srt/layers/layernorm.py
@@ -76,7 +76,7 @@ class RMSNorm(CustomOp):
        residual: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        if not x.is_contiguous():
-            # NOTE: Romove this if aiter kernel supports discontinuous input
+            # NOTE: Remove this if aiter kernel supports discontinuous input
            x = x.contiguous()
        if residual is not None:
            fused_add_rms_norm(x, residual, self.weight.data, self.variance_epsilon)
--- a/python/sglang/srt/layers/moe/ep_moe/kernels.py
+++ b/python/sglang/srt/layers/moe/ep_moe/kernels.py
@@ -116,7 +116,7 @@ def deepep_run_moe_deep_preprocess(topk_ids: torch.Tensor, num_experts: int):
    seg_indptr = torch.empty(num_experts + 1, device=topk_ids.device, dtype=torch.int64)
    src2dst = torch.empty(topk_ids.numel(), device=topk_ids.device, dtype=torch.int64)

-    # Find offet
+    # Find offset
    expert_ids = torch.arange(
        num_experts + 1, device=topk_ids.device, dtype=reorder_topk_ids.dtype
    )
--- a/python/sglang/srt/layers/moe/ep_moe/layer.py
+++ b/python/sglang/srt/layers/moe/ep_moe/layer.py
@@ -611,7 +611,7 @@ class Fp8EPMoEMethod(Fp8MoEMethod):
                self.quant_config.weight_block_size[1],
            )
            # NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n.
-            # Required by collum parallel or enabling merged weights
+            # Required by column parallel or enabling merged weights
            if intermediate_size % block_n != 0:
                raise ValueError(
                    f"The output_size of gate's and up's weight = "
--- a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
@@ -994,7 +994,7 @@ def get_default_config(
                    "num_stages": 2 if _is_hip else 4,
                }
        else:
-            # Block-wise quant: BLOCK_SIZE_K must be divisable by block_shape[1]
+            # Block-wise quant: BLOCK_SIZE_K must be divisible by block_shape[1]
            config = {
                "BLOCK_SIZE_M": 64,
                "BLOCK_SIZE_N": block_shape[0],
--- a/python/sglang/srt/layers/moe/topk.py
+++ b/python/sglang/srt/layers/moe/topk.py
@@ -270,7 +270,7 @@ def select_experts(
    routed_scaling_factor: Optional[float] = None,
 ):
    n_share_experts_fusion = global_server_args_dict["n_share_experts_fusion"]
-    # DeekSeek V2/V3/R1 serices models uses grouped_top_k
+    # DeepSeek V2/V3/R1 series models use grouped_top_k
    if use_grouped_topk:
        assert topk_group is not None
        assert num_expert_group is not None
--- a/python/sglang/srt/layers/quantization/init.py
+++ b/python/sglang/srt/layers/quantization/init.py
@@ -109,7 +109,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
    if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
        raise ValueError(
            f"{quantization} quantization requires some operators from vllm. "
-            "Pleaes install vllm by `pip install vllm==0.8.4`"
+            "Please install vllm by `pip install vllm==0.8.4`"
        )

    return QUANTIZATION_METHODS[quantization]
--- a/python/sglang/srt/layers/quantization/blockwise_int8.py
+++ b/python/sglang/srt/layers/quantization/blockwise_int8.py
@@ -152,7 +152,7 @@ class BlockInt8LinearMethod(LinearMethodBase):
                    f"{input_size_per_partition} is not divisible by "
                    f"weight quantization block_k = {block_k}."
                )
-        # Required by collum parallel or enabling merged weights
+        # Required by column parallel or enabling merged weights
        if (tp_size > 1 and output_size // output_size_per_partition == tp_size) or len(
            output_partition_sizes
        ) > 1:
@@ -285,7 +285,7 @@ class BlockInt8MoEMethod:
            self.quant_config.weight_block_size[1],
        )
        # NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n.
-        # Required by collum parallel or enabling merged weights
+        # Required by column parallel or enabling merged weights
        if intermediate_size % block_n != 0:
            raise ValueError(
                f"The output_size of gate's and up's weight = "
--- a/python/sglang/srt/layers/quantization/deep_gemm.py
+++ b/python/sglang/srt/layers/quantization/deep_gemm.py
@@ -103,10 +103,10 @@ _INITIALIZATION_DICT: Dict[Tuple[DeepGemmKernelType, int, int, int], bool] = dic
 def _compile_warning_1():
    if not _IN_PRECOMPILE_STAGE and _IS_FIRST_RANK_ON_NODE:
        logger.warning(
-            "Entering DeepGEMM JIT Pre-Complie session. "
+            "Entering DeepGEMM JIT Pre-Compile session. "
            "And it may takes a long time(Typically 10-20 mins) "
            "if you have not run `sglang.compile_deep_gemm`. "
-            "Recommand to run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
+            "It is recommended to run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
            " for pre-compilation to reduce the overhead if you have not run it before. "
            "For example: "
            "`python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code`"
@@ -115,7 +115,7 @@ def _compile_warning_1():

 def _compile_warning_2():
    logger.warning(
-        "Entering DeepGEMM JIT Single Kernel Complie session. "
+        "Entering DeepGEMM JIT Single Kernel Compile session. "
        "And it will makes inference throughput becomes flaky. "
        "Please run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
        " for pre-compilation to solve this issue. "
@@ -298,7 +298,7 @@ def _maybe_compile_deep_gemm_one_type_all(
        logger.info(
            f"Try DeepGEMM JIT Compiling for "
            f"<{kernel_helper.name}> N={n}, K={k}, num_groups={num_groups} with all Ms."
-            f"{' It only takes a litte time (typically 1 sec) if you have run `python3 -m sglang.compile_deep_gemm`. ' if not _IN_PRECOMPILE_STAGE else ''}"
+            f"{' It only takes a little time (typically 1 sec) if you have run `python3 -m sglang.compile_deep_gemm`. ' if not _IN_PRECOMPILE_STAGE else ''}"
        )

        # NOTE(alcanderian): get_num_sms should be change when 2-batch-overlap is introduced
--- a/python/sglang/srt/layers/quantization/fp8.py
+++ b/python/sglang/srt/layers/quantization/fp8.py
@@ -235,7 +235,7 @@ class Fp8LinearMethod(LinearMethodBase):
                        f"{input_size_per_partition} is not divisible by "
                        f"weight quantization block_k = {block_k}."
                    )
-            # Required by collum parallel or enabling merged weights
+            # Required by column parallel or enabling merged weights
            if (
                tp_size > 1 and output_size // output_size_per_partition == tp_size
            ) or len(output_partition_sizes) > 1:
@@ -491,7 +491,7 @@ class Fp8MoEMethod:
                self.quant_config.weight_block_size[1],
            )
            # NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n.
-            # Required by collum parallel or enabling merged weights
+            # Required by column parallel or enabling merged weights
            if intermediate_size % block_n != 0:
                raise ValueError(
                    f"The output_size of gate's and up's weight = "
--- a/python/sglang/srt/layers/quantization/fp8_kernel.py
+++ b/python/sglang/srt/layers/quantization/fp8_kernel.py
@@ -104,7 +104,7 @@ def _per_token_group_quant_fp8(
    y_s_ptr,
    # Stride of input
    y_stride,
-    # Collums of input
+    # Columns of input
    N,
    # Avoid to divide zero
    eps,
@@ -342,7 +342,7 @@ def _static_quant_fp8(
    y_s_repeat_ptr,
    # Stride of input
    y_stride,
-    # Collums of input
+    # Columns of input
    N,
    # Information for float8
    fp8_min,
@@ -794,7 +794,7 @@ def w8a8_block_fp8_matmul(
            config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
        else:
            # Default config
-            # Block-wise quant: BLOCK_SIZE_K must be divisable by block_size[1]
+            # Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1]
            config = {
                "BLOCK_SIZE_M": 64,
                "BLOCK_SIZE_N": block_size[0],
--- a/python/sglang/srt/layers/quantization/int8_kernel.py
+++ b/python/sglang/srt/layers/quantization/int8_kernel.py
@@ -76,7 +76,7 @@ def _per_token_group_quant_int8(
    y_s_ptr,
    # Stride of input
    y_stride,
-    # Collums of input
+    # Columns of input
    N,
    # Avoid to divide zero
    eps,
@@ -370,7 +370,7 @@ def w8a8_block_int8_matmul(
        config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
    else:
        # Default config
-        # Block-wise quant: BLOCK_SIZE_K must be divisable by block_size[1]
+        # Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1]
        config = {
            "BLOCK_SIZE_M": 64,
            "BLOCK_SIZE_N": block_size[0],
--- a/python/sglang/srt/lora/lora_manager.py
+++ b/python/sglang/srt/lora/lora_manager.py
@@ -100,7 +100,7 @@ class LoRAManager:
            self.configs[name] = LoRAConfig(path)
            self.hf_target_names.update(self.configs[name].target_modules)

-        # Target lora weight names for lora_a and lora_b modules repectively.
+        # Target lora weight names for lora_a and lora_b modules respectively.
        # e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj")}
        self.lora_weight_names: Set[Tuple[str]] = set(
            [get_stacked_name(module) for module in self.hf_target_names]
--- a/python/sglang/srt/lora/mem_pool.py
+++ b/python/sglang/srt/lora/mem_pool.py
@@ -50,15 +50,15 @@ class LoRAMemoryPool:
        self.uid_to_buffer_id: Dict[Optional[str], int] = {}

        # Buffer idx -> lora uid in memory pool
-        # All uids are initalized as empty strings for empty buffer slots
-        # Here we don't initalize to None since None is a valid uid
+        # All uids are initialized as empty strings for empty buffer slots
+        # Here we don't initialize to None since None is a valid uid
        self.buffer_id_to_uid: List[Optional[str]] = [""] * self.max_loras_per_batch

    def get_lora_A_shape(
        self, module_name: str, base_model: torch.nn.Module
    ) -> Tuple[int]:
        """
-        Given a module_name (might be a stacked name), return the hidden dims of modules's input and output.
+        Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
        """
        input_dim, _ = get_hidden_dim(module_name, self.base_hf_config, base_model)
        c = get_stacked_multiply(module_name)
@@ -75,7 +75,7 @@ class LoRAMemoryPool:
        self, module_name: str, base_model: torch.nn.Module
    ) -> Tuple[int]:
        """
-        Given a module_name (might be a stacked name), return the hidden dims of modules's input and output.
+        Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
        """
        _, output_dim = get_hidden_dim(module_name, self.base_hf_config, base_model)
        c = get_stacked_multiply(module_name)
--- a/python/sglang/srt/lora/triton_ops/gate_up_lora_b.py
+++ b/python/sglang/srt/lora/triton_ops/gate_up_lora_b.py
@@ -77,7 +77,7 @@ def _gate_up_lora_b_kernel(
        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
    )

-    # Iteate to compute the block in output matrix
+    # Iterate to compute the block in output matrix
    partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
    for k in range(0, tl.cdiv(K, BLOCK_K)):
        x_tile = tl.load(
--- a/python/sglang/srt/lora/triton_ops/qkv_lora_b.py
+++ b/python/sglang/srt/lora/triton_ops/qkv_lora_b.py
@@ -79,7 +79,7 @@ def _qkv_lora_b_kernel(
        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
    )

-    # Iteate to compute the block in output matrix
+    # Iterate to compute the block in output matrix
    partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
    for k in range(0, tl.cdiv(K, BLOCK_K)):
        x_tile = tl.load(
--- a/python/sglang/srt/lora/triton_ops/sgemm_lora_a.py
+++ b/python/sglang/srt/lora/triton_ops/sgemm_lora_a.py
@@ -67,7 +67,7 @@ def _sgemm_lora_a_kernel(
        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
    )

-    # Iteate to compute the block in output matrix
+    # Iterate to compute the block in output matrix
    partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
    for k in range(0, tl.cdiv(K, BLOCK_K)):
        x_tile = tl.load(
--- a/python/sglang/srt/lora/triton_ops/sgemm_lora_b.py
+++ b/python/sglang/srt/lora/triton_ops/sgemm_lora_b.py
@@ -69,7 +69,7 @@ def _sgemm_lora_b_kernel(
        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
    )

-    # Iteate to compute the block in output matrix
+    # Iterate to compute the block in output matrix
    partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
    for k in range(0, tl.cdiv(K, BLOCK_K)):
        x_tile = tl.load(
--- a/python/sglang/srt/lora/utils.py
+++ b/python/sglang/srt/lora/utils.py
@@ -79,7 +79,7 @@ def get_hidden_dim(
    module_name: str, config: AutoConfig, base_model: torch.nn.Module
 ) -> Tuple[int]:
    """
-    Given a module_name (might be a stacked name), return the hidden dims of modules's input and output.
+    Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
    """

    if hasattr(base_model, "get_hidden_dim"):
--- a/python/sglang/srt/managers/data_parallel_controller.py
+++ b/python/sglang/srt/managers/data_parallel_controller.py
@@ -210,7 +210,7 @@ class DataParallelController:
                    )
                    # compute zmq ports for this dp rank
                    rank_port_args = PortArgs.init_new(server_args, dp_rank)
-                    # Data parallelism resues the tensor parallelism group,
+                    # Data parallelism reuses the tensor parallelism group,
                    # so all dp ranks should use the same nccl port.
                    rank_port_args.nccl_port = port_args.nccl_port

--- a/python/sglang/srt/managers/io_struct.py
+++ b/python/sglang/srt/managers/io_struct.py
@@ -12,7 +12,7 @@
 # limitations under the License.
 # ==============================================================================
 """
-The definition of objects transfered between different
+The definition of objects transferred between different
 processes (TokenizerManager, DetokenizerManager, Controller).
 """

--- a/python/sglang/srt/managers/mm_utils.py
+++ b/python/sglang/srt/managers/mm_utils.py
@@ -51,7 +51,7 @@ class MultiModalityDataPaddingPatternTokenPairs(MultiModalityDataPaddingPattern)
        self, input_ids: List[int], mm_inputs: MultimodalInputs
    ) -> List[int]:
        """
-        This function will replace the data-tokens inbetween with pad_values accordingly
+        This function will replace the data-tokens in between with pad_values accordingly
        """
        pad_values = [item.pad_value for item in mm_inputs.mm_items]
        data_token_pairs = self.data_token_id_pairs
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -879,7 +879,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
            error_msg = (
                f"{phase_str} out of memory. Try to lower your batch size.\n"
                f"Try to allocate {num_tokens} tokens.\n"
-                f"Avaliable tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
+                f"Available tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
            )
            logger.error(error_msg)
            if self.tree_cache is not None:
@@ -920,7 +920,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
            error_msg = (
                f"Prefill out of memory. Try to lower your batch size.\n"
                f"Try to allocate {extend_num_tokens} tokens.\n"
-                f"Avaliable tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
+                f"Available tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
                f"{self.token_to_kv_pool_allocator.available_size()=}\n"
                f"{self.tree_cache.evictable_size()=}\n"
            )
@@ -955,7 +955,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
            error_msg = (
                f"Decode out of memory. Try to lower your batch size.\n"
                f"Try to allocate {len(seq_lens)} tokens.\n"
-                f"Avaliable tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
+                f"Available tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
                f"{self.token_to_kv_pool_allocator.available_size()=}\n"
                f"{self.tree_cache.evictable_size()=}\n"
            )
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -1325,7 +1325,7 @@ class Scheduler(
            return None

        running_bs = len(self.running_batch.reqs)
-        # Igore the check if self.chunked_req is not None.
+        # Ignore the check if self.chunked_req is not None.
        # In the non-PP case, when self.chunked_req is not None, num_allocatable_reqs should always be greater than 0,
        # as the space for the chunked request has just been released.
        # In PP case, a chunked req can start in one microbatch and end in another microbatch, so the max_running_requests per microbatch should not be strict.
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -1273,7 +1273,7 @@ class TokenizerManager:
            self.model_update_result.set_result(recv_obj)
        else:  # self.server_args.dp_size > 1
            self.model_update_tmp.append(recv_obj)
-            # set future if the all results are recevied
+            # set future if the all results are received
            if len(self.model_update_tmp) == self.server_args.dp_size:
                self.model_update_result.set_result(self.model_update_tmp)

--- a/python/sglang/srt/model_executor/cuda_graph_runner.py
+++ b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -296,12 +296,12 @@ class CudaGraphRunner:
                self.capture()
        except RuntimeError as e:
            raise Exception(
-                f"Capture cuda graph failed: {e}\n"
+                f"Capture CUDA graph failed: {e}\n"
                "Possible solutions:\n"
                "1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
                "2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
                "3. disable torch compile by not using --enable-torch-compile\n"
-                "4. disable cuda graph by --disable-cuda-graph. (Not recommonded. Huge perf loss)\n"
+                "4. disable CUDA graph by --disable-cuda-graph. (Not recommended. Huge performance loss)\n"
                "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
            )

--- a/python/sglang/srt/model_executor/forward_batch_info.py
+++ b/python/sglang/srt/model_executor/forward_batch_info.py
@@ -58,7 +58,7 @@ class ForwardMode(IntEnum):
    DECODE = auto()
    # Contains both EXTEND and DECODE when doing chunked prefill.
    MIXED = auto()
-    # No sequence to forward. For data parallel attention, some workers wil be IDLE if no sequence are allocated.
+    # No sequence to forward. For data parallel attention, some workers will be IDLE if no sequence are allocated.
    IDLE = auto()

    # Used in speculative decoding: verify a batch in the target model.
--- a/python/sglang/srt/models/deepseek_janus_pro.py
+++ b/python/sglang/srt/models/deepseek_janus_pro.py
@@ -188,7 +188,7 @@ def trunc_normal_tf_(
    best when :math:`a \\leq \text{mean} \\leq b`.
    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
-    and the result is subsquently scaled and shifted by the mean and std args.
+    and the result is subsequently scaled and shifted by the mean and std args.
    Args:
        tensor: an n-dimensional `torch.Tensor`
        mean: the mean of the normal distribution
@@ -735,7 +735,7 @@ class VisionTransformer(nn.Module):
            img_size: Input image size.
            patch_size: Patch size.
            in_chans: Number of image input channels.
-            num_classes: Mumber of classes for classification head.
+            num_classes: Number of classes for classification head.
            global_pool: Type of global pooling for final sequence (default: 'token').
            embed_dim: Transformer embedding dimension.
            depth: Depth of transformer.
--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -1287,7 +1287,7 @@ class DeepseekV2DecoderLayer(nn.Module):
        # Fully Connected
        hidden_states = self.mlp(hidden_states)

-        # TODO(ch-wan): ues reduce-scatter in MLP to avoid this scatter
+        # TODO(ch-wan): use reduce-scatter in MLP to avoid this scatter
        # Scatter
        if self.dp_size != 1:
            # important: forward batch.gathered_buffer is used both after scatter and after gather.
@@ -1499,7 +1499,7 @@ class DeepseekV2ForCausalLM(nn.Module):
            else:
                assert (
                    self.n_share_experts_fusion == self.tp_size
-                ), f"Shared experts fusion optimization is enabled in DeepSeek V3/R1, set it to {self.tp_size} can get best optimized performace."
+                ), f"Shared experts fusion optimization is enabled in DeepSeek V3/R1, set it to {self.tp_size} can get best optimized performance."
        elif self.n_share_experts_fusion == 0:
            if (
                _is_cuda
@@ -1665,7 +1665,7 @@ class DeepseekV2ForCausalLM(nn.Module):
        if is_nextn:
            if hasattr(self.config, "num_nextn_predict_layers"):
                num_nextn_layers = self.config.num_nextn_predict_layers
-                assert num_nextn_layers == 1, "Only 1 nextn layer is supportted"
+                assert num_nextn_layers == 1, "Only 1 nextn layer is supported"
                # compatible with old design
                nextn_layer_id = (
                    0
--- a/python/sglang/srt/models/llama4.py
+++ b/python/sglang/srt/models/llama4.py
@@ -428,7 +428,7 @@ class Llama4DecoderLayer(nn.Module):
        # Fully Connected
        hidden_states = self.feed_forward(hidden_states, forward_batch)

-        # TODO(ch-wan): ues reduce-scatter in MLP to avoid this scatter
+        # TODO(ch-wan): use reduce-scatter in MLP to avoid this scatter
        # Scatter
        if self.dp_size != 1:
            # important: forward batch.gathered_buffer is used both after scatter and after gather.
--- a/python/sglang/srt/models/roberta.py
+++ b/python/sglang/srt/models/roberta.py
@@ -57,7 +57,7 @@ class RobertaEmbedding(nn.Module):
        input_shape = input_ids.size()
        inputs_embeds = self.word_embeddings(input_ids)

-        # adpated from vllm: https://github.com/vllm-project/vllm/commit/4a18fd14ba4a349291c798a16bf62fa8a9af0b6b/vllm/model_executor/models/roberta.py
+        # Adapted from vllm: https://github.com/vllm-project/vllm/commit/4a18fd14ba4a349291c798a16bf62fa8a9af0b6b/vllm/model_executor/models/roberta.py

        pos_list = []
        token_list = []
--- a/python/sglang/srt/models/torch_native_llama.py
+++ b/python/sglang/srt/models/torch_native_llama.py
@@ -37,7 +37,7 @@ $ python3 -m sglang.bench_one_batch --correct \
  --tensor-parallel-size 2 \
  --disable-cuda-graph
 ```
-We will eanble CUDA Graph support soon.
+We will enable CUDA Graph support soon.
 """

 import types
--- a/python/sglang/srt/openai_api/adapter.py
+++ b/python/sglang/srt/openai_api/adapter.py
@@ -590,7 +590,7 @@ def v1_generate_response(
    echo = False

    if (not isinstance(request, list)) and request.echo:
-        # TODO: handle the case propmt is token ids
+        # TODO: handle the case prompt is token ids
        if isinstance(request.prompt, list) and isinstance(request.prompt[0], str):
            # for the case of multiple str prompts
            prompts = request.prompt
@@ -646,7 +646,7 @@ def v1_generate_response(
        finish_reason = ret_item["meta_info"]["finish_reason"]

        if to_file:
-            # to make the choise data json serializable
+            # to make the choice data json serializable
            choice_data = {
                "index": 0,
                "text": text,
--- a/python/sglang/srt/reasoning_parser.py
+++ b/python/sglang/srt/reasoning_parser.py
@@ -147,7 +147,7 @@ class ReasoningParser:

    Args:
        model_type (str): Type of model to parse reasoning from
-        stream_reasoning (bool): If Flase, accumulates reasoning content until complete.
+        stream_reasoning (bool): If False, accumulates reasoning content until complete.
            If True, streams reasoning content as it arrives.
    """

--- a/python/sglang/srt/sampling/sampling_batch_info.py
+++ b/python/sglang/srt/sampling/sampling_batch_info.py
@@ -294,7 +294,7 @@ class SamplingBatchInfo:
            # Set the flag to True if any of the two has custom logit processor
            self.has_custom_logit_processor = True

-        # Note: becasue the __len()__ operator is defined on the temperatures tensor,
+        # Note: because the __len()__ operator is defined on the temperatures tensor,
        # please make sure any merge operation with len(self) or len(other) is done before
        # the merge operation of the temperatures tensor below.
        for item in [
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -825,7 +825,7 @@ class ServerArgs:
        # Multi-node distributed serving
        parser.add_argument(
            "--dist-init-addr",
-            "--nccl-init-addr",  # For backward compatbility. This will be removed in the future.
+            "--nccl-init-addr",  # For backward compatibility. This will be removed in the future.
            type=str,
            help="The host address for initializing distributed backend (e.g., `192.168.0.2:25000`).",
        )
@@ -1096,7 +1096,7 @@ class ServerArgs:
        parser.add_argument(
            "--triton-attention-reduce-in-fp32",
            action="store_true",
-            help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
+            help="Cast the intermediate attention results to fp32 to avoid possible crashes related to fp16."
            "This only affects Triton attention kernels.",
        )
        parser.add_argument(
@@ -1188,7 +1188,7 @@ class ServerArgs:
            type=int,
            default=0,
            help="The number of shared_experts need to be replicated to fuse with normal experts in deepseek v3/r1, "
-            "set it to tp_size can get best optimized performace.",
+            "set it to tp_size can get best optimized performance.",
        )
        parser.add_argument(
            "--disable-chunked-prefix-cache",
--- a/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py
+++ b/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py
@@ -82,12 +82,12 @@ class EAGLEDraftCudaGraphRunner:
            self.capture()
        except RuntimeError as e:
            raise Exception(
-                f"Capture cuda graph failed: {e}\n"
+                f"Capture CUDA graph failed: {e}\n"
                "Possible solutions:\n"
                "1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
                "2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
                "3. disable torch compile by not using --enable-torch-compile\n"
-                "4. disable cuda graph by --disable-cuda-graph. (Not recommonded. Huge perf loss)\n"
+                "4. disable CUDA graph by --disable-cuda-graph. (Not recommended. Huge performance loss)\n"
                "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
            )

@@ -149,7 +149,7 @@ class EAGLEDraftCudaGraphRunner:

        # Run and capture
        def run_once():
-            # Backup two fileds, which will be modified in-place in `draft_forward`.
+            # Backup two fields, which will be modified in-place in `draft_forward`.
            output_cache_loc_backup = forward_batch.out_cache_loc
            hidden_states_backup = forward_batch.spec_info.hidden_states

--- a/python/sglang/srt/speculative/eagle_utils.py
+++ b/python/sglang/srt/speculative/eagle_utils.py
@@ -167,12 +167,12 @@ class EagleVerifyOutput:
    draft_input: EagleDraftInput
    # Logit outputs from target worker
    logits_output: LogitsProcessorOutput
-    # Accepeted token ids including the bonus token
+    # Accepted token ids including the bonus token
    verified_id: torch.Tensor
-    # Accepeted token length per sequence in a batch in CPU.
+    # Accepted token length per sequence in a batch in CPU.
    accept_length_per_req_cpu: List[int]
-    # Accepeted indices from logits_output.next_token_logits
-    accepeted_indices: torch.Tensor
+    # Accepted indices from logits_output.next_token_logits
+    accepted_indices: torch.Tensor


@dataclass
@@ -316,7 +316,7 @@ class EagleVerifyInput:

        This API updates values inside logits_output based on the accepted
        tokens. I.e., logits_output.next_token_logits only contains
-        accepeted token logits.
+        accepted token logits.
        """
        bs = self.retrive_index.shape[0]
        candidates = self.draft_token.reshape(bs, self.draft_token_num)
@@ -493,7 +493,7 @@ class EagleVerifyInput:
                logits_output=logits_output,
                verified_id=verified_id,
                accept_length_per_req_cpu=accept_length_cpu,
-                accepeted_indices=accept_index,
+                accepted_indices=accept_index,
            )
        else:
            assign_req_to_token_pool[(bs,)](
@@ -539,7 +539,7 @@ class EagleVerifyInput:
                logits_output=logits_output,
                verified_id=verified_id,
                accept_length_per_req_cpu=accept_length_cpu,
-                accepeted_indices=accept_index,
+                accepted_indices=accept_index,
            )


--- a/python/sglang/srt/speculative/eagle_worker.py
+++ b/python/sglang/srt/speculative/eagle_worker.py
@@ -201,7 +201,7 @@ class EAGLEWorker(TpModelWorker):
            self.has_prefill_wrapper_verify = False
        else:
            raise ValueError(
-                f"EAGLE is not supportted in attention backend {self.server_args.attention_backend}"
+                f"EAGLE is not supported in attention backend {self.server_args.attention_backend}"
            )

        self.draft_model_runner.draft_attn_backend = self.draft_attn_backend
@@ -245,8 +245,8 @@ class EAGLEWorker(TpModelWorker):
        Args:
            batch: The batch to run forward. The state of the batch is modified as it runs.
        Returns:
-            A tuple of the final logit output of the target model, next tokens accepeted,
-            the batch id (used for overlap schedule), and number of accepeted tokens.
+            A tuple of the final logit output of the target model, next tokens accepted,
+            the batch id (used for overlap schedule), and number of accepted tokens.
        """
        if batch.forward_mode.is_decode():
            with self.draft_tp_context(self.draft_model_runner.tp_group):
@@ -491,11 +491,11 @@ class EAGLEWorker(TpModelWorker):
        )

        # Post process based on verified outputs.
-        # Pick indices that we care (accepeted)
+        # Pick indices that we care (accepted)
        logits_output.next_token_logits = logits_output.next_token_logits[
-            res.accepeted_indices
+            res.accepted_indices
        ]
-        logits_output.hidden_states = logits_output.hidden_states[res.accepeted_indices]
+        logits_output.hidden_states = logits_output.hidden_states[res.accepted_indices]

        # Prepare the batch for the next draft forwards.
        batch.forward_mode = ForwardMode.DECODE
@@ -597,7 +597,7 @@ class EAGLEWorker(TpModelWorker):
        self.capture_for_decode(logits_output, forward_batch.spec_info)

    def forward_draft_extend_after_decode(self, batch: ScheduleBatch):
-        # Backup fileds that will be modified in-place
+        # Backup fields that will be modified in-place
        seq_lens_backup = batch.seq_lens.clone()
        req_pool_indices_backup = batch.req_pool_indices
        accept_length_backup = batch.spec_info.accept_length
--- a/python/sglang/test/simple_eval_common.py
+++ b/python/sglang/test/simple_eval_common.py
@@ -140,7 +140,7 @@ class ChatCompletionSampler(SamplerBase):
                    max_tokens=self.max_tokens,
                )
                return response.choices[0].message.content
-            # NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are reruning MMMU
+            # NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are rerunning MMMU
            except openai.BadRequestError as e:
                print("Bad Request Error", e)
                return ""
--- a/python/sglang/test/simple_eval_humaneval.py
+++ b/python/sglang/test/simple_eval_humaneval.py
@@ -121,7 +121,7 @@ class HumanEval(Eval):
                convo=convo,
                metrics={
                    f"pass@{k}": estimate_pass_at_k([total], [correct], k)
-                    # this will be aggrated so no need of .mean()
+                    # this will be aggregated so no need of .mean()
                    for k in self._ks_passes
                    if total >= k
                },
--- a/python/sglang/test/test_programs.py
+++ b/python/sglang/test/test_programs.py
@@ -370,7 +370,7 @@ def test_dtype_gen():
    @sgl.function
    def dtype_gen(s):
        s += "Q: What is the full name of DNS?\n"
-        s += "A: The full nams is " + sgl.gen("str_res", dtype=str, stop="\n") + "\n"
+        s += "A: The full names is " + sgl.gen("str_res", dtype=str, stop="\n") + "\n"
        s += "Q: Which year was DNS invented?\n"
        s += "A: " + sgl.gen("int_res", dtype=int) + "\n"
        s += "Q: What is the value of pi?\n"
--- a/python/sglang/utils.py
+++ b/python/sglang/utils.py
@@ -278,7 +278,7 @@ def graceful_registry(sub_module_name: str):
            f"{sub_module_name} Received signal to shutdown. Performing graceful shutdown..."
        )
        if signum == signal.SIGTERM:
-            logger.info(f"{sub_module_name} recive sigterm")
+            logger.info(f"{sub_module_name} receive sigterm")

    signal.signal(signal.SIGTERM, graceful_shutdown)