diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index eb3073577..6b02e38c4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -33,6 +33,12 @@ repos: rev: 24.10.0 hooks: - id: black-jupyter + - repo: https://github.com/codespell-project/codespell + rev: v2.4.1 + hooks: + - id: codespell + additional_dependencies: ['tomli'] + args: ['--toml', 'python/pyproject.toml'] - repo: https://github.com/pre-commit/mirrors-clang-format rev: v18.1.8 hooks: diff --git a/3rdparty/amd/tuning/TUNING.md b/3rdparty/amd/tuning/TUNING.md index 0638041c9..e7b9b2049 100644 --- a/3rdparty/amd/tuning/TUNING.md +++ b/3rdparty/amd/tuning/TUNING.md @@ -104,7 +104,7 @@ To maximize moe kernel efficiency, need to use below scripts to find out the bes ```bash #Tuning -#for example, we have one case like this "python3 -m sglang.bench_latency --model dummy_grok1/ --load-format dummy --tokenizer-path Xenova/grok-1-tokenizer --tp 8 --batch-size 32 --input 1024 --output 8 --attention-backend triton --sampling-backend pytorch --quantization fp8" to run, it defined batch-size 32 input lenth 1024 and output length 8, from "--batch" in moe view point, the prefill batch is 32*1024 = 32768, the decode batch is 32*1(only one output token generated in each run). +#for example, we have one case like this "python3 -m sglang.bench_latency --model dummy_grok1/ --load-format dummy --tokenizer-path Xenova/grok-1-tokenizer --tp 8 --batch-size 32 --input 1024 --output 8 --attention-backend triton --sampling-backend pytorch --quantization fp8" to run, it defined batch-size 32 input length 1024 and output length 8, from "--batch" in moe view point, the prefill batch is 32*1024 = 32768, the decode batch is 32*1(only one output token generated in each run). #so we can tune decode moe use below command python benchmark_moe_rocm.py --model grok1 --tp-size 8 --dtype float8 --batch "32" # and use this command to tune prefill moe diff --git a/benchmark/hicache/bench_serving.py b/benchmark/hicache/bench_serving.py index 90f4c6c5f..e38d0d0ea 100644 --- a/benchmark/hicache/bench_serving.py +++ b/benchmark/hicache/bench_serving.py @@ -267,7 +267,7 @@ async def get_requests( try: request = await asyncio.wait_for( input_requests_queue.get(), timeout=300 - ) # Wait for 5 minites then abort + ) # Wait for 5 minutes then abort except Exception as e: print(f"exception: {e}") break @@ -514,7 +514,7 @@ async def benchmark( print("{:<40} {:<10}".format("Traffic request rate:", request_rate)) print( "{:<40} {:<10}".format( - "Max reqeuest concurrency:", + "Max request concurrency:", max_concurrency if max_concurrency else "not set", ) ) diff --git a/benchmark/json_schema/bench_sglang.py b/benchmark/json_schema/bench_sglang.py index 5a5fd66c6..4693baae3 100644 --- a/benchmark/json_schema/bench_sglang.py +++ b/benchmark/json_schema/bench_sglang.py @@ -95,7 +95,7 @@ def bench_schema(args): latency = time.time() - tic # Check if the outputs are valid - indexs = [] + indexes = [] for i, state in enumerate(states): try: schema = json.loads(arguments[i]["json_schema"]) @@ -103,7 +103,7 @@ def bench_schema(args): assert jsonschema.validate(obj, schema) is None except Exception as e: print(e) - indexs.append(i) + indexes.append(i) return states, latency diff --git a/benchmark/line_retrieval/gen_data.py b/benchmark/line_retrieval/gen_data.py index c88ecba49..a01d40a06 100644 --- a/benchmark/line_retrieval/gen_data.py +++ b/benchmark/line_retrieval/gen_data.py @@ -15,7 +15,7 @@ from tqdm import tqdm def generate_lines(random_words, num_lines, redirect_ratio): prefix = "Here is a list of lines, each with its corresponding REGISTER_CONTENT value. Please memorize them. Be prepared to provide the REGISTER_CONTENT value for a specific line index when I ask." - suffix = "The list has ended. Please give the final REGISTER_CONTENT value for a specific line after resovling the redirections and references. For example, the REGISTER_CONTENT of Line __idx0__ is __val0__. The REGISTER_CONTENT of Line __idx1__ is __val1__. The REGISTER_CONTENT of Line __idx2__ is __val2__. The REGISTER_CONTENT of Line ??? is" + suffix = "The list has ended. Please give the final REGISTER_CONTENT value for a specific line after resolving the redirections and references. For example, the REGISTER_CONTENT of Line __idx0__ is __val0__. The REGISTER_CONTENT of Line __idx1__ is __val1__. The REGISTER_CONTENT of Line __idx2__ is __val2__. The REGISTER_CONTENT of Line ??? is" # Raw lines visited_indices = set([None]) diff --git a/benchmark/multi_document_qa/bench_other.py b/benchmark/multi_document_qa/bench_other.py index 97ff41686..6f0addcb7 100644 --- a/benchmark/multi_document_qa/bench_other.py +++ b/benchmark/multi_document_qa/bench_other.py @@ -17,7 +17,7 @@ ASSISTANT_SUFFIX = " " def multi_document_qa(docs, question, generate): s = USER_PREFIX - s += "Pleaes answer a question according to given documents.\n" + s += "Please answer a question according to given documents.\n" s += "Question:" + question + "Documents begin.\n" s += "".join(docs) diff --git a/benchmark/multi_document_qa/bench_sglang.py b/benchmark/multi_document_qa/bench_sglang.py index 2c2db4f21..645520166 100644 --- a/benchmark/multi_document_qa/bench_sglang.py +++ b/benchmark/multi_document_qa/bench_sglang.py @@ -13,7 +13,7 @@ from sglang.utils import dump_state_text, read_jsonl @sgl.function def multi_document_qa(s, docs, question): s += sgl.user_begin() - s += "Pleaes answer a question according to given documents.\n" + s += "Please answer a question according to given documents.\n" s += "Question:" + question + "Documents begin.\n" forks = s.fork(len(docs)) diff --git a/docs/backend/function_calling.ipynb b/docs/backend/function_calling.ipynb index 26a0024fa..18ee1a431 100644 --- a/docs/backend/function_calling.ipynb +++ b/docs/backend/function_calling.ipynb @@ -6,7 +6,7 @@ "source": [ "# Tool and Function Calling\n", "\n", - "This guide demonstrates how to use SGLang’s [Funcion calling](https://platform.openai.com/docs/guides/function-calling) functionality." + "This guide demonstrates how to use SGLang’s [Function calling](https://platform.openai.com/docs/guides/function-calling) functionality." ] }, { @@ -399,7 +399,7 @@ " },\n", "}\n", "gen_response = requests.post(gen_url, json=gen_data).json()[\"text\"]\n", - "print_highlight(\"==== Reponse ====\")\n", + "print_highlight(\"==== Response ====\")\n", "print(gen_response)\n", "\n", "# parse the response\n", diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb index 5424e45a0..dbc24f2cc 100644 --- a/docs/backend/openai_api_completions.ipynb +++ b/docs/backend/openai_api_completions.ipynb @@ -275,7 +275,7 @@ "source": [ "## Structured Outputs (JSON, Regex, EBNF)\n", "\n", - "For OpenAI compatible structed outputs API, refer to [Structured Outputs](https://docs.sglang.ai/backend/structured_outputs.html#OpenAI-Compatible-API) for more details.\n" + "For OpenAI compatible structured outputs API, refer to [Structured Outputs](https://docs.sglang.ai/backend/structured_outputs.html#OpenAI-Compatible-API) for more details.\n" ] }, { diff --git a/docs/backend/sampling_params.md b/docs/backend/sampling_params.md index 736d67e05..ff1bf4316 100644 --- a/docs/backend/sampling_params.md +++ b/docs/backend/sampling_params.md @@ -40,7 +40,7 @@ The `/generate` endpoint accepts the following parameters in JSON format. For de | Argument | Type/Default | Description | |--------------------|------------------------|------------------------------------------------------------------------------------------------------------------------------------------------| | frequency_penalty | `float = 0.0` | Penalizes tokens based on their frequency in generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of penalization grows linearly with each appearance of a token. | -| presence_penalty | `float = 0.0` | Penalizes tokens if they appeared in the generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of the penalization is constant if a token occured. | +| presence_penalty | `float = 0.0` | Penalizes tokens if they appeared in the generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of the penalization is constant if a token occurred. | | min_new_tokens | `int = 0` | Forces the model to generate at least `min_new_tokens` until a stop word or EOS token is sampled. Note that this might lead to unintended behavior, for example, if the distribution is highly skewed towards these tokens. | ### Constrained decoding diff --git a/docs/backend/send_request.ipynb b/docs/backend/send_request.ipynb index 455c6431b..082e32b56 100644 --- a/docs/backend/send_request.ipynb +++ b/docs/backend/send_request.ipynb @@ -166,7 +166,7 @@ "source": [ "## Using Native Generation APIs\n", "\n", - "You can also use the native `/generate` endpoint with requests, which provides more flexiblity. An API reference is available at [Sampling Parameters](sampling_params.md)." + "You can also use the native `/generate` endpoint with requests, which provides more flexibility. An API reference is available at [Sampling Parameters](sampling_params.md)." ] }, { diff --git a/docs/backend/separate_reasoning.ipynb b/docs/backend/separate_reasoning.ipynb index 756ecbaa9..6048c6642 100644 --- a/docs/backend/separate_reasoning.ipynb +++ b/docs/backend/separate_reasoning.ipynb @@ -378,7 +378,7 @@ "\n", " Args:\n", " model_type (str): Type of model to parse reasoning from\n", - " stream_reasoning (bool): If Flase, accumulates reasoning content until complete.\n", + " stream_reasoning (bool): If False, accumulates reasoning content until complete.\n", " If True, streams reasoning content as it arrives.\n", " \"\"\"\n", "\n", diff --git a/docs/backend/speculative_decoding.ipynb b/docs/backend/speculative_decoding.ipynb index 2958b265c..7b68d1d26 100644 --- a/docs/backend/speculative_decoding.ipynb +++ b/docs/backend/speculative_decoding.ipynb @@ -11,7 +11,7 @@ "\n", "### Performance Highlights\n", "\n", - "Please see below for the huge improvements on throughput for LLaMA-Instruct 3.1 8B tested on MT bench that can be archieved via EAGLE3 decoding.\n", + "Please see below for the huge improvements on throughput for LLaMA-Instruct 3.1 8B tested on MT bench that can be achieved via EAGLE3 decoding.\n", "For further details please see the [EAGLE3 paper](https://arxiv.org/pdf/2503.01840).\n", "\n", "| Method | Throughput (tokens/s) |\n", @@ -296,7 +296,7 @@ "- EAGLE-2 additionally uses the draft model to evaluate how probable certain branches in the draft tree are, dynamically stopping the expansion of unlikely branches. After the expansion phase, reranking is employed to select only the top `speculative_num_draft_tokens` final nodes as draft tokens.\n", "- EAGLE-3 removes the feature prediction objective, incorporates low and mid-layer features, and is trained in an on-policy manner.\n", "\n", - "This enhances drafting accuracy by operating on the features instead of tokens for more regular inputs and passing the tokens from the next timestep additionaly to minimize randomness effects from sampling. Furthermore the dynamic adjustment of the draft tree and selection of reranked final nodes increases acceptance rate of draft tokens further. For more details see [EAGLE-2](https://arxiv.org/abs/2406.16858) and [EAGLE-3](https://arxiv.org/abs/2503.01840) paper.\n", + "This enhances drafting accuracy by operating on the features instead of tokens for more regular inputs and passing the tokens from the next timestep additionally to minimize randomness effects from sampling. Furthermore the dynamic adjustment of the draft tree and selection of reranked final nodes increases acceptance rate of draft tokens further. For more details see [EAGLE-2](https://arxiv.org/abs/2406.16858) and [EAGLE-3](https://arxiv.org/abs/2503.01840) paper.\n", "\n", "\n", "For guidance how to train your own EAGLE model please see the [EAGLE repo](https://github.com/SafeAILab/EAGLE/tree/main?tab=readme-ov-file#train)." diff --git a/docs/developer/development_guide_using_docker.md b/docs/developer/development_guide_using_docker.md index 8ceaf479e..e38947902 100644 --- a/docs/developer/development_guide_using_docker.md +++ b/docs/developer/development_guide_using_docker.md @@ -52,7 +52,7 @@ docker run -itd --shm-size 32g --gpus all -v --ipc=host --net docker exec -it sglang_dev /bin/zsh ``` Some useful volumes to mount are: -1. **Huggingface model cache**: mounting model cache can avoid re-download everytime docker restarts. Default location on Linux is `~/.cache/huggingface/`. +1. **Huggingface model cache**: mounting model cache can avoid re-download every time docker restarts. Default location on Linux is `~/.cache/huggingface/`. 2. **SGLang repository**: code changes in the SGLang local repository will be automatically synced to the .devcontainer. Example 1: Monting local cache folder `/opt/dlami/nvme/.cache` but not the SGLang repo. Use this when you prefer to manually transfer local code changes to the devcontainer. diff --git a/docs/developer/setup_github_runner.md b/docs/developer/setup_github_runner.md index d0144c8b1..b05adc4eb 100644 --- a/docs/developer/setup_github_runner.md +++ b/docs/developer/setup_github_runner.md @@ -29,7 +29,7 @@ Then follow https://github.com/sgl-project/sglang/settings/actions/runners/new?a **Notes** - Do not need to specify the runner group -- Give it a name (e.g., `test-sgl-gpu-0`) and some labels (e.g., `1-gpu-runner`). The labels can be editted later in Github Settings. +- Give it a name (e.g., `test-sgl-gpu-0`) and some labels (e.g., `1-gpu-runner`). The labels can be edited later in Github Settings. - Do not need to change the work folder. ### Step 3: Run the runner by `run.sh` diff --git a/docs/router/router.md b/docs/router/router.md index 26fa94241..8267007e1 100644 --- a/docs/router/router.md +++ b/docs/router/router.md @@ -32,7 +32,7 @@ python -m sglang_router.launch_server --model-path meta-llama/Meta-Llama-3.1-8B- After the server is ready, you can directly send requests to the router as the same way as sending requests to each single worker. -Please adjust the batchsize accordingly to archieve maximum throughput. +Please adjust the batchsize accordingly to achieve maximum throughput. ```python import requests diff --git a/examples/frontend_language/usage/rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb b/examples/frontend_language/usage/rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb index 3c1b2a6c4..f309142ad 100644 --- a/examples/frontend_language/usage/rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb +++ b/examples/frontend_language/usage/rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb @@ -375,7 +375,7 @@ "\n", "When opening above experiment, we will see an overview of the experiment as shown below. The upper half shows a summary of the statistics on the left and charts to investigate the distribution and relationships of scores on the right. The lower half is a table with the individual traces which we can use to debug individual samples.\n", "\n", - "When looking at the statistics, we can see that the accuracy of our RAG pipeline is 22% as measured by `answer_matches_target_llm_grader`. Though when checking the quality of our retrieval step (`context_query_relevancy`), we can see that our retrival step is fetching relevant information in only 27% of all samples. As shown in the GIF, we investigate the relationship between the two and see the two scores have 95% agreement. This confirms that the retrieval step is a major bottleneck for our RAG pipeline. So, now it's your turn to improve the retrieval step!\n", + "When looking at the statistics, we can see that the accuracy of our RAG pipeline is 22% as measured by `answer_matches_target_llm_grader`. Though when checking the quality of our retrieval step (`context_query_relevancy`), we can see that our retrieval step is fetching relevant information in only 27% of all samples. As shown in the GIF, we investigate the relationship between the two and see the two scores have 95% agreement. This confirms that the retrieval step is a major bottleneck for our RAG pipeline. So, now it's your turn to improve the retrieval step!\n", "\n", "Note, above link isn't publicly accessible but the experiment can be accessed through [here](https://app.parea.ai/public-experiments/parea/rag_sglang/30f0244a-d56c-44ff-bdfb-8f47626304b6).\n", "\n", diff --git a/python/pyproject.toml b/python/pyproject.toml index dd166f7ba..2e6105650 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -147,3 +147,7 @@ exclude = [ "scripts*", "tests*", ] + +[tool.codespell] +ignore-words-list = "ans, als, hel, boostrap, childs, te, vas, hsa, ment" +skip = "*.json,*.jsonl,*.patch,*.txt" diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py index 69bbc3e4d..d88d535fc 100644 --- a/python/sglang/bench_offline_throughput.py +++ b/python/sglang/bench_offline_throughput.py @@ -315,7 +315,7 @@ def throughput_test( tokenizer_id = server_args.tokenizer_path or server_args.model_path tokenizer = get_tokenizer(tokenizer_id) - # Set global environmnets + # Set global environments set_ulimit() random.seed(bench_args.seed) np.random.seed(bench_args.seed) diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index 922ba1e39..d6133d437 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -1263,7 +1263,7 @@ async def benchmark( print("{:<40} {:<10}".format("Traffic request rate:", request_rate)) print( "{:<40} {:<10}".format( - "Max reqeuest concurrency:", + "Max request concurrency:", max_concurrency if max_concurrency else "not set", ) ) diff --git a/python/sglang/compile_deep_gemm.py b/python/sglang/compile_deep_gemm.py index e78714023..84b52962f 100644 --- a/python/sglang/compile_deep_gemm.py +++ b/python/sglang/compile_deep_gemm.py @@ -129,7 +129,7 @@ def launch_server_process_and_send_one_request( def refine_server_args(server_args: ServerArgs, compile_args: CompileArgs): - # Disbale cuda graph and torch compile to save time + # Disable cuda graph and torch compile to save time server_args.disable_cuda_graph = True server_args.enable_torch_compile = False print(f"Disable CUDA Graph and Torch Compile to save time...") diff --git a/python/sglang/lang/tracer.py b/python/sglang/lang/tracer.py index 7b3c72804..0a2a744f9 100644 --- a/python/sglang/lang/tracer.py +++ b/python/sglang/lang/tracer.py @@ -38,7 +38,7 @@ def extract_prefix_by_tracing(program, backend): with TracingScope(tracer): tracer.ret_value = program.func(tracer, **arguments) except (StopTracing, TypeError, AttributeError): - # Some exceptions may not be catched + # Some exceptions may not be caught pass # Run and cache prefix diff --git a/python/sglang/srt/code_completion_parser.py b/python/sglang/srt/code_completion_parser.py index 81cdb4a36..4a94565a2 100644 --- a/python/sglang/srt/code_completion_parser.py +++ b/python/sglang/srt/code_completion_parser.py @@ -27,7 +27,7 @@ completion_template_name = None class FimPosition: - """Postion of fim middle token.""" + """Position of fim middle token.""" MIDDLE = auto() END = auto() diff --git a/python/sglang/srt/configs/deepseekvl2.py b/python/sglang/srt/configs/deepseekvl2.py index 961995410..29fc49696 100644 --- a/python/sglang/srt/configs/deepseekvl2.py +++ b/python/sglang/srt/configs/deepseekvl2.py @@ -416,9 +416,9 @@ class DeepseekVLV2Processor(ProcessorMixin): h = w = math.ceil( (self.image_size // self.patch_size) / self.downsample_ratio ) - # global views tokens h * (w + 1), 1 is for line seperator + # global views tokens h * (w + 1), 1 is for line separator tokenized_image = [self.image_token_id] * h * (w + 1) - # add a seperator between global and local views + # add a separator between global and local views tokenized_image += [self.image_token_id] # local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1) tokenized_image += ( diff --git a/python/sglang/srt/disaggregation/decode.py b/python/sglang/srt/disaggregation/decode.py index 0e64c743b..4499afcfe 100644 --- a/python/sglang/srt/disaggregation/decode.py +++ b/python/sglang/srt/disaggregation/decode.py @@ -509,7 +509,7 @@ class SchedulerDisaggregationDecodeMixin: def event_loop_overlap_disagg_decode(self: Scheduler): result_queue = deque() self.last_batch: Optional[ScheduleBatch] = None - self.last_batch_in_queue = False # last batch is modifed in-place, so we need another variable to track if it's extend + self.last_batch_in_queue = False # last batch is modified in-place, so we need another variable to track if it's extend while True: recv_reqs = self.recv_requests() diff --git a/python/sglang/srt/disaggregation/fake/conn.py b/python/sglang/srt/disaggregation/fake/conn.py index f65289f44..1e2bd4461 100644 --- a/python/sglang/srt/disaggregation/fake/conn.py +++ b/python/sglang/srt/disaggregation/fake/conn.py @@ -54,7 +54,7 @@ class FakeKVSender(BaseKVSender): logger.info(f"FakeKVSender send success") else: self.has_sent = False - logger.info(f"FakeKVSender send fake transfering") + logger.info(f"FakeKVSender send fake transferring") def failure_exception(self): raise Exception("Fake KVSender Exception") diff --git a/python/sglang/srt/disaggregation/mooncake/conn.py b/python/sglang/srt/disaggregation/mooncake/conn.py index 40b083bf0..4cf1ad9f1 100644 --- a/python/sglang/srt/disaggregation/mooncake/conn.py +++ b/python/sglang/srt/disaggregation/mooncake/conn.py @@ -363,7 +363,7 @@ class MooncakeKVManager(BaseKVManager): self.request_status[bootstrap_room] = KVPoll.WaitingForInput def check_status(self, bootstrap_room: int): - # TOOD: do we really need the poll()? + # TODO: do we really need the poll()? return self.request_status[bootstrap_room] diff --git a/python/sglang/srt/disaggregation/utils.py b/python/sglang/srt/disaggregation/utils.py index 90fd6034b..40d63d6a3 100644 --- a/python/sglang/srt/disaggregation/utils.py +++ b/python/sglang/srt/disaggregation/utils.py @@ -112,7 +112,7 @@ def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType): def kv_to_page_indices(kv_indices: np.ndarray, page_size: int): - # 1. The page is guaruanteed to be full except the last page. + # 1. The page is guaranteed to be full except the last page. # 2. page index = kv_index // page_size # The return vector is kv_indices[::page_size] // page_size if page_size == 1: # shortcut diff --git a/python/sglang/srt/function_call_parser.py b/python/sglang/srt/function_call_parser.py index ef7b51058..549843146 100644 --- a/python/sglang/srt/function_call_parser.py +++ b/python/sglang/srt/function_call_parser.py @@ -86,8 +86,8 @@ class StructureInfo: _GetInfoFunc = Callable[[str], StructureInfo] """ -helper alias of function -ususally it is a function that takes a name string and returns a StructureInfo object, +Helper alias of function +Usually it is a function that takes a name string and returns a StructureInfo object, which can be used to construct a structural_tag object """ diff --git a/python/sglang/srt/layers/attention/flashattention_backend.py b/python/sglang/srt/layers/attention/flashattention_backend.py index ff6d81c85..f200a367b 100644 --- a/python/sglang/srt/layers/attention/flashattention_backend.py +++ b/python/sglang/srt/layers/attention/flashattention_backend.py @@ -308,7 +308,7 @@ class FlashAttentionBackend(AttentionBackend): ), "Sliding window and cross attention are not supported together" self.forward_metadata: FlashAttentionMetadata = None - # extra metdata for handling speculative decoding topk > 1, extended draft decode and verify + # extra metadata for handling speculative decoding topk > 1, extended draft decode and verify self.forward_metadata_spec_decode_expand: FlashAttentionMetadata = None self.max_context_len = model_runner.model_config.context_len self.device = model_runner.device diff --git a/python/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py b/python/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py index 6b6ede927..72e0bfe78 100644 --- a/python/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +++ b/python/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py @@ -919,7 +919,7 @@ def _fwd_kernel( e_max = n_e_max - # stage 2: compute the trianlge part + # stage 2: compute the triangle part cur_block_m_end = tl.minimum(cur_seq_len_extend, (cur_block_m + 1) * BLOCK_M) for start_n in range(0, cur_block_m_end, BLOCK_N): diff --git a/python/sglang/srt/layers/dp_attention.py b/python/sglang/srt/layers/dp_attention.py index 69f94407c..2cc399ab7 100644 --- a/python/sglang/srt/layers/dp_attention.py +++ b/python/sglang/srt/layers/dp_attention.py @@ -201,7 +201,7 @@ def _dp_gather( global_tokens, local_tokens, 0, local_start_pos, local_num_tokens, False ) - # Input IDs are in int 32. We should use inplace_all_reduce for local case becaues of custom all reduce. + # Input IDs are in int 32. We should use inplace_all_reduce for local case because of custom all reduce. NUM_GPUS_PER_NODE = 8 if ( not local_tokens.dtype.is_floating_point diff --git a/python/sglang/srt/layers/layernorm.py b/python/sglang/srt/layers/layernorm.py index 87322b1b0..98ae3d83d 100644 --- a/python/sglang/srt/layers/layernorm.py +++ b/python/sglang/srt/layers/layernorm.py @@ -76,7 +76,7 @@ class RMSNorm(CustomOp): residual: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: if not x.is_contiguous(): - # NOTE: Romove this if aiter kernel supports discontinuous input + # NOTE: Remove this if aiter kernel supports discontinuous input x = x.contiguous() if residual is not None: fused_add_rms_norm(x, residual, self.weight.data, self.variance_epsilon) diff --git a/python/sglang/srt/layers/moe/ep_moe/kernels.py b/python/sglang/srt/layers/moe/ep_moe/kernels.py index 37c87ed0c..4cf0be7ae 100644 --- a/python/sglang/srt/layers/moe/ep_moe/kernels.py +++ b/python/sglang/srt/layers/moe/ep_moe/kernels.py @@ -116,7 +116,7 @@ def deepep_run_moe_deep_preprocess(topk_ids: torch.Tensor, num_experts: int): seg_indptr = torch.empty(num_experts + 1, device=topk_ids.device, dtype=torch.int64) src2dst = torch.empty(topk_ids.numel(), device=topk_ids.device, dtype=torch.int64) - # Find offet + # Find offset expert_ids = torch.arange( num_experts + 1, device=topk_ids.device, dtype=reorder_topk_ids.dtype ) diff --git a/python/sglang/srt/layers/moe/ep_moe/layer.py b/python/sglang/srt/layers/moe/ep_moe/layer.py index b2c76b33a..6efb48e97 100644 --- a/python/sglang/srt/layers/moe/ep_moe/layer.py +++ b/python/sglang/srt/layers/moe/ep_moe/layer.py @@ -611,7 +611,7 @@ class Fp8EPMoEMethod(Fp8MoEMethod): self.quant_config.weight_block_size[1], ) # NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n. - # Required by collum parallel or enabling merged weights + # Required by column parallel or enabling merged weights if intermediate_size % block_n != 0: raise ValueError( f"The output_size of gate's and up's weight = " diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py index 39d52cb53..4d949a843 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +++ b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py @@ -994,7 +994,7 @@ def get_default_config( "num_stages": 2 if _is_hip else 4, } else: - # Block-wise quant: BLOCK_SIZE_K must be divisable by block_shape[1] + # Block-wise quant: BLOCK_SIZE_K must be divisible by block_shape[1] config = { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": block_shape[0], diff --git a/python/sglang/srt/layers/moe/topk.py b/python/sglang/srt/layers/moe/topk.py index 47915cf40..a7af87144 100644 --- a/python/sglang/srt/layers/moe/topk.py +++ b/python/sglang/srt/layers/moe/topk.py @@ -270,7 +270,7 @@ def select_experts( routed_scaling_factor: Optional[float] = None, ): n_share_experts_fusion = global_server_args_dict["n_share_experts_fusion"] - # DeekSeek V2/V3/R1 serices models uses grouped_top_k + # DeepSeek V2/V3/R1 series models use grouped_top_k if use_grouped_topk: assert topk_group is not None assert num_expert_group is not None diff --git a/python/sglang/srt/layers/quantization/__init__.py b/python/sglang/srt/layers/quantization/__init__.py index 81ee11a09..0fa9c0f9c 100644 --- a/python/sglang/srt/layers/quantization/__init__.py +++ b/python/sglang/srt/layers/quantization/__init__.py @@ -109,7 +109,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE: raise ValueError( f"{quantization} quantization requires some operators from vllm. " - "Pleaes install vllm by `pip install vllm==0.8.4`" + "Please install vllm by `pip install vllm==0.8.4`" ) return QUANTIZATION_METHODS[quantization] diff --git a/python/sglang/srt/layers/quantization/blockwise_int8.py b/python/sglang/srt/layers/quantization/blockwise_int8.py index 25c91da6e..d79d70de7 100644 --- a/python/sglang/srt/layers/quantization/blockwise_int8.py +++ b/python/sglang/srt/layers/quantization/blockwise_int8.py @@ -152,7 +152,7 @@ class BlockInt8LinearMethod(LinearMethodBase): f"{input_size_per_partition} is not divisible by " f"weight quantization block_k = {block_k}." ) - # Required by collum parallel or enabling merged weights + # Required by column parallel or enabling merged weights if (tp_size > 1 and output_size // output_size_per_partition == tp_size) or len( output_partition_sizes ) > 1: @@ -285,7 +285,7 @@ class BlockInt8MoEMethod: self.quant_config.weight_block_size[1], ) # NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n. - # Required by collum parallel or enabling merged weights + # Required by column parallel or enabling merged weights if intermediate_size % block_n != 0: raise ValueError( f"The output_size of gate's and up's weight = " diff --git a/python/sglang/srt/layers/quantization/deep_gemm.py b/python/sglang/srt/layers/quantization/deep_gemm.py index 3d6ba6281..e14f861fc 100644 --- a/python/sglang/srt/layers/quantization/deep_gemm.py +++ b/python/sglang/srt/layers/quantization/deep_gemm.py @@ -103,10 +103,10 @@ _INITIALIZATION_DICT: Dict[Tuple[DeepGemmKernelType, int, int, int], bool] = dic def _compile_warning_1(): if not _IN_PRECOMPILE_STAGE and _IS_FIRST_RANK_ON_NODE: logger.warning( - "Entering DeepGEMM JIT Pre-Complie session. " + "Entering DeepGEMM JIT Pre-Compile session. " "And it may takes a long time(Typically 10-20 mins) " "if you have not run `sglang.compile_deep_gemm`. " - "Recommand to run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`" + "It is recommended to run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`" " for pre-compilation to reduce the overhead if you have not run it before. " "For example: " "`python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code`" @@ -115,7 +115,7 @@ def _compile_warning_1(): def _compile_warning_2(): logger.warning( - "Entering DeepGEMM JIT Single Kernel Complie session. " + "Entering DeepGEMM JIT Single Kernel Compile session. " "And it will makes inference throughput becomes flaky. " "Please run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`" " for pre-compilation to solve this issue. " @@ -298,7 +298,7 @@ def _maybe_compile_deep_gemm_one_type_all( logger.info( f"Try DeepGEMM JIT Compiling for " f"<{kernel_helper.name}> N={n}, K={k}, num_groups={num_groups} with all Ms." - f"{' It only takes a litte time (typically 1 sec) if you have run `python3 -m sglang.compile_deep_gemm`. ' if not _IN_PRECOMPILE_STAGE else ''}" + f"{' It only takes a little time (typically 1 sec) if you have run `python3 -m sglang.compile_deep_gemm`. ' if not _IN_PRECOMPILE_STAGE else ''}" ) # NOTE(alcanderian): get_num_sms should be change when 2-batch-overlap is introduced diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py index c36dbf295..521ba7deb 100644 --- a/python/sglang/srt/layers/quantization/fp8.py +++ b/python/sglang/srt/layers/quantization/fp8.py @@ -235,7 +235,7 @@ class Fp8LinearMethod(LinearMethodBase): f"{input_size_per_partition} is not divisible by " f"weight quantization block_k = {block_k}." ) - # Required by collum parallel or enabling merged weights + # Required by column parallel or enabling merged weights if ( tp_size > 1 and output_size // output_size_per_partition == tp_size ) or len(output_partition_sizes) > 1: @@ -491,7 +491,7 @@ class Fp8MoEMethod: self.quant_config.weight_block_size[1], ) # NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n. - # Required by collum parallel or enabling merged weights + # Required by column parallel or enabling merged weights if intermediate_size % block_n != 0: raise ValueError( f"The output_size of gate's and up's weight = " diff --git a/python/sglang/srt/layers/quantization/fp8_kernel.py b/python/sglang/srt/layers/quantization/fp8_kernel.py index cd63e19a5..226986612 100644 --- a/python/sglang/srt/layers/quantization/fp8_kernel.py +++ b/python/sglang/srt/layers/quantization/fp8_kernel.py @@ -104,7 +104,7 @@ def _per_token_group_quant_fp8( y_s_ptr, # Stride of input y_stride, - # Collums of input + # Columns of input N, # Avoid to divide zero eps, @@ -342,7 +342,7 @@ def _static_quant_fp8( y_s_repeat_ptr, # Stride of input y_stride, - # Collums of input + # Columns of input N, # Information for float8 fp8_min, @@ -794,7 +794,7 @@ def w8a8_block_fp8_matmul( config = configs[min(configs.keys(), key=lambda x: abs(x - M))] else: # Default config - # Block-wise quant: BLOCK_SIZE_K must be divisable by block_size[1] + # Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1] config = { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": block_size[0], diff --git a/python/sglang/srt/layers/quantization/int8_kernel.py b/python/sglang/srt/layers/quantization/int8_kernel.py index 22f1c5069..32367a5bf 100644 --- a/python/sglang/srt/layers/quantization/int8_kernel.py +++ b/python/sglang/srt/layers/quantization/int8_kernel.py @@ -76,7 +76,7 @@ def _per_token_group_quant_int8( y_s_ptr, # Stride of input y_stride, - # Collums of input + # Columns of input N, # Avoid to divide zero eps, @@ -370,7 +370,7 @@ def w8a8_block_int8_matmul( config = configs[min(configs.keys(), key=lambda x: abs(x - M))] else: # Default config - # Block-wise quant: BLOCK_SIZE_K must be divisable by block_size[1] + # Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1] config = { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": block_size[0], diff --git a/python/sglang/srt/lora/lora_manager.py b/python/sglang/srt/lora/lora_manager.py index d702faab6..70e6ca838 100644 --- a/python/sglang/srt/lora/lora_manager.py +++ b/python/sglang/srt/lora/lora_manager.py @@ -100,7 +100,7 @@ class LoRAManager: self.configs[name] = LoRAConfig(path) self.hf_target_names.update(self.configs[name].target_modules) - # Target lora weight names for lora_a and lora_b modules repectively. + # Target lora weight names for lora_a and lora_b modules respectively. # e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj")} self.lora_weight_names: Set[Tuple[str]] = set( [get_stacked_name(module) for module in self.hf_target_names] diff --git a/python/sglang/srt/lora/mem_pool.py b/python/sglang/srt/lora/mem_pool.py index 3226d9587..71495acca 100644 --- a/python/sglang/srt/lora/mem_pool.py +++ b/python/sglang/srt/lora/mem_pool.py @@ -50,15 +50,15 @@ class LoRAMemoryPool: self.uid_to_buffer_id: Dict[Optional[str], int] = {} # Buffer idx -> lora uid in memory pool - # All uids are initalized as empty strings for empty buffer slots - # Here we don't initalize to None since None is a valid uid + # All uids are initialized as empty strings for empty buffer slots + # Here we don't initialize to None since None is a valid uid self.buffer_id_to_uid: List[Optional[str]] = [""] * self.max_loras_per_batch def get_lora_A_shape( self, module_name: str, base_model: torch.nn.Module ) -> Tuple[int]: """ - Given a module_name (might be a stacked name), return the hidden dims of modules's input and output. + Given a module_name (might be a stacked name), return the hidden dims of modules' input and output. """ input_dim, _ = get_hidden_dim(module_name, self.base_hf_config, base_model) c = get_stacked_multiply(module_name) @@ -75,7 +75,7 @@ class LoRAMemoryPool: self, module_name: str, base_model: torch.nn.Module ) -> Tuple[int]: """ - Given a module_name (might be a stacked name), return the hidden dims of modules's input and output. + Given a module_name (might be a stacked name), return the hidden dims of modules' input and output. """ _, output_dim = get_hidden_dim(module_name, self.base_hf_config, base_model) c = get_stacked_multiply(module_name) diff --git a/python/sglang/srt/lora/triton_ops/gate_up_lora_b.py b/python/sglang/srt/lora/triton_ops/gate_up_lora_b.py index 02140408c..ae242dc48 100644 --- a/python/sglang/srt/lora/triton_ops/gate_up_lora_b.py +++ b/python/sglang/srt/lora/triton_ops/gate_up_lora_b.py @@ -77,7 +77,7 @@ def _gate_up_lora_b_kernel( k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1 ) - # Iteate to compute the block in output matrix + # Iterate to compute the block in output matrix partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32) for k in range(0, tl.cdiv(K, BLOCK_K)): x_tile = tl.load( diff --git a/python/sglang/srt/lora/triton_ops/qkv_lora_b.py b/python/sglang/srt/lora/triton_ops/qkv_lora_b.py index 5c43ebdf4..76f3f8671 100644 --- a/python/sglang/srt/lora/triton_ops/qkv_lora_b.py +++ b/python/sglang/srt/lora/triton_ops/qkv_lora_b.py @@ -79,7 +79,7 @@ def _qkv_lora_b_kernel( k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1 ) - # Iteate to compute the block in output matrix + # Iterate to compute the block in output matrix partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32) for k in range(0, tl.cdiv(K, BLOCK_K)): x_tile = tl.load( diff --git a/python/sglang/srt/lora/triton_ops/sgemm_lora_a.py b/python/sglang/srt/lora/triton_ops/sgemm_lora_a.py index 3e0980c7e..201f75269 100644 --- a/python/sglang/srt/lora/triton_ops/sgemm_lora_a.py +++ b/python/sglang/srt/lora/triton_ops/sgemm_lora_a.py @@ -67,7 +67,7 @@ def _sgemm_lora_a_kernel( k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1 ) - # Iteate to compute the block in output matrix + # Iterate to compute the block in output matrix partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32) for k in range(0, tl.cdiv(K, BLOCK_K)): x_tile = tl.load( diff --git a/python/sglang/srt/lora/triton_ops/sgemm_lora_b.py b/python/sglang/srt/lora/triton_ops/sgemm_lora_b.py index 28b9f4fbd..89fe2591f 100644 --- a/python/sglang/srt/lora/triton_ops/sgemm_lora_b.py +++ b/python/sglang/srt/lora/triton_ops/sgemm_lora_b.py @@ -69,7 +69,7 @@ def _sgemm_lora_b_kernel( k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1 ) - # Iteate to compute the block in output matrix + # Iterate to compute the block in output matrix partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32) for k in range(0, tl.cdiv(K, BLOCK_K)): x_tile = tl.load( diff --git a/python/sglang/srt/lora/utils.py b/python/sglang/srt/lora/utils.py index 2ae07b24e..3f1f3558d 100644 --- a/python/sglang/srt/lora/utils.py +++ b/python/sglang/srt/lora/utils.py @@ -79,7 +79,7 @@ def get_hidden_dim( module_name: str, config: AutoConfig, base_model: torch.nn.Module ) -> Tuple[int]: """ - Given a module_name (might be a stacked name), return the hidden dims of modules's input and output. + Given a module_name (might be a stacked name), return the hidden dims of modules' input and output. """ if hasattr(base_model, "get_hidden_dim"): diff --git a/python/sglang/srt/managers/data_parallel_controller.py b/python/sglang/srt/managers/data_parallel_controller.py index 9870b8902..876472312 100644 --- a/python/sglang/srt/managers/data_parallel_controller.py +++ b/python/sglang/srt/managers/data_parallel_controller.py @@ -210,7 +210,7 @@ class DataParallelController: ) # compute zmq ports for this dp rank rank_port_args = PortArgs.init_new(server_args, dp_rank) - # Data parallelism resues the tensor parallelism group, + # Data parallelism reuses the tensor parallelism group, # so all dp ranks should use the same nccl port. rank_port_args.nccl_port = port_args.nccl_port diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py index 76e57177e..f92bbf044 100644 --- a/python/sglang/srt/managers/io_struct.py +++ b/python/sglang/srt/managers/io_struct.py @@ -12,7 +12,7 @@ # limitations under the License. # ============================================================================== """ -The definition of objects transfered between different +The definition of objects transferred between different processes (TokenizerManager, DetokenizerManager, Controller). """ diff --git a/python/sglang/srt/managers/mm_utils.py b/python/sglang/srt/managers/mm_utils.py index 025a3010b..b5ef4cbce 100644 --- a/python/sglang/srt/managers/mm_utils.py +++ b/python/sglang/srt/managers/mm_utils.py @@ -51,7 +51,7 @@ class MultiModalityDataPaddingPatternTokenPairs(MultiModalityDataPaddingPattern) self, input_ids: List[int], mm_inputs: MultimodalInputs ) -> List[int]: """ - This function will replace the data-tokens inbetween with pad_values accordingly + This function will replace the data-tokens in between with pad_values accordingly """ pad_values = [item.pad_value for item in mm_inputs.mm_items] data_token_pairs = self.data_token_id_pairs diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 1a5865830..05065c237 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -879,7 +879,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin): error_msg = ( f"{phase_str} out of memory. Try to lower your batch size.\n" f"Try to allocate {num_tokens} tokens.\n" - f"Avaliable tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n" + f"Available tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n" ) logger.error(error_msg) if self.tree_cache is not None: @@ -920,7 +920,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin): error_msg = ( f"Prefill out of memory. Try to lower your batch size.\n" f"Try to allocate {extend_num_tokens} tokens.\n" - f"Avaliable tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n" + f"Available tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n" f"{self.token_to_kv_pool_allocator.available_size()=}\n" f"{self.tree_cache.evictable_size()=}\n" ) @@ -955,7 +955,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin): error_msg = ( f"Decode out of memory. Try to lower your batch size.\n" f"Try to allocate {len(seq_lens)} tokens.\n" - f"Avaliable tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n" + f"Available tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n" f"{self.token_to_kv_pool_allocator.available_size()=}\n" f"{self.tree_cache.evictable_size()=}\n" ) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 601f3450a..28c68a41f 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -1325,7 +1325,7 @@ class Scheduler( return None running_bs = len(self.running_batch.reqs) - # Igore the check if self.chunked_req is not None. + # Ignore the check if self.chunked_req is not None. # In the non-PP case, when self.chunked_req is not None, num_allocatable_reqs should always be greater than 0, # as the space for the chunked request has just been released. # In PP case, a chunked req can start in one microbatch and end in another microbatch, so the max_running_requests per microbatch should not be strict. diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index dea49e9be..89190d8a4 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -1273,7 +1273,7 @@ class TokenizerManager: self.model_update_result.set_result(recv_obj) else: # self.server_args.dp_size > 1 self.model_update_tmp.append(recv_obj) - # set future if the all results are recevied + # set future if the all results are received if len(self.model_update_tmp) == self.server_args.dp_size: self.model_update_result.set_result(self.model_update_tmp) diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py index b354215c1..ba3882ac6 100644 --- a/python/sglang/srt/model_executor/cuda_graph_runner.py +++ b/python/sglang/srt/model_executor/cuda_graph_runner.py @@ -296,12 +296,12 @@ class CudaGraphRunner: self.capture() except RuntimeError as e: raise Exception( - f"Capture cuda graph failed: {e}\n" + f"Capture CUDA graph failed: {e}\n" "Possible solutions:\n" "1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n" "2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n" "3. disable torch compile by not using --enable-torch-compile\n" - "4. disable cuda graph by --disable-cuda-graph. (Not recommonded. Huge perf loss)\n" + "4. disable CUDA graph by --disable-cuda-graph. (Not recommended. Huge performance loss)\n" "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n" ) diff --git a/python/sglang/srt/model_executor/forward_batch_info.py b/python/sglang/srt/model_executor/forward_batch_info.py index 8f84c98e4..5018f92d5 100644 --- a/python/sglang/srt/model_executor/forward_batch_info.py +++ b/python/sglang/srt/model_executor/forward_batch_info.py @@ -58,7 +58,7 @@ class ForwardMode(IntEnum): DECODE = auto() # Contains both EXTEND and DECODE when doing chunked prefill. MIXED = auto() - # No sequence to forward. For data parallel attention, some workers wil be IDLE if no sequence are allocated. + # No sequence to forward. For data parallel attention, some workers will be IDLE if no sequence are allocated. IDLE = auto() # Used in speculative decoding: verify a batch in the target model. diff --git a/python/sglang/srt/models/deepseek_janus_pro.py b/python/sglang/srt/models/deepseek_janus_pro.py index bef9c07ed..296983e9d 100644 --- a/python/sglang/srt/models/deepseek_janus_pro.py +++ b/python/sglang/srt/models/deepseek_janus_pro.py @@ -188,7 +188,7 @@ def trunc_normal_tf_( best when :math:`a \\leq \text{mean} \\leq b`. NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0 - and the result is subsquently scaled and shifted by the mean and std args. + and the result is subsequently scaled and shifted by the mean and std args. Args: tensor: an n-dimensional `torch.Tensor` mean: the mean of the normal distribution @@ -735,7 +735,7 @@ class VisionTransformer(nn.Module): img_size: Input image size. patch_size: Patch size. in_chans: Number of image input channels. - num_classes: Mumber of classes for classification head. + num_classes: Number of classes for classification head. global_pool: Type of global pooling for final sequence (default: 'token'). embed_dim: Transformer embedding dimension. depth: Depth of transformer. diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index b297775a1..9770de1c3 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -1287,7 +1287,7 @@ class DeepseekV2DecoderLayer(nn.Module): # Fully Connected hidden_states = self.mlp(hidden_states) - # TODO(ch-wan): ues reduce-scatter in MLP to avoid this scatter + # TODO(ch-wan): use reduce-scatter in MLP to avoid this scatter # Scatter if self.dp_size != 1: # important: forward batch.gathered_buffer is used both after scatter and after gather. @@ -1499,7 +1499,7 @@ class DeepseekV2ForCausalLM(nn.Module): else: assert ( self.n_share_experts_fusion == self.tp_size - ), f"Shared experts fusion optimization is enabled in DeepSeek V3/R1, set it to {self.tp_size} can get best optimized performace." + ), f"Shared experts fusion optimization is enabled in DeepSeek V3/R1, set it to {self.tp_size} can get best optimized performance." elif self.n_share_experts_fusion == 0: if ( _is_cuda @@ -1665,7 +1665,7 @@ class DeepseekV2ForCausalLM(nn.Module): if is_nextn: if hasattr(self.config, "num_nextn_predict_layers"): num_nextn_layers = self.config.num_nextn_predict_layers - assert num_nextn_layers == 1, "Only 1 nextn layer is supportted" + assert num_nextn_layers == 1, "Only 1 nextn layer is supported" # compatible with old design nextn_layer_id = ( 0 diff --git a/python/sglang/srt/models/llama4.py b/python/sglang/srt/models/llama4.py index a84f3106f..a4f2d03a8 100644 --- a/python/sglang/srt/models/llama4.py +++ b/python/sglang/srt/models/llama4.py @@ -428,7 +428,7 @@ class Llama4DecoderLayer(nn.Module): # Fully Connected hidden_states = self.feed_forward(hidden_states, forward_batch) - # TODO(ch-wan): ues reduce-scatter in MLP to avoid this scatter + # TODO(ch-wan): use reduce-scatter in MLP to avoid this scatter # Scatter if self.dp_size != 1: # important: forward batch.gathered_buffer is used both after scatter and after gather. diff --git a/python/sglang/srt/models/roberta.py b/python/sglang/srt/models/roberta.py index d9e8c2c7a..b982bc8e3 100644 --- a/python/sglang/srt/models/roberta.py +++ b/python/sglang/srt/models/roberta.py @@ -57,7 +57,7 @@ class RobertaEmbedding(nn.Module): input_shape = input_ids.size() inputs_embeds = self.word_embeddings(input_ids) - # adpated from vllm: https://github.com/vllm-project/vllm/commit/4a18fd14ba4a349291c798a16bf62fa8a9af0b6b/vllm/model_executor/models/roberta.py + # Adapted from vllm: https://github.com/vllm-project/vllm/commit/4a18fd14ba4a349291c798a16bf62fa8a9af0b6b/vllm/model_executor/models/roberta.py pos_list = [] token_list = [] diff --git a/python/sglang/srt/models/torch_native_llama.py b/python/sglang/srt/models/torch_native_llama.py index 1c99c52a8..494ef80ed 100644 --- a/python/sglang/srt/models/torch_native_llama.py +++ b/python/sglang/srt/models/torch_native_llama.py @@ -37,7 +37,7 @@ $ python3 -m sglang.bench_one_batch --correct \ --tensor-parallel-size 2 \ --disable-cuda-graph ``` -We will eanble CUDA Graph support soon. +We will enable CUDA Graph support soon. """ import types diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index ba10f2951..b2f138b16 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -590,7 +590,7 @@ def v1_generate_response( echo = False if (not isinstance(request, list)) and request.echo: - # TODO: handle the case propmt is token ids + # TODO: handle the case prompt is token ids if isinstance(request.prompt, list) and isinstance(request.prompt[0], str): # for the case of multiple str prompts prompts = request.prompt @@ -646,7 +646,7 @@ def v1_generate_response( finish_reason = ret_item["meta_info"]["finish_reason"] if to_file: - # to make the choise data json serializable + # to make the choice data json serializable choice_data = { "index": 0, "text": text, diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py index 977e26d3e..d8bf8f09c 100644 --- a/python/sglang/srt/reasoning_parser.py +++ b/python/sglang/srt/reasoning_parser.py @@ -147,7 +147,7 @@ class ReasoningParser: Args: model_type (str): Type of model to parse reasoning from - stream_reasoning (bool): If Flase, accumulates reasoning content until complete. + stream_reasoning (bool): If False, accumulates reasoning content until complete. If True, streams reasoning content as it arrives. """ diff --git a/python/sglang/srt/sampling/sampling_batch_info.py b/python/sglang/srt/sampling/sampling_batch_info.py index 66e6552c0..6499d4c6b 100644 --- a/python/sglang/srt/sampling/sampling_batch_info.py +++ b/python/sglang/srt/sampling/sampling_batch_info.py @@ -294,7 +294,7 @@ class SamplingBatchInfo: # Set the flag to True if any of the two has custom logit processor self.has_custom_logit_processor = True - # Note: becasue the __len()__ operator is defined on the temperatures tensor, + # Note: because the __len()__ operator is defined on the temperatures tensor, # please make sure any merge operation with len(self) or len(other) is done before # the merge operation of the temperatures tensor below. for item in [ diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 0aa71e344..a6ff1a888 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -825,7 +825,7 @@ class ServerArgs: # Multi-node distributed serving parser.add_argument( "--dist-init-addr", - "--nccl-init-addr", # For backward compatbility. This will be removed in the future. + "--nccl-init-addr", # For backward compatibility. This will be removed in the future. type=str, help="The host address for initializing distributed backend (e.g., `192.168.0.2:25000`).", ) @@ -1096,7 +1096,7 @@ class ServerArgs: parser.add_argument( "--triton-attention-reduce-in-fp32", action="store_true", - help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16." + help="Cast the intermediate attention results to fp32 to avoid possible crashes related to fp16." "This only affects Triton attention kernels.", ) parser.add_argument( @@ -1188,7 +1188,7 @@ class ServerArgs: type=int, default=0, help="The number of shared_experts need to be replicated to fuse with normal experts in deepseek v3/r1, " - "set it to tp_size can get best optimized performace.", + "set it to tp_size can get best optimized performance.", ) parser.add_argument( "--disable-chunked-prefix-cache", diff --git a/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py b/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py index e74e275fe..736a0f074 100644 --- a/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +++ b/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py @@ -82,12 +82,12 @@ class EAGLEDraftCudaGraphRunner: self.capture() except RuntimeError as e: raise Exception( - f"Capture cuda graph failed: {e}\n" + f"Capture CUDA graph failed: {e}\n" "Possible solutions:\n" "1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n" "2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n" "3. disable torch compile by not using --enable-torch-compile\n" - "4. disable cuda graph by --disable-cuda-graph. (Not recommonded. Huge perf loss)\n" + "4. disable CUDA graph by --disable-cuda-graph. (Not recommended. Huge performance loss)\n" "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n" ) @@ -149,7 +149,7 @@ class EAGLEDraftCudaGraphRunner: # Run and capture def run_once(): - # Backup two fileds, which will be modified in-place in `draft_forward`. + # Backup two fields, which will be modified in-place in `draft_forward`. output_cache_loc_backup = forward_batch.out_cache_loc hidden_states_backup = forward_batch.spec_info.hidden_states diff --git a/python/sglang/srt/speculative/eagle_utils.py b/python/sglang/srt/speculative/eagle_utils.py index b4d339cce..f7d598de9 100644 --- a/python/sglang/srt/speculative/eagle_utils.py +++ b/python/sglang/srt/speculative/eagle_utils.py @@ -167,12 +167,12 @@ class EagleVerifyOutput: draft_input: EagleDraftInput # Logit outputs from target worker logits_output: LogitsProcessorOutput - # Accepeted token ids including the bonus token + # Accepted token ids including the bonus token verified_id: torch.Tensor - # Accepeted token length per sequence in a batch in CPU. + # Accepted token length per sequence in a batch in CPU. accept_length_per_req_cpu: List[int] - # Accepeted indices from logits_output.next_token_logits - accepeted_indices: torch.Tensor + # Accepted indices from logits_output.next_token_logits + accepted_indices: torch.Tensor @dataclass @@ -316,7 +316,7 @@ class EagleVerifyInput: This API updates values inside logits_output based on the accepted tokens. I.e., logits_output.next_token_logits only contains - accepeted token logits. + accepted token logits. """ bs = self.retrive_index.shape[0] candidates = self.draft_token.reshape(bs, self.draft_token_num) @@ -493,7 +493,7 @@ class EagleVerifyInput: logits_output=logits_output, verified_id=verified_id, accept_length_per_req_cpu=accept_length_cpu, - accepeted_indices=accept_index, + accepted_indices=accept_index, ) else: assign_req_to_token_pool[(bs,)]( @@ -539,7 +539,7 @@ class EagleVerifyInput: logits_output=logits_output, verified_id=verified_id, accept_length_per_req_cpu=accept_length_cpu, - accepeted_indices=accept_index, + accepted_indices=accept_index, ) diff --git a/python/sglang/srt/speculative/eagle_worker.py b/python/sglang/srt/speculative/eagle_worker.py index d1fd04e93..7c61307a4 100644 --- a/python/sglang/srt/speculative/eagle_worker.py +++ b/python/sglang/srt/speculative/eagle_worker.py @@ -201,7 +201,7 @@ class EAGLEWorker(TpModelWorker): self.has_prefill_wrapper_verify = False else: raise ValueError( - f"EAGLE is not supportted in attention backend {self.server_args.attention_backend}" + f"EAGLE is not supported in attention backend {self.server_args.attention_backend}" ) self.draft_model_runner.draft_attn_backend = self.draft_attn_backend @@ -245,8 +245,8 @@ class EAGLEWorker(TpModelWorker): Args: batch: The batch to run forward. The state of the batch is modified as it runs. Returns: - A tuple of the final logit output of the target model, next tokens accepeted, - the batch id (used for overlap schedule), and number of accepeted tokens. + A tuple of the final logit output of the target model, next tokens accepted, + the batch id (used for overlap schedule), and number of accepted tokens. """ if batch.forward_mode.is_decode(): with self.draft_tp_context(self.draft_model_runner.tp_group): @@ -491,11 +491,11 @@ class EAGLEWorker(TpModelWorker): ) # Post process based on verified outputs. - # Pick indices that we care (accepeted) + # Pick indices that we care (accepted) logits_output.next_token_logits = logits_output.next_token_logits[ - res.accepeted_indices + res.accepted_indices ] - logits_output.hidden_states = logits_output.hidden_states[res.accepeted_indices] + logits_output.hidden_states = logits_output.hidden_states[res.accepted_indices] # Prepare the batch for the next draft forwards. batch.forward_mode = ForwardMode.DECODE @@ -597,7 +597,7 @@ class EAGLEWorker(TpModelWorker): self.capture_for_decode(logits_output, forward_batch.spec_info) def forward_draft_extend_after_decode(self, batch: ScheduleBatch): - # Backup fileds that will be modified in-place + # Backup fields that will be modified in-place seq_lens_backup = batch.seq_lens.clone() req_pool_indices_backup = batch.req_pool_indices accept_length_backup = batch.spec_info.accept_length diff --git a/python/sglang/test/simple_eval_common.py b/python/sglang/test/simple_eval_common.py index 518e6245c..5a60221ed 100644 --- a/python/sglang/test/simple_eval_common.py +++ b/python/sglang/test/simple_eval_common.py @@ -140,7 +140,7 @@ class ChatCompletionSampler(SamplerBase): max_tokens=self.max_tokens, ) return response.choices[0].message.content - # NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are reruning MMMU + # NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are rerunning MMMU except openai.BadRequestError as e: print("Bad Request Error", e) return "" diff --git a/python/sglang/test/simple_eval_humaneval.py b/python/sglang/test/simple_eval_humaneval.py index fe6f32f4d..25dcdd53a 100644 --- a/python/sglang/test/simple_eval_humaneval.py +++ b/python/sglang/test/simple_eval_humaneval.py @@ -121,7 +121,7 @@ class HumanEval(Eval): convo=convo, metrics={ f"pass@{k}": estimate_pass_at_k([total], [correct], k) - # this will be aggrated so no need of .mean() + # this will be aggregated so no need of .mean() for k in self._ks_passes if total >= k }, diff --git a/python/sglang/test/test_programs.py b/python/sglang/test/test_programs.py index ca2cdb15e..262637eed 100644 --- a/python/sglang/test/test_programs.py +++ b/python/sglang/test/test_programs.py @@ -370,7 +370,7 @@ def test_dtype_gen(): @sgl.function def dtype_gen(s): s += "Q: What is the full name of DNS?\n" - s += "A: The full nams is " + sgl.gen("str_res", dtype=str, stop="\n") + "\n" + s += "A: The full names is " + sgl.gen("str_res", dtype=str, stop="\n") + "\n" s += "Q: Which year was DNS invented?\n" s += "A: " + sgl.gen("int_res", dtype=int) + "\n" s += "Q: What is the value of pi?\n" diff --git a/python/sglang/utils.py b/python/sglang/utils.py index e817c5b58..e83aa112b 100644 --- a/python/sglang/utils.py +++ b/python/sglang/utils.py @@ -278,7 +278,7 @@ def graceful_registry(sub_module_name: str): f"{sub_module_name} Received signal to shutdown. Performing graceful shutdown..." ) if signum == signal.SIGTERM: - logger.info(f"{sub_module_name} recive sigterm") + logger.info(f"{sub_module_name} receive sigterm") signal.signal(signal.SIGTERM, graceful_shutdown) diff --git a/scripts/ci_install_dependency.sh b/scripts/ci_install_dependency.sh index c061b6226..45290c8a6 100755 --- a/scripts/ci_install_dependency.sh +++ b/scripts/ci_install_dependency.sh @@ -25,7 +25,7 @@ pip install -e "python[all]" pip install torch_memory_saver pip install transformers==4.51.0 sentence_transformers accelerate peft pandas datasets timm torchaudio==2.6.0 -# For compling xgrammar kernels +# For compiling xgrammar kernels pip install cuda-python nvidia-cuda-nvrtc-cu12 # For lmms_evals evaluating MMMU diff --git a/scripts/ci_install_dependency_8_gpu.sh b/scripts/ci_install_dependency_8_gpu.sh index 8d6ccd51b..6bb07677a 100755 --- a/scripts/ci_install_dependency_8_gpu.sh +++ b/scripts/ci_install_dependency_8_gpu.sh @@ -43,7 +43,7 @@ pip install -e "python[all]" pip install torch_memory_saver pip install transformers==4.51.0 sentence_transformers accelerate peft pandas datasets timm torchaudio==2.6.0 -# For compling xgrammar kernels +# For compiling xgrammar kernels pip install cuda-python nvidia-cuda-nvrtc-cu12 # For lmms_evals evaluating MMMU diff --git a/scripts/deprecated/convert_yi_vl.py b/scripts/deprecated/convert_yi_vl.py index bdf37ff92..cdd8aebeb 100644 --- a/scripts/deprecated/convert_yi_vl.py +++ b/scripts/deprecated/convert_yi_vl.py @@ -1,5 +1,5 @@ """ -Convert Yi-VL config into a format useable with SGLang +Convert Yi-VL config into a format usable with SGLang Usage: python3 scripts/convert_yi_vl.py --model-path """ diff --git a/scripts/export_deepseek_nextn.py b/scripts/export_deepseek_nextn.py index 35a06b645..5da0e4bc3 100644 --- a/scripts/export_deepseek_nextn.py +++ b/scripts/export_deepseek_nextn.py @@ -90,7 +90,7 @@ def export_nextn_layer_parameters(input_dir, output_dir, nextn_layer_id): if __name__ == "__main__": parser = argparse.ArgumentParser( - description="Export NextN layer paramerters for DeepSeek-V3/R1" + description="Export NextN layer parameters for DeepSeek-V3/R1" ) parser.add_argument( "--input-dir", diff --git a/sgl-kernel/CMakeLists.txt b/sgl-kernel/CMakeLists.txt index f7733388a..05d14b6b3 100755 --- a/sgl-kernel/CMakeLists.txt +++ b/sgl-kernel/CMakeLists.txt @@ -114,7 +114,7 @@ set(SGL_KERNEL_CUDA_FLAGS "--expt-extended-lambda" "--threads=32" - # Supress warnings + # Suppress warnings "-Xcompiler=-Wconversion" "-Xcompiler=-fno-strict-aliasing" diff --git a/sgl-kernel/README.md b/sgl-kernel/README.md index 463982b90..70e99f82c 100644 --- a/sgl-kernel/README.md +++ b/sgl-kernel/README.md @@ -87,7 +87,7 @@ Third-party libraries: The main different Between sm80/sm87 and sm86/sm89 is the shared memory size. you can follow the link below for more information https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x. - And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a. Thats mean if you use **A100(tested)**/A*0/**L20(tested)**/L40/L40s/**3090(tested)** you can use fa3. + And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a. That means if you use **A100(tested)**/A*0/**L20(tested)**/L40/L40s/**3090(tested)** you can use fa3. ### Kernel Development diff --git a/sgl-kernel/benchmark/bench_per_token_group_quant_8bit.py b/sgl-kernel/benchmark/bench_per_token_group_quant_8bit.py index b01e5ceb2..c83172997 100644 --- a/sgl-kernel/benchmark/bench_per_token_group_quant_8bit.py +++ b/sgl-kernel/benchmark/bench_per_token_group_quant_8bit.py @@ -20,7 +20,7 @@ def _per_token_group_quant_8bit( y_s_ptr, # Stride of input y_stride, - # Collums of input + # Columns of input N, # Avoid to divide zero eps, diff --git a/sgl-kernel/csrc/cpu/common.h b/sgl-kernel/csrc/cpu/common.h index 0d340a756..1acdd64a6 100644 --- a/sgl-kernel/csrc/cpu/common.h +++ b/sgl-kernel/csrc/cpu/common.h @@ -49,7 +49,7 @@ namespace { #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_LAST_DIM_CONTIGUOUS(x) \ - TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimention") + TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimension") #define CHECK_INPUT(x) \ CHECK_CPU(x); \ diff --git a/sgl-kernel/csrc/cpu/decode.cpp b/sgl-kernel/csrc/cpu/decode.cpp index e469ffdc5..d1305f351 100644 --- a/sgl-kernel/csrc/cpu/decode.cpp +++ b/sgl-kernel/csrc/cpu/decode.cpp @@ -718,7 +718,7 @@ void decode_attention_kernel_impl( m_prime = m_i; - // caculate V' <- s_delta @ V + V' * m_delta + // calculate V' <- s_delta @ V + V' * m_delta index_gemm_kernel_nn( /* A */ s_delta, /* B */ v_buffer + head_id * v_strideH, @@ -925,7 +925,7 @@ void decode_attention_grouped_kernel_impl( m_prime[h] = m_i; } - // caculate V' <- s_delta @ V + V' * m_delta + // calculate V' <- s_delta @ V + V' * m_delta index_gemm_kernel_nn( /* A */ s_delta, /* B */ v_buffer + head_kv_id * v_strideH, diff --git a/sgl-kernel/csrc/cpu/extend.cpp b/sgl-kernel/csrc/cpu/extend.cpp index 503cef538..9ae36574f 100644 --- a/sgl-kernel/csrc/cpu/extend.cpp +++ b/sgl-kernel/csrc/cpu/extend.cpp @@ -323,7 +323,7 @@ void extend_attention_kernel_impl( /* ld_src */ v_strideN, /* ld_dst */ head_size_v); - // caculate V' <- s_delta @ V + V' + // calculate V' <- s_delta @ V + V' at::native::cpublas::brgemm( /* M */ m_size, /* N */ head_size_v, @@ -434,7 +434,7 @@ void extend_attention_kernel_impl( /* ld_src */ ve_strideN, /* ld_dst */ head_size_v); - // caculate V' <- s_delta @ V + V' + // calculate V' <- s_delta @ V + V' at::native::cpublas::brgemm( /* M */ m_size, /* N */ head_size_v, diff --git a/sgl-kernel/csrc/cpu/gemm.h b/sgl-kernel/csrc/cpu/gemm.h index 010f50a0c..87dfbd7bf 100644 --- a/sgl-kernel/csrc/cpu/gemm.h +++ b/sgl-kernel/csrc/cpu/gemm.h @@ -79,7 +79,7 @@ void fused_experts_int8_kernel_impl( int64_t topk, int64_t num_tokens_post_pad); -// shared expert implememntation for int8 w8a8 +// shared expert implementation for int8 w8a8 template void shared_expert_int8_kernel_impl( scalar_t* __restrict__ output, diff --git a/sgl-kernel/csrc/cpu/gemm_int8.cpp b/sgl-kernel/csrc/cpu/gemm_int8.cpp index ba383076a..a7a87ce74 100644 --- a/sgl-kernel/csrc/cpu/gemm_int8.cpp +++ b/sgl-kernel/csrc/cpu/gemm_int8.cpp @@ -51,7 +51,7 @@ struct tinygemm_kernel_nn { __m512 vd0; __m512 vd1[COLS]; - // oops! 4x4 spills but luckly we use 4x2 + // oops! 4x4 spills but we use 4x2 __m512 vbias[COLS]; // [NOTE]: s8s8 igemm compensation in avx512-vnni diff --git a/sgl-kernel/csrc/speculative/packbit.cu b/sgl-kernel/csrc/speculative/packbit.cu index c65ba4518..687dbfa2b 100644 --- a/sgl-kernel/csrc/speculative/packbit.cu +++ b/sgl-kernel/csrc/speculative/packbit.cu @@ -1,4 +1,4 @@ -// This is only a pluggin used for flashinfer 0.1.6. The new version does not need it. +// This is only a plugin used for flashinfer 0.1.6. The new version does not need it. /* * Copyright (c) 2025 by SGLang team. * Copyright (c) 2025 by FlashInfer team. diff --git a/sgl-kernel/include/sgl_kernel_torch_shim.h b/sgl-kernel/include/sgl_kernel_torch_shim.h index 4101726be..5a26f9076 100644 --- a/sgl-kernel/include/sgl_kernel_torch_shim.h +++ b/sgl-kernel/include/sgl_kernel_torch_shim.h @@ -20,16 +20,16 @@ limitations under the License. #include /** - * Unforunately, the type signatures of the flash_attn ops are not compatible + * Unfortunately, the type signatures of the flash_attn ops are not compatible * with the PyTorch library bindings. To get around that we use - * `make_pytorch_shim` which creates a lambda that exponses the API using + * `make_pytorch_shim` which creates a lambda that exposes the API using * PyTorch compatible types to the types, then converts them to the types * expected by the flash_attn ops. This shims allows us to make minimal changes * to `flash_api.cpp` making it easier to synchronize with upstream changes. * * The `pytorch_library_compatible_type` struct is used to map from the * flash_attn ops types to a PyTorch library compatible one. The main issues is - * that the following types are not support by PyTorch libary bindings: + * that the following types are not support by PyTorch library bindings: * - `int` * - `float` * - `std::optional &` diff --git a/sgl-kernel/python/sgl_kernel/elementwise.py b/sgl-kernel/python/sgl_kernel/elementwise.py index dd717d0ca..0e15cfff3 100644 --- a/sgl-kernel/python/sgl_kernel/elementwise.py +++ b/sgl-kernel/python/sgl_kernel/elementwise.py @@ -229,7 +229,7 @@ def apply_rope_with_cos_sin_cache_inplace( Whether to use Neox style RoPE, default: ``True``. * If ``True``, the last dimension of the query/key tensor is not interleaved, i.e., - we rorate the first half dimensions ``([..., :head_dim//2])`` and the second half + we rotate the first half dimensions ``([..., :head_dim//2])`` and the second half dimensions ``([..., head_dim//2:])``. * If ``False``, the last dimension of the query/key tensor is interleaved, i.e., diff --git a/sgl-kernel/python/sgl_kernel/flash_attn.py b/sgl-kernel/python/sgl_kernel/flash_attn.py index 5a443ab43..fbf0b0d3f 100644 --- a/sgl-kernel/python/sgl_kernel/flash_attn.py +++ b/sgl-kernel/python/sgl_kernel/flash_attn.py @@ -17,7 +17,7 @@ def is_fa3_supported(device=None) -> bool: # Between sm80/sm87 and sm86/sm89 is the shared memory size. you can follow the link below for more information # https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x # And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a. - # Thats mean if you use A100/A*0/L20/L40/L40s/4090 you can use fa3. + # That means if you use A100/A*0/L20/L40/L40s/4090 you can use fa3. return ( torch.cuda.get_device_capability(device)[0] == 9 or torch.cuda.get_device_capability(device)[0] == 8 diff --git a/sgl-kernel/python/sgl_kernel/moe.py b/sgl-kernel/python/sgl_kernel/moe.py index a5e0b3668..f989fb8f7 100755 --- a/sgl-kernel/python/sgl_kernel/moe.py +++ b/sgl-kernel/python/sgl_kernel/moe.py @@ -45,10 +45,10 @@ def moe_fused_gate( ): # This fused kernel function is used to select topk expert in a hierarchical 2-layer fashion # it split group of expert into num_expert_group, and use top2 expert weight sum in each group - # as the group weight to select exerpt groups and then select topk experts within the selected groups + # as the group weight to select expert groups and then select topk experts within the selected groups # the #experts is decided by the input tensor shape and we currently only support power of 2 #experts - # and #experts should be divisible by num_expert_group. #expert/num_expert_group <= 32 is limitted for now. - # for non-supported case, we suggestion to use the biased_grouped_topk func in sglang.srt.layers.moe.topk + # and #experts should be divisible by num_expert_group. #expert/num_expert_group <= 32 is limited for now. + # for non-supported case, we suggest to use the biased_grouped_topk func in sglang.srt.layers.moe.topk # n_share_experts_fusion: if > 0, the last expert will be replaced with a round-robin shared expert # routed_scaling_factor: if > 0, the last expert will be scaled by this factor return torch.ops.sgl_kernel.moe_fused_gate.default( diff --git a/sgl-kernel/tests/test_flash_attention.py b/sgl-kernel/tests/test_flash_attention.py index 2885dbb4b..def092a34 100644 --- a/sgl-kernel/tests/test_flash_attention.py +++ b/sgl-kernel/tests/test_flash_attention.py @@ -24,7 +24,7 @@ def is_fa3_supported(device=None) -> bool: # Between sm80/sm87 and sm86/sm89 is the shared memory size. you can follow the link below for more information # https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x # And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a. - # Thats mean if you use A100/A*0/L20/L40/L40s/4090 you can use fa3. + # That means if you use A100/A*0/L20/L40/L40s/4090 you can use fa3. return ( torch.cuda.get_device_capability(device)[0] == 9 or torch.cuda.get_device_capability(device)[0] == 8 diff --git a/sgl-kernel/tests/test_per_token_group_quant_8bit.py b/sgl-kernel/tests/test_per_token_group_quant_8bit.py index ba3bafda5..66be47d28 100644 --- a/sgl-kernel/tests/test_per_token_group_quant_8bit.py +++ b/sgl-kernel/tests/test_per_token_group_quant_8bit.py @@ -21,7 +21,7 @@ def _per_token_group_quant_fp8( y_s_ptr, # Stride of input y_stride, - # Collums of input + # Columns of input N, # Avoid to divide zero eps, diff --git a/sgl-router/src/tree.rs b/sgl-router/src/tree.rs index 6ddf79140..112bed0d5 100644 --- a/sgl-router/src/tree.rs +++ b/sgl-router/src/tree.rs @@ -1070,7 +1070,7 @@ mod tests { #[test] fn test_utf8_split_seq() { - // The string should be indexed and splitted by a utf-8 value basis instead of byte basis + // The string should be indexed and split by a utf-8 value basis instead of byte basis // use .chars() to get the iterator of the utf-8 value let tree = Arc::new(Tree::new()); diff --git a/test/srt/test_openai_server.py b/test/srt/test_openai_server.py index e90690f10..f76f9fa65 100644 --- a/test/srt/test_openai_server.py +++ b/test/srt/test_openai_server.py @@ -433,7 +433,7 @@ class TestOpenAIServer(CustomTestCase): ) def test_completion_stream(self): - # parallel sampling adn list input are not supported in streaming mode + # parallel sampling and list input are not supported in streaming mode for echo in [False, True]: for logprobs in [None, 5]: for use_list_input in [True, False]: diff --git a/test/srt/test_session_control.py b/test/srt/test_session_control.py index 7a68b2b17..d4bbfa476 100644 --- a/test/srt/test_session_control.py +++ b/test/srt/test_session_control.py @@ -161,7 +161,7 @@ class TestSessionControl(CustomTestCase): ] ) - # query with a non-existing rid (the last one should be disappeared becuase of backtrack), should see abort + # query with a non-existing rid (the last one should be disappeared because of backtrack), should see abort response = requests.post( self.base_url + "/generate", json={ @@ -668,7 +668,7 @@ class TestSessionControlVision(CustomTestCase): ).json() outputs_from_session.append(response["text"]) - # query with a non-existing rid (the last one should be disappeared becuase of backtrack), should see abort + # query with a non-existing rid (the last one should be disappeared because of backtrack), should see abort response = requests.post( self.base_url + "/generate", json={ diff --git a/test/srt/test_srt_endpoint.py b/test/srt/test_srt_endpoint.py index 81671e4a0..17e542156 100644 --- a/test/srt/test_srt_endpoint.py +++ b/test/srt/test_srt_endpoint.py @@ -295,7 +295,7 @@ class TestSRTEndpoint(CustomTestCase): print(f"{output_top_logprobs=}") # Parse results - # This is becaues the grammar constraint allows all prefix tokens + # This is because the grammar constraint allows all prefix tokens logprobs = [None] * 2 for i in range(len(output_top_logprobs)): try: diff --git a/test/srt/test_srt_engine_with_quant_args.py b/test/srt/test_srt_engine_with_quant_args.py index e3b30ea39..47baf5688 100644 --- a/test/srt/test_srt_engine_with_quant_args.py +++ b/test/srt/test_srt_engine_with_quant_args.py @@ -8,7 +8,7 @@ class TestSRTEngineWithQuantArgs(CustomTestCase): def test_1_quantization_args(self): - # we only test fp8 because other methods are currenly depend on vllm. We can add other methods back to test after vllm depency is resolved. + # we only test fp8 because other methods are currently dependent on vllm. We can add other methods back to test after vllm dependency is resolved. quantization_args_list = [ # "awq", "fp8", diff --git a/test/srt/test_vlm_accuracy.py b/test/srt/test_vlm_accuracy.py index a56bc85dc..58b0efa6d 100644 --- a/test/srt/test_vlm_accuracy.py +++ b/test/srt/test_vlm_accuracy.py @@ -116,7 +116,7 @@ class VisionLLMLogitsBase(unittest.IsolatedAsyncioTestCase): }}, {{ "type": "text", - "text": "Whats in this picture?" + "text": "What's in this picture?" }} ] }}