From e8e18dcdcca0e6d4eacccd074bea9da2ad6a3e18 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 12 May 2025 12:53:26 -0700 Subject: [PATCH] Revert "fix some typos" (#6244) --- 3rdparty/amd/profiling/PROFILING.md | 2 +- 3rdparty/amd/tuning/TUNING.md | 14 ++++++------ README.md | 2 +- benchmark/benchmark_vllm_060/README.md | 6 ++--- benchmark/deepseek_v3/README.md | 16 +++++++------- benchmark/gsm8k/README.md | 12 +++++----- benchmark/hellaswag/README.md | 10 ++++----- benchmark/kernels/fused_moe_triton/README.md | 4 ++-- benchmark/mmlu/README.md | 14 ++++++------ benchmark/mtbench/README.md | 10 ++++----- benchmark/multi_chain_reasoning/README.md | 14 ++++++------ benchmark/tree_of_thought_deep/README.md | 14 ++++++------ benchmark/tree_of_thought_v0/README.md | 10 ++++----- docker/k8s-sglang-distributed-sts.yaml | 2 +- docs/README.md | 4 ++-- docs/backend/quantization.md | 2 +- docs/backend/server_arguments.md | 2 +- .../development_guide_using_docker.md | 20 ++++++++--------- docs/developer/setup_github_runner.md | 2 +- docs/index.rst | 2 +- docs/references/benchmark_and_profiling.md | 2 +- docs/references/contribution_guide.md | 2 +- docs/references/deepseek.md | 22 +++++++++---------- docs/references/deploy_on_k8s.md | 2 +- docs/references/modelscope.md | 2 +- docs/start/install.md | 10 ++++----- examples/runtime/README.md | 2 +- .../hidden_states/hidden_states_engine.py | 2 +- .../hidden_states/hidden_states_server.py | 2 +- python/pyproject.toml | 8 +++---- python/sglang/compile_deep_gemm.py | 2 +- python/sglang/srt/_custom_ops.py | 2 +- python/sglang/srt/configs/chatglm.py | 2 +- python/sglang/srt/configs/load_config.py | 2 +- .../device_communicators/custom_all_reduce.py | 12 +++++----- .../custom_all_reduce_utils.py | 14 ++++++------ .../device_communicators/pynccl.py | 2 +- .../device_communicators/pynccl_wrapper.py | 2 +- .../sglang/srt/distributed/parallel_state.py | 2 +- python/sglang/srt/hf_transformers_utils.py | 2 +- .../srt/layers/attention/base_attn_backend.py | 6 ++--- .../attention/double_sparsity_backend.py | 2 +- .../layers/attention/flashinfer_backend.py | 4 ++-- .../attention/flashinfer_mla_backend.py | 2 +- .../srt/layers/attention/triton_backend.py | 2 +- python/sglang/srt/layers/attention/vision.py | 2 +- python/sglang/srt/layers/dp_attention.py | 2 +- python/sglang/srt/layers/logits_processor.py | 2 +- .../srt/layers/quantization/__init__.py | 10 ++++----- .../srt/layers/quantization/blockwise_int8.py | 2 +- .../compressed_tensors/compressed_tensors.py | 6 ++--- .../compressed_tensors_moe.py | 2 +- .../schemes/compressed_tensors_w8a16_fp8.py | 6 ++--- .../srt/layers/quantization/fp8_utils.py | 4 ++-- python/sglang/srt/layers/rotary_embedding.py | 4 ++-- .../sglang/srt/lora/backend/base_backend.py | 16 +++++++------- .../srt/lora/backend/flashinfer_backend.py | 2 +- python/sglang/srt/lora/layers.py | 2 +- python/sglang/srt/lora/lora.py | 8 +++---- python/sglang/srt/lora/lora_manager.py | 16 +++++++------- python/sglang/srt/lora/mem_pool.py | 12 +++++----- .../srt/lora/triton_ops/gate_up_lora_b.py | 2 +- .../sglang/srt/lora/triton_ops/qkv_lora_b.py | 2 +- python/sglang/srt/lora/utils.py | 18 +++++++-------- python/sglang/srt/managers/scheduler.py | 4 ++-- .../sglang/srt/managers/tokenizer_manager.py | 2 +- .../srt/managers/tp_worker_overlap_thread.py | 2 +- .../srt/model_executor/cuda_graph_runner.py | 22 +++++++++---------- .../srt/model_executor/forward_batch_info.py | 4 ++-- .../sglang/srt/model_executor/model_runner.py | 10 ++++----- python/sglang/srt/models/commandr.py | 2 +- .../sglang/srt/models/deepseek_janus_pro.py | 2 +- python/sglang/srt/models/gemma.py | 2 +- python/sglang/srt/models/gemma2.py | 2 +- python/sglang/srt/models/gemma3_causal.py | 4 ++-- python/sglang/srt/models/kimi_vl_moonvit.py | 2 +- python/sglang/srt/models/mllama.py | 8 +++---- python/sglang/srt/models/roberta.py | 2 +- python/sglang/srt/platforms/interface.py | 4 ++-- python/sglang/srt/server_args.py | 20 ++++++++--------- python/sglang/srt/speculative/eagle_worker.py | 12 +++++----- python/sglang/srt/utils.py | 2 +- .../test/attention/test_flashattn_backend.py | 2 +- .../attention/test_flashattn_mla_backend.py | 2 +- sgl-kernel/README.md | 6 ++--- .../benchmark/bench_moe_align_block_size.py | 4 ++-- .../csrc/allreduce/custom_all_reduce.cuh | 6 ++--- sgl-kernel/python/sgl_kernel/__init__.py | 2 +- sgl-kernel/python/sgl_kernel/flash_attn.py | 8 +++---- sgl-kernel/tests/test_merge_state_v2.py | 2 +- test/srt/models/lora/test_lora_cuda_graph.py | 4 ++-- test/srt/models/lora/utils.py | 10 ++++----- test/srt/test_srt_engine_with_quant_args.py | 4 ++-- test/srt/test_triton_attention_kernels.py | 2 +- .../test_update_weights_from_distributed.py | 2 +- 95 files changed, 276 insertions(+), 276 deletions(-) diff --git a/3rdparty/amd/profiling/PROFILING.md b/3rdparty/amd/profiling/PROFILING.md index 4fcbffee3..7e15ec844 100644 --- a/3rdparty/amd/profiling/PROFILING.md +++ b/3rdparty/amd/profiling/PROFILING.md @@ -356,7 +356,7 @@ client.sh # Start profiling via API curl http://localhost:30000/start_profile -H "Content-Type: application/json" -# Benchmark serving using SGLang with a random dataset and tokenizer +# Benchmark serving using sglang with random dataset and tokenizer # Define the log file with a timestamp TIMESTAMP=$(date +%Y%m%d_%H%M%S) LOGFILE="sglang_client_log_$TIMESTAMP.json" diff --git a/3rdparty/amd/tuning/TUNING.md b/3rdparty/amd/tuning/TUNING.md index b5c6005f5..e7b9b2049 100644 --- a/3rdparty/amd/tuning/TUNING.md +++ b/3rdparty/amd/tuning/TUNING.md @@ -93,21 +93,21 @@ TORCHINDUCTOR_MAX_AUTOTUNE=1 TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1 TORCHINDU #Inference with large improvement on AMD GPU TORCHINDUCTOR_FREEZING=1 your_script.sh ``` -## 4. Fused MoE kernel -To maximize MoE kernel efficiency, need to use below scripts to find out the best launch configuration +## 4. Fused MOE kernel +To maximize moe kernel efficiency, need to use below scripts to find out the best launch configuration ### Key parameters: -- **--model**: what MoE model type to do tuning, it will automatically decide the size of d_model, model_intermediate_size, num_layers +- **--model**: what moe model type to do tuning, it will automatically decide the size of d_model, model_intermediate_size, num_layers - **--tp-size**: simulate the whole model run configuration to set the dimension size using tp correctly -- **--batch**: M dimension size of MoE kernel, for prefill MoE kernel the value is batch*input_len, for decode MoE kernel the value is batch +- **--batch**: M dimension size of moe kernel, for prefill moe kernel the value is batch*input_len, for decode moe kernel the value is batch - **--dtype**: computation type ```bash #Tuning -#for example, we have one case like this "python3 -m sglang.bench_latency --model dummy_grok1/ --load-format dummy --tokenizer-path Xenova/grok-1-tokenizer --tp 8 --batch-size 32 --input 1024 --output 8 --attention-backend triton --sampling-backend pytorch --quantization fp8" to run, it defined batch-size 32 input length 1024 and output length 8, from "--batch" in MoE view point, the prefill batch is 32*1024 = 32768, the decode batch is 32*1(only one output token generated in each run). -#so we can tune decode MoE use below command +#for example, we have one case like this "python3 -m sglang.bench_latency --model dummy_grok1/ --load-format dummy --tokenizer-path Xenova/grok-1-tokenizer --tp 8 --batch-size 32 --input 1024 --output 8 --attention-backend triton --sampling-backend pytorch --quantization fp8" to run, it defined batch-size 32 input length 1024 and output length 8, from "--batch" in moe view point, the prefill batch is 32*1024 = 32768, the decode batch is 32*1(only one output token generated in each run). +#so we can tune decode moe use below command python benchmark_moe_rocm.py --model grok1 --tp-size 8 --dtype float8 --batch "32" -# and use this command to tune prefill MoE +# and use this command to tune prefill moe python benchmark_moe_rocm.py --model grok1 --tp-size 8 --dtype float8 --batch "32768" ``` diff --git a/README.md b/README.md index 3300f37f7..2abf1e4da 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ SGLang is a fast serving framework for large language models and vision language It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language. The core features include: -- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (PagedAttention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, quantization (FP8/INT4/AWQ/GPTQ), and multi-LoRA batching. +- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching. - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions. - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, Qwen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models. - **Active Community**: SGLang is open-source and backed by an active community with industry adoption. diff --git a/benchmark/benchmark_vllm_060/README.md b/benchmark/benchmark_vllm_060/README.md index 37606da8b..b480dabf2 100644 --- a/benchmark/benchmark_vllm_060/README.md +++ b/benchmark/benchmark_vllm_060/README.md @@ -1,6 +1,6 @@ ## How to reproduce the benchmark results for SGLang v0.3.0 compared to vLLM v0.6.0 -In short, with multi step enabled, in online scenarios that we benchmarked, the Median TTFT of vLLM is **3 times** that of SGLang, and the Median ITL is **10 times** that of SGLang. Lower Median TTFT and ITL are better. vLLM's multi-step optimization did not improve throughput while ensuring lower Median TTFT and ITL. Also, under maximum throughput benchmark, if vLLM does not set GPU utilization to 0.95 separately and uses the default configuration instead, its maximum throughput is **lower** than that of SGLang. +In short, with multi step enabled, in online scenarios that we benchmarked, the Median TTFT of vLLM is **3 times** that of SGLang, and the Median ITL is **10 times** that of SGLang. Lower Median TTFT and ITL are better. vLLM's multi-step optimization did not improve throughput while ensuring lower Median TTFT and ITL. Also, under maximum throughput benchmark, if vLLM does not set gpu util to 0.95 separately and uses the default configuration instead, its maximum throughput is **lower** than that of SGLang. ## Online benchmark results @@ -41,12 +41,12 @@ In short, with multi step enabled, in online scenarios that we benchmarked, the ## Installation ```bash -# install SGLang v0.3.0 +# install sglang v0.3.0 pip install --upgrade pip pip install "sglang[all]"==0.3.0 pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ -# install vLLM v0.6.0 +# install vllm v0.6.0 pip install vllm==0.6.0 ``` diff --git a/benchmark/deepseek_v3/README.md b/benchmark/deepseek_v3/README.md index b83ed28b5..ec3c63adf 100644 --- a/benchmark/deepseek_v3/README.md +++ b/benchmark/deepseek_v3/README.md @@ -45,10 +45,10 @@ Add [performance optimization options](#performance-optimization-options) as nee ### Performance Optimization Options -[MLA optimizations](https://lmsys.org/blog/2024-09-04-sglang-v0-3/#deepseek-multi-head-latent-attention-mla-throughput-optimizations) are enabled by default. Here are some optional optimizations that can be enabled as needed. +[MLA optimizations](https://lmsys.org/blog/2024-09-04-sglang-v0-3/#deepseek-multi-head-latent-attention-mla-throughput-optimizations) are enabled by default. Here are some optional optimizations can be enabled as needed. - [Data Parallelism Attention](https://lmsys.org/blog/2024-12-04-sglang-v0-4/#data-parallelism-attention-for-deepseek-models): For high QPS scenarios, add the `--enable-dp-attention` argument to boost throughput. -- [Torch.compile Optimization](https://lmsys.org/blog/2024-09-04-sglang-v0-3/#torchcompile-latency-optimizations): Add `--enable-torch-compile` argument to enable it. This will take some time while the server starts. The maximum batch size for torch.compile optimization can be controlled with `--torch-compile-max-bs`. It's recommended to set it between `1` and `8`. (e.g., `--torch-compile-max-bs 8`) +- [Torch.compile Optimization](https://lmsys.org/blog/2024-09-04-sglang-v0-3/#torchcompile-latency-optimizations): Add `--enable-torch-compile` argument to enable it. This will take some time while server starts. The maximum batch size for torch.compile optimization can be controlled with `--torch-compile-max-bs`. It's recommended to set it between `1` and `8`. (e.g., `--torch-compile-max-bs 8`) ### Example: Sending requests with OpenAI API @@ -90,7 +90,7 @@ If you have two H100 nodes, the usage is similar to the aforementioned H20. > **Note that the launch command here does not enable Data Parallelism Attention or `torch.compile` Optimization**. For optimal performance, please refer to the command options in [Performance Optimization Options](#option_args). -### Example: Serving with two H200\*8 nodes and Docker +### Example: Serving with two H200\*8 nodes and docker There are two H200 nodes, each with 8 GPUs. The first node's IP is `192.168.114.10`, and the second node's IP is `192.168.114.11`. Configure the endpoint to expose it to another Docker container using `--host 0.0.0.0` and `--port 40000`, and set up communications with `--dist-init-addr 192.168.114.10:20000`. A single H200 with 8 devices can run DeepSeek V3, the dual H200 setup is just to demonstrate multi-node usage. @@ -147,7 +147,7 @@ docker run --gpus all \ To serve DeepSeek-V3 with A100 GPUs, we need to convert the [FP8 model checkpoints](https://huggingface.co/deepseek-ai/DeepSeek-V3) to BF16 with [script](https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/fp8_cast_bf16.py) mentioned [here](https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/fp8_cast_bf16.py) first. -Since the BF16 model is over 1.3 TB, we need to prepare four A100 nodes, each with 8 80GB GPUs. Assuming the first node's IP is `10.0.0.1`, and the converted model path is `/path/to/DeepSeek-V3-BF16`, we can run the following commands to launch the server. +Since the BF16 model is over 1.3 TB, we need to prepare four A100 nodes, each with 8 80GB GPUs. Assume the first node's IP is `10.0.0.1`, and the converted model path is `/path/to/DeepSeek-V3-BF16`, we can have following commands to launch the server. ```bash # node 1 @@ -178,7 +178,7 @@ python3 -m sglang.bench_one_batch_server --model None --base-url http://10.0.0.1 ### Example: Serving with 8 A100/A800 with AWQ Quantization -Add the `--quantization moe_wna16` flag to enable the MoE wna16 kernel for better performance. +Add `--quantization moe_wna16` flag to enable moe wna16 kernel for better performance. One example is as follows: ```bash @@ -188,12 +188,12 @@ python3 -m sglang.launch_server --model cognitivecomputations/DeepSeek-R1-AWQ -- ### Example: Serving with 16 A100/A800 with int8 Quantization -There are block-wise and per-channel quantization methods, and the quantization parameters have already been uploaded to HuggingFace. One example is as follows: +There are block-wise and per-channel quantization methods, and the quantization parameters have already been uploaded to Huggingface. One example is as follows: - [meituan/DeepSeek-R1-Block-INT8](https://huggingface.co/meituan/DeepSeek-R1-Block-INT8) - [meituan/DeepSeek-R1-Channel-INT8](https://huggingface.co/meituan/DeepSeek-R1-Channel-INT8) -Assuming that the master node IP is `MASTER_IP`, checkpoint path is `/path/to/DeepSeek-R1-INT8` and port=5000, we can run the following commands to launch the server: +Assuming that master node IP is `MASTER_IP`, checkpoint path is `/path/to/DeepSeek-R1-INT8` and port=5000, we can have following commands to launch the server: ```bash #master python3 -m sglang.launch_server \ @@ -225,7 +225,7 @@ Running with per-channel quantization model: - [meituan/DeepSeek-R1-Channel-INT8](https://huggingface.co/meituan/DeepSeek-R1-Channel-INT8) -Assuming that the master node IP is `MASTER_IP`, checkpoint path is `/path/to/DeepSeek-R1-Channel-INT8` and port=5000, we can run the following commands to launch the server: +Assuming that master node IP is `MASTER_IP`, checkpoint path is `/path/to/DeepSeek-R1-Channel-INT8` and port=5000, we can have following commands to launch the server: ```bash #master diff --git a/benchmark/gsm8k/README.md b/benchmark/gsm8k/README.md index f5e898903..c110f533c 100644 --- a/benchmark/gsm8k/README.md +++ b/benchmark/gsm8k/README.md @@ -1,6 +1,6 @@ -## Run Benchmark +## Run benchmark -### Benchmark SGLang +### Benchmark sglang ``` python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 ``` @@ -10,7 +10,7 @@ python3 bench_sglang.py --num-questions 200 ``` -### Benchmark vLLM +### Benchmark vllm ``` python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000 ``` @@ -20,7 +20,7 @@ python3 bench_other.py --num-questions 200 --backend vllm ``` -### Benchmark LightLLM +### Benchmark lightllm ``` # A10G python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 16000 --port 22000 @@ -31,13 +31,13 @@ python3 bench_other.py --num-questions 200 --backend lightllm ``` -### Benchmark Guidance +### Benchmark guidance ``` python3 bench_other.py --num-questions 200 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf ``` -### Benchmark LMQL +### Benchmark lmql ``` CUDA_VISIBLE_DEVICES=0,1 lmql serve-model meta-llama/Llama-2-7b-chat-hf --cuda --port 23000 ``` diff --git a/benchmark/hellaswag/README.md b/benchmark/hellaswag/README.md index d11cc659e..cb7e65366 100644 --- a/benchmark/hellaswag/README.md +++ b/benchmark/hellaswag/README.md @@ -1,6 +1,6 @@ ## Run benchmark -### Benchmark SGLang +### Benchmark sglang ``` python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 ``` @@ -10,7 +10,7 @@ python3 bench_sglang.py --num-questions 200 ``` -### Benchmark vLLM +### Benchmark vllm ``` python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000 ``` @@ -20,7 +20,7 @@ python3 bench_other.py --num-questions 200 --backend vllm ``` -### Benchmark LightLLM +### Benchmark lightllm ``` # A10G python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 16000 --port 22000 @@ -31,13 +31,13 @@ python3 bench_other.py --num-questions 200 --backend lightllm ``` -### Benchmark Guidance +### Benchmark guidance ``` CUDA_VISIBLE_DEVICES=0,1 python3 bench_other.py --num-questions 200 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf ``` -### Benchmark LMQL +### Benchmark lmql ``` lmql serve-model meta-llama/Llama-2-7b-chat-hf --cuda --port 23000 ``` diff --git a/benchmark/kernels/fused_moe_triton/README.md b/benchmark/kernels/fused_moe_triton/README.md index 0348cabec..a0a7ca9c8 100644 --- a/benchmark/kernels/fused_moe_triton/README.md +++ b/benchmark/kernels/fused_moe_triton/README.md @@ -4,7 +4,7 @@ This directory contains benchmarking tools for MoE (Mixture of Experts) kernels. ### Tuning Tool -- `tuning_fused_moe_triton.py`: A tool for tuning the `fused_moe_triton` kernel. Adapted from [vLLM's benchmark_moe.py](https://github.com/vllm-project/vllm/blob/main/benchmarks/kernels/benchmark_moe.py), with added support for various model architectures. +- `tuning_fused_moe_triton.py`: A tool for tuning the `fused_moe_triton` kernel. Adapted from [vllm's benchmark_moe.py](https://github.com/vllm-project/vllm/blob/main/benchmarks/kernels/benchmark_moe.py), with added support for various model architectures. Example usage: ```bash @@ -48,7 +48,7 @@ After tuning, a configuration file (e.g., `E=64,N=640,device_name=NVIDIA_GeForce ### Performance Comparison Tool -- `benchmark_vllm_vs_sglang_fused_moe_triton.py`: A tool for comparing the performance of fused MoE kernels between vLLM and SGLang implementations. Supports various model architectures and data types. +- `benchmark_vllm_vs_sglang_fused_moe_triton.py`: A tool for comparing the performance of fused MoE kernels between vllm and sglang implementations. Supports various model architectures and data types. Example usage: ```bash diff --git a/benchmark/mmlu/README.md b/benchmark/mmlu/README.md index 40eb77740..16de20cda 100644 --- a/benchmark/mmlu/README.md +++ b/benchmark/mmlu/README.md @@ -1,11 +1,11 @@ -## Download Data +## Download data ``` bash download_data.sh ``` -## Run Benchmark +## Run benchmark -### Benchmark SGLang +### Benchmark sglang ``` python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 ``` @@ -19,7 +19,7 @@ python3 bench_sglang.py --nsub 10 python3 bench_sglang.py --backend gpt-3.5-turbo --parallel 8 ``` -### Benchmark vLLM +### Benchmark vllm ``` python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000 ``` @@ -29,7 +29,7 @@ python3 bench_other.py --nsub 10 --backend vllm ``` -### Benchmark LightLLM +### Benchmark lightllm ``` # A10G python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 16000 --port 22000 @@ -43,13 +43,13 @@ python3 bench_other.py --nsub 10 --backend lightllm ``` -### Benchmark Guidance +### Benchmark guidance ``` python3 bench_other.py --nsub 10 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf ``` -### Benchmark LMQL +### Benchmark lmql ``` CUDA_VISIBLE_DEVICES=0,1 lmql serve-model meta-llama/Llama-2-7b-chat-hf --cuda --port 23000 ``` diff --git a/benchmark/mtbench/README.md b/benchmark/mtbench/README.md index 95136f838..e6babf96e 100644 --- a/benchmark/mtbench/README.md +++ b/benchmark/mtbench/README.md @@ -4,9 +4,9 @@ wget -O question.jsonl https://raw.githubusercontent.com/lm-sys/FastChat/main/fastchat/llm_judge/data/mt_bench/question.jsonl ``` -## Run Benchmark +## Run benchmark -### Benchmark SGLang +### Benchmark sglang ``` python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 ``` @@ -15,7 +15,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port python3 bench_sglang.py --num-questions 80 ``` -### Benchmark SGLang EAGLE +### Benchmark sglang EAGLE ``` python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3-8B-Instruct --speculative-algo EAGLE \ --speculative-draft lmsys/sglang-EAGLE-LLaMA3-Instruct-8B --speculative-num-steps 5 \ @@ -27,7 +27,7 @@ python3 bench_sglang_eagle.py --num-questions 80 --parallel 1 ``` -### Benchmark vLLM +### Benchmark vllm ``` python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000 ``` @@ -37,7 +37,7 @@ python3 bench_other.py --num-questions 80 --backend vllm ``` -### Benchmark LightLLM +### Benchmark lightllm ``` # A10G python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 16000 --port 22000 diff --git a/benchmark/multi_chain_reasoning/README.md b/benchmark/multi_chain_reasoning/README.md index 4a1a71b1d..4c9f740f5 100644 --- a/benchmark/multi_chain_reasoning/README.md +++ b/benchmark/multi_chain_reasoning/README.md @@ -1,11 +1,11 @@ -## Download Data +## Download data ``` wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl ``` -## Run Benchmark +## Run benchmark -### Benchmark SGLang +### Benchmark sglang ``` python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --schedule-conservativeness 1.3 ``` @@ -16,7 +16,7 @@ python3 bench_sglang.py --num-questions 32 --parallel 1 ``` -### Benchmark vLLM +### Benchmark vllm ``` python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000 ``` @@ -26,7 +26,7 @@ python3 bench_other.py --num-questions 64 --backend vllm ``` -### Benchmark LightLLM +### Benchmark lightllm ``` # A10G python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 16000 --port 22000 @@ -37,12 +37,12 @@ python3 bench_other.py --num-questions 64 --backend lightllm ``` -### Benchmark Guidance +### Benchmark guidance ``` python3 bench_other.py --num-questions 8 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf ``` -### Benchmark LMQL +### Benchmark lmql ``` python3 bench_other.py --num-questions 64 --backend lmql --parallel 1 diff --git a/benchmark/tree_of_thought_deep/README.md b/benchmark/tree_of_thought_deep/README.md index d6b11f12e..bf5ab1638 100644 --- a/benchmark/tree_of_thought_deep/README.md +++ b/benchmark/tree_of_thought_deep/README.md @@ -1,13 +1,13 @@ -## Download Data +## Download data ``` wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl ``` -## Run Benchmark +## Run benchmark NOTE: This is an implementation for throughput/latency benchmark purposes. The prompts are not tuned to achieve good accuracy on the GSM-8K tasks. -### Benchmark SGLang +### Benchmark sglang ``` python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 ``` @@ -18,7 +18,7 @@ python3 bench_sglang.py --num-questions 16 --parallel 1 ``` -### Benchmark vLLM +### Benchmark vllm ``` python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000 ``` @@ -28,7 +28,7 @@ python3 bench_other.py --num-questions 32 --backend vllm ``` -### Benchmark LightLLM +### Benchmark lightllm ``` # A10G python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 16000 --port 22000 @@ -39,12 +39,12 @@ python3 bench_other.py --num-questions 32 --backend lightllm ``` -### Benchmark Guidance +### Benchmark guidance ``` python3 bench_other.py --num-questions 8 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf ``` -### Benchmark LMQL +### Benchmark lmql ``` python3 bench_other.py --num-questions 8 --backend lmql --parallel 1 diff --git a/benchmark/tree_of_thought_v0/README.md b/benchmark/tree_of_thought_v0/README.md index 08d7e2c29..821bb20d1 100644 --- a/benchmark/tree_of_thought_v0/README.md +++ b/benchmark/tree_of_thought_v0/README.md @@ -1,11 +1,11 @@ -## Download Data +## Download data ``` wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl ``` ## Run benchmark -### Benchmark SGLang +### Benchmark sglang ``` python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 ``` @@ -16,7 +16,7 @@ python3 bench_sglang.py --num-questions 10 --parallel 1 ``` -### Benchmark vLLM +### Benchmark vllm ``` python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000 ``` @@ -26,7 +26,7 @@ python3 bench_other.py --num-questions 32 --backend vllm ``` -### Benchmark LightLLM +### Benchmark lightllm ``` # A10G python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 16000 --port 22000 @@ -37,7 +37,7 @@ python3 bench_other.py --num-questions 32 --backend lightllm ``` -### Benchmark Guidance +### Benchmark guidance ``` python3 bench_other.py --num-questions 32 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf ``` diff --git a/docker/k8s-sglang-distributed-sts.yaml b/docker/k8s-sglang-distributed-sts.yaml index 15538cd41..6b81d9b14 100644 --- a/docker/k8s-sglang-distributed-sts.yaml +++ b/docker/k8s-sglang-distributed-sts.yaml @@ -22,7 +22,7 @@ spec: command: - /bin/bash - -c - # please modify the SGLang serving arguments below, as necessary. + # please modify the sglang serving arguments below, as necessary. # NOTE: the --expert-parallel-size and --enable-ep-moe are for MoE model like DeepSeek-R1 args: - | diff --git a/docs/README.md b/docs/README.md index 95ca0090e..0dfadb1d0 100644 --- a/docs/README.md +++ b/docs/README.md @@ -82,7 +82,7 @@ if is_in_ci(): llm = sgl.Engine(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct") -# Terminate Engine +# Terminalte Engine llm.shutdown() ``` @@ -94,7 +94,7 @@ llm.shutdown() ### **Model Selection** -For demonstrations in the docs, we **prefer smaller models** to reduce memory consumption and speed up inference. Running larger models in CI can lead to instability due to memory constraints. +For demonstrations in the docs, **prefer smaller models** to reduce memory consumption and speed up inference. Running larger models in CI can lead to instability due to memory constraints. ### **Prompt Alignment Example** diff --git a/docs/backend/quantization.md b/docs/backend/quantization.md index 2b5a8afbf..3a229f83d 100644 --- a/docs/backend/quantization.md +++ b/docs/backend/quantization.md @@ -134,7 +134,7 @@ python3 -m sglang.launch_server \ SGLang supports the following quantization methods based on torchao `["int8dq", "int8wo", "fp8wo", "fp8dq-per_tensor", "fp8dq-per_row", "int4wo-32", "int4wo-64", "int4wo-128", "int4wo-256"]`. -Note: According to [this issue](https://github.com/sgl-project/sglang/issues/2219#issuecomment-2561890230), `"int8dq"` method currently has some bugs when using together with CUDA graph capture. So we suggest to disable the CUDA graph capture when using `"int8dq"` method. Namely, please use the following command: +Note: According to [this issue](https://github.com/sgl-project/sglang/issues/2219#issuecomment-2561890230), `"int8dq"` method currently has some bugs when using together with cuda graph capture. So we suggest to disable cuda graph capture when using `"int8dq"` method. Namely, please use the following command: ```bash python3 -m sglang.launch_server \ diff --git a/docs/backend/server_arguments.md b/docs/backend/server_arguments.md index 445e72593..50b888cbf 100644 --- a/docs/backend/server_arguments.md +++ b/docs/backend/server_arguments.md @@ -38,7 +38,7 @@ memory management, and optimization techniques. - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments. - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`. - If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](custom_chat_template.md). -- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port, so you can use the following commands. If you encounter deadlocks, please try to add `--disable-cuda-graph`. +- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port, you can use the following commands. If you meet deadlock, please try to add `--disable-cuda-graph` ```bash # Node 0 diff --git a/docs/developer/development_guide_using_docker.md b/docs/developer/development_guide_using_docker.md index 308b82b74..e38947902 100644 --- a/docs/developer/development_guide_using_docker.md +++ b/docs/developer/development_guide_using_docker.md @@ -1,9 +1,9 @@ # Development Guide Using Docker ## Setup VSCode on a Remote Host -(Optional - you can skip this step if you plan to run the SGLang dev container locally) +(Optional - you can skip this step if you plan to run sglang dev container locally) -1. In the remote host, download `code` from [https://code.visualstudio.com/docs/?dv=linux64cli](https://code.visualstudio.com/download) and run `code tunnel` in a shell. +1. In the remote host, download `code` from [Https://code.visualstudio.com/docs/?dv=linux64cli](https://code.visualstudio.com/download) and run `code tunnel` in a shell. Example ```bash @@ -19,20 +19,20 @@ tar xf vscode_cli_alpine_x64_cli.tar.gz ## Setup Docker Container ### Option 1. Use the default dev container automatically from VSCode -There is a `.devcontainer` folder in the SGLang repository root folder to allow VSCode to automatically start up within dev container. You can read more about this VSCode extension in VSCode official document [Developing inside a Container](https://code.visualstudio.com/docs/devcontainers/containers). +There is a `.devcontainer` folder in the sglang repository root folder to allow VSCode to automatically start up within dev container. You can read more about this VSCode extension in VSCode official document [Developing inside a Container](https://code.visualstudio.com/docs/devcontainers/containers). ![image](https://github.com/user-attachments/assets/6a245da8-2d4d-4ea8-8db1-5a05b3a66f6d) (*Figure 1: Diagram from VSCode official documentation [Developing inside a Container](https://code.visualstudio.com/docs/devcontainers/containers).*) To enable this, you only need to: -1. Start Visual Studio Code and install the [VSCode dev container extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers). +1. Start Visual Studio Code and install [VSCode dev container extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers). 2. Press F1, type and choose "Dev Container: Open Folder in Container. 3. Input the `sglang` local repo path in your machine and press enter. -The first time you open it in the dev container might take longer due to docker pull and build. Once it's successful, you should set on your status bar at the bottom left displaying that you are in a dev container: +The first time you open it in dev container might take longer due to docker pull and build. Once it's successful, you should set on your status bar at the bottom left displaying that you are in a dev container: ![image](https://github.com/user-attachments/assets/650bba0b-c023-455f-91f9-ab357340106b) -Now when you run `sglang.launch_server` in the VSCode terminal or start debugging using F5, the SGLang server will be started in the dev container with all your local changes applied automatically: +Now when you run `sglang.launch_server` in the VSCode terminal or start debugging using F5, sglang server will be started in the dev container with all your local changes applied automatically: ![image](https://github.com/user-attachments/assets/748c85ba-7f8c-465e-8599-2bf7a8dde895) @@ -52,21 +52,21 @@ docker run -itd --shm-size 32g --gpus all -v --ipc=host --net docker exec -it sglang_dev /bin/zsh ``` Some useful volumes to mount are: -1. **HuggingFace model cache**: mounting model cache can avoid the need to re-download every time docker restarts. Default location on Linux is `~/.cache/huggingface/`. +1. **Huggingface model cache**: mounting model cache can avoid re-download every time docker restarts. Default location on Linux is `~/.cache/huggingface/`. 2. **SGLang repository**: code changes in the SGLang local repository will be automatically synced to the .devcontainer. -Example 1: Mounting local cache folder `/opt/dlami/nvme/.cache` but not the SGLang repo. Use this when you prefer to manually transfer local code changes to the devcontainer. +Example 1: Monting local cache folder `/opt/dlami/nvme/.cache` but not the SGLang repo. Use this when you prefer to manually transfer local code changes to the devcontainer. ```bash docker run -itd --shm-size 32g --gpus all -v /opt/dlami/nvme/.cache:/root/.cache --ipc=host --network=host --privileged --name sglang_zhyncs lmsysorg/sglang:dev /bin/zsh docker exec -it sglang_zhyncs /bin/zsh ``` -Example 2: Mounting both the HuggingFace cache and the local SGLang repo. Local code changes are automatically synced to the devcontainer as SGLang is installed in editable mode in the dev image. +Example 2: Mounting both HuggingFace cache and local SGLang repo. Local code changes are automatically synced to the devcontainer as the SGLang is installed in editable mode in the dev image. ```bash docker run -itd --shm-size 32g --gpus all -v $HOME/.cache/huggingface/:/root/.cache/huggingface -v $HOME/src/sglang:/sgl-workspace/sglang --ipc=host --network=host --privileged --name sglang_zhyncs lmsysorg/sglang:dev /bin/zsh docker exec -it sglang_zhyncs /bin/zsh ``` ## Debug SGLang with VSCode Debugger -1. (Create if it does not exist) open `launch.json` in VSCode. +1. (Create if not exist) open `launch.json` in VSCode. 2. Add the following config and save. Please note that you can edit the script as needed to apply different parameters or debug a different program (e.g. benchmark script). ```JSON { diff --git a/docs/developer/setup_github_runner.md b/docs/developer/setup_github_runner.md index f53447656..b05adc4eb 100644 --- a/docs/developer/setup_github_runner.md +++ b/docs/developer/setup_github_runner.md @@ -4,7 +4,7 @@ ### Step 1: Start a docker container. -You can mount a folder for the shared HuggingFace model weights cache. The command below uses `/tmp/huggingface` as an example. +You can mount a folder for the shared huggingface model weights cache. The command below uses `/tmp/huggingface` as an example. ``` docker pull nvidia/cuda:12.1.1-devel-ubuntu22.04 diff --git a/docs/index.rst b/docs/index.rst index 82b553b2f..e98d5d95b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -5,7 +5,7 @@ SGLang is a fast serving framework for large language models and vision language It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language. The core features include: -- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (PagedAttention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, quantization (FP8/INT4/AWQ/GPTQ), and multi-LoRA batching. +- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching. - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions. - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, Qwen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models. - **Active Community**: SGLang is open-source and backed by an active community with industry adoption. diff --git a/docs/references/benchmark_and_profiling.md b/docs/references/benchmark_and_profiling.md index bb6955633..98c67fd4d 100644 --- a/docs/references/benchmark_and_profiling.md +++ b/docs/references/benchmark_and_profiling.md @@ -69,7 +69,7 @@ This command sets the number of prompts to 2 with `--num-prompts` argument and limits the length of output sequences to 100 with `--sharegpt-output-len` argument, which can generate a small trace file for browser to open smoothly. - Additionally, if you want to locate the SGLang Python source code through the CUDA kernel in Trace, you need to disable CUDA Graph when starting the service. This can be done by using the `--disable-cuda-graph` parameter in the command to start the service. + Additionally, if you want to locate the SGLang Python source code through the cuda kernel in Trace, you need to disable CUDA Graph when starting the service. This can be done by using the `--disable-cuda-graph` parameter in the command to start the service. ## Profile with Nsight diff --git a/docs/references/contribution_guide.md b/docs/references/contribution_guide.md index c096f0917..3d5e33104 100644 --- a/docs/references/contribution_guide.md +++ b/docs/references/contribution_guide.md @@ -35,7 +35,7 @@ SGLang uses Python's built-in [unittest](https://docs.python.org/3/library/unitt ## Writing Documentation & Running Docs CI -We recommend new contributors start from writing documentation, which helps you quickly understand the SGLang codebase. For more details, please refer to [docs/README.md](https://github.com/sgl-project/sglang/tree/main/docs/README.md). +We recommend new contributors start from writing documentation, which helps you quickly understand SGLang codebase. For more details, please refer to [docs/README.md](https://github.com/sgl-project/sglang/tree/main/docs/README.md). ## Tips for Newcomers diff --git a/docs/references/deepseek.md b/docs/references/deepseek.md index 5e4902e04..612885bc5 100644 --- a/docs/references/deepseek.md +++ b/docs/references/deepseek.md @@ -69,13 +69,13 @@ If you encounter errors when starting the server, ensure the weights have finish The DeepSeek series have huge model weights, it takes some time to compile the model with `torch.compile` for the first time if you have added the flag `--enable-torch-compile`. You can refer [here](https://docs.sglang.ai/backend/hyperparameter_tuning.html#try-advanced-options) to optimize the caching of compilation results, so that the cache can be used to speed up the next startup. ### Launch with one node of 8 x H200 -Please refer to [the example](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#using-docker-recommended). **Note that Deepseek V3 is already in FP8**. So we should not run it with any quantization arguments like `--quantization fp8 --kv-cache-dtype fp8_e5m2`. +Please refer to [the example](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#using-docker-recommended). **Note that Deepseek V3 is already in FP8. So we should not run it with any quantization arguments like `--quantization fp8 --kv-cache-dtype fp8_e5m2`. ### Running examples on Multi-node - [Serving with two H20*8 nodes](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-two-h208-nodes). -- [Serving with two H200*8 nodes and Docker](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-two-h2008-nodes-and-docker). +- [Serving with two H200*8 nodes and docker](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-two-h2008-nodes-and-docker). - [Serving with four A100*8 nodes](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-four-a1008-nodes). @@ -89,13 +89,13 @@ Please refer to [the example](https://github.com/sgl-project/sglang/tree/main/be - **MLA Attention Backends**: Currently SGLang supports different optimized MLA attention backends, including [FlashAttention3](https://github.com/Dao-AILab/flash-attention), [Flashinfer](https://docs.flashinfer.ai/api/mla.html), [FlashMLA](https://github.com/deepseek-ai/FlashMLA), [CutlassMLA](https://github.com/sgl-project/sglang/pull/5390), and [Triton](https://github.com/triton-lang/triton) backends. The default FA3 provides good performance across wide workloads. -- **FP8 Quantization**: W8A8 FP8 and KV Cache FP8 quantization enables efficient FP8 inference. Additionally, we have implemented the Batched Matrix Multiplication (BMM) operator to facilitate FP8 inference in MLA with weight absorption. +- **FP8 Quantization**: W8A8 FP8 and KV Cache FP8 quantization enables efficient FP8 inference. Additionally, we have implemented Batched Matrix Multiplication (BMM) operator to facilitate FP8 inference in MLA with weight absorption. -- **CUDA Graph & torch.compile**: Both MLA and Mixture of Experts (MoE) are compatible with CUDA Graph and torch.compile, which reduces latency and accelerates decoding speed for small batch sizes. +- **CUDA Graph & Torch.compile**: Both MLA and Mixture of Experts (MoE) are compatible with CUDA Graph and Torch.compile, which reduces latency and accelerates decoding speed for small batch sizes. -- **Chunked Prefix Cache**: Chunked prefix cache optimization can increase throughput by cutting prefix cache into chunks, processing them with multi-head attention and merging their states. Its improvement can be significant when doing chunked prefill on long sequences. Currently this optimization is only available for the FlashAttention3 backend. +- **Chunked Prefix Cache**: Chunked prefix cache optimization can increase throughput by cutting prefix cache into chunks, processing them with multi-head attention and merging their states. Its improvement can be significant when doing chunked prefill on long sequences. Currently this optimization is only available for FlashAttention3 backend. -Overall, with these optimizations, we have achieved up to a **7x** acceleration in output throughput compared to the previous version. +Overall, with these optimizations, we have achieved up to **7x** acceleration in output throughput compared to the previous version.

Multi-head Latent Attention for DeepSeek Series Models @@ -113,7 +113,7 @@ Overall, with these optimizations, we have achieved up to a **7x** acceleration Data Parallelism Attention for DeepSeek Series Models

-With data parallelism attention enabled, we have achieved up to a **1.9x** decoding throughput improvement compared to the previous version. +With data parallelism attention enabled, we have achieved up to **1.9x** decoding throughput improvement compared to the previous version.

Data Parallelism Attention Performance Comparison @@ -150,7 +150,7 @@ python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-V3 --tp 8 --tru The precompilation process typically takes around 10 minutes to complete. ### Multi-token Prediction -**Description**: SGLang implements DeepSeek V3 Multi-Token Prediction (MTP) based on [EAGLE speculative decoding](https://docs.sglang.ai/backend/speculative_decoding.html#EAGLE-Decoding). With this optimization, the decoding speed can be improved by **1.8x** for batch size 1 and **1.5x** for batch size 32 respectively with H200 TP8 setting. +**Description**: SGLang implements DeepSeek V3 Multi-Token Prediction (MTP) based on [EAGLE speculative decoding](https://docs.sglang.ai/backend/speculative_decoding.html#EAGLE-Decoding). With this optimization, the decoding speed can be improved by **1.8x** for batch size 1 and **1.5x** for batch size 32 respectively on H200 TP8 setting. **Usage**: Add arguments `--speculative-algorithm`, `--speculative-num-steps`, `--speculative-eagle-topk` and `--speculative-num-draft-tokens` to enable this feature. For example: @@ -161,7 +161,7 @@ python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --spec - FlashAttention3 and Triton backend fully supports MTP usage. For FlashInfer backend (`--attention-backend flashinfer`) with speculative decoding,`--speculative-eagle-topk` parameter should be set to `1`. MTP support for the FlashMLA backend and CutlassMLA backend is still under development. - To enable DeepSeek MTP for large batch sizes (>32), there are some parameters should be changed (Reference [this discussion](https://github.com/sgl-project/sglang/issues/4543#issuecomment-2737413756)): - Adjust `--max-running-requests` to a larger number. The default value is `32` for MTP. For larger batch sizes, you should increase this value beyond the default value. - - Set `--cuda-graph-bs`. It is a list of batch sizes for CUDA graph capture. The default captured batch sizes for speculative decoding is set [here](https://github.com/sgl-project/sglang/blob/49420741746c8f3e80e0eb17e7d012bfaf25793a/python/sglang/srt/model_executor/cuda_graph_runner.py#L126). You can include more batch sizes into it. + - Set `--cuda-graph-bs`. It's a list of batch sizes for cuda graph capture. The default captured batch sizes for speculative decoding is set [here](https://github.com/sgl-project/sglang/blob/49420741746c8f3e80e0eb17e7d012bfaf25793a/python/sglang/srt/model_executor/cuda_graph_runner.py#L126). You can include more batch sizes into it. ### Reasoning Content for DeepSeek R1 @@ -209,7 +209,7 @@ data: {"choices":[{"delta":{"tool_calls":[{"function":{"arguments":"\"}"}}]}}]} data: {"choices":[{"delta":{"tool_calls":null}}], "finish_reason": "tool_calls"} data: [DONE] ``` -The client needs to concatenate all fragments of the tool call arguments to reconstruct the complete tool call: +The client needs to concatenate all arguments fragments to reconstruct the complete tool call: ``` {"city": "Qingdao"} ``` @@ -223,4 +223,4 @@ Important Notes: 1. **Question**: What should I do if model loading takes too long and NCCL timeout occurs? - **Answer**: You can try to add `--dist-timeout 3600` when launching the model, this allows for a 1-hour timeout. + **Answer**: You can try to add `--dist-timeout 3600` when launching the model, this allows for 1-hour timeout. diff --git a/docs/references/deploy_on_k8s.md b/docs/references/deploy_on_k8s.md index dd06fd144..cfc099f56 100644 --- a/docs/references/deploy_on_k8s.md +++ b/docs/references/deploy_on_k8s.md @@ -330,7 +330,7 @@ This should resolve most NCCL-related issues. ## Remaining issues * In Kubernetes, Docker, or Containerd environments, we use hostNetwork to prevent performance degradation. -* We utilize privileged mode, which isn't secure. Additionally, in containerized environments, full GPU isolation cannot be achieved. +* We utilize privileged mode, which isn’t secure. Additionally, in containerized environments, full GPU isolation cannot be achieved. ## TODO diff --git a/docs/references/modelscope.md b/docs/references/modelscope.md index fe042adbb..4740c2770 100644 --- a/docs/references/modelscope.md +++ b/docs/references/modelscope.md @@ -25,4 +25,4 @@ docker run --gpus all \ python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --port 30000 ``` -Note that ModelScope uses a different cache directory than HuggingFace. You may need to set it manually to avoid running out of disk space. +Note that modelscope uses a different cache directory than huggingface. You may need to set it manually to avoid running out of disk space. diff --git a/docs/start/install.md b/docs/start/install.md index 30577388c..ca7d6dec4 100644 --- a/docs/start/install.md +++ b/docs/start/install.md @@ -23,7 +23,7 @@ uv pip install "sglang[all]>=0.4.6.post3" 1. Use `export CUDA_HOME=/usr/local/cuda-` to set the `CUDA_HOME` environment variable. 2. Install FlashInfer first following [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html), then install SGLang as described above. -- If you encounter `ImportError; cannot import name 'is_valid_list_of_images' from 'transformers.models.llama.image_processing_llama'`, try to use the specified version of `transformers` in [pyproject.toml](https://github.com/sgl-project/sglang/blob/main/python/pyproject.toml). Currently, run `pip install transformers==4.51.1`. +- If you encounter `ImportError; cannot import name 'is_valid_list_of_images' from 'transformers.models.llama.image_processing_llama'`, try to use the specified version of `transformers` in [pyproject.toml](https://github.com/sgl-project/sglang/blob/main/python/pyproject.toml). Currently, just running `pip install transformers==4.51.1`. ## Method 2: From source @@ -54,10 +54,10 @@ cd .. pip install -e "python[all_hip]" ``` -## Method 3: Using Docker +## Method 3: Using docker The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](https://github.com/sgl-project/sglang/tree/main/docker). -Replace `` below with your HuggingFace hub [token](https://huggingface.co/docs/hub/en/security-tokens). +Replace `` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens). ```bash docker run --gpus all \ @@ -89,7 +89,7 @@ drun -p 30000:30000 \ drun v0.4.6.post3-rocm630 python3 -m sglang.bench_one_batch --batch-size 32 --input 1024 --output 128 --model amd/Meta-Llama-3.1-8B-Instruct-FP8-KV --tp 8 --quantization fp8 ``` -## Method 4: Using Docker Compose +## Method 4: Using docker compose

More @@ -164,4 +164,4 @@ sky status --endpoint 30000 sglang - [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is the default attention kernel backend. It only supports sm75 and above. If you encounter any FlashInfer-related issues on sm75+ devices (e.g., T4, A10, A100, L4, L40S, H100), please switch to other kernels by adding `--attention-backend triton --sampling-backend pytorch` and open an issue on GitHub. - If you only need to use OpenAI models with the frontend language, you can avoid installing other dependencies by using `pip install "sglang[openai]"`. - The language frontend operates independently of the backend runtime. You can install the frontend locally without needing a GPU, while the backend can be set up on a GPU-enabled machine. To install the frontend, run `pip install sglang`, and for the backend, use `pip install sglang[srt]`. `srt` is the abbreviation of SGLang runtime. -- To reinstall FlashInfer locally, use the following command: `pip install "flashinfer-python==0.2.5" -i https://flashinfer.ai/whl/cu124/torch2.6 --force-reinstall --no-deps` and then delete the cache with `rm -rf ~/.cache/flashinfer`. +- To reinstall flashinfer locally, use the following command: `pip install "flashinfer-python==0.2.5" -i https://flashinfer.ai/whl/cu124/torch2.6 --force-reinstall --no-deps` and then delete the cache with `rm -rf ~/.cache/flashinfer`. diff --git a/examples/runtime/README.md b/examples/runtime/README.md index 2e7cf02ba..941863727 100644 --- a/examples/runtime/README.md +++ b/examples/runtime/README.md @@ -28,7 +28,7 @@ The `engine` folder contains that examples that show how to use [Offline Engine ## Hidden States -The `hidden_states` folder contains examples on how to extract hidden states using SGLang. Please note that this might degrade throughput due to CUDA graph rebuilding. +The `hidden_states` folder contains examples on how to extract hidden states using SGLang. Please note that this might degrade throughput due to cuda graph rebuilding. * `hidden_states_engine.py`: An example how to extract hidden states using the Engine API. * `hidden_states_server.py`: An example how to extract hidden states using the Server API. diff --git a/examples/runtime/hidden_states/hidden_states_engine.py b/examples/runtime/hidden_states/hidden_states_engine.py index 4db86a7c4..8af883ab1 100644 --- a/examples/runtime/hidden_states/hidden_states_engine.py +++ b/examples/runtime/hidden_states/hidden_states_engine.py @@ -3,7 +3,7 @@ Usage: python hidden_states.py Note that each time you change the `return_hidden_states` parameter, -the CUDA graph will be recaptured, which might lead to a performance hit. +the cuda graph will be recaptured, which might lead to a performance hit. So avoid getting hidden states and completions alternately. """ diff --git a/examples/runtime/hidden_states/hidden_states_server.py b/examples/runtime/hidden_states/hidden_states_server.py index 6b1cdab1f..96045fad9 100644 --- a/examples/runtime/hidden_states/hidden_states_server.py +++ b/examples/runtime/hidden_states/hidden_states_server.py @@ -4,7 +4,7 @@ Usage: python hidden_states_server.py Note that each time you change the `return_hidden_states` parameter, -the CUDA graph will be recaptured, which might lead to a performance hit. +the cuda graph will be recaptured, which might lead to a performance hit. So avoid getting hidden states and completions alternately. """ diff --git a/python/pyproject.toml b/python/pyproject.toml index b06451e82..1a0a498f0 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -68,7 +68,7 @@ blackwell = [ ] # HIP (Heterogeneous-computing Interface for Portability) for AMD -# => base docker rocm/vllm-dev:20250114, not from public vLLM whl +# => base docker rocm/vllm-dev:20250114, not from public vllm whl srt_hip = [ "sglang[runtime_common]", "torch", @@ -76,7 +76,7 @@ srt_hip = [ "outlines==0.1.11" ] -# xpu is not enabled in public vLLM and torch whl, +# xpu is not enabled in public vllm and torch whl, # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm srt_xpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"] @@ -84,8 +84,8 @@ srt_xpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"] # https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html srt_hpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"] -# CPU: currently, there are no pre-built vLLM wheels for CPU. -# To install vLLM for CPU, please follow the instruction here: +# CPU: currently, there are no pre-built vllm wheels for CPU. +# To install vllm for CPU, please follow the instruction here: # https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html srt_cpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11", "torch"] # https://vllm-ascend.readthedocs.io/en/latest/installation.html diff --git a/python/sglang/compile_deep_gemm.py b/python/sglang/compile_deep_gemm.py index cbd7a4b1c..84b52962f 100644 --- a/python/sglang/compile_deep_gemm.py +++ b/python/sglang/compile_deep_gemm.py @@ -129,7 +129,7 @@ def launch_server_process_and_send_one_request( def refine_server_args(server_args: ServerArgs, compile_args: CompileArgs): - # Disable CUDA graph and torch compile to save time + # Disable cuda graph and torch compile to save time server_args.disable_cuda_graph = True server_args.enable_torch_compile = False print(f"Disable CUDA Graph and Torch Compile to save time...") diff --git a/python/sglang/srt/_custom_ops.py b/python/sglang/srt/_custom_ops.py index 1d5d116dc..07c087bf6 100644 --- a/python/sglang/srt/_custom_ops.py +++ b/python/sglang/srt/_custom_ops.py @@ -12,7 +12,7 @@ use_vllm_custom_allreduce = get_bool_env_var( ) if not is_hpu(): - # ROCm does not use vLLM custom allreduce + # ROCm does not use vllm custom allreduce if use_vllm_custom_allreduce and not is_hip(): try: import vllm._C diff --git a/python/sglang/srt/configs/chatglm.py b/python/sglang/srt/configs/chatglm.py index 044d4f7b0..9370c218a 100644 --- a/python/sglang/srt/configs/chatglm.py +++ b/python/sglang/srt/configs/chatglm.py @@ -53,7 +53,7 @@ class ChatGLMConfig(PretrainedConfig): self.kv_channels = kv_channels self.num_attention_heads = num_attention_heads self.seq_length = seq_length - # It is to be compatible with long LoRA. + # It is to be compatible with long lora. self.max_position_embeddings = seq_length self.hidden_dropout = hidden_dropout self.attention_dropout = attention_dropout diff --git a/python/sglang/srt/configs/load_config.py b/python/sglang/srt/configs/load_config.py index e194a9136..be9a40b4b 100644 --- a/python/sglang/srt/configs/load_config.py +++ b/python/sglang/srt/configs/load_config.py @@ -29,7 +29,7 @@ class LoadFormat(str, enum.Enum): class LoadConfig: """ download_dir: Directory to download and load the weights, default to the - default cache directory of HuggingFace. + default cache directory of huggingface. load_format: The format of the model weights to load: "auto" will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is diff --git a/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py b/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py index c9db2700f..9faff648c 100644 --- a/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py +++ b/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py @@ -172,7 +172,7 @@ class CustomAllreduce: if not custom_ar: # disable because of missing custom allreduce library - # e.g. in a non-CUDA environment + # e.g. in a non-cuda environment return self.group = group @@ -389,11 +389,11 @@ class CustomAllreduce: if _is_hip: handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr) handles, offsets = self._gather_ipc_meta((bytes(handle), offset)) - logger.info("Registering %d CUDA graph addresses", len(offset)) + logger.info("Registering %d cuda graph addresses", len(offset)) ops.register_graph_buffers(self._ptr, handles, offsets) else: handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr) - logger.info("Registering %d CUDA graph addresses", len(offset)) + logger.info("Registering %d cuda graph addresses", len(offset)) # We cannot directly use `dist.all_gather_object` here # because it is incompatible with `gloo` backend under inference mode. # see https://github.com/pytorch/pytorch/issues/126032 for details. @@ -435,7 +435,7 @@ class CustomAllreduce: return False # all reduce, assuming inp tensor is IPC registered with register_buffer, - # or, in the context of CUDA graphs, register_graph_buffers + # or, in the context of cuda graphs, register_graph_buffers def all_reduce_reg(self, inp: torch.Tensor, out: torch.Tensor = None): if out is None: out = torch.empty_like(inp) @@ -473,7 +473,7 @@ class CustomAllreduce: return out def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]: - """The main allreduce API that provides support for CUDA graph.""" + """The main allreduce API that provides support for cuda graph.""" # When custom allreduce is disabled, this will be None. if self.disabled or not self.should_custom_ar(input): return None @@ -489,7 +489,7 @@ class CustomAllreduce: return torch.empty_like(input) else: if _is_hip: - # note: outside of CUDA graph context, + # note: outside of cuda graph context, # custom allreduce incurs a cost of cudaMemcpy, which should # be small(<=1% of overall latency) compared to the performance # gains of using custom kernels diff --git a/python/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py b/python/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py index 78d10938f..86121ac97 100644 --- a/python/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +++ b/python/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py @@ -121,14 +121,14 @@ def can_actually_p2p( Therefore, we have to perform a real P2P access to check if it is actually possible. - Note on p2p and CUDA IPC: + Note on p2p and cuda IPC: Usually, one process uses one GPU: - GPU src --> CUDA context src --> tensor src --> process src + GPU src --> cuda context src --> tensor src --> process src - We need to combine p2p and CUDA IPC, so that: - GPU src --> CUDA context src --> tensor src --> process src + We need to combine p2p and cuda IPC, so that: + GPU src --> cuda context src --> tensor src --> process src |shared| - GPU tgt --> CUDA context tgt --> tensor tgt --> process tgt + GPU tgt --> cuda context tgt --> tensor tgt --> process tgt That is to say, process src creates a tensor in GPU src, passes IPC handle to process tgt, and process tgt accesses the tensor in GPU tgt. Any operation on the tensor in process tgt will be reflected in the tensor in process src, because @@ -201,9 +201,9 @@ def can_actually_p2p( # then all the processes can read the cache file to check the p2p access status. # Note that the cache file is suffixed by the CUDA_VISIBLE_DEVICES, so that we # can have different cache files for different CUDA_VISIBLE_DEVICES settings, -# e.g. used by different vLLM engines. The device id in the cache file is a +# e.g. used by different vllm engines. The device id in the cache file is a # **local** device id, i.e. from 0 to num_dev-1, where num_dev is the number -# of visible devices in the vLLM engine. +# of visible devices in the vllm engine. _gpu_p2p_access_cache: Optional[Dict[str, bool]] = None diff --git a/python/sglang/srt/distributed/device_communicators/pynccl.py b/python/sglang/srt/distributed/device_communicators/pynccl.py index fe4e71a67..6459f70fd 100644 --- a/python/sglang/srt/distributed/device_communicators/pynccl.py +++ b/python/sglang/srt/distributed/device_communicators/pynccl.py @@ -104,7 +104,7 @@ class PyNcclCommunicator: self.device = device # nccl communicator and stream will use this device # `torch.cuda.device` is a context manager that changes the - # current CUDA device to the specified one + # current cuda device to the specified one with torch.cuda.device(device): self.comm: ncclComm_t = self.nccl.ncclCommInitRank( self.world_size, self.unique_id, self.rank diff --git a/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py b/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py index 75b52b9d3..afb477334 100644 --- a/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +++ b/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py @@ -6,7 +6,7 @@ # 1. We tried to use `cupy`, it calls NCCL correctly, but `cupy` itself # often gets stuck when initializing the NCCL communicator. # 2. We tried to use `torch.distributed`, but `torch.distributed.all_reduce` -# contains many other potential CUDA APIs, that are not allowed during +# contains many other potential cuda APIs, that are not allowed during # capturing the CUDA graph. For further details, please check # https://discuss.pytorch.org/t/pytorch-cudagraph-with-nccl-operation-failed/ . # diff --git a/python/sglang/srt/distributed/parallel_state.py b/python/sglang/srt/distributed/parallel_state.py index ca6a951d0..a9161b5c3 100644 --- a/python/sglang/srt/distributed/parallel_state.py +++ b/python/sglang/srt/distributed/parallel_state.py @@ -170,7 +170,7 @@ class GroupCoordinator: GroupCoordinator takes charge of all the communication operations among the processes in the group. It can route the communication to a specific implementation (e.g. switch allreduce implementation - based on the tensor size and CUDA graph mode). + based on the tensor size and cuda graph mode). """ # available attributes: diff --git a/python/sglang/srt/hf_transformers_utils.py b/python/sglang/srt/hf_transformers_utils.py index ab5637971..0bba48b4d 100644 --- a/python/sglang/srt/hf_transformers_utils.py +++ b/python/sglang/srt/hf_transformers_utils.py @@ -127,7 +127,7 @@ CONTEXT_LENGTH_KEYS = [ def get_context_length(config): - """Get the context length of a model from a HuggingFace model configs.""" + """Get the context length of a model from a huggingface model configs.""" text_config = config rope_scaling = getattr(text_config, "rope_scaling", None) if rope_scaling: diff --git a/python/sglang/srt/layers/attention/base_attn_backend.py b/python/sglang/srt/layers/attention/base_attn_backend.py index be2e4064c..52bcd5fba 100644 --- a/python/sglang/srt/layers/attention/base_attn_backend.py +++ b/python/sglang/srt/layers/attention/base_attn_backend.py @@ -20,7 +20,7 @@ class AttentionBackend(ABC): raise NotImplementedError() def init_cuda_graph_state(self, max_bs: int): - """Init the global shared states for CUDA graph.""" + """Init the global shared states for cuda graph.""" raise NotImplementedError() def init_forward_metadata_capture_cuda_graph( @@ -33,7 +33,7 @@ class AttentionBackend(ABC): forward_mode: ForwardMode, spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]], ): - """Init the metadata for a forward pass for capturing a CUDA graph.""" + """Init the metadata for a forward pass for capturing a cuda graph.""" raise NotImplementedError() def init_forward_metadata_replay_cuda_graph( @@ -47,7 +47,7 @@ class AttentionBackend(ABC): spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]], seq_lens_cpu: Optional[torch.Tensor], ): - """Init the metadata for a forward pass for replaying a CUDA graph.""" + """Init the metadata for a forward pass for replaying a cuda graph.""" raise NotImplementedError() def get_cuda_graph_seq_len_fill_value(self): diff --git a/python/sglang/srt/layers/attention/double_sparsity_backend.py b/python/sglang/srt/layers/attention/double_sparsity_backend.py index debbcca9d..47b867f61 100644 --- a/python/sglang/srt/layers/attention/double_sparsity_backend.py +++ b/python/sglang/srt/layers/attention/double_sparsity_backend.py @@ -15,7 +15,7 @@ if TYPE_CHECKING: class DoubleSparseAttnBackend(AttentionBackend): def __init__(self, model_runner: ModelRunner): - # Lazy import to avoid the initialization of CUDA context + # Lazy import to avoid the initialization of cuda context from sglang.srt.layers.attention.triton_ops.double_sparsity_attention import ( extend_attention_fwd, flash_decode_attention_fwd, diff --git a/python/sglang/srt/layers/attention/flashinfer_backend.py b/python/sglang/srt/layers/attention/flashinfer_backend.py index 6ed3af5d9..1c254c4fa 100644 --- a/python/sglang/srt/layers/attention/flashinfer_backend.py +++ b/python/sglang/srt/layers/attention/flashinfer_backend.py @@ -664,7 +664,7 @@ class FlashInferIndicesUpdaterDecode: kv_indptr = kv_indptr[: bs + 1] if wrapper.is_cuda_graph_enabled: - # Directly write to the CUDA graph input buffer + # Directly write to the cuda graph input buffer kv_indices = wrapper._paged_kv_indices_buf else: kv_indices = torch.empty( @@ -1173,7 +1173,7 @@ def fast_decode_plan( """ A faster version of BatchDecodeWithPagedKVCacheWrapper::plan used for FlashInferMultiStepDraftBackend. Modifications: - - Remove unnecessary device-to-device copy for the CUDA graph buffers. + - Remove unnecessary device-to-device copy for the cuda graph buffers. - Remove unnecessary host-to-device copy for the metadata buffers. """ batch_size = len(last_page_len) diff --git a/python/sglang/srt/layers/attention/flashinfer_mla_backend.py b/python/sglang/srt/layers/attention/flashinfer_mla_backend.py index dc1a8b4de..cd7778418 100644 --- a/python/sglang/srt/layers/attention/flashinfer_mla_backend.py +++ b/python/sglang/srt/layers/attention/flashinfer_mla_backend.py @@ -874,7 +874,7 @@ def fast_mla_decode_plan( ) -> None: """A faster version of BatchMLAPagedAttentionWrapper::plan, for skipping the stream synchronization in original plan function during - CUDA graph replaying. + cuda graph replaying. """ self._causal = causal self._page_size = page_size diff --git a/python/sglang/srt/layers/attention/triton_backend.py b/python/sglang/srt/layers/attention/triton_backend.py index 964611599..0aa3a695e 100644 --- a/python/sglang/srt/layers/attention/triton_backend.py +++ b/python/sglang/srt/layers/attention/triton_backend.py @@ -92,7 +92,7 @@ class TritonAttnBackend(AttentionBackend): skip_prefill: bool = False, kv_indptr_buf: Optional[torch.Tensor] = None, ): - # Lazy import to avoid the initialization of CUDA context + # Lazy import to avoid the initialization of cuda context from sglang.srt.layers.attention.triton_ops.decode_attention import ( decode_attention_fwd, ) diff --git a/python/sglang/srt/layers/attention/vision.py b/python/sglang/srt/layers/attention/vision.py index 3617ca5ac..429787ec8 100644 --- a/python/sglang/srt/layers/attention/vision.py +++ b/python/sglang/srt/layers/attention/vision.py @@ -257,7 +257,7 @@ class VisionFlash3Attention(nn.Module): **kwargs, ): if not _is_cuda: - raise Exception("VisionFlash3Attention is only available for CUDA") + raise Exception("VisionFlash3Attention is only available for cuda") super().__init__() def forward( diff --git a/python/sglang/srt/layers/dp_attention.py b/python/sglang/srt/layers/dp_attention.py index 2e58b1f98..0f1e453bf 100644 --- a/python/sglang/srt/layers/dp_attention.py +++ b/python/sglang/srt/layers/dp_attention.py @@ -237,7 +237,7 @@ def dp_scatter( forward_batch: ForwardBatch, ): # local_num_tokens is not necessarily the same as local_tokens.shape[0], - # since local_tokens may be padded for CUDA graph + # since local_tokens may be padded for cuda graph local_start_pos, local_num_tokens = get_dp_local_info(forward_batch) local_tokens.fill_(0) diff --git a/python/sglang/srt/layers/logits_processor.py b/python/sglang/srt/layers/logits_processor.py index 03346a2aa..5a4f07817 100644 --- a/python/sglang/srt/layers/logits_processor.py +++ b/python/sglang/srt/layers/logits_processor.py @@ -166,7 +166,7 @@ class LogitsMetadata: def compute_dp_attention_metadata(self, hidden_states: torch.Tensor): if self.global_num_tokens_for_logprob_cpu is None: - # we are capturing CUDA graph + # we are capturing cuda graph return cumtokens = torch.cumsum(self.global_num_tokens_for_logprob_gpu, dim=0) diff --git a/python/sglang/srt/layers/quantization/__init__.py b/python/sglang/srt/layers/quantization/__init__.py index c550db026..0fa9c0f9c 100644 --- a/python/sglang/srt/layers/quantization/__init__.py +++ b/python/sglang/srt/layers/quantization/__init__.py @@ -38,7 +38,7 @@ try: except ImportError: VLLM_AVAILABLE = False - # Define empty classes as placeholders when vLLM is not available + # Define empty classes as placeholders when vllm is not available class DummyConfig: def override_quantization_method(self, *args, **kwargs): return None @@ -109,7 +109,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE: raise ValueError( f"{quantization} quantization requires some operators from vllm. " - "Please install vLLM by `pip install vllm==0.8.4`" + "Please install vllm by `pip install vllm==0.8.4`" ) return QUANTIZATION_METHODS[quantization] @@ -231,7 +231,7 @@ original_isinstance = builtins.isinstance def monkey_patch_isinstance_for_vllm_base_layer(reverse: bool = False): """ Patch isinstance so that the `get_quant_method` in vllm's QuantizationConfig - can recognize SGLang layers + can recognize sglang layers """ if not VLLM_AVAILABLE: return @@ -267,7 +267,7 @@ def monkey_patch_isinstance_for_vllm_base_layer(reverse: bool = False): def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"): """ Monkey patch the apply function of vllm's FusedMoEMethodBase. - Convert SGLang arguments to vLLM arguments. + Convert sglang arguments to vllm arguments. """ original_apply = class_obj.apply sig = inspect.signature(original_apply) @@ -329,6 +329,6 @@ def monkey_patch_quant_configs(): monkey_patch_moe_apply(CompressedTensorsWNA16MoEMethod) -# Only apply monkey patches if vLLM is available +# Only apply monkey patches if vllm is available if VLLM_AVAILABLE: monkey_patch_quant_configs() diff --git a/python/sglang/srt/layers/quantization/blockwise_int8.py b/python/sglang/srt/layers/quantization/blockwise_int8.py index b7c46ab4a..d79d70de7 100644 --- a/python/sglang/srt/layers/quantization/blockwise_int8.py +++ b/python/sglang/srt/layers/quantization/blockwise_int8.py @@ -208,7 +208,7 @@ class BlockInt8LinearMethod(LinearMethodBase): def process_weights_after_loading(self, layer: Module) -> None: # Block quant doesn't need to process weights after loading - # Use torch Parameter to avoid CUDA graph capturing issue + # Use torch Parameter to avoid cuda graph capturing issue layer.weight = torch.nn.Parameter(layer.weight.data, requires_grad=False) layer.weight_scale_inv = torch.nn.Parameter( layer.weight_scale_inv.data, requires_grad=False diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py index 7ca9ca80d..7ce89345f 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py @@ -363,7 +363,7 @@ class CompressedTensorsConfig(QuantizationConfig): if self._is_wNa16_group_channel(weight_quant, input_quant): if not VLLM_AVAILABLE: raise ImportError( - "vLLM is not installed, to use CompressedTensorsW4A16Sparse24 and CompressedTensorsWNA16, please install vLLM" + "vllm is not installed, to use CompressedTensorsW4A16Sparse24 and CompressedTensorsWNA16, please install vllm" ) if ( self.quant_format == CompressionFormat.marlin_24.value @@ -409,7 +409,7 @@ class CompressedTensorsConfig(QuantizationConfig): if self._is_fp8_w8a16(weight_quant, input_quant): if not VLLM_AVAILABLE: raise ImportError( - "vLLM is not installed, to use CompressedTensorsW8A16Fp8, please install vLLM" + "vllm is not installed, to use CompressedTensorsW8A16Fp8, please install vllm" ) is_static_input_scheme = input_quant and not input_quant.dynamic return CompressedTensorsW8A16Fp8( @@ -491,7 +491,7 @@ class CompressedTensorsConfig(QuantizationConfig): ): if not VLLM_AVAILABLE: raise ImportError( - "vLLM is not installed, to use CompressedTensors24, please install vLLM" + "vllm is not installed, to use CompressedTensors24, please install vllm" ) # Have a valid sparsity scheme # Validate layer is supported by Cutlass 2:4 Kernel diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py index e31596f3f..e8fd243e4 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -65,7 +65,7 @@ class CompressedTensorsMoEMethod: if quant_config._is_wNa16_group_channel(weight_quant, input_quant): if not VLLM_AVAILABLE: raise ImportError( - "vLLM is not installed, to use CompressedTensorsWNA16MoEMethod, please install vLLM." + "vllm is not installed, to use CompressedTensorsWNA16MoEMethod, please install vllm." ) return CompressedTensorsWNA16MoEMethod(quant_config) elif quant_config._is_fp8_w8a8(weight_quant, input_quant): diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py index eaa20c563..fa7d77f28 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py @@ -27,10 +27,10 @@ except ImportError: MARLIN_FP8_AVAILABLE = False def apply_fp8_marlin_linear(*args, **kwargs): - raise ImportError("vLLM is not installed") + raise ImportError("vllm is not installed") def prepare_fp8_layer_for_marlin(*args, **kwargs): - raise ImportError("vLLM is not installed") + raise ImportError("vllm is not installed") __all__ = ["CompressedTensorsW8A16Fp8"] @@ -45,7 +45,7 @@ class CompressedTensorsW8A16Fp8(CompressedTensorsScheme): if not MARLIN_FP8_AVAILABLE: raise ImportError( - "vLLM is not installed. To use CompressedTensorsW8A16Fp8, please install vLLM" + "vllm is not installed. To use CompressedTensorsW8A16Fp8, please install vllm" ) @classmethod diff --git a/python/sglang/srt/layers/quantization/fp8_utils.py b/python/sglang/srt/layers/quantization/fp8_utils.py index 32ddb6960..0602144e7 100644 --- a/python/sglang/srt/layers/quantization/fp8_utils.py +++ b/python/sglang/srt/layers/quantization/fp8_utils.py @@ -357,7 +357,7 @@ def apply_fp8_linear( # Fused GEMM_DQ if VLLM_AVAILABLE and use_vllm_cutlass_w8a8_fp8_kernel: - # Fall back to vLLM cutlass w8a8 fp8 kernel + # Fall back to vllm cutlass w8a8 fp8 kernel output = ops.cutlass_scaled_mm( qinput, weight, @@ -493,7 +493,7 @@ def apply_fp8_linear( if cutlass_fp8_supported: try: if VLLM_AVAILABLE and use_vllm_cutlass_w8a8_fp8_kernel: - # Fall back to vLLM cutlass w8a8 fp8 kernel + # Fall back to vllm cutlass w8a8 fp8 kernel output = ops.cutlass_scaled_mm( qinput, weight, diff --git a/python/sglang/srt/layers/rotary_embedding.py b/python/sglang/srt/layers/rotary_embedding.py index b6308b3ab..c5c285ca0 100644 --- a/python/sglang/srt/layers/rotary_embedding.py +++ b/python/sglang/srt/layers/rotary_embedding.py @@ -186,8 +186,8 @@ class LinearScalingRotaryEmbedding(RotaryEmbedding): It supports multiple scaling factors. Since multiple LoRA adapters may have different scaling factors, we need multiple cos/sin caches. In this way, - instead of running rotary embedding kernel per LoRA adapter, we can run multiple - LoRA adapters in a batched way. + instead of running rotary embedding kernel per lora, we can run multiple + lora in a batched way. In addition to that, we also keep the cos/sin cache for the scaling factor of 1 (default) at all times. diff --git a/python/sglang/srt/lora/backend/base_backend.py b/python/sglang/srt/lora/backend/base_backend.py index a0d7c119e..e1bdc5408 100644 --- a/python/sglang/srt/lora/backend/base_backend.py +++ b/python/sglang/srt/lora/backend/base_backend.py @@ -41,13 +41,13 @@ class BaseLoRABackend: def run_lora_a_sgemm( self, x: torch.Tensor, weights: torch.Tensor, *args, **kwargs ) -> torch.Tensor: - """Run segment Gemm of LoRA a modules with current backend. + """Run segment Gemm of lora a modules with current backend. The definition of segment Gemm can be referred to https://docs.flashinfer.ai/api/gemm.html. Args: x: input matrix with shape (s, input_dim), here s is the sum of all sequence lengths - weights: a set of LoRA weights with shape (num_lora, c * r, input_dim), - here r is LoRA rank, c is a multiplier for stacked modules (e.g., c=3 for qkv_proj, c=2 for gate_up_proj) + weights: a set of lora weights with shape (num_lora, c * r, input_dim), + here r is lora rank, c is a multiplier for stacked modules (e.g., c=3 for qkv_proj, c=2 for gate_up_proj) usually input_dim is much larger than r Returns: result with shape (s, c * r) @@ -57,12 +57,12 @@ class BaseLoRABackend: def run_lora_b_sgemm( self, x: torch.Tensor, weights: torch.Tensor, *args, **kwargs ) -> torch.Tensor: - """Run segment Gemm of LoRA b modules with current backend. + """Run segment Gemm of lora b modules with current backend. The definition of segment Gemm can be referred to https://docs.flashinfer.ai/api/gemm.html. Args: - x: input matrix with shape (s, r), here s is the sum of all sequence lengths, r is LoRA rank - weights: a set of LoRA weights with shape (num_lora, output_dim, r) + x: input matrix with shape (s, r), here s is the sum of all sequence lengths, r is lora rank + weights: a set of lora weights with shape (num_lora, output_dim, r) usually output_dim is much larger than r Returns: result with shape (s, output_dim) @@ -77,7 +77,7 @@ class BaseLoRABackend: *args, **kwargs, ) -> torch.Tensor: - """Run the LoRA pass for QKV Layer. + """Run the lora pass for QKV Layer. Args: x: input matrix with shape (s, input_dim), here s is the sum of all sequence lengths @@ -100,7 +100,7 @@ class BaseLoRABackend: *args, **kwargs, ) -> torch.Tensor: - """Run the LoRA pass for gate_up_proj, usually attached to MergedColumnParallelLayer. + """Run the lora pass for gate_up_proj, usually attached to MergedColumnParallelLayer. Args: x: input matrix with shape (s, input_dim), here s is the sum of all sequence lengths diff --git a/python/sglang/srt/lora/backend/flashinfer_backend.py b/python/sglang/srt/lora/backend/flashinfer_backend.py index c9200edbb..0370c6c81 100644 --- a/python/sglang/srt/lora/backend/flashinfer_backend.py +++ b/python/sglang/srt/lora/backend/flashinfer_backend.py @@ -117,7 +117,7 @@ class FlashInferLoRABackend(BaseLoRABackend): dtype=x.dtype, ) - # Compute LoRA for gate and up proj respectively + # Compute lora for gate and up proj respectively lora_output[:, :output_dim] = self.run_lora_b_sgemm( x=lora_a_output[:, :lora_rank].contiguous(), weights=gate_up_lora_b[0], diff --git a/python/sglang/srt/lora/layers.py b/python/sglang/srt/lora/layers.py index 9f1af72ce..aa10ef6b7 100644 --- a/python/sglang/srt/lora/layers.py +++ b/python/sglang/srt/lora/layers.py @@ -198,7 +198,7 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): if self.lora_backend.fuse_stacked_lora_b: assert ( B_buffer_q.shape[-1] == B_buffer_kv.shape[-1] - ), "The LoRA rank of q and kv should be the same when enabling fusion of qkv lora_b" + ), "The lora rank of q and kv should be the same when enabling fusion of qkv lora_b" output_dim_q, output_dim_kv = B_buffer_q.shape[-2], B_buffer_kv.shape[-2] # B_buffer_qkv: (num_lora, output_dim_q + 2 * output_dim_kv, r) diff --git a/python/sglang/srt/lora/lora.py b/python/sglang/srt/lora/lora.py index 1fa202c31..b0db40d6a 100644 --- a/python/sglang/srt/lora/lora.py +++ b/python/sglang/srt/lora/lora.py @@ -40,7 +40,7 @@ class LoRALayer(nn.Module): self.config: LoRAConfig = config self.base_hf_config: AutoConfig = base_hf_config - # LoRA weights in cpu. The weights are loaded from checkpoint. + # lora weights in cpu. The weights are loaded from checkpoint. self.weights: Dict[str, torch.Tensor] = {} @@ -97,7 +97,7 @@ class LoRAAdapter(nn.Module): def stack_qkv_proj(self, weight_names: List[str], weights: Dict[str, torch.Tensor]): - # Collect target q/k/v modules. This process is necessary since there might be no LoRA attached to k_proj + # Collect target q/k/v modules. This process is necessary since there might be no lora attached to k_proj target_module = set() for weight_name in weight_names: if "k_proj" in weight_name: @@ -110,7 +110,7 @@ class LoRAAdapter(nn.Module): return for weight_name in weight_names: - # We assume every LoRA adaptor should contain LoRA modules for q_proj + # We assume every lora adaptor should contain lora modules for q_proj if "q_proj" in weight_name: q_name = weight_name k_name = weight_name.replace("q_proj", "k_proj") @@ -118,7 +118,7 @@ class LoRAAdapter(nn.Module): kv_name = weight_name.replace("q_proj", "kv_proj") qkv_name = weight_name.replace("q_proj", "qkv_proj") - # If k_proj doesn't have LoRA, initialize it to zero + # If k_proj doesn't have lora, initialize it to zero k_proj_weight = ( weights[k_name] if "k_proj" in target_module diff --git a/python/sglang/srt/lora/lora_manager.py b/python/sglang/srt/lora/lora_manager.py index 92f7956d8..70e6ca838 100644 --- a/python/sglang/srt/lora/lora_manager.py +++ b/python/sglang/srt/lora/lora_manager.py @@ -93,14 +93,14 @@ class LoRAManager: # Config of each LoRA adapter self.configs: Dict[str, LoRAConfig] = {} - # Target module names in HuggingFace LoRA configs. + # Target module names in huggingface lora configs. # e.g., {"k_proj", "q_proj", "v_proj", "o_proj"} self.hf_target_names: Set[str] = set() for name, path in self.lora_paths.items(): self.configs[name] = LoRAConfig(path) self.hf_target_names.update(self.configs[name].target_modules) - # Target LoRA weight names for lora_a and lora_b modules respectively. + # Target lora weight names for lora_a and lora_b modules respectively. # e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj")} self.lora_weight_names: Set[Tuple[str]] = set( [get_stacked_name(module) for module in self.hf_target_names] @@ -119,11 +119,11 @@ class LoRAManager: lora_adapter.initialize_weights() self.loras[name] = lora_adapter - # misc LoRA configs + # misc lora configs self.max_lora_dim: int = max([x.hf_config["r"] for x in self.configs.values()]) if self.lora_backend == "flashinfer": - # FIXME: remove the restrictions after supporting multi-rank for flashinfer backend + # FIXME remove the restrictions after supporting multi-rank for flashinfer backend max_lora_dim = max([x.hf_config["r"] for x in self.configs.values()]) scaling = list(self.loras.values())[0].scaling assert all(x.hf_config["r"] == max_lora_dim for x in self.configs.values()) @@ -144,16 +144,16 @@ class LoRAManager: self.lora_modules, ) - # Initialize target LoRA modules in memory pool + # Initialize target lora modules in memory pool self.memory_pool.init_buffers(self.lora_weight_names, self.base_model) def prepare_lora_batch(self, forward_batch: ForwardBatch): - # load active LoRAs into LoRA memory pool + # load active loras into lora memory pool cur_uids = set(forward_batch.lora_paths) assert len(cur_uids) <= self.max_loras_per_batch self.memory_pool.prepare_lora_batch(cur_uids, self.loras) - # set up batch info shared by all LoRA modules + # set up batch info shared by all lora modules bs = forward_batch.batch_size if ( @@ -221,7 +221,7 @@ class LoRAManager: ) self.lora_backend.set_batch_info(batch_info) - # call set_lora_info for each LoRA modules + # call set_lora_info for each lora modules for layer_id, modules in self.lora_modules.items(): for module_name, module in modules: if "qkv_proj" in module_name: diff --git a/python/sglang/srt/lora/mem_pool.py b/python/sglang/srt/lora/mem_pool.py index 8915a76e7..71495acca 100644 --- a/python/sglang/srt/lora/mem_pool.py +++ b/python/sglang/srt/lora/mem_pool.py @@ -16,7 +16,7 @@ from sglang.srt.lora.utils import ( class LoRAMemoryPool: - """Class for memory pool management of LoRA modules""" + """Class for memory pool management of lora modules""" def __init__( self, @@ -38,7 +38,7 @@ class LoRAMemoryPool: self.tp_rank: int = tp_rank self.lora_modules: Dict[int, List[Tuple[str, BaseLayerWithLoRA]]] = lora_modules - # Both A_buffer and B_buffer maps LoRA weight names to its buffer space. + # Both A_buffer and B_buffer maps lora weight names to its buffer space. # A_buffer contains num_layer number of row-major tensors with shape # (max_loras_per_batch, stacked_num * max_lora_dim, input_dim) # B_buffer contains num_layer number of column-major tensors with shape @@ -46,10 +46,10 @@ class LoRAMemoryPool: self.A_buffer: Dict[str, List[torch.Tensor]] = {} self.B_buffer: Dict[str, List[torch.Tensor]] = {} - # LoRA uid -> buffer idx in memory pool + # Lora uid -> buffer idx in memory pool self.uid_to_buffer_id: Dict[Optional[str], int] = {} - # Buffer idx -> LoRA uid in memory pool + # Buffer idx -> lora uid in memory pool # All uids are initialized as empty strings for empty buffer slots # Here we don't initialize to None since None is a valid uid self.buffer_id_to_uid: List[Optional[str]] = [""] * self.max_loras_per_batch @@ -95,7 +95,7 @@ class LoRAMemoryPool: base_model: torch.nn.Module, ): - # lora_weight_names is a set of name pairs indicating each pair of LoRA modules to load + # lora_weight_names is a set of name pairs indicating each pair of lora modules to load # e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj"), ("o_proj", "o_proj")} self.lora_weight_names: Set[Tuple[str]] = lora_weight_names device = next(base_model.parameters()).device @@ -137,7 +137,7 @@ class LoRAMemoryPool: return buffer_id, "" for buffer_id in range(self.max_loras_per_batch): - # Evict unneeded LoRA + # Evict unneeded lora if self.buffer_id_to_uid[buffer_id] not in cur_uids: return buffer_id, self.buffer_id_to_uid[buffer_id] diff --git a/python/sglang/srt/lora/triton_ops/gate_up_lora_b.py b/python/sglang/srt/lora/triton_ops/gate_up_lora_b.py index 72600d6cf..ae242dc48 100644 --- a/python/sglang/srt/lora/triton_ops/gate_up_lora_b.py +++ b/python/sglang/srt/lora/triton_ops/gate_up_lora_b.py @@ -37,7 +37,7 @@ def _gate_up_lora_b_kernel( ): # This kernel packs 2 sgemms (gate/up) into a single kernel. - # x: (s, 2 * K), s is the sum of sequence lengths, K equals to LoRA rank + # x: (s, 2 * K), s is the sum of sequence lengths, K equals to lora rank # weights: (num_lora, 2 * output_dim, K) # output: (s, 2 * output_dim) # output_dim >> K diff --git a/python/sglang/srt/lora/triton_ops/qkv_lora_b.py b/python/sglang/srt/lora/triton_ops/qkv_lora_b.py index 4c48aacc9..76f3f8671 100644 --- a/python/sglang/srt/lora/triton_ops/qkv_lora_b.py +++ b/python/sglang/srt/lora/triton_ops/qkv_lora_b.py @@ -39,7 +39,7 @@ def _qkv_lora_b_kernel( ): # This kernel packs 3 sgemms (q/k/v) into a single kernel. - # x: (s, 3 * K), s is the sum of sequence lengths, K equals to LoRA rank + # x: (s, 3 * K), s is the sum of sequence lengths, K equals to lora rank # weights: (num_lora, N_Q + 2 * N_KV, K) # output: (s, N_Q + 2 * N_KV) # N_Q >> K, N_KV >> K diff --git a/python/sglang/srt/lora/utils.py b/python/sglang/srt/lora/utils.py index 2ab80032e..3f1f3558d 100644 --- a/python/sglang/srt/lora/utils.py +++ b/python/sglang/srt/lora/utils.py @@ -22,13 +22,13 @@ class LoRABatchInfo: # Maximum sequence length of current batch max_len: int - # The index of LoRA adapter used by each sequence, in shape (bs,) + # The index of lora adapter used by each sequence, in shape (bs,) weight_indices: torch.Tensor - # ranks of each LoRA adapter, in shape (lora_num,) + # ranks of each lora adapter, in shape (lora_num,) lora_ranks: torch.Tensor - # scaling of each LoRA adapter, in shape (lora_num,) + # scaling of each lora adapter, in shape (lora_num,) scalings: torch.Tensor @@ -51,9 +51,9 @@ def get_customized_names_from_hf_names( hf_module_names: Set[str], base_model: torch.nn.Module ) -> Set[str]: """ - This function takes in a set of HuggingFace style module names: + This function takes in a set of huggingface style module names: e.g., {"k_proj", "q_proj", "v_proj", "o_proj"} - and outputs a set of module names of customized SGLang layers: + and outputs a set of module names of customized sglang layers: e.g., {"qkv_proj", "o_proj"} """ if hasattr(base_model, "get_module_name"): @@ -87,7 +87,7 @@ def get_hidden_dim( else: """ WARNING: get_hidden_dim() is not defined, - which is used to get the hidden dim for different LoRA modules + which is used to get the hidden dim for different lora modules Use the default one, but please check if it is correct for your model. Please implement the function in the model class if it is not. You can reference this function in llama.py. @@ -108,7 +108,7 @@ def get_hidden_dim( def get_stacked_name(name: str) -> Tuple[str]: """ - Mapping a target LoRA module name to (stacked name for LoRA A, stacked name for LoRA B) + Mapping a target module name to (stacked name for Lora A, stacked name for Lora B) """ params_mapping = { "q_proj": ("qkv_proj", "q_proj"), @@ -122,7 +122,7 @@ def get_stacked_name(name: str) -> Tuple[str]: def get_stacked_multiply(module_name: str) -> int: """ - Mapping a module name to its magnification at output dimension + Mapping a lora module name to its magnification at output dimension """ stacked_rank = { "qkv_proj": 3, @@ -137,7 +137,7 @@ def get_weight_name( ) -> Optional[str]: """ target_name is name of a given module, - lora_weight_names is a set of LoRA stacked name pairs (see get_stacked_name method above) + lora_weight_names is a set of lora stacked name pairs (see get_stacked_name method above) If there is a weight name in lora_weight_names that can match target_name, return this name Else raise ValueError. """ diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index b9758465b..68a9df309 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -1667,7 +1667,7 @@ class Scheduler( can_cuda_graph = 0 if not spec_algorithm.is_none(): - # TODO(sang): Support CUDA graph when idle batch is there. + # TODO(sang): Support cuda graph when idle batch is there. if local_batch is None or local_batch.forward_mode.is_idle(): can_cuda_graph = 0 @@ -1704,7 +1704,7 @@ class Scheduler( local_batch.global_num_tokens = global_num_tokens local_batch.global_num_tokens_for_logprob = global_num_tokens_for_logprob - # Check forward mode for CUDA graph + # Check forward mode for cuda graph if not disable_cuda_graph: local_batch.can_run_dp_cuda_graph = can_cuda_graph diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 265f88008..db64dd0a2 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -238,7 +238,7 @@ class TokenizerManager: self.metrics_collector = TokenizerMetricsCollector( labels={ "model_name": self.server_args.served_model_name, - # TODO: Add LoRA name/path in the future, + # TODO: Add lora name/path in the future, }, ) diff --git a/python/sglang/srt/managers/tp_worker_overlap_thread.py b/python/sglang/srt/managers/tp_worker_overlap_thread.py index 13d727e8d..4c6cd576f 100644 --- a/python/sglang/srt/managers/tp_worker_overlap_thread.py +++ b/python/sglang/srt/managers/tp_worker_overlap_thread.py @@ -213,7 +213,7 @@ class TpModelWorkerClient: penalizer_orchestrator=None, ) - # A CUDA stream sync here to avoid the CUDA illegal memory access error. + # A cuda stream sync here to avoid the cuda illegal memory access error. self.scheduler_stream.synchronize() # Push a new batch to the queue diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py index e48314474..025c75392 100644 --- a/python/sglang/srt/model_executor/cuda_graph_runner.py +++ b/python/sglang/srt/model_executor/cuda_graph_runner.py @@ -11,7 +11,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Run the model with CUDA graph and torch.compile.""" +"""Run the model with cuda graph and torch.compile.""" from __future__ import annotations @@ -127,7 +127,7 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner): else: capture_bs = [1, 2, 4, 8] + list(range(16, 161, 8)) else: - # Since speculative decoding requires more CUDA graph memory, we + # Since speculative decoding requires more cuda graph memory, we # capture less. capture_bs = ( list(range(1, 9)) + list(range(10, 33, 2)) + list(range(40, 161, 16)) @@ -161,7 +161,7 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner): return capture_bs, compile_bs -# Reuse this memory pool across all CUDA graph runners. +# Reuse this memory pool across all cuda graph runners. global_graph_memory_pool = None @@ -175,7 +175,7 @@ def set_global_graph_memory_pool(val): class CudaGraphRunner: - """A CudaGraphRunner runs the forward pass of a model with CUDA graph and torch.compile.""" + """A CudaGraphRunner runs the forward pass of a model with cuda graph and torch.compile.""" def __init__(self, model_runner: ModelRunner): # Parse args @@ -194,7 +194,7 @@ class CudaGraphRunner: # Batch sizes to capture self.capture_bs, self.compile_bs = get_batch_sizes_to_capture(model_runner) - rank0_log(f"Capture CUDA graph bs {self.capture_bs}") + rank0_log(f"Capture cuda graph bs {self.capture_bs}") self.capture_forward_mode = ForwardMode.DECODE self.capture_hidden_mode = CaptureHiddenMode.NULL self.num_tokens_per_bs = 1 @@ -334,8 +334,8 @@ class CudaGraphRunner: else forward_batch.batch_size <= self.max_bs ) - # NOTE: CUDA graph cannot handle mixed batch (encoder_len = 0) - # If mixed batch cannot be supported, then encoder_lens can be removed in CUDA graph + # NOTE: cuda graph cannot handle mixed batch (encoder_len = 0) + # If mixed batch cannot be supported, then encoder_lens can be removed in cuda graph # because the full_text_row_masked_out_mask tensor will always be ones is_encoder_lens_supported = ( torch.all(forward_batch.encoder_lens > 0) @@ -350,7 +350,7 @@ class CudaGraphRunner: avail_mem = get_available_gpu_memory( self.model_runner.device, self.model_runner.gpu_id, empty_cache=False ) - # Reverse the order to enable better memory sharing across CUDA graphs. + # Reverse the order to enable better memory sharing across cuda graphs. capture_range = ( tqdm.tqdm(list(reversed(self.capture_bs))) if get_tensor_model_parallel_rank() == 0 @@ -429,9 +429,9 @@ class CudaGraphRunner: spec_info.capture_hidden_mode if spec_info else CaptureHiddenMode.NULL ) if self.model_runner.server_args.lora_paths is not None: - # Currently, if the lora_path in `lora_paths` is None, the LoRA backend will use a - # different logic to handle LoRA, so we need to set `lora_paths` to a list of non-None - # values if LoRA is enabled. + # Currently, if the lora_path in `lora_paths` is None, the lora backend will use a + # different logic to handle lora, so we need to set `lora_paths` to a list of non-None + # values if lora is enabled. lora_paths = [next(iter(self.model_runner.server_args.lora_paths))] * bs else: lora_paths = None diff --git a/python/sglang/srt/model_executor/forward_batch_info.py b/python/sglang/srt/model_executor/forward_batch_info.py index 316340bd9..5018f92d5 100644 --- a/python/sglang/srt/model_executor/forward_batch_info.py +++ b/python/sglang/srt/model_executor/forward_batch_info.py @@ -229,7 +229,7 @@ class ForwardBatch: # For DP attention global_num_tokens_cpu: Optional[List[int]] = None global_num_tokens_gpu: Optional[torch.Tensor] = None - # Has to be None when CUDA graph is captured. + # Has to be None when cuda graph is captured. global_num_tokens_for_logprob_cpu: Optional[List[int]] = None global_num_tokens_for_logprob_gpu: Optional[torch.Tensor] = None # for extend, local start pos and num tokens is different in logits processor @@ -356,7 +356,7 @@ class ForwardBatch: if model_runner.model_is_mrope: ret._compute_mrope_positions(model_runner, batch) - # Init LoRA information + # Init lora information if model_runner.server_args.lora_paths is not None: model_runner.lora_manager.prepare_lora_batch(ret) diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 235869b35..9fd9013c4 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -225,7 +225,7 @@ class ModelRunner: if self.tp_size > 1 and supports_torch_tp: self.apply_torch_tp() - # Init LoRA + # Init lora if server_args.lora_paths is not None: self.init_lora_manager() @@ -1009,11 +1009,11 @@ class ModelRunner: ) def init_cuda_graphs(self): - """Capture CUDA graphs.""" + """Capture cuda graphs.""" self.cuda_graph_runner = None if not self.is_generation: - # TODO: Currently, CUDA graph only captures decode steps, which only exists for generation models + # TODO: Currently, cuda graph only captures decode steps, which only exists for generation models return if self.server_args.disable_cuda_graph: @@ -1022,12 +1022,12 @@ class ModelRunner: tic = time.time() before_mem = get_available_gpu_memory(self.device, self.gpu_id) logger.info( - f"Capture CUDA graph begin. This can take up to several minutes. avail mem={before_mem:.2f} GB" + f"Capture cuda graph begin. This can take up to several minutes. avail mem={before_mem:.2f} GB" ) self.cuda_graph_runner = CudaGraphRunner(self) after_mem = get_available_gpu_memory(self.device, self.gpu_id) logger.info( - f"Capture CUDA graph end. Time elapsed: {time.time() - tic:.2f} s. " + f"Capture cuda graph end. Time elapsed: {time.time() - tic:.2f} s. " f"mem usage={(before_mem - after_mem):.2f} GB. avail mem={after_mem:.2f} GB." ) diff --git a/python/sglang/srt/models/commandr.py b/python/sglang/srt/models/commandr.py index ec37721fc..ebbf8ed64 100644 --- a/python/sglang/srt/models/commandr.py +++ b/python/sglang/srt/models/commandr.py @@ -393,7 +393,7 @@ class CohereForCausalLM(nn.Module): weight_loader(param, loaded_weight, shard_id) break else: - # lm_head is not used in vLLM as it is tied with embed_token. + # lm_head is not used in vllm as it is tied with embed_token. # To prevent errors, skip loading lm_head.weight. if "lm_head.weight" in name: continue diff --git a/python/sglang/srt/models/deepseek_janus_pro.py b/python/sglang/srt/models/deepseek_janus_pro.py index c3ddf7478..296983e9d 100644 --- a/python/sglang/srt/models/deepseek_janus_pro.py +++ b/python/sglang/srt/models/deepseek_janus_pro.py @@ -1190,7 +1190,7 @@ class CLIPVisionTower(nn.Module): # vision_tower = create_sam_vit(**vision_tower_params) forward_kwargs = dict() - else: # HuggingFace + else: # huggingface from transformers import CLIPVisionModel vision_tower = CLIPVisionModel.from_pretrained(**vision_tower_params) diff --git a/python/sglang/srt/models/gemma.py b/python/sglang/srt/models/gemma.py index 5ed5e5f99..d8074487c 100644 --- a/python/sglang/srt/models/gemma.py +++ b/python/sglang/srt/models/gemma.py @@ -342,7 +342,7 @@ class GemmaForCausalLM(nn.Module): weight_loader(param, loaded_weight, shard_id) break else: - # lm_head is not used in vLLM as it is tied with embed_token. + # lm_head is not used in vllm as it is tied with embed_token. # To prevent errors, skip loading lm_head.weight. if "lm_head.weight" in name: continue diff --git a/python/sglang/srt/models/gemma2.py b/python/sglang/srt/models/gemma2.py index faeb04958..9056b0b0c 100644 --- a/python/sglang/srt/models/gemma2.py +++ b/python/sglang/srt/models/gemma2.py @@ -441,7 +441,7 @@ class Gemma2ForCausalLM(nn.Module): weight_loader(param, loaded_weight, shard_id) break else: - # lm_head is not used in vLLM as it is tied with embed_token. + # lm_head is not used in vllm as it is tied with embed_token. # To prevent errors, skip loading lm_head.weight. if "lm_head.weight" in name: continue diff --git a/python/sglang/srt/models/gemma3_causal.py b/python/sglang/srt/models/gemma3_causal.py index b82ed8c59..511d9c7e8 100644 --- a/python/sglang/srt/models/gemma3_causal.py +++ b/python/sglang/srt/models/gemma3_causal.py @@ -174,7 +174,7 @@ class Gemma3Attention(nn.Module): # Local attention. Override the values in config.json. self.rope_theta = config.rope_local_base_freq self.rope_scaling = {"rope_type": "default"} - # FIXME(mick): idk why vLLM does this + # FIXME(mick): idk why vllm does this # self.sliding_window = config.interleaved_sliding_window self.sliding_window = get_attention_sliding_window_size(config) else: @@ -667,7 +667,7 @@ class Gemma3ForCausalLM(PreTrainedModel): weight_loader(param, loaded_weight, shard_id) break else: - # lm_head is not used in vLLM as it is tied with embed_token. + # lm_head is not used in vllm as it is tied with embed_token. # To prevent errors, skip loading lm_head.weight. if "lm_head.weight" in name: continue diff --git a/python/sglang/srt/models/kimi_vl_moonvit.py b/python/sglang/srt/models/kimi_vl_moonvit.py index 86c16fc71..a16ee5923 100644 --- a/python/sglang/srt/models/kimi_vl_moonvit.py +++ b/python/sglang/srt/models/kimi_vl_moonvit.py @@ -418,7 +418,7 @@ class MoonVitEncoderLayer(nn.Module): hidden_dim: int, mlp_dim: int, *, - attn_implementation: str = "flash_attention_2", # use fa2 in SGLang by default + attn_implementation: str = "flash_attention_2", # use fa2 in sglang by default activation=F.gelu, attn_bias: bool = False, ): diff --git a/python/sglang/srt/models/mllama.py b/python/sglang/srt/models/mllama.py index 5a5e06306..8d63d9cfc 100644 --- a/python/sglang/srt/models/mllama.py +++ b/python/sglang/srt/models/mllama.py @@ -537,8 +537,8 @@ class MllamaTextCrossAttention(nn.Module): quant_config=quant_config, prefix=add_prefix("o_proj", prefix), ) - # vllm.model_executor.layers.layernorm.RMSNorm has a precision issue, - # use HuggingFace's instead + # vllm.model_executor.layers.layernorm.RMSNorm has precision issue, + # use huggingface's instead self.q_norm = MllamaTextRMSNorm(self.head_dim, eps=config.rms_norm_eps) self.k_norm = MllamaTextRMSNorm(self.head_dim, eps=config.rms_norm_eps) self.scaling = self.head_dim**-0.5 @@ -979,8 +979,8 @@ class MllamaForConditionalGeneration(nn.Module): cross_attention_states = None if self.capture_mode: - # NOTE: when doing CUDA graph capture, we do not want to skip cross attention - # Make is a constant value to avoid CUDA graph capture issue + # NOTE: when doing cuda graph capture, we do not want to skip cross attention + # Make is a constant value to avoid cuda graph capture issue skip_cross_attention = False else: # NOTE: we do not need image_inputs when prefill diff --git a/python/sglang/srt/models/roberta.py b/python/sglang/srt/models/roberta.py index 8ca11d384..b982bc8e3 100644 --- a/python/sglang/srt/models/roberta.py +++ b/python/sglang/srt/models/roberta.py @@ -57,7 +57,7 @@ class RobertaEmbedding(nn.Module): input_shape = input_ids.size() inputs_embeds = self.word_embeddings(input_ids) - # Adapted from vLLM: https://github.com/vllm-project/vllm/commit/4a18fd14ba4a349291c798a16bf62fa8a9af0b6b/vllm/model_executor/models/roberta.py + # Adapted from vllm: https://github.com/vllm-project/vllm/commit/4a18fd14ba4a349291c798a16bf62fa8a9af0b6b/vllm/model_executor/models/roberta.py pos_list = [] token_list = [] diff --git a/python/sglang/srt/platforms/interface.py b/python/sglang/srt/platforms/interface.py index 915572a84..eb8dbd469 100644 --- a/python/sglang/srt/platforms/interface.py +++ b/python/sglang/srt/platforms/interface.py @@ -67,7 +67,7 @@ class Platform: # Real device name of current platform. device_name: str - # For specifying torch device for CUDA alike platform's capability. + # For specifying torch device for cuda alike platform's capability. device_type: str # The torch.distributed backend on current platform @@ -254,7 +254,7 @@ class Platform: @classmethod def check_and_update_lora_backend(cls, backend: str) -> str: """ - Check and update the LoRA backend for the current platform. + Check and update the lora backend for the current platform. """ raise NotImplementedError diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index a0c9bba1d..6051d2409 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -246,7 +246,7 @@ class ServerArgs: self.mem_fraction_static = min( mem_fraction + 48 * 1024 * (1 - mem_fraction) / gpu_mem, (gpu_mem - 1024 * 18) - / gpu_mem, # 15 GB + additional 3GB for CUDA graph + / gpu_mem, # 15 GB + additional 3GB for cuda graph ) # Set chunked prefill size, which depends on the gpu memory capacity @@ -276,9 +276,9 @@ class ServerArgs: ) self.page_size = 128 - # Set CUDA graph max batch size + # Set cuda graph max batch size if self.cuda_graph_max_bs is None: - # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable CUDA graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating CUDA graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable CUDA graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating CUDA graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating CUDA graphs on lower-end GPUs compared to the original 160, avoiding OOM issues. + # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues. if gpu_mem is not None and gpu_mem < 25_000: if self.tp_size < 4: self.cuda_graph_max_bs = 8 @@ -729,7 +729,7 @@ class ServerArgs: "--download-dir", type=str, default=ServerArgs.download_dir, - help="Model download directory for HuggingFace.", + help="Model download directory for huggingface.", ) parser.add_argument( "--base-gpu-id", @@ -1024,12 +1024,12 @@ class ServerArgs: parser.add_argument( "--disable-cuda-graph", action="store_true", - help="Disable CUDA graph.", + help="Disable cuda graph.", ) parser.add_argument( "--disable-cuda-graph-padding", action="store_true", - help="Disable CUDA graph when padding is needed. Still uses CUDA graph when padding is not needed.", + help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.", ) parser.add_argument( "--enable-nccl-nvls", @@ -1075,7 +1075,7 @@ class ServerArgs: parser.add_argument( "--enable-ep-moe", action="store_true", - help="Enabling expert parallelism for MoE. The ep size is equal to the tp size.", + help="Enabling expert parallelism for moe. The ep size is equal to the tp size.", ) parser.add_argument( "--enable-torch-compile", @@ -1092,13 +1092,13 @@ class ServerArgs: "--cuda-graph-max-bs", type=int, default=ServerArgs.cuda_graph_max_bs, - help="Set the maximum batch size for CUDA graph. It will extend the CUDA graph capture batch size to this value.", + help="Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.", ) parser.add_argument( "--cuda-graph-bs", type=int, nargs="+", - help="Set the list of batch sizes for CUDA graph.", + help="Set the list of batch sizes for cuda graph.", ) parser.add_argument( "--torchao-config", @@ -1334,7 +1334,7 @@ class ServerArgs: self.max_loras_per_batch > 0 # FIXME and (self.lora_paths is None or self.disable_radix_cache) - ), "compatibility of LoRA and CUDA graph and RadixAttention is in progress" + ), "compatibility of lora and cuda graph and radix attention is in progress" assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative" assert self.gpu_id_step >= 1, "gpu_id_step must be positive" diff --git a/python/sglang/srt/speculative/eagle_worker.py b/python/sglang/srt/speculative/eagle_worker.py index 3369e6f75..7ea48102d 100644 --- a/python/sglang/srt/speculative/eagle_worker.py +++ b/python/sglang/srt/speculative/eagle_worker.py @@ -78,7 +78,7 @@ class EAGLEWorker(TpModelWorker): # Override context length with target model's context length server_args.context_length = target_worker.model_runner.model_config.context_len - # Do not capture CUDA graph in `super().__init__()` + # Do not capture cuda graph in `super().__init__()` # It will be captured later. backup_disable_cuda_graph = server_args.disable_cuda_graph server_args.disable_cuda_graph = True @@ -136,7 +136,7 @@ class EAGLEWorker(TpModelWorker): # Share the embedding and lm_head self.draft_model_runner.model.set_embed_and_head(embed, head) - # Init attention backend and CUDA graphs + # Init attention backend and cuda graphs self.draft_model_runner.server_args.disable_cuda_graph = ( backup_disable_cuda_graph ) @@ -148,7 +148,7 @@ class EAGLEWorker(TpModelWorker): self.init_cuda_graphs() def init_attention_backend(self): - # Create multi-step attn backends and CUDA graph runners + # Create multi-step attn backends and cuda graph runners if self.server_args.attention_backend == "flashinfer": if not global_server_args_dict["use_mla_backend"]: from sglang.srt.layers.attention.flashinfer_backend import ( @@ -207,7 +207,7 @@ class EAGLEWorker(TpModelWorker): self.draft_model_runner.draft_attn_backend = self.draft_attn_backend def init_cuda_graphs(self): - """Capture CUDA graphs.""" + """Capture cuda graphs.""" self.cuda_graph_runner = None self.cuda_graph_runner_for_draft_extend = None @@ -218,12 +218,12 @@ class EAGLEWorker(TpModelWorker): tic = time.time() before_mem = get_available_gpu_memory(self.device, self.gpu_id) logger.info( - f"Capture draft CUDA graph begin. This can take up to several minutes. avail mem={before_mem:.2f} GB" + f"Capture draft cuda graph begin. This can take up to several minutes. avail mem={before_mem:.2f} GB" ) self.cuda_graph_runner = EAGLEDraftCudaGraphRunner(self) after_mem = get_available_gpu_memory(self.device, self.gpu_id) logger.info( - f"Capture draft CUDA graph end. Time elapsed: {time.time() - tic:.2f} s. avail mem={after_mem:.2f} GB. mem usage={(before_mem - after_mem):.2f} GB." + f"Capture draft cuda graph end. Time elapsed: {time.time() - tic:.2f} s. avail mem={after_mem:.2f} GB. mem usage={(before_mem - after_mem):.2f} GB." ) # Capture extend diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index 1ab14c819..85cb807c5 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -1117,7 +1117,7 @@ prometheus_multiproc_dir: tempfile.TemporaryDirectory def set_prometheus_multiproc_dir(): # Set prometheus multiprocess directory - # SGLang uses prometheus multiprocess mode + # sglang uses prometheus multiprocess mode # we need to set this before importing prometheus_client # https://prometheus.github.io/client_python/multiprocess/ global prometheus_multiproc_dir diff --git a/python/sglang/test/attention/test_flashattn_backend.py b/python/sglang/test/attention/test_flashattn_backend.py index ae814ba59..5e5ebbaf1 100644 --- a/python/sglang/test/attention/test_flashattn_backend.py +++ b/python/sglang/test/attention/test_flashattn_backend.py @@ -42,7 +42,7 @@ class MockModelRunner: "TokenPool", (), { - # A typical max_bs * max_context_len for CUDA graph decode + # A typical max_bs * max_context_len for cuda graph decode "size": max_batch_size, # Add req_to_token attribute "req_to_token": torch.zeros( diff --git a/python/sglang/test/attention/test_flashattn_mla_backend.py b/python/sglang/test/attention/test_flashattn_mla_backend.py index 9a64ee533..ebfd0b395 100644 --- a/python/sglang/test/attention/test_flashattn_mla_backend.py +++ b/python/sglang/test/attention/test_flashattn_mla_backend.py @@ -37,7 +37,7 @@ class MockModelRunner: "TokenPool", (), { - # A typical max_bs * max_context_len for CUDA graph decode + # A typical max_bs * max_context_len for cuda graph decode "size": batch_size, # Add req_to_token attribute "req_to_token": torch.zeros( diff --git a/sgl-kernel/README.md b/sgl-kernel/README.md index 90980f593..70e99f82c 100644 --- a/sgl-kernel/README.md +++ b/sgl-kernel/README.md @@ -83,11 +83,11 @@ Third-party libraries: ### FlashAttention FYI - FA3 can fail without a enough shared memory for some shapes, such as higher hidden_dim or some special cases. Right now, FA3 is supported for sm80/sm87 and sm86/sm89. + FA3 can fail without a enough shared memory for a some shapes, such as higher hidden_dim or some special cases. Right now, fa3 is supported for sm80/sm87 and sm86/sm89. The main different Between sm80/sm87 and sm86/sm89 is the shared memory size. you can follow the link below for more information https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x. - And for sgl-kernel right now, we can build FA3 on sm80/sm86/sm89/sm90a. That means if you use **A100(tested)**/A*0/**L20(tested)**/L40/L40s/**3090(tested)** you can use FA3. + And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a. That means if you use **A100(tested)**/A*0/**L20(tested)**/L40/L40s/**3090(tested)** you can use fa3. ### Kernel Development @@ -164,7 +164,7 @@ template <> struct pytorch_library_compatible_type { using type = int64_t; static int convert_from_type(int64_t arg) { - TORCH_CHECK(arg <= std::numeric_limits::max(), "int64_t value is too large to be converted to int"); + TORCH_CHECK(arg <= std::numeric_limits::max(), "int64_t value is too large to be converted to int"); TORCH_CHECK(arg >= std::numeric_limits::min(), "int64_t value is too small to be converted to int"); return arg; } diff --git a/sgl-kernel/benchmark/bench_moe_align_block_size.py b/sgl-kernel/benchmark/bench_moe_align_block_size.py index ab4a909cc..274502221 100644 --- a/sgl-kernel/benchmark/bench_moe_align_block_size.py +++ b/sgl-kernel/benchmark/bench_moe_align_block_size.py @@ -177,7 +177,7 @@ def calculate_diff(num_tokens, num_experts=256, block_size=128, topk=8): expert_ids_vllm = torch.zeros_like(expert_ids_cuda) num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_cuda) - # compare the performance of CUDA, triton and vllm implementation + # compare the performance of cuda, triton and vllm implementation sgl_moe_align_block_size( topk_ids, num_experts, @@ -349,7 +349,7 @@ def benchmark(num_tokens, num_experts, topk, provider): ), quantiles=quantiles, ) - else: # vLLM + else: # vllm try: ms, min_ms, max_ms = triton.testing.do_bench( lambda: ops.moe_align_block_size( diff --git a/sgl-kernel/csrc/allreduce/custom_all_reduce.cuh b/sgl-kernel/csrc/allreduce/custom_all_reduce.cuh index 460b0c584..ec223bdeb 100644 --- a/sgl-kernel/csrc/allreduce/custom_all_reduce.cuh +++ b/sgl-kernel/csrc/allreduce/custom_all_reduce.cuh @@ -280,8 +280,8 @@ class CustomAllreduce { std::unordered_map buffers_; Signal* self_sg_; - // Stores rank data from all ranks. This is mainly for CUDA graph purposes. - // For CUDA graph to work, all kernel arguments must be fixed during graph + // Stores rank data from all ranks. This is mainly for cuda graph purposes. + // For cuda graph to work, all kernel arguments must be fixed during graph // capture time. However, the peer pointers are not known during graph capture // time. Therefore, during capture, we increment the rank data pointer and use // that as the argument to the kernel. The kernel arguments are stored in @@ -291,7 +291,7 @@ class CustomAllreduce { // // The overall process looks like this: // 1. Graph capture. - // 2. Each rank obtains the IPC handles for each addresses used during CUDA + // 2. Each rank obtains the IPC handles for each addresses used during cuda // graph capture using get_graph_buffer_ipc_meta. // 3. (In Python) all gather the IPC handles. // 4. Obtain the peer pointers by opening the IPC handles, and store them in diff --git a/sgl-kernel/python/sgl_kernel/__init__.py b/sgl-kernel/python/sgl_kernel/__init__.py index 898de0cb1..0aaf09042 100755 --- a/sgl-kernel/python/sgl_kernel/__init__.py +++ b/sgl-kernel/python/sgl_kernel/__init__.py @@ -65,5 +65,5 @@ from sgl_kernel.speculative import ( from sgl_kernel.version import __version__ build_tree_kernel = ( - None # TODO(ying): remove this after updating the SGLang python code. + None # TODO(ying): remove this after updating the sglang python code. ) diff --git a/sgl-kernel/python/sgl_kernel/flash_attn.py b/sgl-kernel/python/sgl_kernel/flash_attn.py index 27dbf1250..fbf0b0d3f 100644 --- a/sgl-kernel/python/sgl_kernel/flash_attn.py +++ b/sgl-kernel/python/sgl_kernel/flash_attn.py @@ -10,14 +10,14 @@ except: def is_fa3_supported(device=None) -> bool: - # There some FA3 FYI + # There some fa3 FYI # FA3 can fail without a enough shared memory for a some shapes, such as higher # hidden_dim or some special cases. - # Right now, FA3 is supported for sm80/sm87 and sm86/sm89. The main different + # Right now, fa3 is supported for sm80/sm87 and sm86/sm89. The main different # Between sm80/sm87 and sm86/sm89 is the shared memory size. you can follow the link below for more information # https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x - # And for sgl-kernel right now, we can build FA3 on sm80/sm86/sm89/sm90a. - # That means if you use A100/A*0/L20/L40/L40s/4090 you can use FA3. + # And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a. + # That means if you use A100/A*0/L20/L40/L40s/4090 you can use fa3. return ( torch.cuda.get_device_capability(device)[0] == 9 or torch.cuda.get_device_capability(device)[0] == 8 diff --git a/sgl-kernel/tests/test_merge_state_v2.py b/sgl-kernel/tests/test_merge_state_v2.py index abc057d75..f5c7a30dd 100644 --- a/sgl-kernel/tests/test_merge_state_v2.py +++ b/sgl-kernel/tests/test_merge_state_v2.py @@ -197,7 +197,7 @@ def test_merge_attn_states( if not torch.cuda.is_available(): pytest.skip( "Currently only support compare triton merge_attn_states " - "with custom CUDA merge_attn_states kernel" + "with custom cuda merge_attn_states kernel" ) NUM_TOKENS = num_tokens diff --git a/test/srt/models/lora/test_lora_cuda_graph.py b/test/srt/models/lora/test_lora_cuda_graph.py index 68eaaed79..ba68df59a 100644 --- a/test/srt/models/lora/test_lora_cuda_graph.py +++ b/test/srt/models/lora/test_lora_cuda_graph.py @@ -47,8 +47,8 @@ TEST_CUDA_GRAPH_PADDING_PROMPTS = [ class TestLoRACudaGraph(CustomTestCase): def _run_without_cuda_graph_on_model_cases(self, model_cases: List[LoRAModelCase]): - # Since we have already enabled CUDA graph by default in other LoRA tests, - # we only need to run LoRA tests without CUDA graph here. + # Since we have already enabled CUDA graph by default in other lora tests, + # we only need to run lora tests without CUDA graph here. for model_case in model_cases: # If skip_long_prompt is True, filter out prompts longer than 1000 characters prompts = ( diff --git a/test/srt/models/lora/utils.py b/test/srt/models/lora/utils.py index d111821ce..642b8731e 100644 --- a/test/srt/models/lora/utils.py +++ b/test/srt/models/lora/utils.py @@ -154,7 +154,7 @@ def run_lora_test_one_by_one( model_case (LoRAModelCase): The model case to test. torch_dtype (torch.dtype): The torch dtype to use. max_new_tokens (int): The maximum number of new tokens to generate. - backend (str): The LoRA backend to use. + backend (str): The lora backend to use. disable_cuda_graph (bool, optional): Whether to disable CUDA graph. Defaults to False. disable_radix_cache (bool, optional): Whether to disable radix cache. Defaults to True. mem_fraction_static (float, optional): The fraction of memory to use. Defaults to 0.88. @@ -289,7 +289,7 @@ def run_lora_test_by_batch( test_tag: str = "", ): """ - Run LoRA tests as a batch. + Run lora tests as a batch. For prompt0, prompt1, ..., promptN, we will use adaptor0, adaptor1, ..., adaptorN included in model case, We will then compare the outputs of HF and SRT with LoRA. @@ -301,7 +301,7 @@ def run_lora_test_by_batch( model_case (LoRAModelCase): The model case to test. torch_dtype (torch.dtype): The torch dtype to use. max_new_tokens (int): The maximum number of new tokens to generate. - backend (str): The LoRA backend to use. + backend (str): The lora backend to use. disable_cuda_graph (bool, optional): Whether to disable CUDA graph. Defaults to False. disable_radix_cache (bool, optional): Whether to disable radix cache. Defaults to True. mem_fraction_static (float, optional): The fraction of memory to use. Defaults to 0.88. @@ -372,8 +372,8 @@ def run_lora_test_by_batch( print("ROUGE-L score:", rouge_score) print("SRT output:", srt_output_str) print("HF output:", hf_output_str) - print("SRT no LoRA output:", srt_no_lora_outputs.output_strs[i].strip()) - print("HF no LoRA output:", hf_no_lora_outputs.output_strs[i].strip()) + print("SRT no lora output:", srt_no_lora_outputs.output_strs[i].strip()) + print("HF no lora output:", hf_no_lora_outputs.output_strs[i].strip()) assert srt_outputs.output_strs[i].strip(" ") == hf_outputs.output_strs[i].strip( " " ), ( diff --git a/test/srt/test_srt_engine_with_quant_args.py b/test/srt/test_srt_engine_with_quant_args.py index 3416c6e4a..47baf5688 100644 --- a/test/srt/test_srt_engine_with_quant_args.py +++ b/test/srt/test_srt_engine_with_quant_args.py @@ -8,7 +8,7 @@ class TestSRTEngineWithQuantArgs(CustomTestCase): def test_1_quantization_args(self): - # we only test fp8 because other methods are currently dependent on vLLM. We can add other methods back to test after vLLM dependency is resolved. + # we only test fp8 because other methods are currently dependent on vllm. We can add other methods back to test after vllm dependency is resolved. quantization_args_list = [ # "awq", "fp8", @@ -34,7 +34,7 @@ class TestSRTEngineWithQuantArgs(CustomTestCase): def test_2_torchao_args(self): - # we don't test int8dq because currently there is conflict between int8dq and capture CUDA graph + # we don't test int8dq because currently there is conflict between int8dq and capture cuda graph torchao_args_list = [ # "int8dq", "int8wo", diff --git a/test/srt/test_triton_attention_kernels.py b/test/srt/test_triton_attention_kernels.py index 3f469a5fc..47eb16a9b 100644 --- a/test/srt/test_triton_attention_kernels.py +++ b/test/srt/test_triton_attention_kernels.py @@ -277,7 +277,7 @@ class TestTritonAttention(CustomTestCase): def test_decode_attention(self): # Here we just to ensure there is no error - # TODO: correctness test + # TODO: correctnesss test # Test configurations configs = [ diff --git a/test/srt/test_update_weights_from_distributed.py b/test/srt/test_update_weights_from_distributed.py index 96f954db0..064406703 100644 --- a/test/srt/test_update_weights_from_distributed.py +++ b/test/srt/test_update_weights_from_distributed.py @@ -189,7 +189,7 @@ def init_process_hf( print(f"[hf] {rank=} {broadcast_time=:.3f}s") param_queue.put(("broadcast_time", broadcast_time)) - # Delete the HuggingFace models to free up memory. + # Delete the huggingface models to free up memory. del hf_instruct_model del hf_base_model gc.collect()