From 706bd69cc58aefb7c0a4d7b269f1cbe2908f955b Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Fri, 8 Aug 2025 19:56:50 -0700 Subject: [PATCH] Clean up server_args.py to have a dedicated function for model specific adjustments (#8983) --- .github/workflows/execute-notebook.yml | 5 +- .github/workflows/pr-test-pd-router.yml | 2 +- .github/workflows/pr-test.yml | 74 +++--- .github/workflows/vllm-dependency-test.yml | 7 +- .gitmodules | 0 README.md | 9 +- docs/backend/server_arguments.md | 4 +- docs/index.rst | 6 +- python/pyproject.toml | 9 +- python/sglang/srt/configs/model_config.py | 12 +- python/sglang/srt/entrypoints/context.py | 4 +- .../layers/attention/hybrid_attn_backend.py | 2 +- .../srt/layers/quantization/__init__.py | 2 +- python/sglang/srt/server_args.py | 214 ++++++++---------- scripts/ci_install_dependency.sh | 75 +++--- scripts/fix_corrupted_json.py | 40 ---- scripts/killall_sglang.sh | 1 - test/lang/run_suite.py | 2 - test/srt/run_suite.py | 4 - test/srt/test_ascend_tp1_bf16.py | 2 +- test/srt/test_bench_serving.py | 1 - test/srt/test_mla_deepseek_v3.py | 61 ----- test/srt/test_mla_flashinfer.py | 3 +- test/srt/test_mla_int8_deepseek_v3.py | 2 + 24 files changed, 201 insertions(+), 340 deletions(-) delete mode 100644 .gitmodules delete mode 100644 scripts/fix_corrupted_json.py diff --git a/.github/workflows/execute-notebook.yml b/.github/workflows/execute-notebook.yml index 627650417..d8381b12e 100644 --- a/.github/workflows/execute-notebook.yml +++ b/.github/workflows/execute-notebook.yml @@ -26,10 +26,7 @@ jobs: run: | bash scripts/ci_install_dependency.sh pip install -r docs/requirements.txt - apt-get update - apt-get install -y pandoc - apt-get update && apt-get install -y parallel retry - + apt-get update && apt-get install -y pandoc parallel retry ln -sf "$(which python3)" /usr/bin/python - name: Setup Jupyter Kernel diff --git a/.github/workflows/pr-test-pd-router.yml b/.github/workflows/pr-test-pd-router.yml index c7fa1e666..570dc4e21 100644 --- a/.github/workflows/pr-test-pd-router.yml +++ b/.github/workflows/pr-test-pd-router.yml @@ -1,4 +1,4 @@ -name: Test Disaggregation Mode +name: PR Test (PD Router) on: push: diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index d99d68bc7..f58fb2377 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -61,7 +61,7 @@ jobs: python3 run_suite.py --suite per-commit unit-test-backend-1-gpu: - needs: check-changes + needs: [check-changes, unit-test-frontend] if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false && needs.check-changes.outputs.src == 'true' @@ -85,7 +85,7 @@ jobs: python3 run_suite.py --suite per-commit --auto-partition-id ${{ matrix.part }} --auto-partition-size 10 unit-test-backend-2-gpu: - needs: check-changes + needs: [check-changes, unit-test-frontend] if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false && needs.check-changes.outputs.src == 'true' @@ -105,7 +105,7 @@ jobs: python3 run_suite.py --suite per-commit-2-gpu unit-test-backend-4-gpu: - needs: [check-changes, unit-test-frontend, unit-test-backend-2-gpu] + needs: [check-changes, unit-test-backend-2-gpu] if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false && needs.check-changes.outputs.src == 'true' @@ -125,7 +125,7 @@ jobs: python3 run_suite.py --suite per-commit-4-gpu unit-test-backend-8-gpu: - needs: [check-changes, unit-test-frontend, unit-test-backend-2-gpu] + needs: [check-changes, unit-test-backend-2-gpu] if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false && needs.check-changes.outputs.src == 'true' @@ -245,7 +245,7 @@ jobs: python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_online_latency performance-test-2-gpu: - needs: check-changes + needs: [check-changes, unit-test-backend-2-gpu] if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false && needs.check-changes.outputs.src == 'true' @@ -282,13 +282,13 @@ jobs: cd test/srt python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache - - name: Benchmark offline decode throughput (PP=2) + - name: Benchmark offline PP decode throughput (PP=2) timeout-minutes: 10 run: | cd test/srt python3 -m unittest test_bench_serving.TestBenchServing.test_pp_offline_throughput_default_decode - - name: Benchmark offline prefill throughput (PP=2) + - name: Benchmark offline PP prefill throughput (PP=2) timeout-minutes: 10 run: | cd test/srt @@ -318,7 +318,7 @@ jobs: python3 test_eval_accuracy_large.py accuracy-test-2-gpu: - needs: check-changes + needs: [check-changes, accuracy-test-1-gpu] if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false && needs.check-changes.outputs.src == 'true' @@ -341,7 +341,7 @@ jobs: python3 test_moe_eval_accuracy_large.py unit-test-deepep-4-gpu: - needs: check-changes + needs: [check-changes, unit-test-backend-2-gpu] if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false && needs.check-changes.outputs.src == 'true' @@ -361,7 +361,7 @@ jobs: python3 run_suite.py --suite per-commit-4-gpu-deepep unit-test-deepep-8-gpu: - needs: [check-changes, unit-test-deepep-4-gpu] + needs: [check-changes, unit-test-backend-2-gpu] if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false && needs.check-changes.outputs.src == 'true' @@ -380,12 +380,38 @@ jobs: cd test/srt python3 run_suite.py --suite per-commit-8-gpu-deepep + unit-test-backend-8-gpu-b200: + needs: [check-changes, unit-test-backend-2-gpu] + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && + github.event.pull_request.draft == false && + needs.check-changes.outputs.src == 'true' + runs-on: b200-runner + strategy: + fail-fast: false + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + IS_BLACKWELL=1 bash scripts/ci_install_dependency.sh + + - name: Run test + timeout-minutes: 20 + run: | + cd test/srt + python3 run_suite.py --suite per-commit-8-gpu-b200 --auto-partition-id 0 --auto-partition-size 1 + + pr-test-finish: needs: [ check-changes, - unit-test-frontend, unit-test-backend-1-gpu, unit-test-backend-2-gpu, unit-test-backend-4-gpu, - unit-test-backend-8-gpu, performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-2-gpu, - accuracy-test-1-gpu, accuracy-test-2-gpu, unit-test-deepep-4-gpu, unit-test-deepep-8-gpu, + unit-test-frontend, unit-test-backend-1-gpu, + unit-test-backend-2-gpu, unit-test-backend-4-gpu, unit-test-backend-8-gpu, + performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-2-gpu, + accuracy-test-1-gpu, accuracy-test-2-gpu, + unit-test-deepep-4-gpu, unit-test-deepep-8-gpu, + unit-test-backend-8-gpu-b200, ] if: needs.check-changes.outputs.src == 'true' runs-on: ubuntu-latest @@ -401,25 +427,3 @@ jobs: done echo "All jobs completed successfully" exit 0 - - unit-test-backend-8-gpu-b200: - needs: [check-changes, unit-test-frontend, unit-test-backend-2-gpu] - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && - github.event.pull_request.draft == false && - needs.check-changes.outputs.src == 'true' - runs-on: b200-runner - strategy: - fail-fast: false - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Install dependencies - run: | - MODE_BLACKWELL=1 bash scripts/ci_install_dependency.sh - - - name: Run test - timeout-minutes: 20 - run: | - cd test/srt - python3 run_suite.py --suite per-commit-8-gpu-b200 --auto-partition-id 0 --auto-partition-size 1 diff --git a/.github/workflows/vllm-dependency-test.yml b/.github/workflows/vllm-dependency-test.yml index 1c5630fb6..785b42efc 100644 --- a/.github/workflows/vllm-dependency-test.yml +++ b/.github/workflows/vllm-dependency-test.yml @@ -30,11 +30,12 @@ jobs: - name: Install dependencies run: | bash scripts/ci_install_dependency.sh - pip install "vllm==0.10.0" + pip install "vllm==0.9.0" pip install "bitsandbytes>=0.44.0" - pip3 install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu126 + pip install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu126 + pip install "openai==1.99.1" - - name: Run VLLM dependency tests + - name: Run vLLM dependency tests timeout-minutes: 60 run: | cd test/srt diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index e69de29bb..000000000 diff --git a/README.md b/README.md index 974eaf12f..3b3a226b9 100644 --- a/README.md +++ b/README.md @@ -20,13 +20,13 @@ | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) | ## News +- [2025/08] 🔥 SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833)) - [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)). - [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)). - [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)). - [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html)) - [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/)) - [2024/12] v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)). -- [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
More @@ -35,6 +35,7 @@ - [2025/01] SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412)) - [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)). - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)). +- [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)). - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)). - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)). - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)). @@ -46,10 +47,10 @@ SGLang is a fast serving framework for large language models and vision language It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language. The core features include: -- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor parallelism, pipeline parallelism, expert parallelism, structured outputs, chunked prefill, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching. +- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor/pipeline/expert/data parallelism, structured outputs, chunked prefill, quantization (FP4/FP8/INT4/AWQ/GPTQ), and multi-lora batching. - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions. -- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, Qwen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models. -- **Active Community**: SGLang is open-source and backed by an active community with industry adoption. +- **Extensive Model Support**: Supports a wide range of generative models (Llama, Qwen, DeepSeek, Kimi, GPT, Gemma, Mistral, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models. +- **Active Community**: SGLang is open-source and backed by an active community with wide industry adoption. ## Getting Started - [Install SGLang](https://docs.sglang.ai/start/install.html) diff --git a/docs/backend/server_arguments.md b/docs/backend/server_arguments.md index 008953bc2..3d951a9f7 100644 --- a/docs/backend/server_arguments.md +++ b/docs/backend/server_arguments.md @@ -189,8 +189,8 @@ Please consult the documentation below and [server_args.py](https://github.com/s | Arguments | Description | Defaults | |-----------|-------------|----------| | `--attention-backend` | Choose the kernels for attention layers. | None | -| `decode_attention_backend` | (Experimental) This argument specifies the backend for decode attention computation. Note that this argument has priority over `attention_backend`. | None | -| `prefill_attention_backend` | (Experimental) This argument specifies the backend for prefill attention computation. Note that this argument has priority over `attention_backend`. | None | +| `--prefill-attention-backend` | (Experimental) This argument specifies the backend for prefill attention computation. Note that this argument has priority over `attention_backend`. | None | +| `--decode-attention-backend` | (Experimental) This argument specifies the backend for decode attention computation. Note that this argument has priority over `attention_backend`. | None | | `--sampling-backend` | Choose the kernels for sampling layers. | None | | `--grammar-backend` | Choose the backend for grammar-guided decoding. | None | | `--mm-attention-backend` | Set multimodal attention backend. | None | diff --git a/docs/index.rst b/docs/index.rst index 380b58b1b..4530e7442 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -5,10 +5,10 @@ SGLang is a fast serving framework for large language models and vision language It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language. The core features include: -- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor parallelism, pipeline parallelism, expert parallelism, structured outputs, chunked prefill, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching. +- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor/pipeline/expert/data parallelism, structured outputs, chunked prefill, quantization (FP4/FP8/INT4/AWQ/GPTQ), and multi-lora batching. - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions. -- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, Qwen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models. -- **Active Community**: SGLang is open-source and backed by an active community with industry adoption. +- **Extensive Model Support**: Supports a wide range of generative models (Llama, Qwen, DeepSeek, Kimi, GPT, Gemma, Mistral, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models. +- **Active Community**: SGLang is open-source and backed by an active community with wide industry adoption. .. toctree:: :maxdepth: 1 diff --git a/python/pyproject.toml b/python/pyproject.toml index f688803dd..376c8b2d5 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -21,6 +21,7 @@ runtime_common = [ "build", "compressed-tensors", "datasets", + "einops", "fastapi", "hf_transfer", "huggingface_hub", @@ -29,6 +30,7 @@ runtime_common = [ "modelscope", "msgspec", "ninja", + "openai==1.99.1", "openai-harmony==0.0.3", "orjson", "outlines==0.1.11", @@ -48,6 +50,7 @@ runtime_common = [ "torchao==0.9.0", "transformers==4.55.0", "timm==1.0.16", + "tiktoken", "uvicorn", "uvloop", "xgrammar==0.1.22", @@ -60,7 +63,6 @@ srt = [ "torchaudio==2.8.0", "torchvision", "cuda-python", - "einops", "flashinfer_python==0.2.10", ] @@ -71,10 +73,7 @@ blackwell = [ "torchaudio==2.8.0", "torchvision", "cuda-python", - "einops", "flashinfer_python==0.2.10", - "tiktoken", - "openai==1.99.1", ] # HIP (Heterogeneous-computing Interface for Portability) for AMD @@ -101,7 +100,7 @@ srt_npu = ["sglang[runtime_common]"] openai = ["openai==1.99.1", "tiktoken"] anthropic = ["anthropic>=0.20.0"] litellm = ["litellm>=1.0.0"] -torch_memory_saver = ["torch_memory_saver>=0.0.8"] +torch_memory_saver = ["torch_memory_saver==0.0.8"] decord = ["decord"] test = [ "accelerate", diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py index 812f07103..da70ec740 100644 --- a/python/sglang/srt/configs/model_config.py +++ b/python/sglang/srt/configs/model_config.py @@ -64,13 +64,12 @@ class ModelConfig: hybrid_kvcache_ratio: Optional[float] = None, model_impl: Union[str, ModelImpl] = ModelImpl.AUTO, ) -> None: - + # Parse args self.model_path = model_path self.revision = revision self.quantization = quantization self.model_impl = model_impl - # Parse args self.maybe_pull_model_tokenizer_from_remote() self.model_override_args = json.loads(model_override_args) kwargs = {} @@ -139,6 +138,7 @@ class ModelConfig: and self.hf_config.architectures[0] == "Ernie4_5_MoeForCausalLM" ): self.hf_config.architectures[0] = "Ernie4_5_MoeForCausalLMMTP" + # Check model type self.is_generation = is_generation_model( self.hf_config.architectures, is_embedding @@ -282,12 +282,10 @@ class ModelConfig: # Cache attributes self.hf_eos_token_id = self.get_hf_eos_token_id() - config = self.hf_config - # multimodal - self.image_token_id = getattr(config, "image_token_id", None) or getattr( - config, "image_token_index", None - ) + self.image_token_id = getattr( + self.hf_config, "image_token_id", None + ) or getattr(self.hf_config, "image_token_index", None) @staticmethod def from_server_args(server_args: ServerArgs, model_path: str = None, **kwargs): diff --git a/python/sglang/srt/entrypoints/context.py b/python/sglang/srt/entrypoints/context.py index 0c8bc116d..e7a0c07cf 100644 --- a/python/sglang/srt/entrypoints/context.py +++ b/python/sglang/srt/entrypoints/context.py @@ -9,8 +9,8 @@ logger = logging.getLogger(__name__) try: from mcp import ClientSession -except ImportError: - logger.warning("Ignoring mcp import error") +except ImportError as e: + mcp = e from openai_harmony import Author, Message, Role, StreamState, TextContent diff --git a/python/sglang/srt/layers/attention/hybrid_attn_backend.py b/python/sglang/srt/layers/attention/hybrid_attn_backend.py index 370961864..b9f829e41 100644 --- a/python/sglang/srt/layers/attention/hybrid_attn_backend.py +++ b/python/sglang/srt/layers/attention/hybrid_attn_backend.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Optional, Union +from typing import Optional, Union import torch diff --git a/python/sglang/srt/layers/quantization/__init__.py b/python/sglang/srt/layers/quantization/__init__.py index 19977012a..7be1572da 100644 --- a/python/sglang/srt/layers/quantization/__init__.py +++ b/python/sglang/srt/layers/quantization/__init__.py @@ -3,7 +3,7 @@ from __future__ import annotations import builtins import inspect -from typing import TYPE_CHECKING, Callable, Dict, Optional, Type, Union +from typing import TYPE_CHECKING, Dict, Optional, Type import torch diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 7bfd443bf..b6c5156ff 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -37,7 +37,6 @@ from sglang.srt.utils import ( is_hip, is_port_available, is_remote_url, - is_triton_kernels_available, is_valid_ipv6_address, nullable_str, ) @@ -109,7 +108,7 @@ class ServerArgs: log_level: str = "info" log_level_http: Optional[str] = None log_requests: bool = False - log_requests_level: int = 0 + log_requests_level: int = 2 crash_dump_folder: Optional[str] = None show_time_cost: bool = False enable_metrics: bool = False @@ -131,6 +130,7 @@ class ServerArgs: enable_cache_report: bool = False reasoning_parser: Optional[str] = None tool_call_parser: Optional[str] = None + tool_server: Optional[str] = None # Data parallelism dp_size: int = 1 @@ -278,15 +278,11 @@ class ServerArgs: enable_pdmux: bool = False sm_group_num: int = 3 - # For tool server - tool_server: Optional[str] = None - # Deprecated arguments enable_ep_moe: bool = False enable_deepep_moe: bool = False def __post_init__(self): - # Check deprecated arguments def print_deprecated_warning(message: str): logger.warning(f"\033[33m{message}\033[0m") @@ -392,6 +388,9 @@ class ServerArgs: self.attention_backend = "torch_native" self.sampling_backend = "pytorch" + # Model-specific adjustments + self.model_specific_adjustments() + # Set kernel backends if self.device == "cpu": if self.attention_backend is None: @@ -470,55 +469,9 @@ class ServerArgs: "trtllm_mha backend does not support speculative decoding yet." ) - model_arch = self.get_hf_config().architectures[0] - if model_arch in ["GptOssForCausalLM"]: - if self.attention_backend is None: - # default is triton, but we could have trtllm_mha as an option - self.attention_backend = "triton" - assert ( - self.attention_backend == "trtllm_mha" - or self.attention_backend == "triton" - ) - quantization_config = getattr( - self.get_hf_config(), "quantization_config", None - ) - is_mxfp4_quant_format = ( - quantization_config is not None - and quantization_config.get("quant_method") == "mxfp4" - ) - - if is_sm100_supported() and is_mxfp4_quant_format: - self.enable_flashinfer_mxfp4_moe = True - self.enable_triton_kernel_moe = False - logger.info( - "Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel." - ) - else: - if self.enable_triton_kernel_moe: - assert ( - self.ep_size == 1 - ), "Triton kernel MoE is only supported when ep_size == 1" - if not self.enable_triton_kernel_moe and self.ep_size == 1: - self.enable_triton_kernel_moe = True - logger.info( - "Detected GPT-OSS model, enabling triton_kernels MOE kernel." - ) - - self.disable_hybrid_swa_memory = True - - if is_mxfp4_quant_format: - # use bf16 for mxfp4 triton kernels - self.dtype = "bfloat16" - if self.attention_backend == "dual_chunk_flash_attn": logger.warning( - "Mixed chunk is disabled because of using dual chunk flash attention backend" - ) - logger.warning( - "Radix cache is disabled because of using dual chunk flash attention backend" - ) - logger.warning( - "Cuda graph is disabled because of using dual chunk flash attention backend" + "Mixed chunk, radix cache, and cuda graphs are disabled because of using dual chunk flash attention backend" ) self.enable_mixed_chunk = False self.disable_cuda_graph = True @@ -583,7 +536,7 @@ class ServerArgs: if self.enable_eplb and (self.expert_distribution_recorder_mode is None): self.expert_distribution_recorder_mode = "stat" - logger.info( + logger.warning( "EPLB is enabled. The expert_distribution_recorder_mode is automatically set." ) @@ -591,9 +544,6 @@ class ServerArgs: self.ep_dispatch_algorithm is None ): self.ep_dispatch_algorithm = "static" - logger.info( - "EPLB is enabled or init_expert_location is provided. ep_dispatch_algorithm is configured." - ) if self.enable_eplb: assert self.ep_size > 1 or self.moe_a2a_backend is not None @@ -1112,7 +1062,7 @@ class ServerArgs: parser.add_argument( "--log-requests-level", type=int, - default=0, + default=ServerArgs.log_requests_level, help="0: Log metadata (no sampling parameters). 1: Log metadata and sampling parameters. 2: Log metadata, sampling parameters and partial input/output. 3: Log every input/output.", choices=[0, 1, 2, 3], ) @@ -1245,6 +1195,12 @@ class ServerArgs: default=ServerArgs.tool_call_parser, help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', 'qwen3_coder', 'glm45', and 'step3'.", ) + parser.add_argument( + "--tool-server", + type=str, + default=None, + help="Either 'demo' or a comma-separated list of tool server urls to use for the model. If not specified, no tool server will be used.", + ) # Data parallelism parser.add_argument( @@ -1344,55 +1300,41 @@ class ServerArgs: ) # Kernel backend + ATTN_BACKENDS = [ + "aiter", + "cutlass_mla", + "fa3", + "flashinfer", + "flashmla", + "intel_amx", + "torch_native", + "ascend", + "triton", + "trtllm_mla", + "trtllm_mha", + "dual_chunk_flash_attn", + ] parser.add_argument( "--attention-backend", type=str, - choices=[ - "aiter", - "cutlass_mla", - "fa3", - "flashinfer", - "flashmla", - "intel_amx", - "torch_native", - "ascend", - "triton", - "trtllm_mla", - "trtllm_mha", - "dual_chunk_flash_attn", - ], + choices=ATTN_BACKENDS, default=ServerArgs.attention_backend, help="Choose the kernels for attention layers.", ) - parser.add_argument( - "--decode-attention-backend", - type=str, - choices=[ - "flashinfer", - "triton", - "torch_native", - "fa3", - "flashmla", - "cutlass_mla", - ], - default=ServerArgs.decode_attention_backend, - help="Choose the kernels for decode attention layers (have priority over --attention-backend).", - ) - parser.add_argument( "--prefill-attention-backend", type=str, - choices=[ - "flashinfer", - "triton", - "torch_native", - "fa3", - "flashmla", - "cutlass_mla", - ], + choices=ATTN_BACKENDS, default=ServerArgs.prefill_attention_backend, help="Choose the kernels for prefill attention layers (have priority over --attention-backend).", ) + parser.add_argument( + "--decode-attention-backend", + type=str, + choices=ATTN_BACKENDS, + default=ServerArgs.decode_attention_backend, + help="Choose the kernels for decode attention layers (have priority over --attention-backend).", + ) parser.add_argument( "--sampling-backend", type=str, @@ -1612,7 +1554,6 @@ class ServerArgs: default=ServerArgs.hicache_mem_layout, help="The layout of host memory pool for hierarchical cache.", ) - parser.add_argument( "--hicache-storage-backend", type=str, @@ -1985,14 +1926,6 @@ class ServerArgs: help="Disable mmap while loading weight using safetensors.", ) - # For tool server - parser.add_argument( - "--tool-server", - type=str, - default=None, - help="Either 'demo' or a comma-separated list of tool server urls to use for the model. If not specified, no tool server will be used.", - ) - # Deprecated arguments parser.add_argument( "--enable-ep-moe", @@ -2056,25 +1989,6 @@ class ServerArgs: None, }, "moe_dense_tp_size only support 1 and None currently" - # Check model architecture - model_arch = self.get_hf_config().architectures[0] - if "Llama4" in model_arch: - assert self.attention_backend == "fa3", "fa3 is required for Llama4 model" - - if model_arch in [ - "Gemma2ForCausalLM", - "Gemma3ForCausalLM", - "Gemma3ForConditionalGeneration", - "Gemma3nForCausalLM", - "Gemma3nForConditionalGeneration", - ]: - # FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model. - # It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736 - logger.warning( - f"Disable hybrid SWA memory for {model_arch} as it is not yet supported." - ) - self.disable_hybrid_swa_memory = True - # Check LoRA self.check_lora_server_args() @@ -2100,7 +2014,7 @@ class ServerArgs: if self.lora_paths: if self.enable_lora is None: self.enable_lora = True - logger.info( + logger.warning( "--enable-lora is set to True because --lora-paths is provided." ) elif self.enable_lora is False: @@ -2172,6 +2086,58 @@ class ServerArgs: f"decode_tp={decode_tp}, prefill_tp={prefill_tp}" ) + def model_specific_adjustments(self): + hf_config = self.get_hf_config() + model_arch = hf_config.architectures[0] + if model_arch in ["GptOssForCausalLM"]: + if self.attention_backend is None: + self.attention_backend = "triton" + assert self.attention_backend in [ + "triton", + "trtllm_mha", + ], f"GptOssForCausalLM requires 'triton' or 'trtllm_mha' attention backend, but got {self.attention_backend}" + quantization_config = getattr(hf_config, "quantization_config", None) + is_mxfp4_quant_format = ( + quantization_config is not None + and quantization_config.get("quant_method") == "mxfp4" + ) + + if is_sm100_supported() and is_mxfp4_quant_format: + self.enable_flashinfer_mxfp4_moe = True + self.enable_triton_kernel_moe = False + logger.warning( + "Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel." + ) + else: + if self.enable_triton_kernel_moe: + assert ( + self.ep_size == 1 + ), "Triton kernel MoE is only supported when ep_size == 1" + if not self.enable_triton_kernel_moe and self.ep_size == 1: + self.enable_triton_kernel_moe = True + logger.warning( + "Detected GPT-OSS model, enabling triton_kernels MOE kernel." + ) + self.disable_hybrid_swa_memory = True + if is_mxfp4_quant_format: + # use bf16 for mxfp4 triton kernels + self.dtype = "bfloat16" + elif "Llama4" in model_arch: + assert self.attention_backend == "fa3", "fa3 is required for Llama4 model" + elif model_arch in [ + "Gemma2ForCausalLM", + "Gemma3ForCausalLM", + "Gemma3ForConditionalGeneration", + "Gemma3nForCausalLM", + "Gemma3nForConditionalGeneration", + ]: + # FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model. + # It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736 + logger.warning( + f"Disable hybrid SWA memory for {model_arch} as it is not yet supported." + ) + self.disable_hybrid_swa_memory = True + def adjust_mem_fraction_for_vlm(self, model_config): vision_config = getattr(model_config.hf_config, "vision_config", None) if vision_config is None: diff --git a/scripts/ci_install_dependency.sh b/scripts/ci_install_dependency.sh index 7cad2775d..0ad51c7a3 100755 --- a/scripts/ci_install_dependency.sh +++ b/scripts/ci_install_dependency.sh @@ -2,68 +2,71 @@ # Install the dependency in CI. set -euxo pipefail -MODE_BLACKWELL=${MODE_BLACKWELL:-0} +IS_BLACKWELL=${IS_BLACKWELL:-0} -CU_VERSION="cu126" -if [ "$MODE_BLACKWELL" = "1" ]; then +if [ "$IS_BLACKWELL" = "1" ]; then CU_VERSION="cu129" +else + CU_VERSION="cu126" fi # Kill existing processes SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" bash "${SCRIPT_DIR}/killall_sglang.sh" -if [ "$MODE_BLACKWELL" = "1" ]; then - apt-get install -y git libnuma-dev -fi +# Install apt packages +apt install -y git libnuma-dev -# Update pip -if [ "$MODE_BLACKWELL" != "1" ]; then +# Install uv +if [ "$IS_BLACKWELL" = "1" ]; then + # The blackwell CI runner has some issues with pip and uv, + # so we can only use pip with `--break-system-packages` + PIP_CMD="pip" + PIP_INSTALL_SUFFIX="--break-system-packages" + + # Clean up existing installations + $PIP_CMD uninstall -y flashinfer_python sgl-kernel sglang vllm $PIP_INSTALL_SUFFIX || true +else + # In normal cases, we use uv, which is much faster than pip. pip install --upgrade pip -fi + pip install uv + export UV_SYSTEM_PYTHON=true -# Clean up existing installations -pip uninstall -y flashinfer flashinfer_python sgl-kernel sglang vllm --break-system-packages || true -pip cache purge || true -rm -rf /root/.cache/flashinfer -# TODO handle other python versions -rm -rf /usr/local/lib/python3.10/dist-packages/flashinfer* -rm -rf /usr/local/lib/python3.10/dist-packages/sgl_kernel* + PIP_CMD="uv pip" + PIP_INSTALL_SUFFIX="--index-strategy unsafe-best-match" + + # Clean up existing installations + $PIP_CMD uninstall flashinfer_python sgl-kernel sglang vllm || true +fi # Install the main package -pip install -e "python[dev]" --extra-index-url https://download.pytorch.org/whl/${CU_VERSION} --break-system-packages +$PIP_CMD install -e "python[dev]" --extra-index-url https://download.pytorch.org/whl/${CU_VERSION} $PIP_INSTALL_SUFFIX -if [ "$MODE_BLACKWELL" = "1" ]; then +if [ "$IS_BLACKWELL" = "1" ]; then # TODO auto determine sgl-kernel version SGL_KERNEL_VERSION=0.3.2 - pip3 install https://github.com/sgl-project/whl/releases/download/v${SGL_KERNEL_VERSION}/sgl_kernel-${SGL_KERNEL_VERSION}-cp39-abi3-manylinux2014_x86_64.whl --break-system-packages + $PIP_CMD install https://github.com/sgl-project/whl/releases/download/v${SGL_KERNEL_VERSION}/sgl_kernel-${SGL_KERNEL_VERSION}-cp39-abi3-manylinux2014_x86_64.whl --force-reinstall $PIP_INSTALL_SUFFIX fi # Show current packages -pip list +$PIP_CMD list # Install additional dependencies -pip install mooncake-transfer-engine==0.3.5 nvidia-cuda-nvrtc-cu12 --break-system-packages +$PIP_CMD install mooncake-transfer-engine==0.3.5 nvidia-cuda-nvrtc-cu12 py-spy huggingface_hub[hf_xet] $PIP_INSTALL_SUFFIX -if [ "$MODE_BLACKWELL" != "1" ]; then +if [ "$IS_BLACKWELL" != "1" ]; then # For lmms_evals evaluating MMMU git clone --branch v0.3.3 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git - pip install -e lmms-eval/ --break-system-packages + $PIP_CMD install -e lmms-eval/ $PIP_INSTALL_SUFFIX + + # Install xformers + $PIP_CMD install xformers --index-url https://download.pytorch.org/whl/${CU_VERSION} --no-deps $PIP_INSTALL_SUFFIX fi # Install FlashMLA for attention backend tests -# pip install git+https://github.com/deepseek-ai/FlashMLA.git --break-system-packages - -# Install hf_xet -pip install huggingface_hub[hf_xet] --break-system-packages - -if [ "$MODE_BLACKWELL" != "1" ]; then - # Install xformers - pip install -U xformers --index-url https://download.pytorch.org/whl/${CU_VERSION} --no-deps --force-reinstall --break-system-packages -fi - -# To help dumping traces when timeout occurred -pip install py-spy --break-system-packages +# $PIP_CMD install git+https://github.com/deepseek-ai/FlashMLA.git $PIP_INSTALL_SUFFIX # Show current packages -pip list +$PIP_CMD list + +echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}" diff --git a/scripts/fix_corrupted_json.py b/scripts/fix_corrupted_json.py deleted file mode 100644 index 67c2980f5..000000000 --- a/scripts/fix_corrupted_json.py +++ /dev/null @@ -1,40 +0,0 @@ -import json -import re -import sys - - -def clean_json_file(input_file, output_file): - try: - # Open the input file with 'replace' option for handling bad characters - with open(input_file, "r", encoding="utf-8", errors="replace") as f: - data = f.read() - - # Replace bad characters (represented by '�' after decoding) with a space - cleaned_data = data.replace("�", " ") - - # Remove control characters (e.g., ASCII control characters like \x00 to \x1F) - # These can cause issues in JSON parsing. - cleaned_data = re.sub(r"[\x00-\x1F]+", " ", cleaned_data) - - # Parse cleaned data as JSON - json_data = json.loads(cleaned_data) - - # Write the cleaned JSON to a new output file - with open(output_file, "w", encoding="utf-8") as f: - json.dump(json_data, f, ensure_ascii=False, indent=4) - - print(f"Cleaned JSON file has been saved to {output_file}") - - except Exception as e: - print(f"Error: {e}") - - -if __name__ == "__main__": - assert len(sys.argv) > 1, "please give the input file path" - if len(sys.argv) == 3: - input_file = sys.argv[1] - output_file = sys.argv[2] - else: - input_file = output_file = sys.argv[1] - - clean_json_file(input_file, output_file) diff --git a/scripts/killall_sglang.sh b/scripts/killall_sglang.sh index 669fb00e4..7d0fe8bca 100755 --- a/scripts/killall_sglang.sh +++ b/scripts/killall_sglang.sh @@ -27,7 +27,6 @@ else lsof /dev/nvidia* | awk '{print $2}' | xargs kill -9 2>/dev/null fi - # Show GPU status after clean up nvidia-smi fi diff --git a/test/lang/run_suite.py b/test/lang/run_suite.py index 67a477a11..04efba51f 100644 --- a/test/lang/run_suite.py +++ b/test/lang/run_suite.py @@ -8,8 +8,6 @@ suites = { TestFile("test_srt_backend.py"), # Skip this due to some OPENAI_API_KEY issues # "test_openai_backend.py", - TestFile("test_separate_reasoning.py"), - TestFile("test_separate_reasoning_execution.py"), ], } diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index aecea4498..ab06ec596 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -38,7 +38,6 @@ suites = { TestFile("openai_server/basic/test_serving_embedding.py", 10), TestFile("openai_server/basic/test_openai_embedding.py", 141), TestFile("openai_server/basic/test_openai_server.py", 149), - TestFile("openai_server/features/test_cache_report.py", 100), TestFile("openai_server/features/test_enable_thinking.py", 70), TestFile("openai_server/features/test_json_constrained.py", 98), TestFile("openai_server/features/test_json_mode.py", 90), @@ -103,7 +102,6 @@ suites = { TestFile("test_update_weights_from_disk.py", 114), TestFile("test_update_weights_from_tensor.py", 48), TestFile("test_utils_update_weights.py", 48), - TestFile("test_vertex_endpoint.py", 31), TestFile("test_vision_chunked_prefill.py", 175), TestFile("test_vlm_input_format.py", 300), TestFile("test_vision_openai_server_a.py", 584), @@ -167,7 +165,6 @@ suites = { TestFile("models/lora/test_lora_tp.py", 116), TestFile("test_data_parallelism.py", 73), TestFile("test_dp_attention.py", 277), - TestFile("test_mla_tp.py", 170), TestFile("test_patch_torch.py", 19), TestFile("test_update_weights_from_distributed.py", 103), TestFile("test_release_memory_occupation.py", 127), @@ -175,7 +172,6 @@ suites = { "per-commit-2-gpu-amd": [ TestFile("models/lora/test_lora_tp.py", 116), TestFile("test_data_parallelism.py", 73), - TestFile("test_mla_tp.py", 170), TestFile("test_patch_torch.py", 19), TestFile("test_update_weights_from_distributed.py", 103), ], diff --git a/test/srt/test_ascend_tp1_bf16.py b/test/srt/test_ascend_tp1_bf16.py index 90fde7a80..f854605ce 100644 --- a/test/srt/test_ascend_tp1_bf16.py +++ b/test/srt/test_ascend_tp1_bf16.py @@ -15,7 +15,7 @@ from sglang.test.test_utils import ( TEST_MODEL_MATRIX = { "Qwen/Qwen2.5-7B-Instruct": { - "accuracy": 0.85, + "accuracy": 0.84, "latency": 150, "output_throughput": 30, }, diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py index 581238a01..400571713 100644 --- a/test/srt/test_bench_serving.py +++ b/test/srt/test_bench_serving.py @@ -1,7 +1,6 @@ import asyncio import itertools import unittest -from random import random, uniform import requests diff --git a/test/srt/test_mla_deepseek_v3.py b/test/srt/test_mla_deepseek_v3.py index c2d659a78..0ebb191fb 100644 --- a/test/srt/test_mla_deepseek_v3.py +++ b/test/srt/test_mla_deepseek_v3.py @@ -149,66 +149,5 @@ class TestDeepseekV3MTP(CustomTestCase): self.assertGreater(avg_spec_accept_length, 2.5) -# compatible with old APIs -class TestDeepseekV3MTPWithDraft(CustomTestCase): - @classmethod - def setUpClass(cls): - cls.model = "lmsys/sglang-ci-dsv3-test" - cls.base_url = DEFAULT_URL_FOR_TEST - other_args = [ - "--trust-remote-code", - "--cuda-graph-max-bs", - "2", - "--disable-radix", - "--enable-torch-compile", - "--torch-compile-max-bs", - "1", - "--speculative-algorithm", - "EAGLE", - "--speculative-draft", - "lmsys/sglang-ci-dsv3-test-NextN", - "--speculative-num-steps", - "2", - "--speculative-eagle-topk", - "4", - "--speculative-num-draft-tokens", - "4", - ] - cls.process = popen_launch_server( - cls.model, - cls.base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=other_args, - ) - - @classmethod - def tearDownClass(cls): - kill_process_tree(cls.process.pid) - - def test_gsm8k(self): - requests.get(self.base_url + "/flush_cache") - - args = SimpleNamespace( - num_shots=5, - data_path=None, - num_questions=200, - max_new_tokens=512, - parallel=128, - host="http://127.0.0.1", - port=int(self.base_url.split(":")[-1]), - ) - metrics = run_eval_few_shot_gsm8k(args) - print(metrics) - - self.assertGreater(metrics["accuracy"], 0.60) - - server_info = requests.get(self.base_url + "/get_server_info") - avg_spec_accept_length = server_info.json()["internal_states"][0][ - "avg_spec_accept_length" - ] - print(f"{avg_spec_accept_length=}") - self.assertGreater(avg_spec_accept_length, 2.5) - - if __name__ == "__main__": unittest.main() diff --git a/test/srt/test_mla_flashinfer.py b/test/srt/test_mla_flashinfer.py index d04cf37fb..f72aef5a5 100644 --- a/test/srt/test_mla_flashinfer.py +++ b/test/srt/test_mla_flashinfer.py @@ -25,7 +25,7 @@ class TestFlashinferMLA(CustomTestCase): [ "--enable-torch-compile", "--cuda-graph-max-bs", - "2", + "4", "--attention-backend", "flashinfer", ] @@ -68,7 +68,6 @@ class TestFlashinferMLAMTP(CustomTestCase): [ "--cuda-graph-max-bs", "4", - "--disable-radix", "--enable-torch-compile", "--torch-compile-max-bs", "1", diff --git a/test/srt/test_mla_int8_deepseek_v3.py b/test/srt/test_mla_int8_deepseek_v3.py index 38207cdf6..a528a64be 100644 --- a/test/srt/test_mla_int8_deepseek_v3.py +++ b/test/srt/test_mla_int8_deepseek_v3.py @@ -10,6 +10,7 @@ from sglang.test.test_utils import ( DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, CustomTestCase, + is_in_ci, popen_launch_server, ) @@ -112,6 +113,7 @@ class TestDeepseekV3MTPChannelInt8(CustomTestCase): self.assertGreater(avg_spec_accept_length, 2.5) +@unittest.skipIf(is_in_ci(), "To reduce the CI execution time.") class TestMLADeepseekV3BlockInt8(CustomTestCase): @classmethod def setUpClass(cls):