From 706bd69cc58aefb7c0a4d7b269f1cbe2908f955b Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Fri, 8 Aug 2025 19:56:50 -0700
Subject: [PATCH] Clean up server_args.py to have a dedicated function for
 model specific adjustments (#8983)

---
 .github/workflows/execute-notebook.yml        |   5 +-
 .github/workflows/pr-test-pd-router.yml       |   2 +-
 .github/workflows/pr-test.yml                 |  74 +++---
 .github/workflows/vllm-dependency-test.yml    |   7 +-
 .gitmodules                                   |   0
 README.md                                     |   9 +-
 docs/backend/server_arguments.md              |   4 +-
 docs/index.rst                                |   6 +-
 python/pyproject.toml                         |   9 +-
 python/sglang/srt/configs/model_config.py     |  12 +-
 python/sglang/srt/entrypoints/context.py      |   4 +-
 .../layers/attention/hybrid_attn_backend.py   |   2 +-
 .../srt/layers/quantization/__init__.py       |   2 +-
 python/sglang/srt/server_args.py              | 214 ++++++++----------
 scripts/ci_install_dependency.sh              |  75 +++---
 scripts/fix_corrupted_json.py                 |  40 ----
 scripts/killall_sglang.sh                     |   1 -
 test/lang/run_suite.py                        |   2 -
 test/srt/run_suite.py                         |   4 -
 test/srt/test_ascend_tp1_bf16.py              |   2 +-
 test/srt/test_bench_serving.py                |   1 -
 test/srt/test_mla_deepseek_v3.py              |  61 -----
 test/srt/test_mla_flashinfer.py               |   3 +-
 test/srt/test_mla_int8_deepseek_v3.py         |   2 +
 24 files changed, 201 insertions(+), 340 deletions(-)
 delete mode 100644 .gitmodules
 delete mode 100644 scripts/fix_corrupted_json.py

diff --git a/.github/workflows/execute-notebook.yml b/.github/workflows/execute-notebook.yml
index 627650417..d8381b12e 100644
--- a/.github/workflows/execute-notebook.yml
+++ b/.github/workflows/execute-notebook.yml
@@ -26,10 +26,7 @@ jobs:
         run: |
           bash scripts/ci_install_dependency.sh
           pip install -r docs/requirements.txt
-          apt-get update
-          apt-get install -y pandoc
-          apt-get update && apt-get install -y parallel retry
-
+          apt-get update && apt-get install -y pandoc parallel retry
           ln -sf "$(which python3)" /usr/bin/python
 
       - name: Setup Jupyter Kernel
diff --git a/.github/workflows/pr-test-pd-router.yml b/.github/workflows/pr-test-pd-router.yml
index c7fa1e666..570dc4e21 100644
--- a/.github/workflows/pr-test-pd-router.yml
+++ b/.github/workflows/pr-test-pd-router.yml
@@ -1,4 +1,4 @@
-name: Test Disaggregation Mode
+name: PR Test (PD Router)
 
 on:
   push:
diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index d99d68bc7..f58fb2377 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -61,7 +61,7 @@ jobs:
           python3 run_suite.py --suite per-commit
 
   unit-test-backend-1-gpu:
-    needs: check-changes
+    needs: [check-changes, unit-test-frontend]
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
         github.event.pull_request.draft == false &&
         needs.check-changes.outputs.src == 'true'
@@ -85,7 +85,7 @@ jobs:
           python3 run_suite.py --suite per-commit --auto-partition-id ${{ matrix.part }} --auto-partition-size 10
 
   unit-test-backend-2-gpu:
-    needs: check-changes
+    needs: [check-changes, unit-test-frontend]
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
         github.event.pull_request.draft == false &&
         needs.check-changes.outputs.src == 'true'
@@ -105,7 +105,7 @@ jobs:
           python3 run_suite.py --suite per-commit-2-gpu
 
   unit-test-backend-4-gpu:
-    needs: [check-changes, unit-test-frontend, unit-test-backend-2-gpu]
+    needs: [check-changes, unit-test-backend-2-gpu]
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
         github.event.pull_request.draft == false &&
         needs.check-changes.outputs.src == 'true'
@@ -125,7 +125,7 @@ jobs:
           python3 run_suite.py --suite per-commit-4-gpu
 
   unit-test-backend-8-gpu:
-    needs: [check-changes, unit-test-frontend, unit-test-backend-2-gpu]
+    needs: [check-changes, unit-test-backend-2-gpu]
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
         github.event.pull_request.draft == false &&
         needs.check-changes.outputs.src == 'true'
@@ -245,7 +245,7 @@ jobs:
           python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_online_latency
 
   performance-test-2-gpu:
-    needs: check-changes
+    needs: [check-changes, unit-test-backend-2-gpu]
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
         github.event.pull_request.draft == false &&
         needs.check-changes.outputs.src == 'true'
@@ -282,13 +282,13 @@ jobs:
           cd test/srt
           python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache
 
-      - name: Benchmark offline decode throughput (PP=2)
+      - name: Benchmark offline PP decode throughput (PP=2)
         timeout-minutes: 10
         run: |
           cd test/srt
           python3 -m unittest test_bench_serving.TestBenchServing.test_pp_offline_throughput_default_decode
 
-      - name: Benchmark offline prefill throughput (PP=2)
+      - name: Benchmark offline PP prefill throughput (PP=2)
         timeout-minutes: 10
         run: |
           cd test/srt
@@ -318,7 +318,7 @@ jobs:
           python3 test_eval_accuracy_large.py
 
   accuracy-test-2-gpu:
-    needs: check-changes
+    needs: [check-changes, accuracy-test-1-gpu]
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
         github.event.pull_request.draft == false &&
         needs.check-changes.outputs.src == 'true'
@@ -341,7 +341,7 @@ jobs:
           python3 test_moe_eval_accuracy_large.py
 
   unit-test-deepep-4-gpu:
-    needs: check-changes
+    needs: [check-changes, unit-test-backend-2-gpu]
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
         github.event.pull_request.draft == false &&
         needs.check-changes.outputs.src == 'true'
@@ -361,7 +361,7 @@ jobs:
           python3 run_suite.py --suite per-commit-4-gpu-deepep
 
   unit-test-deepep-8-gpu:
-    needs: [check-changes, unit-test-deepep-4-gpu]
+    needs: [check-changes, unit-test-backend-2-gpu]
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
         github.event.pull_request.draft == false &&
         needs.check-changes.outputs.src == 'true'
@@ -380,12 +380,38 @@ jobs:
           cd test/srt
           python3 run_suite.py --suite per-commit-8-gpu-deepep
 
+  unit-test-backend-8-gpu-b200:
+    needs: [check-changes, unit-test-backend-2-gpu]
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+      github.event.pull_request.draft == false &&
+      needs.check-changes.outputs.src == 'true'
+    runs-on: b200-runner
+    strategy:
+      fail-fast: false
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          IS_BLACKWELL=1 bash scripts/ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 20
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-8-gpu-b200 --auto-partition-id 0 --auto-partition-size 1
+
+
   pr-test-finish:
     needs: [
       check-changes,
-      unit-test-frontend, unit-test-backend-1-gpu, unit-test-backend-2-gpu, unit-test-backend-4-gpu,
-      unit-test-backend-8-gpu, performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-2-gpu,
-      accuracy-test-1-gpu, accuracy-test-2-gpu, unit-test-deepep-4-gpu, unit-test-deepep-8-gpu,
+      unit-test-frontend, unit-test-backend-1-gpu,
+      unit-test-backend-2-gpu, unit-test-backend-4-gpu, unit-test-backend-8-gpu,
+      performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-2-gpu,
+      accuracy-test-1-gpu, accuracy-test-2-gpu,
+      unit-test-deepep-4-gpu, unit-test-deepep-8-gpu,
+      unit-test-backend-8-gpu-b200,
     ]
     if: needs.check-changes.outputs.src == 'true'
     runs-on: ubuntu-latest
@@ -401,25 +427,3 @@ jobs:
           done
           echo "All jobs completed successfully"
           exit 0
-
-  unit-test-backend-8-gpu-b200:
-    needs: [check-changes, unit-test-frontend, unit-test-backend-2-gpu]
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
-      github.event.pull_request.draft == false &&
-      needs.check-changes.outputs.src == 'true'
-    runs-on: b200-runner
-    strategy:
-      fail-fast: false
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Install dependencies
-        run: |
-          MODE_BLACKWELL=1 bash scripts/ci_install_dependency.sh
-
-      - name: Run test
-        timeout-minutes: 20
-        run: |
-          cd test/srt
-          python3 run_suite.py --suite per-commit-8-gpu-b200 --auto-partition-id 0 --auto-partition-size 1
diff --git a/.github/workflows/vllm-dependency-test.yml b/.github/workflows/vllm-dependency-test.yml
index 1c5630fb6..785b42efc 100644
--- a/.github/workflows/vllm-dependency-test.yml
+++ b/.github/workflows/vllm-dependency-test.yml
@@ -30,11 +30,12 @@ jobs:
       - name: Install dependencies
         run: |
           bash scripts/ci_install_dependency.sh
-          pip install "vllm==0.10.0"
+          pip install "vllm==0.9.0"
           pip install "bitsandbytes>=0.44.0"
-          pip3 install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu126
+          pip install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu126
+          pip install "openai==1.99.1"
 
-      - name: Run VLLM dependency tests
+      - name: Run vLLM dependency tests
         timeout-minutes: 60
         run: |
           cd test/srt
diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index e69de29bb..000000000
diff --git a/README.md b/README.md
index 974eaf12f..3b3a226b9 100644
--- a/README.md
+++ b/README.md
@@ -20,13 +20,13 @@
 | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
 
 ## News
+- [2025/08] 🔥 SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
 - [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
 - [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).
 - [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
 - [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
 - [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
 - [2024/12] v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
-- [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
 
 <details>
 <summary>More</summary>
@@ -35,6 +35,7 @@
 - [2025/01] SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
 - [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
 - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
+- [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
 - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
 - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
 - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
@@ -46,10 +47,10 @@ SGLang is a fast serving framework for large language models and vision language
 It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
 The core features include:
 
-- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor parallelism, pipeline parallelism, expert parallelism, structured outputs, chunked prefill, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
+- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor/pipeline/expert/data parallelism, structured outputs, chunked prefill, quantization (FP4/FP8/INT4/AWQ/GPTQ), and multi-lora batching.
 - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
-- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, Qwen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
-- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
+- **Extensive Model Support**: Supports a wide range of generative models (Llama, Qwen, DeepSeek, Kimi, GPT, Gemma, Mistral, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
+- **Active Community**: SGLang is open-source and backed by an active community with wide industry adoption.
 
 ## Getting Started
 - [Install SGLang](https://docs.sglang.ai/start/install.html)
diff --git a/docs/backend/server_arguments.md b/docs/backend/server_arguments.md
index 008953bc2..3d951a9f7 100644
--- a/docs/backend/server_arguments.md
+++ b/docs/backend/server_arguments.md
@@ -189,8 +189,8 @@ Please consult the documentation below and [server_args.py](https://github.com/s
 | Arguments | Description | Defaults |
 |-----------|-------------|----------|
 | `--attention-backend` | Choose the kernels for attention layers. | None |
-| `decode_attention_backend` | (Experimental) This argument specifies the backend for decode attention computation. Note that this argument has priority over `attention_backend`. | None |
-| `prefill_attention_backend` | (Experimental) This argument specifies the backend for prefill attention computation. Note that this argument has priority over `attention_backend`. | None |
+| `--prefill-attention-backend` | (Experimental) This argument specifies the backend for prefill attention computation. Note that this argument has priority over `attention_backend`. | None |
+| `--decode-attention-backend` | (Experimental) This argument specifies the backend for decode attention computation. Note that this argument has priority over `attention_backend`. | None |
 | `--sampling-backend` | Choose the kernels for sampling layers. | None |
 | `--grammar-backend` | Choose the backend for grammar-guided decoding. | None |
 | `--mm-attention-backend` | Set multimodal attention backend. | None |
diff --git a/docs/index.rst b/docs/index.rst
index 380b58b1b..4530e7442 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -5,10 +5,10 @@ SGLang is a fast serving framework for large language models and vision language
 It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
 The core features include:
 
-- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor parallelism, pipeline parallelism, expert parallelism, structured outputs, chunked prefill, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
+- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor/pipeline/expert/data parallelism, structured outputs, chunked prefill, quantization (FP4/FP8/INT4/AWQ/GPTQ), and multi-lora batching.
 - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
-- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, Qwen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
-- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
+- **Extensive Model Support**: Supports a wide range of generative models (Llama, Qwen, DeepSeek, Kimi, GPT, Gemma, Mistral, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
+- **Active Community**: SGLang is open-source and backed by an active community with wide industry adoption.
 
 .. toctree::
    :maxdepth: 1
diff --git a/python/pyproject.toml b/python/pyproject.toml
index f688803dd..376c8b2d5 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -21,6 +21,7 @@ runtime_common = [
     "build",
     "compressed-tensors",
     "datasets",
+    "einops",
     "fastapi",
     "hf_transfer",
     "huggingface_hub",
@@ -29,6 +30,7 @@ runtime_common = [
     "modelscope",
     "msgspec",
     "ninja",
+    "openai==1.99.1",
     "openai-harmony==0.0.3",
     "orjson",
     "outlines==0.1.11",
@@ -48,6 +50,7 @@ runtime_common = [
     "torchao==0.9.0",
     "transformers==4.55.0",
     "timm==1.0.16",
+    "tiktoken",
     "uvicorn",
     "uvloop",
     "xgrammar==0.1.22",
@@ -60,7 +63,6 @@ srt = [
     "torchaudio==2.8.0",
     "torchvision",
     "cuda-python",
-    "einops",
     "flashinfer_python==0.2.10",
 ]
 
@@ -71,10 +73,7 @@ blackwell = [
     "torchaudio==2.8.0",
     "torchvision",
     "cuda-python",
-    "einops",
     "flashinfer_python==0.2.10",
-    "tiktoken",
-    "openai==1.99.1",
 ]
 
 # HIP (Heterogeneous-computing Interface for Portability) for AMD
@@ -101,7 +100,7 @@ srt_npu = ["sglang[runtime_common]"]
 openai = ["openai==1.99.1", "tiktoken"]
 anthropic = ["anthropic>=0.20.0"]
 litellm = ["litellm>=1.0.0"]
-torch_memory_saver = ["torch_memory_saver>=0.0.8"]
+torch_memory_saver = ["torch_memory_saver==0.0.8"]
 decord = ["decord"]
 test = [
     "accelerate",
diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
index 812f07103..da70ec740 100644
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -64,13 +64,12 @@ class ModelConfig:
         hybrid_kvcache_ratio: Optional[float] = None,
         model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
     ) -> None:
-
+        # Parse args
         self.model_path = model_path
         self.revision = revision
         self.quantization = quantization
         self.model_impl = model_impl
 
-        # Parse args
         self.maybe_pull_model_tokenizer_from_remote()
         self.model_override_args = json.loads(model_override_args)
         kwargs = {}
@@ -139,6 +138,7 @@ class ModelConfig:
             and self.hf_config.architectures[0] == "Ernie4_5_MoeForCausalLM"
         ):
             self.hf_config.architectures[0] = "Ernie4_5_MoeForCausalLMMTP"
+
         # Check model type
         self.is_generation = is_generation_model(
             self.hf_config.architectures, is_embedding
@@ -282,12 +282,10 @@ class ModelConfig:
         # Cache attributes
         self.hf_eos_token_id = self.get_hf_eos_token_id()
 
-        config = self.hf_config
-
         # multimodal
-        self.image_token_id = getattr(config, "image_token_id", None) or getattr(
-            config, "image_token_index", None
-        )
+        self.image_token_id = getattr(
+            self.hf_config, "image_token_id", None
+        ) or getattr(self.hf_config, "image_token_index", None)
 
     @staticmethod
     def from_server_args(server_args: ServerArgs, model_path: str = None, **kwargs):
diff --git a/python/sglang/srt/entrypoints/context.py b/python/sglang/srt/entrypoints/context.py
index 0c8bc116d..e7a0c07cf 100644
--- a/python/sglang/srt/entrypoints/context.py
+++ b/python/sglang/srt/entrypoints/context.py
@@ -9,8 +9,8 @@ logger = logging.getLogger(__name__)
 
 try:
     from mcp import ClientSession
-except ImportError:
-    logger.warning("Ignoring mcp import error")
+except ImportError as e:
+    mcp = e
 
 from openai_harmony import Author, Message, Role, StreamState, TextContent
 
diff --git a/python/sglang/srt/layers/attention/hybrid_attn_backend.py b/python/sglang/srt/layers/attention/hybrid_attn_backend.py
index 370961864..b9f829e41 100644
--- a/python/sglang/srt/layers/attention/hybrid_attn_backend.py
+++ b/python/sglang/srt/layers/attention/hybrid_attn_backend.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING, Optional, Union
+from typing import Optional, Union
 
 import torch
 
diff --git a/python/sglang/srt/layers/quantization/__init__.py b/python/sglang/srt/layers/quantization/__init__.py
index 19977012a..7be1572da 100644
--- a/python/sglang/srt/layers/quantization/__init__.py
+++ b/python/sglang/srt/layers/quantization/__init__.py
@@ -3,7 +3,7 @@ from __future__ import annotations
 
 import builtins
 import inspect
-from typing import TYPE_CHECKING, Callable, Dict, Optional, Type, Union
+from typing import TYPE_CHECKING, Dict, Optional, Type
 
 import torch
 
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 7bfd443bf..b6c5156ff 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -37,7 +37,6 @@ from sglang.srt.utils import (
     is_hip,
     is_port_available,
     is_remote_url,
-    is_triton_kernels_available,
     is_valid_ipv6_address,
     nullable_str,
 )
@@ -109,7 +108,7 @@ class ServerArgs:
     log_level: str = "info"
     log_level_http: Optional[str] = None
     log_requests: bool = False
-    log_requests_level: int = 0
+    log_requests_level: int = 2
     crash_dump_folder: Optional[str] = None
     show_time_cost: bool = False
     enable_metrics: bool = False
@@ -131,6 +130,7 @@ class ServerArgs:
     enable_cache_report: bool = False
     reasoning_parser: Optional[str] = None
     tool_call_parser: Optional[str] = None
+    tool_server: Optional[str] = None
 
     # Data parallelism
     dp_size: int = 1
@@ -278,15 +278,11 @@ class ServerArgs:
     enable_pdmux: bool = False
     sm_group_num: int = 3
 
-    # For tool server
-    tool_server: Optional[str] = None
-
     # Deprecated arguments
     enable_ep_moe: bool = False
     enable_deepep_moe: bool = False
 
     def __post_init__(self):
-
         # Check deprecated arguments
         def print_deprecated_warning(message: str):
             logger.warning(f"\033[33m{message}\033[0m")
@@ -392,6 +388,9 @@ class ServerArgs:
             self.attention_backend = "torch_native"
             self.sampling_backend = "pytorch"
 
+        # Model-specific adjustments
+        self.model_specific_adjustments()
+
         # Set kernel backends
         if self.device == "cpu":
             if self.attention_backend is None:
@@ -470,55 +469,9 @@ class ServerArgs:
                     "trtllm_mha backend does not support speculative decoding yet."
                 )
 
-        model_arch = self.get_hf_config().architectures[0]
-        if model_arch in ["GptOssForCausalLM"]:
-            if self.attention_backend is None:
-                # default is triton, but we could have trtllm_mha as an option
-                self.attention_backend = "triton"
-            assert (
-                self.attention_backend == "trtllm_mha"
-                or self.attention_backend == "triton"
-            )
-            quantization_config = getattr(
-                self.get_hf_config(), "quantization_config", None
-            )
-            is_mxfp4_quant_format = (
-                quantization_config is not None
-                and quantization_config.get("quant_method") == "mxfp4"
-            )
-
-            if is_sm100_supported() and is_mxfp4_quant_format:
-                self.enable_flashinfer_mxfp4_moe = True
-                self.enable_triton_kernel_moe = False
-                logger.info(
-                    "Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
-                )
-            else:
-                if self.enable_triton_kernel_moe:
-                    assert (
-                        self.ep_size == 1
-                    ), "Triton kernel MoE is only supported when ep_size == 1"
-                if not self.enable_triton_kernel_moe and self.ep_size == 1:
-                    self.enable_triton_kernel_moe = True
-                    logger.info(
-                        "Detected GPT-OSS model, enabling triton_kernels MOE kernel."
-                    )
-
-            self.disable_hybrid_swa_memory = True
-
-            if is_mxfp4_quant_format:
-                # use bf16 for mxfp4 triton kernels
-                self.dtype = "bfloat16"
-
         if self.attention_backend == "dual_chunk_flash_attn":
             logger.warning(
-                "Mixed chunk is disabled because of using dual chunk flash attention backend"
-            )
-            logger.warning(
-                "Radix cache is disabled because of using dual chunk flash attention backend"
-            )
-            logger.warning(
-                "Cuda graph is disabled because of using dual chunk flash attention backend"
+                "Mixed chunk, radix cache, and cuda graphs are disabled because of using dual chunk flash attention backend"
             )
             self.enable_mixed_chunk = False
             self.disable_cuda_graph = True
@@ -583,7 +536,7 @@ class ServerArgs:
 
         if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
             self.expert_distribution_recorder_mode = "stat"
-            logger.info(
+            logger.warning(
                 "EPLB is enabled. The expert_distribution_recorder_mode is automatically set."
             )
 
@@ -591,9 +544,6 @@ class ServerArgs:
             self.ep_dispatch_algorithm is None
         ):
             self.ep_dispatch_algorithm = "static"
-            logger.info(
-                "EPLB is enabled or init_expert_location is provided. ep_dispatch_algorithm is configured."
-            )
 
         if self.enable_eplb:
             assert self.ep_size > 1 or self.moe_a2a_backend is not None
@@ -1112,7 +1062,7 @@ class ServerArgs:
         parser.add_argument(
             "--log-requests-level",
             type=int,
-            default=0,
+            default=ServerArgs.log_requests_level,
             help="0: Log metadata (no sampling parameters). 1: Log metadata and sampling parameters. 2: Log metadata, sampling parameters and partial input/output. 3: Log every input/output.",
             choices=[0, 1, 2, 3],
         )
@@ -1245,6 +1195,12 @@ class ServerArgs:
             default=ServerArgs.tool_call_parser,
             help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', 'qwen3_coder', 'glm45', and 'step3'.",
         )
+        parser.add_argument(
+            "--tool-server",
+            type=str,
+            default=None,
+            help="Either 'demo' or a comma-separated list of tool server urls to use for the model. If not specified, no tool server will be used.",
+        )
 
         # Data parallelism
         parser.add_argument(
@@ -1344,55 +1300,41 @@ class ServerArgs:
         )
 
         # Kernel backend
+        ATTN_BACKENDS = [
+            "aiter",
+            "cutlass_mla",
+            "fa3",
+            "flashinfer",
+            "flashmla",
+            "intel_amx",
+            "torch_native",
+            "ascend",
+            "triton",
+            "trtllm_mla",
+            "trtllm_mha",
+            "dual_chunk_flash_attn",
+        ]
         parser.add_argument(
             "--attention-backend",
             type=str,
-            choices=[
-                "aiter",
-                "cutlass_mla",
-                "fa3",
-                "flashinfer",
-                "flashmla",
-                "intel_amx",
-                "torch_native",
-                "ascend",
-                "triton",
-                "trtllm_mla",
-                "trtllm_mha",
-                "dual_chunk_flash_attn",
-            ],
+            choices=ATTN_BACKENDS,
             default=ServerArgs.attention_backend,
             help="Choose the kernels for attention layers.",
         )
-        parser.add_argument(
-            "--decode-attention-backend",
-            type=str,
-            choices=[
-                "flashinfer",
-                "triton",
-                "torch_native",
-                "fa3",
-                "flashmla",
-                "cutlass_mla",
-            ],
-            default=ServerArgs.decode_attention_backend,
-            help="Choose the kernels for decode attention layers (have priority over --attention-backend).",
-        )
-
         parser.add_argument(
             "--prefill-attention-backend",
             type=str,
-            choices=[
-                "flashinfer",
-                "triton",
-                "torch_native",
-                "fa3",
-                "flashmla",
-                "cutlass_mla",
-            ],
+            choices=ATTN_BACKENDS,
             default=ServerArgs.prefill_attention_backend,
             help="Choose the kernels for prefill attention layers (have priority over --attention-backend).",
         )
+        parser.add_argument(
+            "--decode-attention-backend",
+            type=str,
+            choices=ATTN_BACKENDS,
+            default=ServerArgs.decode_attention_backend,
+            help="Choose the kernels for decode attention layers (have priority over --attention-backend).",
+        )
         parser.add_argument(
             "--sampling-backend",
             type=str,
@@ -1612,7 +1554,6 @@ class ServerArgs:
             default=ServerArgs.hicache_mem_layout,
             help="The layout of host memory pool for hierarchical cache.",
         )
-
         parser.add_argument(
             "--hicache-storage-backend",
             type=str,
@@ -1985,14 +1926,6 @@ class ServerArgs:
             help="Disable mmap while loading weight using safetensors.",
         )
 
-        # For tool server
-        parser.add_argument(
-            "--tool-server",
-            type=str,
-            default=None,
-            help="Either 'demo' or a comma-separated list of tool server urls to use for the model. If not specified, no tool server will be used.",
-        )
-
         # Deprecated arguments
         parser.add_argument(
             "--enable-ep-moe",
@@ -2056,25 +1989,6 @@ class ServerArgs:
             None,
         }, "moe_dense_tp_size only support 1 and None currently"
 
-        # Check model architecture
-        model_arch = self.get_hf_config().architectures[0]
-        if "Llama4" in model_arch:
-            assert self.attention_backend == "fa3", "fa3 is required for Llama4 model"
-
-        if model_arch in [
-            "Gemma2ForCausalLM",
-            "Gemma3ForCausalLM",
-            "Gemma3ForConditionalGeneration",
-            "Gemma3nForCausalLM",
-            "Gemma3nForConditionalGeneration",
-        ]:
-            # FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
-            # It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
-            logger.warning(
-                f"Disable hybrid SWA memory for {model_arch} as it is not yet supported."
-            )
-            self.disable_hybrid_swa_memory = True
-
         # Check LoRA
         self.check_lora_server_args()
 
@@ -2100,7 +2014,7 @@ class ServerArgs:
         if self.lora_paths:
             if self.enable_lora is None:
                 self.enable_lora = True
-                logger.info(
+                logger.warning(
                     "--enable-lora is set to True because --lora-paths is provided."
                 )
             elif self.enable_lora is False:
@@ -2172,6 +2086,58 @@ class ServerArgs:
             f"decode_tp={decode_tp}, prefill_tp={prefill_tp}"
         )
 
+    def model_specific_adjustments(self):
+        hf_config = self.get_hf_config()
+        model_arch = hf_config.architectures[0]
+        if model_arch in ["GptOssForCausalLM"]:
+            if self.attention_backend is None:
+                self.attention_backend = "triton"
+            assert self.attention_backend in [
+                "triton",
+                "trtllm_mha",
+            ], f"GptOssForCausalLM requires 'triton' or 'trtllm_mha' attention backend, but got {self.attention_backend}"
+            quantization_config = getattr(hf_config, "quantization_config", None)
+            is_mxfp4_quant_format = (
+                quantization_config is not None
+                and quantization_config.get("quant_method") == "mxfp4"
+            )
+
+            if is_sm100_supported() and is_mxfp4_quant_format:
+                self.enable_flashinfer_mxfp4_moe = True
+                self.enable_triton_kernel_moe = False
+                logger.warning(
+                    "Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
+                )
+            else:
+                if self.enable_triton_kernel_moe:
+                    assert (
+                        self.ep_size == 1
+                    ), "Triton kernel MoE is only supported when ep_size == 1"
+                if not self.enable_triton_kernel_moe and self.ep_size == 1:
+                    self.enable_triton_kernel_moe = True
+                    logger.warning(
+                        "Detected GPT-OSS model, enabling triton_kernels MOE kernel."
+                    )
+            self.disable_hybrid_swa_memory = True
+            if is_mxfp4_quant_format:
+                # use bf16 for mxfp4 triton kernels
+                self.dtype = "bfloat16"
+        elif "Llama4" in model_arch:
+            assert self.attention_backend == "fa3", "fa3 is required for Llama4 model"
+        elif model_arch in [
+            "Gemma2ForCausalLM",
+            "Gemma3ForCausalLM",
+            "Gemma3ForConditionalGeneration",
+            "Gemma3nForCausalLM",
+            "Gemma3nForConditionalGeneration",
+        ]:
+            # FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
+            # It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
+            logger.warning(
+                f"Disable hybrid SWA memory for {model_arch} as it is not yet supported."
+            )
+            self.disable_hybrid_swa_memory = True
+
     def adjust_mem_fraction_for_vlm(self, model_config):
         vision_config = getattr(model_config.hf_config, "vision_config", None)
         if vision_config is None:
diff --git a/scripts/ci_install_dependency.sh b/scripts/ci_install_dependency.sh
index 7cad2775d..0ad51c7a3 100755
--- a/scripts/ci_install_dependency.sh
+++ b/scripts/ci_install_dependency.sh
@@ -2,68 +2,71 @@
 # Install the dependency in CI.
 set -euxo pipefail
 
-MODE_BLACKWELL=${MODE_BLACKWELL:-0}
+IS_BLACKWELL=${IS_BLACKWELL:-0}
 
-CU_VERSION="cu126"
-if [ "$MODE_BLACKWELL" = "1" ]; then
+if [ "$IS_BLACKWELL" = "1" ]; then
     CU_VERSION="cu129"
+else
+    CU_VERSION="cu126"
 fi
 
 # Kill existing processes
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 bash "${SCRIPT_DIR}/killall_sglang.sh"
 
-if [ "$MODE_BLACKWELL" = "1" ]; then
-    apt-get install -y git libnuma-dev
-fi
+# Install apt packages
+apt install -y git libnuma-dev
 
-# Update pip
-if [ "$MODE_BLACKWELL" != "1" ]; then
+# Install uv
+if [ "$IS_BLACKWELL" = "1" ]; then
+    # The blackwell CI runner has some issues with pip and uv,
+    # so we can only use pip with `--break-system-packages`
+    PIP_CMD="pip"
+    PIP_INSTALL_SUFFIX="--break-system-packages"
+
+    # Clean up existing installations
+    $PIP_CMD uninstall -y flashinfer_python sgl-kernel sglang vllm $PIP_INSTALL_SUFFIX || true
+else
+    # In normal cases, we use uv, which is much faster than pip.
     pip install --upgrade pip
-fi
+    pip install uv
+    export UV_SYSTEM_PYTHON=true
 
-# Clean up existing installations
-pip uninstall -y flashinfer flashinfer_python sgl-kernel sglang vllm --break-system-packages || true
-pip cache purge || true
-rm -rf /root/.cache/flashinfer
-# TODO handle other python versions
-rm -rf /usr/local/lib/python3.10/dist-packages/flashinfer*
-rm -rf /usr/local/lib/python3.10/dist-packages/sgl_kernel*
+    PIP_CMD="uv pip"
+    PIP_INSTALL_SUFFIX="--index-strategy unsafe-best-match"
+
+    # Clean up existing installations
+    $PIP_CMD uninstall flashinfer_python sgl-kernel sglang vllm || true
+fi
 
 # Install the main package
-pip install -e "python[dev]" --extra-index-url https://download.pytorch.org/whl/${CU_VERSION} --break-system-packages
+$PIP_CMD install -e "python[dev]" --extra-index-url https://download.pytorch.org/whl/${CU_VERSION} $PIP_INSTALL_SUFFIX
 
-if [ "$MODE_BLACKWELL" = "1" ]; then
+if [ "$IS_BLACKWELL" = "1" ]; then
     # TODO auto determine sgl-kernel version
     SGL_KERNEL_VERSION=0.3.2
-    pip3 install https://github.com/sgl-project/whl/releases/download/v${SGL_KERNEL_VERSION}/sgl_kernel-${SGL_KERNEL_VERSION}-cp39-abi3-manylinux2014_x86_64.whl --break-system-packages
+    $PIP_CMD install https://github.com/sgl-project/whl/releases/download/v${SGL_KERNEL_VERSION}/sgl_kernel-${SGL_KERNEL_VERSION}-cp39-abi3-manylinux2014_x86_64.whl --force-reinstall $PIP_INSTALL_SUFFIX
 fi
 
 # Show current packages
-pip list
+$PIP_CMD list
 
 # Install additional dependencies
-pip install mooncake-transfer-engine==0.3.5 nvidia-cuda-nvrtc-cu12 --break-system-packages
+$PIP_CMD install mooncake-transfer-engine==0.3.5 nvidia-cuda-nvrtc-cu12 py-spy huggingface_hub[hf_xet] $PIP_INSTALL_SUFFIX
 
-if [ "$MODE_BLACKWELL" != "1" ]; then
+if [ "$IS_BLACKWELL" != "1" ]; then
     # For lmms_evals evaluating MMMU
     git clone --branch v0.3.3 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
-    pip install -e lmms-eval/ --break-system-packages
+    $PIP_CMD install -e lmms-eval/ $PIP_INSTALL_SUFFIX
+
+    # Install xformers
+    $PIP_CMD install xformers --index-url https://download.pytorch.org/whl/${CU_VERSION} --no-deps $PIP_INSTALL_SUFFIX
 fi
 
 # Install FlashMLA for attention backend tests
-# pip install git+https://github.com/deepseek-ai/FlashMLA.git --break-system-packages
-
-# Install hf_xet
-pip install huggingface_hub[hf_xet] --break-system-packages
-
-if [ "$MODE_BLACKWELL" != "1" ]; then
-    # Install xformers
-    pip install -U xformers --index-url https://download.pytorch.org/whl/${CU_VERSION} --no-deps --force-reinstall --break-system-packages
-fi
-
-# To help dumping traces when timeout occurred
-pip install py-spy --break-system-packages
+# $PIP_CMD install git+https://github.com/deepseek-ai/FlashMLA.git $PIP_INSTALL_SUFFIX
 
 # Show current packages
-pip list
+$PIP_CMD list
+
+echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}"
diff --git a/scripts/fix_corrupted_json.py b/scripts/fix_corrupted_json.py
deleted file mode 100644
index 67c2980f5..000000000
--- a/scripts/fix_corrupted_json.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import json
-import re
-import sys
-
-
-def clean_json_file(input_file, output_file):
-    try:
-        # Open the input file with 'replace' option for handling bad characters
-        with open(input_file, "r", encoding="utf-8", errors="replace") as f:
-            data = f.read()
-
-        # Replace bad characters (represented by '�' after decoding) with a space
-        cleaned_data = data.replace("�", " ")
-
-        # Remove control characters (e.g., ASCII control characters like \x00 to \x1F)
-        # These can cause issues in JSON parsing.
-        cleaned_data = re.sub(r"[\x00-\x1F]+", " ", cleaned_data)
-
-        # Parse cleaned data as JSON
-        json_data = json.loads(cleaned_data)
-
-        # Write the cleaned JSON to a new output file
-        with open(output_file, "w", encoding="utf-8") as f:
-            json.dump(json_data, f, ensure_ascii=False, indent=4)
-
-        print(f"Cleaned JSON file has been saved to {output_file}")
-
-    except Exception as e:
-        print(f"Error: {e}")
-
-
-if __name__ == "__main__":
-    assert len(sys.argv) > 1, "please give the input file path"
-    if len(sys.argv) == 3:
-        input_file = sys.argv[1]
-        output_file = sys.argv[2]
-    else:
-        input_file = output_file = sys.argv[1]
-
-    clean_json_file(input_file, output_file)
diff --git a/scripts/killall_sglang.sh b/scripts/killall_sglang.sh
index 669fb00e4..7d0fe8bca 100755
--- a/scripts/killall_sglang.sh
+++ b/scripts/killall_sglang.sh
@@ -27,7 +27,6 @@ else
         lsof /dev/nvidia* | awk '{print $2}' | xargs kill -9 2>/dev/null
     fi
 
-
     # Show GPU status after clean up
     nvidia-smi
 fi
diff --git a/test/lang/run_suite.py b/test/lang/run_suite.py
index 67a477a11..04efba51f 100644
--- a/test/lang/run_suite.py
+++ b/test/lang/run_suite.py
@@ -8,8 +8,6 @@ suites = {
         TestFile("test_srt_backend.py"),
         # Skip this due to some OPENAI_API_KEY issues
         # "test_openai_backend.py",
-        TestFile("test_separate_reasoning.py"),
-        TestFile("test_separate_reasoning_execution.py"),
     ],
 }
 
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index aecea4498..ab06ec596 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -38,7 +38,6 @@ suites = {
         TestFile("openai_server/basic/test_serving_embedding.py", 10),
         TestFile("openai_server/basic/test_openai_embedding.py", 141),
         TestFile("openai_server/basic/test_openai_server.py", 149),
-        TestFile("openai_server/features/test_cache_report.py", 100),
         TestFile("openai_server/features/test_enable_thinking.py", 70),
         TestFile("openai_server/features/test_json_constrained.py", 98),
         TestFile("openai_server/features/test_json_mode.py", 90),
@@ -103,7 +102,6 @@ suites = {
         TestFile("test_update_weights_from_disk.py", 114),
         TestFile("test_update_weights_from_tensor.py", 48),
         TestFile("test_utils_update_weights.py", 48),
-        TestFile("test_vertex_endpoint.py", 31),
         TestFile("test_vision_chunked_prefill.py", 175),
         TestFile("test_vlm_input_format.py", 300),
         TestFile("test_vision_openai_server_a.py", 584),
@@ -167,7 +165,6 @@ suites = {
         TestFile("models/lora/test_lora_tp.py", 116),
         TestFile("test_data_parallelism.py", 73),
         TestFile("test_dp_attention.py", 277),
-        TestFile("test_mla_tp.py", 170),
         TestFile("test_patch_torch.py", 19),
         TestFile("test_update_weights_from_distributed.py", 103),
         TestFile("test_release_memory_occupation.py", 127),
@@ -175,7 +172,6 @@ suites = {
     "per-commit-2-gpu-amd": [
         TestFile("models/lora/test_lora_tp.py", 116),
         TestFile("test_data_parallelism.py", 73),
-        TestFile("test_mla_tp.py", 170),
         TestFile("test_patch_torch.py", 19),
         TestFile("test_update_weights_from_distributed.py", 103),
     ],
diff --git a/test/srt/test_ascend_tp1_bf16.py b/test/srt/test_ascend_tp1_bf16.py
index 90fde7a80..f854605ce 100644
--- a/test/srt/test_ascend_tp1_bf16.py
+++ b/test/srt/test_ascend_tp1_bf16.py
@@ -15,7 +15,7 @@ from sglang.test.test_utils import (
 
 TEST_MODEL_MATRIX = {
     "Qwen/Qwen2.5-7B-Instruct": {
-        "accuracy": 0.85,
+        "accuracy": 0.84,
         "latency": 150,
         "output_throughput": 30,
     },
diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py
index 581238a01..400571713 100644
--- a/test/srt/test_bench_serving.py
+++ b/test/srt/test_bench_serving.py
@@ -1,7 +1,6 @@
 import asyncio
 import itertools
 import unittest
-from random import random, uniform
 
 import requests
 
diff --git a/test/srt/test_mla_deepseek_v3.py b/test/srt/test_mla_deepseek_v3.py
index c2d659a78..0ebb191fb 100644
--- a/test/srt/test_mla_deepseek_v3.py
+++ b/test/srt/test_mla_deepseek_v3.py
@@ -149,66 +149,5 @@ class TestDeepseekV3MTP(CustomTestCase):
         self.assertGreater(avg_spec_accept_length, 2.5)
 
 
-# compatible with old APIs
-class TestDeepseekV3MTPWithDraft(CustomTestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = "lmsys/sglang-ci-dsv3-test"
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        other_args = [
-            "--trust-remote-code",
-            "--cuda-graph-max-bs",
-            "2",
-            "--disable-radix",
-            "--enable-torch-compile",
-            "--torch-compile-max-bs",
-            "1",
-            "--speculative-algorithm",
-            "EAGLE",
-            "--speculative-draft",
-            "lmsys/sglang-ci-dsv3-test-NextN",
-            "--speculative-num-steps",
-            "2",
-            "--speculative-eagle-topk",
-            "4",
-            "--speculative-num-draft-tokens",
-            "4",
-        ]
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=other_args,
-        )
-
-    @classmethod
-    def tearDownClass(cls):
-        kill_process_tree(cls.process.pid)
-
-    def test_gsm8k(self):
-        requests.get(self.base_url + "/flush_cache")
-
-        args = SimpleNamespace(
-            num_shots=5,
-            data_path=None,
-            num_questions=200,
-            max_new_tokens=512,
-            parallel=128,
-            host="http://127.0.0.1",
-            port=int(self.base_url.split(":")[-1]),
-        )
-        metrics = run_eval_few_shot_gsm8k(args)
-        print(metrics)
-
-        self.assertGreater(metrics["accuracy"], 0.60)
-
-        server_info = requests.get(self.base_url + "/get_server_info")
-        avg_spec_accept_length = server_info.json()["internal_states"][0][
-            "avg_spec_accept_length"
-        ]
-        print(f"{avg_spec_accept_length=}")
-        self.assertGreater(avg_spec_accept_length, 2.5)
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/srt/test_mla_flashinfer.py b/test/srt/test_mla_flashinfer.py
index d04cf37fb..f72aef5a5 100644
--- a/test/srt/test_mla_flashinfer.py
+++ b/test/srt/test_mla_flashinfer.py
@@ -25,7 +25,7 @@ class TestFlashinferMLA(CustomTestCase):
                 [
                     "--enable-torch-compile",
                     "--cuda-graph-max-bs",
-                    "2",
+                    "4",
                     "--attention-backend",
                     "flashinfer",
                 ]
@@ -68,7 +68,6 @@ class TestFlashinferMLAMTP(CustomTestCase):
                 [
                     "--cuda-graph-max-bs",
                     "4",
-                    "--disable-radix",
                     "--enable-torch-compile",
                     "--torch-compile-max-bs",
                     "1",
diff --git a/test/srt/test_mla_int8_deepseek_v3.py b/test/srt/test_mla_int8_deepseek_v3.py
index 38207cdf6..a528a64be 100644
--- a/test/srt/test_mla_int8_deepseek_v3.py
+++ b/test/srt/test_mla_int8_deepseek_v3.py
@@ -10,6 +10,7 @@ from sglang.test.test_utils import (
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
     DEFAULT_URL_FOR_TEST,
     CustomTestCase,
+    is_in_ci,
     popen_launch_server,
 )
 
@@ -112,6 +113,7 @@ class TestDeepseekV3MTPChannelInt8(CustomTestCase):
         self.assertGreater(avg_spec_accept_length, 2.5)
 
 
+@unittest.skipIf(is_in_ci(), "To reduce the CI execution time.")
 class TestMLADeepseekV3BlockInt8(CustomTestCase):
     @classmethod
     def setUpClass(cls):